diff --git a/Makefile b/Makefile index 692a9a5..319723c 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -VERSION := 0.9.5.2 +VERSION := 0.9.5.1 .PHONY: lint vis clean common client server passwd subpkgs install uninstall reinstall ## Tag version of binaries with build info wrt. diff --git a/bacillus/ci_pushbuild.sh b/bacillus/ci_pushbuild.sh index b6092c2..b94f39e 100755 --- a/bacillus/ci_pushbuild.sh +++ b/bacillus/ci_pushbuild.sh @@ -28,13 +28,11 @@ ls ############ stage "Build" ############ -echo "Removing any vendor/ dir if present ..." -rm -rf vendor echo "Recreating go.mod from scratch ..." -mv go.mod go.mod.git || true -mv go.sum go.sum.git || true -go mod init -go mod tidy +#!mv go.mod go.mod.git || true +#!mv go.sum go.sum.git || true +#!go mod init +#!go mod tidy echo "Cleaning go mod cache ..." go clean -modcache diff --git a/go.mod b/go.mod index 9086e68..f711150 100644 --- a/go.mod +++ b/go.mod @@ -1,31 +1,30 @@ module blitter.com/go/xs -go 1.18 +go 1.17 require ( blitter.com/go/cryptmt v1.0.2 blitter.com/go/goutmp v1.0.6 blitter.com/go/herradurakex v1.0.0 - blitter.com/go/hopscotch v0.0.0-20220617051533-4b42ccd4e00a + blitter.com/go/hopscotch v0.0.0-20211113042251-b8a306eea4dc blitter.com/go/kyber v0.0.0-20200130200857-6f2021cb88d9 blitter.com/go/newhope v0.0.0-20200130200750-192fc08a8aae github.com/aead/chacha20 v0.0.0-20180709150244-8b13a72661da github.com/creack/pty v1.1.18 github.com/jameskeane/bcrypt v0.0.0-20120420032655-c3cd44c1e20f github.com/kuking/go-frodokem v1.0.2 - github.com/mattn/go-isatty v0.0.16 + github.com/mattn/go-isatty v0.0.14 github.com/xtaci/kcp-go v5.4.20+incompatible - golang.org/x/crypto v0.0.0-20220829220503-c86fa9a7ed90 - golang.org/x/sys v0.0.0-20220909162455-aba9fc2a8ff2 + golang.org/x/crypto v0.0.0-20220408190544-5352b0902921 + golang.org/x/sys v0.0.0-20220408201424-a24fb2fb8a0f gopkg.in/hlandau/passlib.v1 v1.0.11 ) require ( blitter.com/go/chacha20 v0.0.0-20200130200441-214e4085f54c // indirect - blitter.com/go/groestl v0.0.0-20220410000905-c4decbf31d64 // indirect blitter.com/go/mtwist v1.0.1 // indirect - github.com/klauspost/cpuid/v2 v2.1.1 // indirect - github.com/klauspost/reedsolomon v1.11.0 // indirect + github.com/klauspost/cpuid/v2 v2.0.6 // indirect + github.com/klauspost/reedsolomon v1.9.16 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/templexxx/cpufeat v0.0.0-20180724012125-cef66df7f161 // indirect github.com/templexxx/xor v0.0.0-20191217153810-f85b25db303b // indirect diff --git a/go.sum b/go.sum index 677e1c7..4217963 100644 --- a/go.sum +++ b/go.sum @@ -4,12 +4,10 @@ blitter.com/go/cryptmt v1.0.2 h1:ZcLhQk7onUssXyQwG3GdXDXctCVnNL+b7aFuvwOdKXc= blitter.com/go/cryptmt v1.0.2/go.mod h1:tdME2J3O4agaDAYIYNQzzuB28yVGnPSMmV3a/ucSU84= blitter.com/go/goutmp v1.0.6 h1:jRKRw2WalVBza4T50etAfbvT2xp9G5uykIHTvyB5r0k= blitter.com/go/goutmp v1.0.6/go.mod h1:DnK/uLBu1/1yLFiuVlmwvWErzAWVp+pDv7t6ZaQRLNc= -blitter.com/go/groestl v0.0.0-20220410000905-c4decbf31d64 h1:SH6cZ4JiOTmWGeVd5hCgt8gsMvfPPHWpEwNdxfsBugM= -blitter.com/go/groestl v0.0.0-20220410000905-c4decbf31d64/go.mod h1:YMdIR/gCtFwU/a09jyWAwUu2J9CQejUFwkfD+PyVg+4= blitter.com/go/herradurakex v1.0.0 h1:6XaxY+JLT1HUWPF0gYJnjX3pVjrw4YhYZEzZ1U0wkyc= blitter.com/go/herradurakex v1.0.0/go.mod h1:m3+vYZX+2dDjdo+n/HDnXEYJX9pwmNeQLgAfJM8mtxw= -blitter.com/go/hopscotch v0.0.0-20220617051533-4b42ccd4e00a h1:1fEN7eJMG9TweQuGMAgQlTJ0Wl7lsdDL4Nt5gHZijhY= -blitter.com/go/hopscotch v0.0.0-20220617051533-4b42ccd4e00a/go.mod h1:LtcFd2/R9xcau5SZIYeaHvdqAM7Y5pyvdZYT5J9HAME= +blitter.com/go/hopscotch v0.0.0-20211113042251-b8a306eea4dc h1:IS+jxdKSdlqp6TWG3yMoBde/cctBEMwMDg588JHxgTE= +blitter.com/go/hopscotch v0.0.0-20211113042251-b8a306eea4dc/go.mod h1:9Da1oy0t9aUw3wviba+2mP1inbLGbDuCKAO3mmGQha4= blitter.com/go/kyber v0.0.0-20200130200857-6f2021cb88d9 h1:D45AnrNphtvczBXRp5JQicZRTgaK/Is5bgPDDvRKhTc= blitter.com/go/kyber v0.0.0-20200130200857-6f2021cb88d9/go.mod h1:SK6QfGG72lIfKW1Td0wH7f0wwN5nSIhV3K+wvzGNjrw= blitter.com/go/mtwist v1.0.1 h1:PxmoWexfMpLmc8neHP/PcRc3s17ct7iz4d5W/qJVt04= @@ -47,14 +45,14 @@ github.com/google/go-cmp v0.3.1/go.mod h1:8QqcDgzrUqlUb/G2PQTWiueGozuR1884gddMyw github.com/google/go-cmp v0.4.0/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/jameskeane/bcrypt v0.0.0-20120420032655-c3cd44c1e20f h1:UWGE8Vi+1Agt0lrvnd7UsmvwqWKRzb9byK9iQmsbY0Y= github.com/jameskeane/bcrypt v0.0.0-20120420032655-c3cd44c1e20f/go.mod h1:u+9Snq0w+ZdYKi8BBoaxnEwWu0fY4Kvu9ByFpM51t1s= -github.com/klauspost/cpuid/v2 v2.1.1 h1:t0wUqjowdm8ezddV5k0tLWVklVuvLJpoHeb4WBdydm0= -github.com/klauspost/cpuid/v2 v2.1.1/go.mod h1:RVVoqg1df56z8g3pUjL/3lE5UfnlrJX8tyFgg4nqhuY= -github.com/klauspost/reedsolomon v1.11.0 h1:fc24kMFf4I6dXJwSkVAsw8Za/dMcJrV5ImeDjG3ss1M= -github.com/klauspost/reedsolomon v1.11.0/go.mod h1:FXLZzlJIdfqEnQLdUKWNRuMZg747hZ4oYp2Ml60Lb/k= +github.com/klauspost/cpuid/v2 v2.0.6 h1:dQ5ueTiftKxp0gyjKSx5+8BtPWkyQbd95m8Gys/RarI= +github.com/klauspost/cpuid/v2 v2.0.6/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= +github.com/klauspost/reedsolomon v1.9.16 h1:mR0AwphBwqFv/I3B9AHtNKvzuowI1vrj8/3UX4XRmHA= +github.com/klauspost/reedsolomon v1.9.16/go.mod h1:eqPAcE7xar5CIzcdfwydOEdcmchAKAP/qs14y4GCBOk= github.com/kuking/go-frodokem v1.0.2 h1:sxdguENCyr6WnLbJ/cjz0AYCW75H1b+E6zXY2ldZnUU= github.com/kuking/go-frodokem v1.0.2/go.mod h1:83ZX1kHOd72ouCsvbffCqJIj7Ih83MQTAjH2QbqzLZk= -github.com/mattn/go-isatty v0.0.16 h1:bq3VjFmv/sOjHtdEhmkEV4x1AJtvUvOJ2PFAZ5+peKQ= -github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= +github.com/mattn/go-isatty v0.0.14 h1:yVuAays6BHfxijgZPzw+3Zlu5yQgKGP2/hcQbHb7S9Y= +github.com/mattn/go-isatty v0.0.14/go.mod h1:7GGIvUiUoEMVVmxf/4nioHXj79iQHKdU27kJ6hsGG94= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= @@ -81,8 +79,8 @@ golang.org/x/crypto v0.0.0-20200128174031-69ecbb4d6d5d/go.mod h1:LzIPMQfyMNhhGPh golang.org/x/crypto v0.0.0-20200510223506-06a226fb4e37/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20201012173705-84dcc777aaee/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= -golang.org/x/crypto v0.0.0-20220829220503-c86fa9a7ed90 h1:Y/gsMcFOcR+6S6f3YeMKl5g+dZMEWqcz5Czj/GWYbkM= -golang.org/x/crypto v0.0.0-20220829220503-c86fa9a7ed90/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= +golang.org/x/crypto v0.0.0-20220408190544-5352b0902921 h1:iU7T1X1J6yxDr0rda54sWGkHgOp5XJrqm79gcNlC2VM= +golang.org/x/crypto v0.0.0-20220408190544-5352b0902921/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= @@ -104,12 +102,16 @@ golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5h golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190902133755-9109b7679e13/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= -golang.org/x/sys v0.0.0-20220704084225-05e143d24a9e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220909162455-aba9fc2a8ff2 h1:wM1k/lXfpc5HdkJJyW9GELpd8ERGdnh8sMGL6Gzq3Ho= -golang.org/x/sys v0.0.0-20220909162455-aba9fc2a8ff2/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220408201424-a24fb2fb8a0f h1:8w7RhxzTVgUzw/AH/9mUV5q0vMgy40SQRursCcfmkCw= +golang.org/x/sys v0.0.0-20220408201424-a24fb2fb8a0f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= diff --git a/vendor/blitter.com/go/hopscotch/hopscotch.go b/vendor/blitter.com/go/hopscotch/hopscotch.go index d462344..aea1aea 100644 --- a/vendor/blitter.com/go/hopscotch/hopscotch.go +++ b/vendor/blitter.com/go/hopscotch/hopscotch.go @@ -25,11 +25,10 @@ import ( _ "crypto/sha512" b2b "golang.org/x/crypto/blake2b" - groestl "blitter.com/go/groestl" ) const ( - maxResched = 99 // above 20 starts to show outlines in 'tuxtest' ... so 10 max + maxResched = 10 // above 20 starts to show outlines in 'tuxtest' ... so 10 max ) type Cipher struct { @@ -72,10 +71,9 @@ func New(r io.Reader, w io.Writer, resched int, key []byte) (c *Cipher) { } // Init all the hash algs we're going to 'hop' around with initial keystream - c.h = make([]hash.Hash, 3) + c.h = make([]hash.Hash, 2) c.h[0] = sha512.New() c.h[1], _ = b2b.New512(c.k) - c.h[2] = groestl.New512() c.keyUpdate(c.k) c.rekeyCtr = len(c.hs) * c.resched // lower multiplier == greater security, lower speed @@ -111,11 +109,6 @@ func (c *Cipher) keyUpdate(data []byte) { sliceTmp := b2b.Sum512(data) c.hs = append(c.hs, sliceTmp[:]...) } - { - c.h[2].Write(data) - sliceTmp := groestl.Sum512(data) - c.hs = append(c.hs, sliceTmp[:]...) - } } func (c *Cipher) yield(ib byte) (ob byte) { @@ -125,38 +118,25 @@ func (c *Cipher) yield(ib byte) (ob byte) { //fmt.Fprintf(os.Stderr, "[c.hidx:%v c.idx:%v]\n", c.hidx, c.idx) // NOTE: using a non-prime modulus degrades CV % from ~ 0.055 to ~ 0.07 - switch c.ctr % 5 { + switch c.ctr % 3 { case 0: ob = c.bTmp ^ ib ^ byte(c.ctr) ^ byte(c.idx) ^ c.hs[len(c.hs)-19] ^ c.hs[len(c.hs)-2] ^ c.hs[len(c.hs)-3] ^ c.hs[len(c.hs)-5] ^ - c.hs[len(c.hs)-7] ^ c.hs[len(c.hs)-11] ^ c.hs[len(c.hs)-13] ^ c.hs[len(c.hs)-17] ^ - c.hs[len(c.hs)-47] ^ c.hs[len(c.hs)-43] ^ c.hs[len(c.hs)-41] ^ c.hs[len(c.hs)-39] + c.hs[len(c.hs)-7] ^ c.hs[len(c.hs)-11] ^ c.hs[len(c.hs)-13] ^ c.hs[len(c.hs)-17] case 1: ob = c.bTmp ^ ib ^ byte(c.ctr) ^ byte(c.idx) ^ c.hs[len(c.hs)-5] ^ c.hs[len(c.hs)-7] ^ c.hs[len(c.hs)-11] ^ c.hs[len(c.hs)-13] ^ - c.hs[len(c.hs)-17] ^ c.hs[len(c.hs)-19] ^ c.hs[len(c.hs)-23] ^ c.hs[len(c.hs)-29] ^ - c.hs[len(c.hs)-43] ^ c.hs[len(c.hs)-41] ^ c.hs[len(c.hs)-39] ^ c.hs[len(c.hs)-37] + c.hs[len(c.hs)-17] ^ c.hs[len(c.hs)-19] ^ c.hs[len(c.hs)-23] ^ c.hs[len(c.hs)-29] case 2: ob = c.bTmp ^ ib ^ byte(c.ctr) ^ byte(c.idx) ^ c.hs[len(c.hs)-13] ^ c.hs[len(c.hs)-17] ^ c.hs[len(c.hs)-23] ^ c.hs[len(c.hs)-27] ^ - c.hs[len(c.hs)-29] ^ c.hs[len(c.hs)-31] ^ c.hs[len(c.hs)-2] ^ c.hs[len(c.hs)-3] ^ - c.hs[len(c.hs)-37] ^ c.hs[len(c.hs)-41] ^ c.hs[len(c.hs)-39] ^ c.hs[len(c.hs)-47] - case 3: - ob = c.bTmp ^ ib ^ byte(c.ctr) ^ byte(c.idx) ^ - c.hs[len(c.hs)-13] ^ c.hs[len(c.hs)-17] ^ c.hs[len(c.hs)-23] ^ c.hs[len(c.hs)-27] ^ - c.hs[len(c.hs)-29] ^ c.hs[len(c.hs)-31] ^ c.hs[len(c.hs)-5] ^ c.hs[len(c.hs)-3] ^ - c.hs[len(c.hs)-43] ^ c.hs[len(c.hs)-41] ^ c.hs[len(c.hs)-39] ^ c.hs[len(c.hs)-37] - case 4: - ob = c.bTmp ^ ib ^ byte(c.ctr) ^ byte(c.idx) ^ - c.hs[len(c.hs)-13] ^ c.hs[len(c.hs)-17] ^ c.hs[len(c.hs)-23] ^ c.hs[len(c.hs)-27] ^ - c.hs[len(c.hs)-29] ^ c.hs[len(c.hs)-31] ^ c.hs[len(c.hs)-7] ^ c.hs[len(c.hs)-3] ^ - c.hs[len(c.hs)-33] ^ c.hs[len(c.hs)-41] ^ c.hs[len(c.hs)-45] ^ c.hs[len(c.hs)-43] + c.hs[len(c.hs)-29] ^ c.hs[len(c.hs)-31] ^ c.hs[len(c.hs)-2] ^ c.hs[len(c.hs)-3] } if c.ctr%c.rekeyCtr == 0 { - bufTmp := make([]byte, 16*3) + bufTmp := make([]byte, 32) _, _ = c.prng.Read(bufTmp) c.keyUpdate(bufTmp) } diff --git a/vendor/github.com/klauspost/cpuid/v2/.travis.yml b/vendor/github.com/klauspost/cpuid/v2/.travis.yml new file mode 100644 index 0000000..aa9bad7 --- /dev/null +++ b/vendor/github.com/klauspost/cpuid/v2/.travis.yml @@ -0,0 +1,67 @@ +language: go + +os: + - linux + - osx + - windows + +arch: + - amd64 + - arm64 + +go: + - 1.13.x + - 1.14.x + - 1.15.x + - 1.16.x + - master + +env: + - CGO_ENABLED=0 + +script: + - go vet ./... + - go test -test.v -test.run ^TestCPUID$ + - CGO_ENABLED=1 go test -race ./... + - go test -tags=nounsafe -test.v -test.run ^TestCPUID$ + - go test -tags=noasm ./... + - go run ./cmd/cpuid/main.go + - go run ./cmd/cpuid/main.go -json + +matrix: + allow_failures: + - go: 'master' + fast_finish: true + include: + - stage: other + go: 1.16.x + os: linux + arch: amd64 + script: + - diff <(gofmt -d .) <(printf "") + - diff <(gofmt -d ./private) <(printf "") + - curl -sfL https://git.io/goreleaser | VERSION=v0.157.0 sh -s -- check # check goreleaser config for deprecations + - curl -sL https://git.io/goreleaser | VERSION=v0.157.0 sh -s -- --snapshot --skip-publish --rm-dist + - go get github.com/klauspost/asmfmt&&go install github.com/klauspost/asmfmt/cmd/asmfmt + - diff <(asmfmt -d .) <(printf "") + - GOOS=linux GOARCH=386 go test . + - ./test-architectures.sh + - stage: other + go: 1.15.x + os: linux + arch: amd64 + script: + - ./test-architectures.sh + +deploy: + - provider: script + skip_cleanup: true + script: curl -sL https://git.io/goreleaser | VERSION=v0.157.0 bash || true + on: + tags: true + condition: ($TRAVIS_OS_NAME = linux) && ($TRAVIS_CPU_ARCH = amd64) + go: 1.16.x +branches: + only: + - master + - /^v\d+\.\d+(\.\d+)?(-\S*)?$/ diff --git a/vendor/github.com/klauspost/cpuid/v2/README.md b/vendor/github.com/klauspost/cpuid/v2/README.md index ea7df3d..465f4b7 100644 --- a/vendor/github.com/klauspost/cpuid/v2/README.md +++ b/vendor/github.com/klauspost/cpuid/v2/README.md @@ -39,10 +39,10 @@ func main() { fmt.Println("ThreadsPerCore:", CPU.ThreadsPerCore) fmt.Println("LogicalCores:", CPU.LogicalCores) fmt.Println("Family", CPU.Family, "Model:", CPU.Model, "Vendor ID:", CPU.VendorID) - fmt.Println("Features:", strings.Join(CPU.FeatureSet(), ",")) + fmt.Println("Features:", fmt.Sprintf(strings.Join(CPU.FeatureSet(), ","))) fmt.Println("Cacheline bytes:", CPU.CacheLine) fmt.Println("L1 Data Cache:", CPU.Cache.L1D, "bytes") - fmt.Println("L1 Instruction Cache:", CPU.Cache.L1I, "bytes") + fmt.Println("L1 Instruction Cache:", CPU.Cache.L1D, "bytes") fmt.Println("L2 Cache:", CPU.Cache.L2, "bytes") fmt.Println("L3 Cache:", CPU.Cache.L3, "bytes") fmt.Println("Frequency", CPU.Hz, "hz") @@ -132,127 +132,6 @@ func main() { } ``` -## commandline - -Download as binary from: https://github.com/klauspost/cpuid/releases - -Install from source: - -`go install github.com/klauspost/cpuid/v2/cmd/cpuid@latest` - -### Example - -``` -λ cpuid -Name: AMD Ryzen 9 3950X 16-Core Processor -Vendor String: AuthenticAMD -Vendor ID: AMD -PhysicalCores: 16 -Threads Per Core: 2 -Logical Cores: 32 -CPU Family 23 Model: 113 -Features: ADX,AESNI,AVX,AVX2,BMI1,BMI2,CLMUL,CLZERO,CMOV,CMPXCHG8,CPBOOST,CX16,F16C,FMA3,FXSR,FXSROPT,HTT,HYPERVISOR,LAHF,LZCNT,MCAOVERFLOW,MMX,MMXEXT,MOVBE,NX,OSXSAVE,POPCNT,RDRAND,RDSEED,RDTSCP,SCE,SHA,SSE,SSE2,SSE3,SSE4,SSE42,SSE4A,SSSE3,SUCCOR,X87,XSAVE -Microarchitecture level: 3 -Cacheline bytes: 64 -L1 Instruction Cache: 32768 bytes -L1 Data Cache: 32768 bytes -L2 Cache: 524288 bytes -L3 Cache: 16777216 bytes - -``` -### JSON Output: - -``` -λ cpuid --json -{ - "BrandName": "AMD Ryzen 9 3950X 16-Core Processor", - "VendorID": 2, - "VendorString": "AuthenticAMD", - "PhysicalCores": 16, - "ThreadsPerCore": 2, - "LogicalCores": 32, - "Family": 23, - "Model": 113, - "CacheLine": 64, - "Hz": 0, - "BoostFreq": 0, - "Cache": { - "L1I": 32768, - "L1D": 32768, - "L2": 524288, - "L3": 16777216 - }, - "SGX": { - "Available": false, - "LaunchControl": false, - "SGX1Supported": false, - "SGX2Supported": false, - "MaxEnclaveSizeNot64": 0, - "MaxEnclaveSize64": 0, - "EPCSections": null - }, - "Features": [ - "ADX", - "AESNI", - "AVX", - "AVX2", - "BMI1", - "BMI2", - "CLMUL", - "CLZERO", - "CMOV", - "CMPXCHG8", - "CPBOOST", - "CX16", - "F16C", - "FMA3", - "FXSR", - "FXSROPT", - "HTT", - "HYPERVISOR", - "LAHF", - "LZCNT", - "MCAOVERFLOW", - "MMX", - "MMXEXT", - "MOVBE", - "NX", - "OSXSAVE", - "POPCNT", - "RDRAND", - "RDSEED", - "RDTSCP", - "SCE", - "SHA", - "SSE", - "SSE2", - "SSE3", - "SSE4", - "SSE42", - "SSE4A", - "SSSE3", - "SUCCOR", - "X87", - "XSAVE" - ], - "X64Level": 3 -} -``` - -### Check CPU microarch level - -``` -λ cpuid --check-level=3 -2022/03/18 17:04:40 AMD Ryzen 9 3950X 16-Core Processor -2022/03/18 17:04:40 Microarchitecture level 3 is supported. Max level is 3. -Exit Code 0 - -λ cpuid --check-level=4 -2022/03/18 17:06:18 AMD Ryzen 9 3950X 16-Core Processor -2022/03/18 17:06:18 Microarchitecture level 4 not supported. Max level is 3. -Exit Code 1 -``` - # license This code is published under an MIT license. See LICENSE file for more information. diff --git a/vendor/github.com/klauspost/cpuid/v2/cpuid.go b/vendor/github.com/klauspost/cpuid/v2/cpuid.go index 27f3325..43e9cc1 100644 --- a/vendor/github.com/klauspost/cpuid/v2/cpuid.go +++ b/vendor/github.com/klauspost/cpuid/v2/cpuid.go @@ -14,7 +14,6 @@ import ( "flag" "fmt" "math" - "math/bits" "os" "runtime" "strings" @@ -84,7 +83,6 @@ const ( AVX512DQ // AVX-512 Doubleword and Quadword Instructions AVX512ER // AVX-512 Exponential and Reciprocal Instructions AVX512F // AVX-512 Foundation - AVX512FP16 // AVX-512 FP16 Instructions AVX512IFMA // AVX-512 Integer Fused Multiply-Add Instructions AVX512PF // AVX-512 Prefetch Instructions AVX512VBMI // AVX-512 Vector Bit Manipulation Instructions @@ -93,32 +91,21 @@ const ( AVX512VNNI // AVX-512 Vector Neural Network Instructions AVX512VP2INTERSECT // AVX-512 Intersect for D/Q AVX512VPOPCNTDQ // AVX-512 Vector Population Count Doubleword and Quadword - AVXSLOW // Indicates the CPU performs 2 128 bit operations instead of one - AVXVNNI // AVX (VEX encoded) VNNI neural network instructions + AVXSLOW // Indicates the CPU performs 2 128 bit operations instead of one. BMI1 // Bit Manipulation Instruction Set 1 BMI2 // Bit Manipulation Instruction Set 2 - CETIBT // Intel CET Indirect Branch Tracking - CETSS // Intel CET Shadow Stack CLDEMOTE // Cache Line Demote CLMUL // Carry-less Multiplication - CLZERO // CLZERO instruction supported CMOV // i686 CMOV - CMPSB_SCADBS_SHORT // Fast short CMPSB and SCASB - CMPXCHG8 // CMPXCHG8 instruction - CPBOOST // Core Performance Boost CX16 // CMPXCHG16B Instruction ENQCMD // Enqueue Command ERMS // Enhanced REP MOVSB/STOSB F16C // Half-precision floating-point conversion FMA3 // Intel FMA 3. Does not imply AVX. FMA4 // Bulldozer FMA4 functions - FXSR // FXSAVE, FXRESTOR instructions, CR4 bit 9 - FXSROPT // FXSAVE/FXRSTOR optimizations - GFNI // Galois Field New Instructions. May require other features (AVX, AVX512VL,AVX512F) based on usage. + GFNI // Galois Field New Instructions HLE // Hardware Lock Elision - HRESET // If set CPU supports history reset and the IA32_HRESET_ENABLE MSR HTT // Hyperthreading (enabled) - HWA // Hardware assert supported. Indicates support for MSRC001_10 HYPERVISOR // This bit has been reserved by Intel & AMD for use by hypervisors IBPB // Indirect Branch Restricted Speculation (IBRS) and Indirect Branch Predictor Barrier (IBPB) IBS // Instruction Based Sampling (AMD) @@ -130,48 +117,22 @@ const ( IBSOPSAM // Instruction Based Sampling Feature (AMD) IBSRDWROPCNT // Instruction Based Sampling Feature (AMD) IBSRIPINVALIDCHK // Instruction Based Sampling Feature (AMD) - IBS_PREVENTHOST // Disallowing IBS use by the host supported - INT_WBINVD // WBINVD/WBNOINVD are interruptible. - INVLPGB // NVLPGB and TLBSYNC instruction supported - LAHF // LAHF/SAHF in long mode - LAM // If set, CPU supports Linear Address Masking - LBRVIRT // LBR virtualization LZCNT // LZCNT instruction - MCAOVERFLOW // MCA overflow recovery support. - MCOMMIT // MCOMMIT instruction supported MMX // standard MMX MMXEXT // SSE integer functions or AMD MMX ext - MOVBE // MOVBE instruction (big-endian) MOVDIR64B // Move 64 Bytes as Direct Store MOVDIRI // Move Doubleword as Direct Store - MOVSB_ZL // Fast Zero-Length MOVSB MPX // Intel MPX (Memory Protection Extensions) - MSRIRC // Instruction Retired Counter MSR available - MSR_PAGEFLUSH // Page Flush MSR available - NRIPS // Indicates support for NRIP save on VMEXIT NX // NX (No-Execute) bit - OSXSAVE // XSAVE enabled by OS - PCONFIG // PCONFIG for Intel Multi-Key Total Memory Encryption POPCNT // POPCNT instruction - RDPRU // RDPRU instruction supported RDRAND // RDRAND instruction is available RDSEED // RDSEED instruction is available RDTSCP // RDTSCP Instruction RTM // Restricted Transactional Memory - RTM_ALWAYS_ABORT // Indicates that the loaded microcode is forcing RTM abort. SERIALIZE // Serialize Instruction Execution - SEV // AMD Secure Encrypted Virtualization supported - SEV_64BIT // AMD SEV guest execution only allowed from a 64-bit host - SEV_ALTERNATIVE // AMD SEV Alternate Injection supported - SEV_DEBUGSWAP // Full debug state swap supported for SEV-ES guests - SEV_ES // AMD SEV Encrypted State supported - SEV_RESTRICTED // AMD SEV Restricted Injection supported - SEV_SNP // AMD SEV Secure Nested Paging supported SGX // Software Guard Extensions SGXLC // Software Guard Extensions Launch Control SHA // Intel SHA Extensions - SME // AMD Secure Memory Encryption supported - SME_COHERENT // AMD Hardware cache coherency across encryption domains enforced SSE // SSE functions SSE2 // P4 SSE functions SSE3 // Prescott SSE3 functions @@ -180,38 +141,14 @@ const ( SSE4A // AMD Barcelona microarchitecture SSE4a instructions SSSE3 // Conroe SSSE3 functions STIBP // Single Thread Indirect Branch Predictors - STOSB_SHORT // Fast short STOSB - SUCCOR // Software uncorrectable error containment and recovery capability. - SVM // AMD Secure Virtual Machine - SVMDA // Indicates support for the SVM decode assists. - SVMFBASID // SVM, Indicates that TLB flush events, including CR3 writes and CR4.PGE toggles, flush only the current ASID's TLB entries. Also indicates support for the extended VMCBTLB_Control - SVML // AMD SVM lock. Indicates support for SVM-Lock. - SVMNP // AMD SVM nested paging - SVMPF // SVM pause intercept filter. Indicates support for the pause intercept filter - SVMPFT // SVM PAUSE filter threshold. Indicates support for the PAUSE filter cycle count threshold - SYSCALL // System-Call Extension (SCE): SYSCALL and SYSRET instructions. - SYSEE // SYSENTER and SYSEXIT instructions TBM // AMD Trailing Bit Manipulation - TOPEXT // TopologyExtensions: topology extensions support. Indicates support for CPUID Fn8000_001D_EAX_x[N:0]-CPUID Fn8000_001E_EDX. - TME // Intel Total Memory Encryption. The following MSRs are supported: IA32_TME_CAPABILITY, IA32_TME_ACTIVATE, IA32_TME_EXCLUDE_MASK, and IA32_TME_EXCLUDE_BASE. - TSCRATEMSR // MSR based TSC rate control. Indicates support for MSR TSC ratio MSRC000_0104 TSXLDTRK // Intel TSX Suspend Load Address Tracking - VAES // Vector AES. AVX(512) versions requires additional checks. - VMCBCLEAN // VMCB clean bits. Indicates support for VMCB clean bits. - VMPL // AMD VM Permission Levels supported - VMSA_REGPROT // AMD VMSA Register Protection supported + VAES // Vector AES VMX // Virtual Machine Extensions - VPCLMULQDQ // Carry-Less Multiplication Quadword. Requires AVX for 3 register versions. - VTE // AMD Virtual Transparent Encryption supported + VPCLMULQDQ // Carry-Less Multiplication Quadword WAITPKG // TPAUSE, UMONITOR, UMWAIT WBNOINVD // Write Back and Do Not Invalidate Cache - X87 // FPU - XGETBV1 // Supports XGETBV with ECX = 1 XOP // Bulldozer XOP functions - XSAVE // XSAVE, XRESTOR, XSETBV, XGETBV - XSAVEC // Supports XSAVEC and the compacted form of XRSTOR. - XSAVEOPT // XSAVEOPT available - XSAVES // Supports XSAVES/XRSTORS and IA32_XSS // ARM features: AESARM // AES instructions @@ -238,6 +175,7 @@ const ( SM3 // SM3 instructions SM4 // SM4 instructions SVE // Scalable Vector Extension + // Keep it last. It automatically defines the size of []flagSet lastID @@ -255,10 +193,8 @@ type CPUInfo struct { LogicalCores int // Number of physical cores times threads that can run on each core through the use of hyperthreading. Will be 0 if undetectable. Family int // CPU family number Model int // CPU model number - Stepping int // CPU stepping info CacheLine int // Cache line size in bytes. Will be 0 if undetectable. - Hz int64 // Clock speed, if known, 0 otherwise. Will attempt to contain base clock speed. - BoostFreq int64 // Max clock speed, if known, 0 otherwise + Hz int64 // Clock speed, if known, 0 otherwise Cache struct { L1I int // L1 Instruction Cache (per core or shared). Will be -1 if undetected L1D int // L1 Data Cache (per core or shared). Will be -1 if undetected @@ -362,41 +298,6 @@ func (c CPUInfo) Has(id FeatureID) bool { return c.featureSet.inSet(id) } -// AnyOf returns whether the CPU supports one or more of the requested features. -func (c CPUInfo) AnyOf(ids ...FeatureID) bool { - for _, id := range ids { - if c.featureSet.inSet(id) { - return true - } - } - return false -} - -// https://en.wikipedia.org/wiki/X86-64#Microarchitecture_levels -var level1Features = flagSetWith(CMOV, CMPXCHG8, X87, FXSR, MMX, SYSCALL, SSE, SSE2) -var level2Features = flagSetWith(CMOV, CMPXCHG8, X87, FXSR, MMX, SYSCALL, SSE, SSE2, CX16, LAHF, POPCNT, SSE3, SSE4, SSE42, SSSE3) -var level3Features = flagSetWith(CMOV, CMPXCHG8, X87, FXSR, MMX, SYSCALL, SSE, SSE2, CX16, LAHF, POPCNT, SSE3, SSE4, SSE42, SSSE3, AVX, AVX2, BMI1, BMI2, F16C, FMA3, LZCNT, MOVBE, OSXSAVE) -var level4Features = flagSetWith(CMOV, CMPXCHG8, X87, FXSR, MMX, SYSCALL, SSE, SSE2, CX16, LAHF, POPCNT, SSE3, SSE4, SSE42, SSSE3, AVX, AVX2, BMI1, BMI2, F16C, FMA3, LZCNT, MOVBE, OSXSAVE, AVX512F, AVX512BW, AVX512CD, AVX512DQ, AVX512VL) - -// X64Level returns the microarchitecture level detected on the CPU. -// If features are lacking or non x64 mode, 0 is returned. -// See https://en.wikipedia.org/wiki/X86-64#Microarchitecture_levels -func (c CPUInfo) X64Level() int { - if c.featureSet.hasSet(level4Features) { - return 4 - } - if c.featureSet.hasSet(level3Features) { - return 3 - } - if c.featureSet.hasSet(level2Features) { - return 2 - } - if c.featureSet.hasSet(level1Features) { - return 1 - } - return 0 -} - // Disable will disable one or several features. func (c *CPUInfo) Disable(ids ...FeatureID) bool { for _, id := range ids { @@ -419,10 +320,11 @@ func (c CPUInfo) IsVendor(v Vendor) bool { return c.VendorID == v } -// FeatureSet returns all available features as strings. func (c CPUInfo) FeatureSet() []string { - s := make([]string, 0, c.featureSet.nEnabled()) - s = append(s, c.featureSet.Strings()...) + s := make([]string, 0) + for _, f := range c.featureSet.Strings() { + s = append(s, f) + } return s } @@ -461,42 +363,25 @@ func (c CPUInfo) LogicalCPU() int { return int(ebx >> 24) } -// frequencies tries to compute the clock speed of the CPU. If leaf 15 is +// hertz tries to compute the clock speed of the CPU. If leaf 15 is // supported, use it, otherwise parse the brand string. Yes, really. -func (c *CPUInfo) frequencies() { - c.Hz, c.BoostFreq = 0, 0 +func hertz(model string) int64 { mfi := maxFunctionID() if mfi >= 0x15 { eax, ebx, ecx, _ := cpuid(0x15) if eax != 0 && ebx != 0 && ecx != 0 { - c.Hz = (int64(ecx) * int64(ebx)) / int64(eax) + return int64((int64(ecx) * int64(ebx)) / int64(eax)) } } - if mfi >= 0x16 { - a, b, _, _ := cpuid(0x16) - // Base... - if a&0xffff > 0 { - c.Hz = int64(a&0xffff) * 1_000_000 - } - // Boost... - if b&0xffff > 0 { - c.BoostFreq = int64(b&0xffff) * 1_000_000 - } - } - if c.Hz > 0 { - return - } - // computeHz determines the official rated speed of a CPU from its brand // string. This insanity is *actually the official documented way to do // this according to Intel*, prior to leaf 0x15 existing. The official // documentation only shows this working for exactly `x.xx` or `xxxx` // cases, e.g., `2.50GHz` or `1300MHz`; this parser will accept other // sizes. - model := c.BrandName hz := strings.LastIndex(model, "Hz") if hz < 3 { - return + return 0 } var multiplier int64 switch model[hz-1] { @@ -508,7 +393,7 @@ func (c *CPUInfo) frequencies() { multiplier = 1000 * 1000 * 1000 * 1000 } if multiplier == 0 { - return + return 0 } freq := int64(0) divisor := int64(0) @@ -520,22 +405,21 @@ func (c *CPUInfo) frequencies() { decimalShift *= 10 } else if model[i] == '.' { if divisor != 0 { - return + return 0 } divisor = decimalShift } else { - return + return 0 } } // we didn't find a space if i < 0 { - return + return 0 } if divisor != 0 { - c.Hz = (freq * multiplier) / divisor - return + return (freq * multiplier) / divisor } - c.Hz = freq * multiplier + return freq * multiplier } // VM Will return true if the cpu id indicates we are in @@ -584,32 +468,6 @@ func (s *flagSet) or(other flagSet) { } } -// hasSet returns whether all features are present. -func (s flagSet) hasSet(other flagSet) bool { - for i, v := range other[:] { - if s[i]&v != v { - return false - } - } - return true -} - -// nEnabled will return the number of enabled flags. -func (s flagSet) nEnabled() (n int) { - for _, v := range s[:] { - n += bits.OnesCount64(uint64(v)) - } - return n -} - -func flagSetWith(feat ...FeatureID) flagSet { - var res flagSet - for _, f := range feat { - res.set(f) - } - return res -} - // ParseFeature will parse the string and return the ID of the matching feature. // Will return UNKNOWN if not found. func ParseFeature(s string) FeatureID { @@ -690,7 +548,7 @@ func threadsPerCore() int { if vend == AMD { // Workaround for AMD returning 0, assume 2 if >= Zen 2 // It will be more correct than not. - fam, _, _ := familyModel() + fam, _ := familyModel() _, _, _, d := cpuid(1) if (d&(1<<28)) != 0 && fam >= 23 { return 2 @@ -728,27 +586,14 @@ func logicalCores() int { } } -func familyModel() (family, model, stepping int) { +func familyModel() (int, int) { if maxFunctionID() < 0x1 { - return 0, 0, 0 + return 0, 0 } eax, _, _, _ := cpuid(1) - // If BaseFamily[3:0] is less than Fh then ExtendedFamily[7:0] is reserved and Family is equal to BaseFamily[3:0]. - family = int((eax >> 8) & 0xf) - extFam := family == 0x6 // Intel is 0x6, needs extended model. - if family == 0xf { - // Add ExtFamily - family += int((eax >> 20) & 0xff) - extFam = true - } - // If BaseFamily[3:0] is less than 0Fh then ExtendedModel[3:0] is reserved and Model is equal to BaseModel[3:0]. - model = int((eax >> 4) & 0xf) - if extFam { - // Add ExtModel - model += int((eax >> 12) & 0xf0) - } - stepping = int(eax & 0xf) - return family, model, stepping + family := ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff) + model := ((eax >> 4) & 0xf) + ((eax >> 12) & 0xf0) + return int(family), int(model) } func physicalCores() int { @@ -832,7 +677,6 @@ func (c *CPUInfo) cacheSize() { if maxFunctionID() < 4 { return } - c.Cache.L1I, c.Cache.L1D, c.Cache.L2, c.Cache.L3 = 0, 0, 0, 0 for i := uint32(0); ; i++ { eax, ebx, ecx, _ := cpuidex(4, i) cacheType := eax & 15 @@ -883,14 +727,9 @@ func (c *CPUInfo) cacheSize() { c.Cache.L2 = int(((ecx >> 16) & 0xFFFF) * 1024) // CPUID Fn8000_001D_EAX_x[N:0] Cache Properties - if maxExtendedFunction() < 0x8000001D || !c.Has(TOPEXT) { + if maxExtendedFunction() < 0x8000001D { return } - - // Xen Hypervisor is buggy and returns the same entry no matter ECX value. - // Hack: When we encounter the same entry 100 times we break. - nSame := 0 - var last uint32 for i := uint32(0); i < math.MaxUint32; i++ { eax, ebx, ecx, _ := cpuidex(0x8000001D, i) @@ -906,16 +745,6 @@ func (c *CPUInfo) cacheSize() { return } - // Check for the same value repeated. - comb := eax ^ ebx ^ ecx - if comb == last { - nSame++ - if nSame == 100 { - return - } - } - last = comb - switch level { case 1: switch typ { @@ -940,6 +769,8 @@ func (c *CPUInfo) cacheSize() { } } } + + return } type SGXEPCSection struct { @@ -1000,26 +831,21 @@ func support() flagSet { if mfi < 0x1 { return fs } - family, model, _ := familyModel() + family, model := familyModel() _, _, c, d := cpuid(1) - fs.setIf((d&(1<<0)) != 0, X87) - fs.setIf((d&(1<<8)) != 0, CMPXCHG8) - fs.setIf((d&(1<<11)) != 0, SYSEE) fs.setIf((d&(1<<15)) != 0, CMOV) fs.setIf((d&(1<<23)) != 0, MMX) - fs.setIf((d&(1<<24)) != 0, FXSR) - fs.setIf((d&(1<<25)) != 0, FXSROPT) + fs.setIf((d&(1<<25)) != 0, MMXEXT) fs.setIf((d&(1<<25)) != 0, SSE) fs.setIf((d&(1<<26)) != 0, SSE2) fs.setIf((c&1) != 0, SSE3) fs.setIf((c&(1<<5)) != 0, VMX) - fs.setIf((c&(1<<9)) != 0, SSSE3) - fs.setIf((c&(1<<19)) != 0, SSE4) - fs.setIf((c&(1<<20)) != 0, SSE42) + fs.setIf((c&0x00000200) != 0, SSSE3) + fs.setIf((c&0x00080000) != 0, SSE4) + fs.setIf((c&0x00100000) != 0, SSE42) fs.setIf((c&(1<<25)) != 0, AESNI) fs.setIf((c&(1<<1)) != 0, CLMUL) - fs.setIf(c&(1<<22) != 0, MOVBE) fs.setIf(c&(1<<23) != 0, POPCNT) fs.setIf(c&(1<<30) != 0, RDRAND) @@ -1035,8 +861,6 @@ func support() flagSet { if vend == AMD && (d&(1<<28)) != 0 && mfi >= 4 { fs.setIf(threadsPerCore() > 1, HTT) } - fs.setIf(c&1<<26 != 0, XSAVE) - fs.setIf(c&1<<27 != 0, OSXSAVE) // Check XGETBV/XSAVE (26), OXSAVE (27) and AVX (28) bits const avxCheck = 1<<26 | 1<<27 | 1<<28 if c&avxCheck == avxCheck { @@ -1062,6 +886,7 @@ func support() flagSet { // Check AVX2, AVX2 requires OS support, but BMI1/2 don't. if mfi >= 7 { _, ebx, ecx, edx := cpuidex(7, 0) + eax1, _, _, _ := cpuidex(7, 1) if fs.inSet(AVX) && (ebx&0x00000020) != 0 { fs.set(AVX2) } @@ -1078,38 +903,19 @@ func support() flagSet { fs.setIf(ebx&(1<<18) != 0, RDSEED) fs.setIf(ebx&(1<<19) != 0, ADX) fs.setIf(ebx&(1<<29) != 0, SHA) - // CPUID.(EAX=7, ECX=0).ECX fs.setIf(ecx&(1<<5) != 0, WAITPKG) - fs.setIf(ecx&(1<<7) != 0, CETSS) - fs.setIf(ecx&(1<<8) != 0, GFNI) - fs.setIf(ecx&(1<<9) != 0, VAES) - fs.setIf(ecx&(1<<10) != 0, VPCLMULQDQ) - fs.setIf(ecx&(1<<13) != 0, TME) fs.setIf(ecx&(1<<25) != 0, CLDEMOTE) fs.setIf(ecx&(1<<27) != 0, MOVDIRI) fs.setIf(ecx&(1<<28) != 0, MOVDIR64B) fs.setIf(ecx&(1<<29) != 0, ENQCMD) fs.setIf(ecx&(1<<30) != 0, SGXLC) - // CPUID.(EAX=7, ECX=0).EDX - fs.setIf(edx&(1<<11) != 0, RTM_ALWAYS_ABORT) fs.setIf(edx&(1<<14) != 0, SERIALIZE) fs.setIf(edx&(1<<16) != 0, TSXLDTRK) - fs.setIf(edx&(1<<18) != 0, PCONFIG) - fs.setIf(edx&(1<<20) != 0, CETIBT) fs.setIf(edx&(1<<26) != 0, IBPB) fs.setIf(edx&(1<<27) != 0, STIBP) - // CPUID.(EAX=7, ECX=1) - eax1, _, _, _ := cpuidex(7, 1) - fs.setIf(fs.inSet(AVX) && eax1&(1<<4) != 0, AVXVNNI) - fs.setIf(eax1&(1<<10) != 0, MOVSB_ZL) - fs.setIf(eax1&(1<<11) != 0, STOSB_SHORT) - fs.setIf(eax1&(1<<12) != 0, CMPSB_SCADBS_SHORT) - fs.setIf(eax1&(1<<22) != 0, HRESET) - fs.setIf(eax1&(1<<26) != 0, LAM) - // Only detect AVX-512 features if XGETBV is supported if c&((1<<26)|(1<<27)) == (1<<26)|(1<<27) { // Check for OS support @@ -1134,13 +940,15 @@ func support() flagSet { // ecx fs.setIf(ecx&(1<<1) != 0, AVX512VBMI) fs.setIf(ecx&(1<<6) != 0, AVX512VBMI2) + fs.setIf(ecx&(1<<8) != 0, GFNI) + fs.setIf(ecx&(1<<9) != 0, VAES) + fs.setIf(ecx&(1<<10) != 0, VPCLMULQDQ) fs.setIf(ecx&(1<<11) != 0, AVX512VNNI) fs.setIf(ecx&(1<<12) != 0, AVX512BITALG) fs.setIf(ecx&(1<<14) != 0, AVX512VPOPCNTDQ) // edx fs.setIf(edx&(1<<8) != 0, AVX512VP2INTERSECT) fs.setIf(edx&(1<<22) != 0, AMXBF16) - fs.setIf(edx&(1<<23) != 0, AVX512FP16) fs.setIf(edx&(1<<24) != 0, AMXTILE) fs.setIf(edx&(1<<25) != 0, AMXINT8) // eax1 = CPUID.(EAX=7, ECX=1).EAX @@ -1148,91 +956,33 @@ func support() flagSet { } } } - // Processor Extended State Enumeration Sub-leaf (EAX = 0DH, ECX = 1) - // EAX - // Bit 00: XSAVEOPT is available. - // Bit 01: Supports XSAVEC and the compacted form of XRSTOR if set. - // Bit 02: Supports XGETBV with ECX = 1 if set. - // Bit 03: Supports XSAVES/XRSTORS and IA32_XSS if set. - // Bits 31 - 04: Reserved. - // EBX - // Bits 31 - 00: The size in bytes of the XSAVE area containing all states enabled by XCRO | IA32_XSS. - // ECX - // Bits 31 - 00: Reports the supported bits of the lower 32 bits of the IA32_XSS MSR. IA32_XSS[n] can be set to 1 only if ECX[n] is 1. - // EDX? - // Bits 07 - 00: Used for XCR0. Bit 08: PT state. Bit 09: Used for XCR0. Bits 12 - 10: Reserved. Bit 13: HWP state. Bits 31 - 14: Reserved. - if mfi >= 0xd { - if fs.inSet(XSAVE) { - eax, _, _, _ := cpuidex(0xd, 1) - fs.setIf(eax&(1<<0) != 0, XSAVEOPT) - fs.setIf(eax&(1<<1) != 0, XSAVEC) - fs.setIf(eax&(1<<2) != 0, XGETBV1) - fs.setIf(eax&(1<<3) != 0, XSAVES) - } - } + if maxExtendedFunction() >= 0x80000001 { _, _, c, d := cpuid(0x80000001) if (c & (1 << 5)) != 0 { fs.set(LZCNT) fs.set(POPCNT) } - // ECX - fs.setIf((c&(1<<0)) != 0, LAHF) - fs.setIf((c&(1<<2)) != 0, SVM) - fs.setIf((c&(1<<6)) != 0, SSE4A) fs.setIf((c&(1<<10)) != 0, IBS) - fs.setIf((c&(1<<22)) != 0, TOPEXT) - - // EDX - fs.setIf(d&(1<<11) != 0, SYSCALL) + fs.setIf((d&(1<<31)) != 0, AMD3DNOW) + fs.setIf((d&(1<<30)) != 0, AMD3DNOWEXT) + fs.setIf((d&(1<<23)) != 0, MMX) + fs.setIf((d&(1<<22)) != 0, MMXEXT) + fs.setIf((c&(1<<6)) != 0, SSE4A) fs.setIf(d&(1<<20) != 0, NX) - fs.setIf(d&(1<<22) != 0, MMXEXT) - fs.setIf(d&(1<<23) != 0, MMX) - fs.setIf(d&(1<<24) != 0, FXSR) - fs.setIf(d&(1<<25) != 0, FXSROPT) fs.setIf(d&(1<<27) != 0, RDTSCP) - fs.setIf(d&(1<<30) != 0, AMD3DNOWEXT) - fs.setIf(d&(1<<31) != 0, AMD3DNOW) /* XOP and FMA4 use the AVX instruction coding scheme, so they can't be * used unless the OS has AVX support. */ if fs.inSet(AVX) { - fs.setIf((c&(1<<11)) != 0, XOP) - fs.setIf((c&(1<<16)) != 0, FMA4) + fs.setIf((c&0x00000800) != 0, XOP) + fs.setIf((c&0x00010000) != 0, FMA4) } } - if maxExtendedFunction() >= 0x80000007 { - _, b, _, d := cpuid(0x80000007) - fs.setIf((b&(1<<0)) != 0, MCAOVERFLOW) - fs.setIf((b&(1<<1)) != 0, SUCCOR) - fs.setIf((b&(1<<2)) != 0, HWA) - fs.setIf((d&(1<<9)) != 0, CPBOOST) - } - if maxExtendedFunction() >= 0x80000008 { _, b, _, _ := cpuid(0x80000008) fs.setIf((b&(1<<9)) != 0, WBNOINVD) - fs.setIf((b&(1<<8)) != 0, MCOMMIT) - fs.setIf((b&(1<<13)) != 0, INT_WBINVD) - fs.setIf((b&(1<<4)) != 0, RDPRU) - fs.setIf((b&(1<<3)) != 0, INVLPGB) - fs.setIf((b&(1<<1)) != 0, MSRIRC) - fs.setIf((b&(1<<0)) != 0, CLZERO) - } - - if fs.inSet(SVM) && maxExtendedFunction() >= 0x8000000A { - _, _, _, edx := cpuid(0x8000000A) - fs.setIf((edx>>0)&1 == 1, SVMNP) - fs.setIf((edx>>1)&1 == 1, LBRVIRT) - fs.setIf((edx>>2)&1 == 1, SVML) - fs.setIf((edx>>3)&1 == 1, NRIPS) - fs.setIf((edx>>4)&1 == 1, TSCRATEMSR) - fs.setIf((edx>>5)&1 == 1, VMCBCLEAN) - fs.setIf((edx>>6)&1 == 1, SVMFBASID) - fs.setIf((edx>>7)&1 == 1, SVMDA) - fs.setIf((edx>>10)&1 == 1, SVMPF) - fs.setIf((edx>>12)&1 == 1, SVMPFT) } if maxExtendedFunction() >= 0x8000001b && fs.inSet(IBS) { @@ -1247,24 +997,6 @@ func support() flagSet { fs.setIf((eax>>7)&1 == 1, IBSRIPINVALIDCHK) } - if maxExtendedFunction() >= 0x8000001f && vend == AMD { - a, _, _, _ := cpuid(0x8000001f) - fs.setIf((a>>0)&1 == 1, SME) - fs.setIf((a>>1)&1 == 1, SEV) - fs.setIf((a>>2)&1 == 1, MSR_PAGEFLUSH) - fs.setIf((a>>3)&1 == 1, SEV_ES) - fs.setIf((a>>4)&1 == 1, SEV_SNP) - fs.setIf((a>>5)&1 == 1, VMPL) - fs.setIf((a>>10)&1 == 1, SME_COHERENT) - fs.setIf((a>>11)&1 == 1, SEV_64BIT) - fs.setIf((a>>12)&1 == 1, SEV_RESTRICTED) - fs.setIf((a>>13)&1 == 1, SEV_ALTERNATIVE) - fs.setIf((a>>14)&1 == 1, SEV_DEBUGSWAP) - fs.setIf((a>>15)&1 == 1, IBS_PREVENTHOST) - fs.setIf((a>>16)&1 == 1, VTE) - fs.setIf((a>>24)&1 == 1, VMSA_REGPROT) - } - return fs } diff --git a/vendor/github.com/klauspost/cpuid/v2/detect_arm64.go b/vendor/github.com/klauspost/cpuid/v2/detect_arm64.go index 9a53504..9bf9f77 100644 --- a/vendor/github.com/klauspost/cpuid/v2/detect_arm64.go +++ b/vendor/github.com/klauspost/cpuid/v2/detect_arm64.go @@ -1,7 +1,6 @@ // Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. -//go:build arm64 && !gccgo && !noasm && !appengine -// +build arm64,!gccgo,!noasm,!appengine +//+build arm64,!gccgo,!noasm,!appengine package cpuid diff --git a/vendor/github.com/klauspost/cpuid/v2/detect_ref.go b/vendor/github.com/klauspost/cpuid/v2/detect_ref.go index 9636c2b..e9c8606 100644 --- a/vendor/github.com/klauspost/cpuid/v2/detect_ref.go +++ b/vendor/github.com/klauspost/cpuid/v2/detect_ref.go @@ -1,7 +1,6 @@ // Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. -//go:build (!amd64 && !386 && !arm64) || gccgo || noasm || appengine -// +build !amd64,!386,!arm64 gccgo noasm appengine +//+build !amd64,!386,!arm64 gccgo noasm appengine package cpuid diff --git a/vendor/github.com/klauspost/cpuid/v2/detect_x86.go b/vendor/github.com/klauspost/cpuid/v2/detect_x86.go index c946824..93bc20f 100644 --- a/vendor/github.com/klauspost/cpuid/v2/detect_x86.go +++ b/vendor/github.com/klauspost/cpuid/v2/detect_x86.go @@ -1,7 +1,6 @@ // Copyright (c) 2015 Klaus Post, released under MIT License. See LICENSE file. -//go:build (386 && !gccgo && !noasm && !appengine) || (amd64 && !gccgo && !noasm && !appengine) -// +build 386,!gccgo,!noasm,!appengine amd64,!gccgo,!noasm,!appengine +//+build 386,!gccgo,!noasm,!appengine amd64,!gccgo,!noasm,!appengine package cpuid @@ -24,13 +23,13 @@ func addInfo(c *CPUInfo, safe bool) { c.maxExFunc = maxExtendedFunction() c.BrandName = brandName() c.CacheLine = cacheLine() - c.Family, c.Model, c.Stepping = familyModel() + c.Family, c.Model = familyModel() c.featureSet = support() c.SGX = hasSGX(c.featureSet.inSet(SGX), c.featureSet.inSet(SGXLC)) c.ThreadsPerCore = threadsPerCore() c.LogicalCores = logicalCores() c.PhysicalCores = physicalCores() c.VendorID, c.VendorString = vendorID() + c.Hz = hertz(c.BrandName) c.cacheSize() - c.frequencies() } diff --git a/vendor/github.com/klauspost/cpuid/v2/featureid_string.go b/vendor/github.com/klauspost/cpuid/v2/featureid_string.go index d12e547..0e764f9 100644 --- a/vendor/github.com/klauspost/cpuid/v2/featureid_string.go +++ b/vendor/github.com/klauspost/cpuid/v2/featureid_string.go @@ -24,165 +24,103 @@ func _() { _ = x[AVX512DQ-14] _ = x[AVX512ER-15] _ = x[AVX512F-16] - _ = x[AVX512FP16-17] - _ = x[AVX512IFMA-18] - _ = x[AVX512PF-19] - _ = x[AVX512VBMI-20] - _ = x[AVX512VBMI2-21] - _ = x[AVX512VL-22] - _ = x[AVX512VNNI-23] - _ = x[AVX512VP2INTERSECT-24] - _ = x[AVX512VPOPCNTDQ-25] - _ = x[AVXSLOW-26] - _ = x[AVXVNNI-27] - _ = x[BMI1-28] - _ = x[BMI2-29] - _ = x[CETIBT-30] - _ = x[CETSS-31] - _ = x[CLDEMOTE-32] - _ = x[CLMUL-33] - _ = x[CLZERO-34] - _ = x[CMOV-35] - _ = x[CMPSB_SCADBS_SHORT-36] - _ = x[CMPXCHG8-37] - _ = x[CPBOOST-38] - _ = x[CX16-39] - _ = x[ENQCMD-40] - _ = x[ERMS-41] - _ = x[F16C-42] - _ = x[FMA3-43] - _ = x[FMA4-44] - _ = x[FXSR-45] - _ = x[FXSROPT-46] - _ = x[GFNI-47] - _ = x[HLE-48] - _ = x[HRESET-49] - _ = x[HTT-50] - _ = x[HWA-51] - _ = x[HYPERVISOR-52] - _ = x[IBPB-53] - _ = x[IBS-54] - _ = x[IBSBRNTRGT-55] - _ = x[IBSFETCHSAM-56] - _ = x[IBSFFV-57] - _ = x[IBSOPCNT-58] - _ = x[IBSOPCNTEXT-59] - _ = x[IBSOPSAM-60] - _ = x[IBSRDWROPCNT-61] - _ = x[IBSRIPINVALIDCHK-62] - _ = x[IBS_PREVENTHOST-63] - _ = x[INT_WBINVD-64] - _ = x[INVLPGB-65] - _ = x[LAHF-66] - _ = x[LAM-67] - _ = x[LBRVIRT-68] - _ = x[LZCNT-69] - _ = x[MCAOVERFLOW-70] - _ = x[MCOMMIT-71] - _ = x[MMX-72] - _ = x[MMXEXT-73] - _ = x[MOVBE-74] - _ = x[MOVDIR64B-75] - _ = x[MOVDIRI-76] - _ = x[MOVSB_ZL-77] - _ = x[MPX-78] - _ = x[MSRIRC-79] - _ = x[MSR_PAGEFLUSH-80] - _ = x[NRIPS-81] - _ = x[NX-82] - _ = x[OSXSAVE-83] - _ = x[PCONFIG-84] - _ = x[POPCNT-85] - _ = x[RDPRU-86] - _ = x[RDRAND-87] - _ = x[RDSEED-88] - _ = x[RDTSCP-89] - _ = x[RTM-90] - _ = x[RTM_ALWAYS_ABORT-91] - _ = x[SERIALIZE-92] - _ = x[SEV-93] - _ = x[SEV_64BIT-94] - _ = x[SEV_ALTERNATIVE-95] - _ = x[SEV_DEBUGSWAP-96] - _ = x[SEV_ES-97] - _ = x[SEV_RESTRICTED-98] - _ = x[SEV_SNP-99] - _ = x[SGX-100] - _ = x[SGXLC-101] - _ = x[SHA-102] - _ = x[SME-103] - _ = x[SME_COHERENT-104] - _ = x[SSE-105] - _ = x[SSE2-106] - _ = x[SSE3-107] - _ = x[SSE4-108] - _ = x[SSE42-109] - _ = x[SSE4A-110] - _ = x[SSSE3-111] - _ = x[STIBP-112] - _ = x[STOSB_SHORT-113] - _ = x[SUCCOR-114] - _ = x[SVM-115] - _ = x[SVMDA-116] - _ = x[SVMFBASID-117] - _ = x[SVML-118] - _ = x[SVMNP-119] - _ = x[SVMPF-120] - _ = x[SVMPFT-121] - _ = x[SYSCALL-122] - _ = x[SYSEE-123] - _ = x[TBM-124] - _ = x[TOPEXT-125] - _ = x[TME-126] - _ = x[TSCRATEMSR-127] - _ = x[TSXLDTRK-128] - _ = x[VAES-129] - _ = x[VMCBCLEAN-130] - _ = x[VMPL-131] - _ = x[VMSA_REGPROT-132] - _ = x[VMX-133] - _ = x[VPCLMULQDQ-134] - _ = x[VTE-135] - _ = x[WAITPKG-136] - _ = x[WBNOINVD-137] - _ = x[X87-138] - _ = x[XGETBV1-139] - _ = x[XOP-140] - _ = x[XSAVE-141] - _ = x[XSAVEC-142] - _ = x[XSAVEOPT-143] - _ = x[XSAVES-144] - _ = x[AESARM-145] - _ = x[ARMCPUID-146] - _ = x[ASIMD-147] - _ = x[ASIMDDP-148] - _ = x[ASIMDHP-149] - _ = x[ASIMDRDM-150] - _ = x[ATOMICS-151] - _ = x[CRC32-152] - _ = x[DCPOP-153] - _ = x[EVTSTRM-154] - _ = x[FCMA-155] - _ = x[FP-156] - _ = x[FPHP-157] - _ = x[GPA-158] - _ = x[JSCVT-159] - _ = x[LRCPC-160] - _ = x[PMULL-161] - _ = x[SHA1-162] - _ = x[SHA2-163] - _ = x[SHA3-164] - _ = x[SHA512-165] - _ = x[SM3-166] - _ = x[SM4-167] - _ = x[SVE-168] - _ = x[lastID-169] + _ = x[AVX512IFMA-17] + _ = x[AVX512PF-18] + _ = x[AVX512VBMI-19] + _ = x[AVX512VBMI2-20] + _ = x[AVX512VL-21] + _ = x[AVX512VNNI-22] + _ = x[AVX512VP2INTERSECT-23] + _ = x[AVX512VPOPCNTDQ-24] + _ = x[AVXSLOW-25] + _ = x[BMI1-26] + _ = x[BMI2-27] + _ = x[CLDEMOTE-28] + _ = x[CLMUL-29] + _ = x[CMOV-30] + _ = x[CX16-31] + _ = x[ENQCMD-32] + _ = x[ERMS-33] + _ = x[F16C-34] + _ = x[FMA3-35] + _ = x[FMA4-36] + _ = x[GFNI-37] + _ = x[HLE-38] + _ = x[HTT-39] + _ = x[HYPERVISOR-40] + _ = x[IBPB-41] + _ = x[IBS-42] + _ = x[IBSBRNTRGT-43] + _ = x[IBSFETCHSAM-44] + _ = x[IBSFFV-45] + _ = x[IBSOPCNT-46] + _ = x[IBSOPCNTEXT-47] + _ = x[IBSOPSAM-48] + _ = x[IBSRDWROPCNT-49] + _ = x[IBSRIPINVALIDCHK-50] + _ = x[LZCNT-51] + _ = x[MMX-52] + _ = x[MMXEXT-53] + _ = x[MOVDIR64B-54] + _ = x[MOVDIRI-55] + _ = x[MPX-56] + _ = x[NX-57] + _ = x[POPCNT-58] + _ = x[RDRAND-59] + _ = x[RDSEED-60] + _ = x[RDTSCP-61] + _ = x[RTM-62] + _ = x[SERIALIZE-63] + _ = x[SGX-64] + _ = x[SGXLC-65] + _ = x[SHA-66] + _ = x[SSE-67] + _ = x[SSE2-68] + _ = x[SSE3-69] + _ = x[SSE4-70] + _ = x[SSE42-71] + _ = x[SSE4A-72] + _ = x[SSSE3-73] + _ = x[STIBP-74] + _ = x[TBM-75] + _ = x[TSXLDTRK-76] + _ = x[VAES-77] + _ = x[VMX-78] + _ = x[VPCLMULQDQ-79] + _ = x[WAITPKG-80] + _ = x[WBNOINVD-81] + _ = x[XOP-82] + _ = x[AESARM-83] + _ = x[ARMCPUID-84] + _ = x[ASIMD-85] + _ = x[ASIMDDP-86] + _ = x[ASIMDHP-87] + _ = x[ASIMDRDM-88] + _ = x[ATOMICS-89] + _ = x[CRC32-90] + _ = x[DCPOP-91] + _ = x[EVTSTRM-92] + _ = x[FCMA-93] + _ = x[FP-94] + _ = x[FPHP-95] + _ = x[GPA-96] + _ = x[JSCVT-97] + _ = x[LRCPC-98] + _ = x[PMULL-99] + _ = x[SHA1-100] + _ = x[SHA2-101] + _ = x[SHA3-102] + _ = x[SHA512-103] + _ = x[SM3-104] + _ = x[SM4-105] + _ = x[SVE-106] + _ = x[lastID-107] _ = x[firstID-0] } -const _FeatureID_name = "firstIDADXAESNIAMD3DNOWAMD3DNOWEXTAMXBF16AMXINT8AMXTILEAVXAVX2AVX512BF16AVX512BITALGAVX512BWAVX512CDAVX512DQAVX512ERAVX512FAVX512FP16AVX512IFMAAVX512PFAVX512VBMIAVX512VBMI2AVX512VLAVX512VNNIAVX512VP2INTERSECTAVX512VPOPCNTDQAVXSLOWAVXVNNIBMI1BMI2CETIBTCETSSCLDEMOTECLMULCLZEROCMOVCMPSB_SCADBS_SHORTCMPXCHG8CPBOOSTCX16ENQCMDERMSF16CFMA3FMA4FXSRFXSROPTGFNIHLEHRESETHTTHWAHYPERVISORIBPBIBSIBSBRNTRGTIBSFETCHSAMIBSFFVIBSOPCNTIBSOPCNTEXTIBSOPSAMIBSRDWROPCNTIBSRIPINVALIDCHKIBS_PREVENTHOSTINT_WBINVDINVLPGBLAHFLAMLBRVIRTLZCNTMCAOVERFLOWMCOMMITMMXMMXEXTMOVBEMOVDIR64BMOVDIRIMOVSB_ZLMPXMSRIRCMSR_PAGEFLUSHNRIPSNXOSXSAVEPCONFIGPOPCNTRDPRURDRANDRDSEEDRDTSCPRTMRTM_ALWAYS_ABORTSERIALIZESEVSEV_64BITSEV_ALTERNATIVESEV_DEBUGSWAPSEV_ESSEV_RESTRICTEDSEV_SNPSGXSGXLCSHASMESME_COHERENTSSESSE2SSE3SSE4SSE42SSE4ASSSE3STIBPSTOSB_SHORTSUCCORSVMSVMDASVMFBASIDSVMLSVMNPSVMPFSVMPFTSYSCALLSYSEETBMTOPEXTTMETSCRATEMSRTSXLDTRKVAESVMCBCLEANVMPLVMSA_REGPROTVMXVPCLMULQDQVTEWAITPKGWBNOINVDX87XGETBV1XOPXSAVEXSAVECXSAVEOPTXSAVESAESARMARMCPUIDASIMDASIMDDPASIMDHPASIMDRDMATOMICSCRC32DCPOPEVTSTRMFCMAFPFPHPGPAJSCVTLRCPCPMULLSHA1SHA2SHA3SHA512SM3SM4SVElastID" +const _FeatureID_name = "firstIDADXAESNIAMD3DNOWAMD3DNOWEXTAMXBF16AMXINT8AMXTILEAVXAVX2AVX512BF16AVX512BITALGAVX512BWAVX512CDAVX512DQAVX512ERAVX512FAVX512IFMAAVX512PFAVX512VBMIAVX512VBMI2AVX512VLAVX512VNNIAVX512VP2INTERSECTAVX512VPOPCNTDQAVXSLOWBMI1BMI2CLDEMOTECLMULCMOVCX16ENQCMDERMSF16CFMA3FMA4GFNIHLEHTTHYPERVISORIBPBIBSIBSBRNTRGTIBSFETCHSAMIBSFFVIBSOPCNTIBSOPCNTEXTIBSOPSAMIBSRDWROPCNTIBSRIPINVALIDCHKLZCNTMMXMMXEXTMOVDIR64BMOVDIRIMPXNXPOPCNTRDRANDRDSEEDRDTSCPRTMSERIALIZESGXSGXLCSHASSESSE2SSE3SSE4SSE42SSE4ASSSE3STIBPTBMTSXLDTRKVAESVMXVPCLMULQDQWAITPKGWBNOINVDXOPAESARMARMCPUIDASIMDASIMDDPASIMDHPASIMDRDMATOMICSCRC32DCPOPEVTSTRMFCMAFPFPHPGPAJSCVTLRCPCPMULLSHA1SHA2SHA3SHA512SM3SM4SVElastID" -var _FeatureID_index = [...]uint16{0, 7, 10, 15, 23, 34, 41, 48, 55, 58, 62, 72, 84, 92, 100, 108, 116, 123, 133, 143, 151, 161, 172, 180, 190, 208, 223, 230, 237, 241, 245, 251, 256, 264, 269, 275, 279, 297, 305, 312, 316, 322, 326, 330, 334, 338, 342, 349, 353, 356, 362, 365, 368, 378, 382, 385, 395, 406, 412, 420, 431, 439, 451, 467, 482, 492, 499, 503, 506, 513, 518, 529, 536, 539, 545, 550, 559, 566, 574, 577, 583, 596, 601, 603, 610, 617, 623, 628, 634, 640, 646, 649, 665, 674, 677, 686, 701, 714, 720, 734, 741, 744, 749, 752, 755, 767, 770, 774, 778, 782, 787, 792, 797, 802, 813, 819, 822, 827, 836, 840, 845, 850, 856, 863, 868, 871, 877, 880, 890, 898, 902, 911, 915, 927, 930, 940, 943, 950, 958, 961, 968, 971, 976, 982, 990, 996, 1002, 1010, 1015, 1022, 1029, 1037, 1044, 1049, 1054, 1061, 1065, 1067, 1071, 1074, 1079, 1084, 1089, 1093, 1097, 1101, 1107, 1110, 1113, 1116, 1122} +var _FeatureID_index = [...]uint16{0, 7, 10, 15, 23, 34, 41, 48, 55, 58, 62, 72, 84, 92, 100, 108, 116, 123, 133, 141, 151, 162, 170, 180, 198, 213, 220, 224, 228, 236, 241, 245, 249, 255, 259, 263, 267, 271, 275, 278, 281, 291, 295, 298, 308, 319, 325, 333, 344, 352, 364, 380, 385, 388, 394, 403, 410, 413, 415, 421, 427, 433, 439, 442, 451, 454, 459, 462, 465, 469, 473, 477, 482, 487, 492, 497, 500, 508, 512, 515, 525, 532, 540, 543, 549, 557, 562, 569, 576, 584, 591, 596, 601, 608, 612, 614, 618, 621, 626, 631, 636, 640, 644, 648, 654, 657, 660, 663, 669} func (i FeatureID) String() string { if i < 0 || i >= FeatureID(len(_FeatureID_index)-1) { diff --git a/vendor/github.com/klauspost/cpuid/v2/os_darwin_arm64.go b/vendor/github.com/klauspost/cpuid/v2/os_darwin_arm64.go index d91d021..8d2cb03 100644 --- a/vendor/github.com/klauspost/cpuid/v2/os_darwin_arm64.go +++ b/vendor/github.com/klauspost/cpuid/v2/os_darwin_arm64.go @@ -2,120 +2,18 @@ package cpuid -import ( - "runtime" - "strings" - - "golang.org/x/sys/unix" -) +import "runtime" func detectOS(c *CPUInfo) bool { - if runtime.GOOS != "ios" { - tryToFillCPUInfoFomSysctl(c) - } // There are no hw.optional sysctl values for the below features on Mac OS 11.0 // to detect their supported state dynamically. Assume the CPU features that // Apple Silicon M1 supports to be available as a minimal set of features // to all Go programs running on darwin/arm64. // TODO: Add more if we know them. c.featureSet.setIf(runtime.GOOS != "ios", AESARM, PMULL, SHA1, SHA2) - + c.PhysicalCores = runtime.NumCPU() + // For now assuming 1 thread per core... + c.ThreadsPerCore = 1 + c.LogicalCores = c.PhysicalCores return true } - -func sysctlGetBool(name string) bool { - value, err := unix.SysctlUint32(name) - if err != nil { - return false - } - return value != 0 -} - -func sysctlGetString(name string) string { - value, err := unix.Sysctl(name) - if err != nil { - return "" - } - return value -} - -func sysctlGetInt(unknown int, names ...string) int { - for _, name := range names { - value, err := unix.SysctlUint32(name) - if err != nil { - continue - } - if value != 0 { - return int(value) - } - } - return unknown -} - -func sysctlGetInt64(unknown int, names ...string) int { - for _, name := range names { - value64, err := unix.SysctlUint64(name) - if err != nil { - continue - } - if int(value64) != unknown { - return int(value64) - } - } - return unknown -} - -func setFeature(c *CPUInfo, name string, feature FeatureID) { - c.featureSet.setIf(sysctlGetBool(name), feature) -} -func tryToFillCPUInfoFomSysctl(c *CPUInfo) { - c.BrandName = sysctlGetString("machdep.cpu.brand_string") - - if len(c.BrandName) != 0 { - c.VendorString = strings.Fields(c.BrandName)[0] - } - - c.PhysicalCores = sysctlGetInt(runtime.NumCPU(), "hw.physicalcpu") - c.ThreadsPerCore = sysctlGetInt(1, "machdep.cpu.thread_count", "kern.num_threads") / - sysctlGetInt(1, "hw.physicalcpu") - c.LogicalCores = sysctlGetInt(runtime.NumCPU(), "machdep.cpu.core_count") - c.Family = sysctlGetInt(0, "machdep.cpu.family", "hw.cpufamily") - c.Model = sysctlGetInt(0, "machdep.cpu.model") - c.CacheLine = sysctlGetInt64(0, "hw.cachelinesize") - c.Cache.L1I = sysctlGetInt64(-1, "hw.l1icachesize") - c.Cache.L1D = sysctlGetInt64(-1, "hw.l1icachesize") - c.Cache.L2 = sysctlGetInt64(-1, "hw.l2cachesize") - c.Cache.L3 = sysctlGetInt64(-1, "hw.l3cachesize") - - // from https://developer.arm.com/downloads/-/exploration-tools/feature-names-for-a-profile - setFeature(c, "hw.optional.arm.FEAT_AES", AESARM) - setFeature(c, "hw.optional.AdvSIMD", ASIMD) - setFeature(c, "hw.optional.arm.FEAT_DotProd", ASIMDDP) - setFeature(c, "hw.optional.arm.FEAT_RDM", ASIMDRDM) - setFeature(c, "hw.optional.FEAT_CRC32", CRC32) - setFeature(c, "hw.optional.arm.FEAT_DPB", DCPOP) - // setFeature(c, "", EVTSTRM) - setFeature(c, "hw.optional.arm.FEAT_FCMA", FCMA) - setFeature(c, "hw.optional.arm.FEAT_FP", FP) - setFeature(c, "hw.optional.arm.FEAT_FP16", FPHP) - setFeature(c, "hw.optional.arm.FEAT_PAuth", GPA) - setFeature(c, "hw.optional.arm.FEAT_JSCVT", JSCVT) - setFeature(c, "hw.optional.arm.FEAT_LRCPC", LRCPC) - setFeature(c, "hw.optional.arm.FEAT_PMULL", PMULL) - setFeature(c, "hw.optional.arm.FEAT_SHA1", SHA1) - setFeature(c, "hw.optional.arm.FEAT_SHA256", SHA2) - setFeature(c, "hw.optional.arm.FEAT_SHA3", SHA3) - setFeature(c, "hw.optional.arm.FEAT_SHA512", SHA512) - // setFeature(c, "", SM3) - // setFeature(c, "", SM4) - setFeature(c, "hw.optional.arm.FEAT_SVE", SVE) - - // from empirical observation - setFeature(c, "hw.optional.AdvSIMD_HPFPCvt", ASIMDHP) - setFeature(c, "hw.optional.armv8_1_atomics", ATOMICS) - setFeature(c, "hw.optional.floatingpoint", FP) - setFeature(c, "hw.optional.armv8_2_sha3", SHA3) - setFeature(c, "hw.optional.armv8_2_sha512", SHA512) - setFeature(c, "hw.optional.armv8_3_compnum", FCMA) - setFeature(c, "hw.optional.armv8_crc32", CRC32) -} diff --git a/vendor/github.com/klauspost/cpuid/v2/os_other_arm64.go b/vendor/github.com/klauspost/cpuid/v2/os_other_arm64.go index 8733ba3..1a951e6 100644 --- a/vendor/github.com/klauspost/cpuid/v2/os_other_arm64.go +++ b/vendor/github.com/klauspost/cpuid/v2/os_other_arm64.go @@ -1,7 +1,8 @@ // Copyright (c) 2020 Klaus Post, released under MIT License. See LICENSE file. -//go:build arm64 && !linux && !darwin -// +build arm64,!linux,!darwin +// +build arm64 +// +build !linux +// +build !darwin package cpuid diff --git a/vendor/github.com/klauspost/cpuid/v2/os_safe_linux_arm64.go b/vendor/github.com/klauspost/cpuid/v2/os_safe_linux_arm64.go index f8f201b..4d0b8b4 100644 --- a/vendor/github.com/klauspost/cpuid/v2/os_safe_linux_arm64.go +++ b/vendor/github.com/klauspost/cpuid/v2/os_safe_linux_arm64.go @@ -1,7 +1,6 @@ // Copyright (c) 2021 Klaus Post, released under MIT License. See LICENSE file. -//go:build nounsafe -// +build nounsafe +//+build nounsafe package cpuid diff --git a/vendor/github.com/klauspost/cpuid/v2/os_unsafe_linux_arm64.go b/vendor/github.com/klauspost/cpuid/v2/os_unsafe_linux_arm64.go index 92af622..3298002 100644 --- a/vendor/github.com/klauspost/cpuid/v2/os_unsafe_linux_arm64.go +++ b/vendor/github.com/klauspost/cpuid/v2/os_unsafe_linux_arm64.go @@ -1,7 +1,6 @@ // Copyright (c) 2021 Klaus Post, released under MIT License. See LICENSE file. -//go:build !nounsafe -// +build !nounsafe +//+build !nounsafe package cpuid diff --git a/vendor/github.com/klauspost/reedsolomon/.travis.yml b/vendor/github.com/klauspost/reedsolomon/.travis.yml new file mode 100644 index 0000000..fdd619c --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/.travis.yml @@ -0,0 +1,65 @@ +language: go + +os: + - linux + - osx + - windows + +arch: + - amd64 + - arm64 + - ppc64le + - s390x + +go: + - 1.14.x + - 1.15.x + - 1.16.x + - master + +env: + - GO111MODULE=off CGO_ENABLED=0 + +install: + - go get ./... + +script: + - go vet ./... + - go test -cpu=1,2 . + - go test -tags=noasm -cpu=1,2 . + - go build examples/simple-decoder.go + - go build examples/simple-encoder.go + - go build examples/stream-decoder.go + - go build examples/stream-encoder.go + +jobs: + allow_failures: + - go: 'master' + - arch: s390x + fast_finish: true + include: + - stage: other + go: 1.16.x + os: linux + arch: amd64 + script: + - diff <(gofmt -d .) <(printf "") + - diff <(gofmt -d ./examples) <(printf "") + - go get github.com/klauspost/asmfmt&&go install github.com/klauspost/asmfmt/cmd/asmfmt + - diff <(asmfmt -d .) <(printf "") + - CGO_ENABLED=1 go test -cpu=1 -short -race . + - CGO_ENABLED=1 go test -cpu=2 -short -race . + - CGO_ENABLED=1 go test -tags=noasm -cpu=1 -short -race . + - CGO_ENABLED=1 go test -tags=noasm -cpu=4 -short -race . + - CGO_ENABLED=1 go test -no-avx512 -short -race . + - CGO_ENABLED=1 go test -no-avx512 -no-avx2 -short -race . + - CGO_ENABLED=1 go test -no-avx512 -no-avx2 -no-ssse3 -short -race . + - GOOS=linux GOARCH=386 go test -short . + - stage: other + go: 1.15.x + os: linux + arch: amd64 + script: + - go test -no-avx512 + - go test -no-avx512 -no-avx2 + - go test -no-avx512 -no-avx2 -no-ssse3 diff --git a/vendor/github.com/klauspost/reedsolomon/README.md b/vendor/github.com/klauspost/reedsolomon/README.md index c8f1886..ee8f2ae 100644 --- a/vendor/github.com/klauspost/reedsolomon/README.md +++ b/vendor/github.com/klauspost/reedsolomon/README.md @@ -1,5 +1,8 @@ # Reed-Solomon -[![Go Reference](https://pkg.go.dev/badge/github.com/klauspost/reedsolomon.svg)](https://pkg.go.dev/github.com/klauspost/reedsolomon) [![Go](https://github.com/klauspost/reedsolomon/actions/workflows/go.yml/badge.svg)](https://github.com/klauspost/reedsolomon/actions/workflows/go.yml) +[![Go Reference](https://pkg.go.dev/badge/github.com/klauspost/reedsolomon.svg)](https://pkg.go.dev/github.com/klauspost/reedsolomon) [![Build Status][3]][4] + +[3]: https://travis-ci.org/klauspost/reedsolomon.svg?branch=master +[4]: https://travis-ci.org/klauspost/reedsolomon Reed-Solomon Erasure Coding in Go, with speeds exceeding 1GB/s/cpu core implemented in pure Go. @@ -8,12 +11,9 @@ This is a Go port of the [JavaReedSolomon](https://github.com/Backblaze/JavaReed For an introduction on erasure coding, see the post on the [Backblaze blog](https://www.backblaze.com/blog/reed-solomon/). -For encoding high shard counts (>256) a Leopard implementation is used. -For most platforms this performs close to the original Leopard implementation in terms of speed. - Package home: https://github.com/klauspost/reedsolomon -Godoc: https://pkg.go.dev/github.com/klauspost/reedsolomon +Godoc: https://pkg.go.dev/github.com/klauspost/reedsolomon?tab=doc # Installation To get the package use the standard: @@ -21,18 +21,11 @@ To get the package use the standard: go get -u github.com/klauspost/reedsolomon ``` -Using Go modules is recommended. +Using Go modules recommended. # Changes - -## 2022 - -* Leopard GF16 mode added, for up to 63336 shards. -* [WithJerasureMatrix](https://pkg.go.dev/github.com/klauspost/reedsolomon?tab=doc#WithJerasureMatrix) allows constructing a [Jerasure](https://github.com/tsuraan/Jerasure) compatible matrix. - ## 2021 -* Use `GOAMD64=v4` to enable faster AVX2. * Add progressive shard encoding. * Wider AVX2 loops * Limit concurrency on AVX2, since we are likely memory bound. @@ -40,8 +33,6 @@ Using Go modules is recommended. * Allow disabling inversion cache. * Faster AVX2 encoding. -
- See older changes ## May 2020 @@ -105,8 +96,6 @@ The [`StreamEncoder`](https://godoc.org/github.com/klauspost/reedsolomon#StreamE handles this without modifying the interface. This is a good lesson on why returning interfaces is not a good design. -
- # Usage This section assumes you know the basics of Reed-Solomon encoding. @@ -116,19 +105,23 @@ This package performs the calculation of the parity sets. The usage is therefore First of all, you need to choose your distribution of data and parity shards. A 'good' distribution is very subjective, and will depend a lot on your usage scenario. +A good starting point is above 5 and below 257 data shards (the maximum supported number), +and the number of parity shards to be 2 or above, and below the number of data shards. To create an encoder with 10 data shards (where your data goes) and 3 parity shards (calculated): ```Go enc, err := reedsolomon.New(10, 3) ``` This encoder will work for all parity sets with this distribution of data and parity shards. +The error will only be set if you specify 0 or negative values in any of the parameters, +or if you specify more than 256 data shards. If you will primarily be using it with one shard size it is recommended to use [`WithAutoGoroutines(shardSize)`](https://pkg.go.dev/github.com/klauspost/reedsolomon?tab=doc#WithAutoGoroutines) as an additional parameter. This will attempt to calculate the optimal number of goroutines to use for the best speed. It is not required that all shards are this size. -Then you send and receive data that is a simple slice of byte slices; `[][]byte`. +The you send and receive data is a simple slice of byte slices; `[][]byte`. In the example above, the top slice must have a length of 13. ```Go @@ -194,17 +187,6 @@ If you are only interested in the data shards (for reading purposes) you can cal err := enc.ReconstructData(data) ``` -If you don't need all data shards you can use `ReconstructSome()`: - -```Go - // Delete two data shards - data[3] = nil - data[7] = nil - - // Reconstruct just the shard 3 - err := enc.ReconstructSome(data, []bool{false, false, false, true, false, false, false, false}) -``` - So to sum up reconstruction: * The number of data/parity shards must match the numbers used for encoding. * The order of shards must be the same as used when encoding. @@ -351,8 +333,6 @@ There is no buffering or timeouts/retry specified. If you want to add that, you For complete examples of a streaming encoder and decoder see the [examples folder](https://github.com/klauspost/reedsolomon/tree/master/examples). -GF16 (more than 256 shards) is not supported by the streaming interface. - # Advanced Options You can modify internal options which affects how jobs are split between and processed by goroutines. @@ -366,83 +346,8 @@ Example of how to supply options: enc, err := reedsolomon.New(10, 3, WithMaxGoroutines(25)) ``` -# Leopard Compatible GF16 - -When you encode more than 256 shards the library will switch to a [Leopard-RS](https://github.com/catid/leopard) implementation. - -This allows encoding up to 65536 shards (data+parity) with the following limitations, similar to leopard: - -* The original and recovery data must not exceed 65536 pieces. -* The shard size *must* each be a multiple of 64 bytes. -* Each buffer should have the same number of bytes. -* Even the last shard must be rounded up to the block size. - -| | Regular | Leopard | -|-----------------|---------|---------| -| Encode | ✓ | ✓ | -| EncodeIdx | ✓ | - | -| Verify | ✓ | ✓ | -| Reconstruct | ✓ | ✓ | -| ReconstructData | ✓ | ✓ | -| ReconstructSome | ✓ | ✓ (+) | -| Update | ✓ | - | -| Split | ✓ | ✓ | -| Join | ✓ | ✓ | - -* (+) Same as calling `ReconstructData`. - -The Split/Join functions will help to split an input to the proper sizes. - -Speed can be expected to be `O(N*log(N))`, compared to the `O(N*N)`. -Reconstruction matrix calculation is more time-consuming, -so be sure to include that as part of any benchmark you run. - -For now SSSE3, AVX2 and AVX512 assembly are available on AMD64 platforms. - -Leopard mode currently always runs as a single goroutine, since multiple gorouties doesn't provide any worthwhile speedup. - -## Forcing Leopard GF16 - -The `WithLeopardGF16(true)` can be used to use Leopard GF16 for all operations. -This is *not* compatible with the Leopard library that has a separate GF8 implementation. - -Benchmark Encoding and Reconstructing *1KB* shards with variable number of shards. -For Cauchy matrix the inversion cache is disabled for a more "fair" test. -Speed is total shard size for each operation. Data shard throughput is speed/2. -AVX2 is used. - -| Encoder | Shards | Encode | Recover All | Recover One | -|--------------|-------------|---------------|--------------|--------------| -| Cauchy | 4+4 | 23076.83 MB/s | 3048.86 MB/s | 5620.84 MB/s | -| Cauchy | 8+8 | 15206.87 MB/s | 3041.99 MB/s | 7173.71 MB/s | -| Cauchy | 16+16 | 7427.47 MB/s | 1384.58 MB/s | 6343.85 MB/s | -| Cauchy | 32+32 | 3785.64 MB/s | 557.60 MB/s | 4660.27 MB/s | -| Cauchy | 64+64 | 1911.93 MB/s | 160.54 MB/s | 2864.63 MB/s | -| Cauchy | 128+128 | 963.83 MB/s | 42.81 MB/s | 1597.93 MB/s | -| Leopard GF16 | 4+4 | 18468.32 MB/s | 10.45 MB/s | 10.30 MB/s | -| Leopard GF16 | 8+8 | 10293.79 MB/s | 20.83 MB/s | 20.51 MB/s | -| Leopard GF16 | 16+16 | 12386.04 MB/s | 40.80 MB/s | 40.47 MB/s | -| Leopard GF16 | 32+32 | 7347.35 MB/s | 81.15 MB/s | 79.80 MB/s | -| Leopard GF16 | 64+64 | 8299.63 MB/s | 150.47 MB/s | 154.15 MB/s | -| Leopard GF16 | 128+128 | 5629.04 MB/s | 278.84 MB/s | 289.15 MB/s | -| Leopard GF16 | 256+256 | 6158.66 MB/s | 454.14 MB/s | 506.70 MB/s | -| Leopard GF16 | 512+512 | 4418.58 MB/s | 685.75 MB/s | 801.63 MB/s | -| Leopard GF16 | 1024+1024 | 4778.05 MB/s | 814.51 MB/s | 1080.19 MB/s | -| Leopard GF16 | 2048+2048 | 3417.05 MB/s | 911.64 MB/s | 1179.48 MB/s | -| Leopard GF16 | 4096+4096 | 3209.41 MB/s | 729.13 MB/s | 1135.06 MB/s | -| Leopard GF16 | 8192+8192 | 2034.11 MB/s | 604.52 MB/s | 842.13 MB/s | -| Leopard GF16 | 16384+16384 | 1525.88 MB/s | 486.74 MB/s | 750.01 MB/s | -| Leopard GF16 | 32768+32768 | 1138.67 MB/s | 482.81 MB/s | 712.73 MB/s | - -"Traditional" encoding is faster until somewhere between 16 and 32 shards. -Leopard provides fast encoding in all cases, but shows a significant overhead for reconstruction. - -Calculating the reconstruction matrix takes a significant amount of computation. -With bigger shards that will be smaller. Arguably, fewer shards typically also means bigger shards. -Due to the high shard count caching reconstruction matrices generally isn't feasible for Leopard. # Performance - Performance depends mainly on the number of parity shards. In rough terms, doubling the number of parity shards will double the encoding time. @@ -451,16 +356,27 @@ For reference each shard is 1MB random data, and 16 CPU cores are used for encod | Data | Parity | Go MB/s | SSSE3 MB/s | AVX2 MB/s | |------|--------|---------|------------|-----------| -| 5 | 2 | 20,772 | 66,355 | 108,755 | -| 8 | 8 | 6,815 | 38,338 | 70,516 | -| 10 | 4 | 9,245 | 48,237 | 93,875 | -| 50 | 20 | 2,063 | 12,130 | 22,828 | +| 5 | 2 | 14287 | 66355 | 108755 | +| 8 | 8 | 5569 | 34298 | 70516 | +| 10 | 4 | 6766 | 48237 | 93875 | +| 50 | 20 | 1540 | 12130 | 22090 | The throughput numbers here is the size of the encoded data and parity shards. If `runtime.GOMAXPROCS()` is set to a value higher than 1, the encoder will use multiple goroutines to perform the calculations in `Verify`, `Encode` and `Reconstruct`. +Example of performance scaling on AMD Ryzen 3950X - 16 physical cores, 32 logical cores, AVX 2. +The example uses 10 blocks with 1MB data each and 4 parity blocks. + +| Threads | Speed | +|---------|------------| +| 1 | 9979 MB/s | +| 2 | 18870 MB/s | +| 4 | 33697 MB/s | +| 8 | 51531 MB/s | +| 16 | 59204 MB/s | + Benchmarking `Reconstruct()` followed by a `Verify()` (=`all`) versus just calling `ReconstructData()` (=`data`) gives the following result: ``` @@ -474,9 +390,22 @@ BenchmarkReconstruct50x20x1M-8 1364.35 4189.79 3.07x BenchmarkReconstruct10x4x16M-8 1484.35 5779.53 3.89x ``` -The performance on AVX512 has been accelerated for CPUs when available. +# Performance on AVX512 -## ARM64 NEON +The performance on AVX512 has been accelerated for Intel CPUs. +This gives speedups on a per-core basis typically up to 2x compared to +AVX2 as can be seen in the following table: + +``` +[...] +``` + +This speedup has been achieved by computing multiple parity blocks in parallel as opposed to one after the other. +In doing so it is possible to minimize the memory bandwidth required for loading all data shards. +At the same time the calculations are performed in the 512-bit wide ZMM registers and the surplus of ZMM +registers (32 in total) is used to keep more data around (most notably the matrix coefficients). + +# Performance on ARM64 NEON By exploiting NEON instructions the performance for ARM has been accelerated. Below are the performance numbers for a single core on an EC2 m6g.16xlarge (Graviton2) instance (Amazon Linux 2): @@ -491,7 +420,7 @@ BenchmarkGaloisXor1M-64 10000 100322 ns/op 10452.13 MB/s # Performance on ppc64le The performance for ppc64le has been accelerated. -This gives roughly a 10x performance improvement on this architecture as can be seen below: +This gives roughly a 10x performance improvement on this architecture as can been seen below: ``` benchmark old MB/s new MB/s speedup @@ -501,6 +430,9 @@ BenchmarkGaloisXor128K-160 862.02 7905.00 9.17x BenchmarkGaloisXor1M-160 784.60 6296.65 8.03x ``` +# asm2plan9s + +[asm2plan9s](https://github.com/fwessels/asm2plan9s) is used for assembling the AVX2 instructions into their BYTE/WORD/LONG equivalents. # Links * [Backblaze Open Sources Reed-Solomon Erasure Coding Source Code](https://www.backblaze.com/blog/reed-solomon/). @@ -511,7 +443,6 @@ BenchmarkGaloisXor1M-160 784.60 6296.65 8.03x * [reed-solomon-erasure](https://github.com/darrenldl/reed-solomon-erasure). Compatible Rust implementation. * [go-erasure](https://github.com/somethingnew2-0/go-erasure). A similar library using cgo, slower in my tests. * [Screaming Fast Galois Field Arithmetic](http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf). Basis for SSE3 optimizations. -* [Leopard-RS](https://github.com/catid/leopard) C library used as basis for GF16 implementation. # License diff --git a/vendor/github.com/klauspost/reedsolomon/appveyor.yml b/vendor/github.com/klauspost/reedsolomon/appveyor.yml new file mode 100644 index 0000000..9bb067f --- /dev/null +++ b/vendor/github.com/klauspost/reedsolomon/appveyor.yml @@ -0,0 +1,20 @@ +os: Visual Studio 2015 + +platform: x64 + +clone_folder: c:\gopath\src\github.com\klauspost\reedsolomon + +# environment variables +environment: + GOPATH: c:\gopath + +install: + - echo %PATH% + - echo %GOPATH% + - go version + - go env + - go get -d ./... + +build_script: + - go test -v -cpu=2 ./... + - go test -cpu=1,2,4 -short -race ./... diff --git a/vendor/github.com/klauspost/reedsolomon/galois.go b/vendor/github.com/klauspost/reedsolomon/galois.go index 703f209..30e9e03 100644 --- a/vendor/github.com/klauspost/reedsolomon/galois.go +++ b/vendor/github.com/klauspost/reedsolomon/galois.go @@ -6,8 +6,6 @@ package reedsolomon -import "encoding/binary" - const ( // The number of elements in the field. fieldSize = 256 @@ -931,24 +929,3 @@ func genAvx2Matrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []byte) } return dst } - -// xor slices writing to out. -func sliceXorGo(in, out []byte, _ *options) { - for len(out) >= 32 { - inS := in[:32] - v0 := binary.LittleEndian.Uint64(out[:8]) ^ binary.LittleEndian.Uint64(inS[:8]) - v1 := binary.LittleEndian.Uint64(out[8:16]) ^ binary.LittleEndian.Uint64(inS[8:16]) - v2 := binary.LittleEndian.Uint64(out[16:24]) ^ binary.LittleEndian.Uint64(inS[16:24]) - v3 := binary.LittleEndian.Uint64(out[24:32]) ^ binary.LittleEndian.Uint64(inS[24:32]) - binary.LittleEndian.PutUint64(out[:8], v0) - binary.LittleEndian.PutUint64(out[8:16], v1) - binary.LittleEndian.PutUint64(out[16:24], v2) - binary.LittleEndian.PutUint64(out[24:32], v3) - out = out[32:] - in = in[32:] - } - out = out[:len(in)] - for n, input := range in { - out[n] ^= input - } -} diff --git a/vendor/github.com/klauspost/reedsolomon/galoisAvx512_amd64.go b/vendor/github.com/klauspost/reedsolomon/galoisAvx512_amd64.go index 9a249d2..79207e6 100644 --- a/vendor/github.com/klauspost/reedsolomon/galoisAvx512_amd64.go +++ b/vendor/github.com/klauspost/reedsolomon/galoisAvx512_amd64.go @@ -104,7 +104,7 @@ func setupMatrix84(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[ // Invoke AVX512 routine for single output row in parallel func galMulAVX512Parallel81(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset, start, stop int, matrix81 *[matrixSize81]byte) { done := stop - start - if done <= 0 || len(in) == 0 || len(out) == 0 { + if done <= 0 { return } @@ -139,7 +139,7 @@ func galMulAVX512Parallel81(in, out [][]byte, matrixRows [][]byte, inputOffset, // Invoke AVX512 routine for 2 output rows in parallel func galMulAVX512Parallel82(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset, start, stop int, matrix82 *[matrixSize82]byte) { done := stop - start - if done <= 0 || len(in) == 0 || len(out) == 0 { + if done <= 0 { return } @@ -174,7 +174,7 @@ func galMulAVX512Parallel82(in, out [][]byte, matrixRows [][]byte, inputOffset, // Invoke AVX512 routine for 4 output rows in parallel func galMulAVX512Parallel84(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset, start, stop int, matrix84 *[matrixSize84]byte) { done := stop - start - if done <= 0 || len(in) == 0 || len(out) == 0 { + if done <= 0 { return } diff --git a/vendor/github.com/klauspost/reedsolomon/galois_amd64.go b/vendor/github.com/klauspost/reedsolomon/galois_amd64.go index ae6f289..d722e31 100644 --- a/vendor/github.com/klauspost/reedsolomon/galois_amd64.go +++ b/vendor/github.com/klauspost/reedsolomon/galois_amd64.go @@ -132,215 +132,9 @@ func sliceXor(in, out []byte, o *options) { in = in[done:] out = out[done:] } - } else { - sliceXorGo(in, out, o) - return } out = out[:len(in)] for i := range in { out[i] ^= in[i] } } - -// 4-way butterfly -func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { - if len(work[0]) == 0 { - return - } - - t01 := &multiply256LUT[log_m01] - t23 := &multiply256LUT[log_m23] - t02 := &multiply256LUT[log_m02] - if o.useAVX512 { - if log_m01 == modulus { - if log_m23 == modulus { - if log_m02 == modulus { - ifftDIT4_avx512_7(work, dist*24, t01, t23, t02) - } else { - ifftDIT4_avx512_3(work, dist*24, t01, t23, t02) - } - } else { - if log_m02 == modulus { - ifftDIT4_avx512_5(work, dist*24, t01, t23, t02) - } else { - ifftDIT4_avx512_1(work, dist*24, t01, t23, t02) - } - } - } else { - if log_m23 == modulus { - if log_m02 == modulus { - ifftDIT4_avx512_6(work, dist*24, t01, t23, t02) - } else { - ifftDIT4_avx512_2(work, dist*24, t01, t23, t02) - } - } else { - if log_m02 == modulus { - ifftDIT4_avx512_4(work, dist*24, t01, t23, t02) - } else { - ifftDIT4_avx512_0(work, dist*24, t01, t23, t02) - } - } - } - return - } else if o.useAVX2 { - if log_m01 == modulus { - if log_m23 == modulus { - if log_m02 == modulus { - ifftDIT4_avx2_7(work, dist*24, t01, t23, t02) - } else { - ifftDIT4_avx2_3(work, dist*24, t01, t23, t02) - } - } else { - if log_m02 == modulus { - ifftDIT4_avx2_5(work, dist*24, t01, t23, t02) - } else { - ifftDIT4_avx2_1(work, dist*24, t01, t23, t02) - } - } - } else { - if log_m23 == modulus { - if log_m02 == modulus { - ifftDIT4_avx2_6(work, dist*24, t01, t23, t02) - } else { - ifftDIT4_avx2_2(work, dist*24, t01, t23, t02) - } - } else { - if log_m02 == modulus { - ifftDIT4_avx2_4(work, dist*24, t01, t23, t02) - } else { - ifftDIT4_avx2_0(work, dist*24, t01, t23, t02) - } - } - } - return - } - ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) -} - -func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { - if len(work[0]) == 0 { - return - } - - t01 := &multiply256LUT[log_m01] - t23 := &multiply256LUT[log_m23] - t02 := &multiply256LUT[log_m02] - if o.useAVX512 { - if log_m02 == modulus { - if log_m01 == modulus { - if log_m23 == modulus { - fftDIT4_avx512_7(work, dist*24, t01, t23, t02) - } else { - fftDIT4_avx512_3(work, dist*24, t01, t23, t02) - } - } else { - if log_m23 == modulus { - fftDIT4_avx512_5(work, dist*24, t01, t23, t02) - } else { - fftDIT4_avx512_1(work, dist*24, t01, t23, t02) - } - } - } else { - if log_m01 == modulus { - if log_m23 == modulus { - fftDIT4_avx512_6(work, dist*24, t01, t23, t02) - } else { - fftDIT4_avx512_2(work, dist*24, t01, t23, t02) - } - } else { - if log_m23 == modulus { - fftDIT4_avx512_4(work, dist*24, t01, t23, t02) - } else { - fftDIT4_avx512_0(work, dist*24, t01, t23, t02) - } - } - } - return - } else if o.useAVX2 { - if log_m02 == modulus { - if log_m01 == modulus { - if log_m23 == modulus { - fftDIT4_avx2_7(work, dist*24, t01, t23, t02) - } else { - fftDIT4_avx2_3(work, dist*24, t01, t23, t02) - } - } else { - if log_m23 == modulus { - fftDIT4_avx2_5(work, dist*24, t01, t23, t02) - } else { - fftDIT4_avx2_1(work, dist*24, t01, t23, t02) - } - } - } else { - if log_m01 == modulus { - if log_m23 == modulus { - fftDIT4_avx2_6(work, dist*24, t01, t23, t02) - } else { - fftDIT4_avx2_2(work, dist*24, t01, t23, t02) - } - } else { - if log_m23 == modulus { - fftDIT4_avx2_4(work, dist*24, t01, t23, t02) - } else { - fftDIT4_avx2_0(work, dist*24, t01, t23, t02) - } - } - } - return - } - fftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) -} - -// 2-way butterfly forward -func fftDIT2(x, y []byte, log_m ffe, o *options) { - if o.useAVX2 { - if len(x) > 0 { - tmp := &multiply256LUT[log_m] - fftDIT2_avx2(x, y, tmp) - } - } else if o.useSSSE3 { - if len(x) > 0 { - tmp := &multiply256LUT[log_m] - fftDIT2_ssse3(x, y, tmp) - } - } else { - // Reference version: - refMulAdd(x, y, log_m) - sliceXor(x, y, o) - } -} - -// 2-way butterfly -func ifftDIT2(x, y []byte, log_m ffe, o *options) { - if o.useAVX2 { - if len(x) > 0 { - tmp := &multiply256LUT[log_m] - ifftDIT2_avx2(x, y, tmp) - } - } else if o.useSSSE3 { - if len(x) > 0 { - tmp := &multiply256LUT[log_m] - ifftDIT2_ssse3(x, y, tmp) - } - } else { - // Reference version: - sliceXor(x, y, o) - refMulAdd(x, y, log_m) - } -} - -func mulgf16(x, y []byte, log_m ffe, o *options) { - if o.useAVX2 { - if len(x) > 0 { - tmp := &multiply256LUT[log_m] - mulgf16_avx2(x, y, tmp) - } - } else if o.useSSSE3 { - if len(x) > 0 { - tmp := &multiply256LUT[log_m] - mulgf16_ssse3(x, y, tmp) - } - } else { - refMul(x, y, log_m) - } -} diff --git a/vendor/github.com/klauspost/reedsolomon/galois_arm64.go b/vendor/github.com/klauspost/reedsolomon/galois_arm64.go index 92b67b8..df79a98 100644 --- a/vendor/github.com/klauspost/reedsolomon/galois_arm64.go +++ b/vendor/github.com/klauspost/reedsolomon/galois_arm64.go @@ -64,33 +64,3 @@ func sliceXor(in, out []byte, o *options) { } } } - -// 4-way butterfly -func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { - ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) -} - -// 4-way butterfly -func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { - fftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) -} - -// 2-way butterfly forward -func fftDIT2(x, y []byte, log_m ffe, o *options) { - // Reference version: - refMulAdd(x, y, log_m) - // 64 byte aligned, always full. - galXorNEON(x, y) -} - -// 2-way butterfly -func ifftDIT2(x, y []byte, log_m ffe, o *options) { - // 64 byte aligned, always full. - galXorNEON(x, y) - // Reference version: - refMulAdd(x, y, log_m) -} - -func mulgf16(x, y []byte, log_m ffe, o *options) { - refMul(x, y, log_m) -} diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_amd64.go b/vendor/github.com/klauspost/reedsolomon/galois_gen_amd64.go index 24d6a02..817c7ea 100644 --- a/vendor/github.com/klauspost/reedsolomon/galois_gen_amd64.go +++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_amd64.go @@ -1,1551 +1,1176 @@ // Code generated by command: go run gen.go -out ../galois_gen_amd64.s -stubs ../galois_gen_amd64.go -pkg=reedsolomon. DO NOT EDIT. //go:build !appengine && !noasm && !nogen && gc +// +build !appengine,!noasm,!nogen,gc package reedsolomon -func _dummy_() - // mulAvxTwo_1x1 takes 1 inputs and produces 1 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_1x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x1_64 takes 1 inputs and produces 1 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_1x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x1Xor takes 1 inputs and produces 1 outputs. -// //go:noescape func mulAvxTwo_1x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x1_64Xor takes 1 inputs and produces 1 outputs. -// //go:noescape func mulAvxTwo_1x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x2 takes 1 inputs and produces 2 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_1x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x2_64 takes 1 inputs and produces 2 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_1x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x2Xor takes 1 inputs and produces 2 outputs. -// //go:noescape func mulAvxTwo_1x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x2_64Xor takes 1 inputs and produces 2 outputs. -// //go:noescape func mulAvxTwo_1x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x3 takes 1 inputs and produces 3 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_1x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x3_64 takes 1 inputs and produces 3 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_1x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x3Xor takes 1 inputs and produces 3 outputs. -// //go:noescape func mulAvxTwo_1x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x3_64Xor takes 1 inputs and produces 3 outputs. -// //go:noescape func mulAvxTwo_1x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x4 takes 1 inputs and produces 4 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_1x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x4Xor takes 1 inputs and produces 4 outputs. -// //go:noescape func mulAvxTwo_1x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x5 takes 1 inputs and produces 5 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_1x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x5Xor takes 1 inputs and produces 5 outputs. -// //go:noescape func mulAvxTwo_1x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x6 takes 1 inputs and produces 6 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_1x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x6Xor takes 1 inputs and produces 6 outputs. -// //go:noescape func mulAvxTwo_1x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x7 takes 1 inputs and produces 7 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_1x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x7Xor takes 1 inputs and produces 7 outputs. -// //go:noescape func mulAvxTwo_1x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x8 takes 1 inputs and produces 8 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_1x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x8Xor takes 1 inputs and produces 8 outputs. -// //go:noescape func mulAvxTwo_1x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x9 takes 1 inputs and produces 9 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_1x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x9Xor takes 1 inputs and produces 9 outputs. -// //go:noescape func mulAvxTwo_1x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x10 takes 1 inputs and produces 10 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_1x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_1x10Xor takes 1 inputs and produces 10 outputs. -// //go:noescape func mulAvxTwo_1x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x1 takes 2 inputs and produces 1 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_2x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x1_64 takes 2 inputs and produces 1 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_2x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x1Xor takes 2 inputs and produces 1 outputs. -// //go:noescape func mulAvxTwo_2x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x1_64Xor takes 2 inputs and produces 1 outputs. -// //go:noescape func mulAvxTwo_2x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x2 takes 2 inputs and produces 2 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_2x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x2_64 takes 2 inputs and produces 2 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_2x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x2Xor takes 2 inputs and produces 2 outputs. -// //go:noescape func mulAvxTwo_2x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x2_64Xor takes 2 inputs and produces 2 outputs. -// //go:noescape func mulAvxTwo_2x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x3 takes 2 inputs and produces 3 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_2x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x3_64 takes 2 inputs and produces 3 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_2x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x3Xor takes 2 inputs and produces 3 outputs. -// //go:noescape func mulAvxTwo_2x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x3_64Xor takes 2 inputs and produces 3 outputs. -// //go:noescape func mulAvxTwo_2x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x4 takes 2 inputs and produces 4 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_2x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x4Xor takes 2 inputs and produces 4 outputs. -// //go:noescape func mulAvxTwo_2x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x5 takes 2 inputs and produces 5 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_2x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x5Xor takes 2 inputs and produces 5 outputs. -// //go:noescape func mulAvxTwo_2x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x6 takes 2 inputs and produces 6 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_2x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x6Xor takes 2 inputs and produces 6 outputs. -// //go:noescape func mulAvxTwo_2x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x7 takes 2 inputs and produces 7 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_2x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x7Xor takes 2 inputs and produces 7 outputs. -// //go:noescape func mulAvxTwo_2x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x8 takes 2 inputs and produces 8 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_2x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x8Xor takes 2 inputs and produces 8 outputs. -// //go:noescape func mulAvxTwo_2x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x9 takes 2 inputs and produces 9 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_2x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x9Xor takes 2 inputs and produces 9 outputs. -// //go:noescape func mulAvxTwo_2x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x10 takes 2 inputs and produces 10 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_2x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_2x10Xor takes 2 inputs and produces 10 outputs. -// //go:noescape func mulAvxTwo_2x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x1 takes 3 inputs and produces 1 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_3x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x1_64 takes 3 inputs and produces 1 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_3x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x1Xor takes 3 inputs and produces 1 outputs. -// //go:noescape func mulAvxTwo_3x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x1_64Xor takes 3 inputs and produces 1 outputs. -// //go:noescape func mulAvxTwo_3x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x2 takes 3 inputs and produces 2 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_3x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x2_64 takes 3 inputs and produces 2 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_3x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x2Xor takes 3 inputs and produces 2 outputs. -// //go:noescape func mulAvxTwo_3x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x2_64Xor takes 3 inputs and produces 2 outputs. -// //go:noescape func mulAvxTwo_3x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x3 takes 3 inputs and produces 3 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_3x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x3_64 takes 3 inputs and produces 3 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_3x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x3Xor takes 3 inputs and produces 3 outputs. -// //go:noescape func mulAvxTwo_3x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x3_64Xor takes 3 inputs and produces 3 outputs. -// //go:noescape func mulAvxTwo_3x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x4 takes 3 inputs and produces 4 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_3x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x4Xor takes 3 inputs and produces 4 outputs. -// //go:noescape func mulAvxTwo_3x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x5 takes 3 inputs and produces 5 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_3x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x5Xor takes 3 inputs and produces 5 outputs. -// //go:noescape func mulAvxTwo_3x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x6 takes 3 inputs and produces 6 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_3x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x6Xor takes 3 inputs and produces 6 outputs. -// //go:noescape func mulAvxTwo_3x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x7 takes 3 inputs and produces 7 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_3x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x7Xor takes 3 inputs and produces 7 outputs. -// //go:noescape func mulAvxTwo_3x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x8 takes 3 inputs and produces 8 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_3x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x8Xor takes 3 inputs and produces 8 outputs. -// //go:noescape func mulAvxTwo_3x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x9 takes 3 inputs and produces 9 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_3x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x9Xor takes 3 inputs and produces 9 outputs. -// //go:noescape func mulAvxTwo_3x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x10 takes 3 inputs and produces 10 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_3x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_3x10Xor takes 3 inputs and produces 10 outputs. -// //go:noescape func mulAvxTwo_3x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x1 takes 4 inputs and produces 1 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_4x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x1_64 takes 4 inputs and produces 1 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_4x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x1Xor takes 4 inputs and produces 1 outputs. -// //go:noescape func mulAvxTwo_4x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x1_64Xor takes 4 inputs and produces 1 outputs. -// //go:noescape func mulAvxTwo_4x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x2 takes 4 inputs and produces 2 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_4x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x2_64 takes 4 inputs and produces 2 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_4x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x2Xor takes 4 inputs and produces 2 outputs. -// //go:noescape func mulAvxTwo_4x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x2_64Xor takes 4 inputs and produces 2 outputs. -// //go:noescape func mulAvxTwo_4x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x3 takes 4 inputs and produces 3 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_4x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x3_64 takes 4 inputs and produces 3 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_4x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x3Xor takes 4 inputs and produces 3 outputs. -// //go:noescape func mulAvxTwo_4x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x3_64Xor takes 4 inputs and produces 3 outputs. -// //go:noescape func mulAvxTwo_4x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x4 takes 4 inputs and produces 4 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_4x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x4Xor takes 4 inputs and produces 4 outputs. -// //go:noescape func mulAvxTwo_4x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x5 takes 4 inputs and produces 5 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_4x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x5Xor takes 4 inputs and produces 5 outputs. -// //go:noescape func mulAvxTwo_4x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x6 takes 4 inputs and produces 6 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_4x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x6Xor takes 4 inputs and produces 6 outputs. -// //go:noescape func mulAvxTwo_4x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x7 takes 4 inputs and produces 7 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_4x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x7Xor takes 4 inputs and produces 7 outputs. -// //go:noescape func mulAvxTwo_4x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x8 takes 4 inputs and produces 8 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_4x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x8Xor takes 4 inputs and produces 8 outputs. -// //go:noescape func mulAvxTwo_4x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x9 takes 4 inputs and produces 9 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_4x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x9Xor takes 4 inputs and produces 9 outputs. -// //go:noescape func mulAvxTwo_4x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x10 takes 4 inputs and produces 10 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_4x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_4x10Xor takes 4 inputs and produces 10 outputs. -// //go:noescape func mulAvxTwo_4x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x1 takes 5 inputs and produces 1 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_5x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x1_64 takes 5 inputs and produces 1 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_5x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x1Xor takes 5 inputs and produces 1 outputs. -// //go:noescape func mulAvxTwo_5x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x1_64Xor takes 5 inputs and produces 1 outputs. -// //go:noescape func mulAvxTwo_5x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x2 takes 5 inputs and produces 2 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_5x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x2_64 takes 5 inputs and produces 2 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_5x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x2Xor takes 5 inputs and produces 2 outputs. -// //go:noescape func mulAvxTwo_5x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x2_64Xor takes 5 inputs and produces 2 outputs. -// //go:noescape func mulAvxTwo_5x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x3 takes 5 inputs and produces 3 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_5x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x3_64 takes 5 inputs and produces 3 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_5x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x3Xor takes 5 inputs and produces 3 outputs. -// //go:noescape func mulAvxTwo_5x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x3_64Xor takes 5 inputs and produces 3 outputs. -// //go:noescape func mulAvxTwo_5x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x4 takes 5 inputs and produces 4 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_5x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x4Xor takes 5 inputs and produces 4 outputs. -// //go:noescape func mulAvxTwo_5x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x5 takes 5 inputs and produces 5 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_5x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x5Xor takes 5 inputs and produces 5 outputs. -// //go:noescape func mulAvxTwo_5x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x6 takes 5 inputs and produces 6 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_5x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x6Xor takes 5 inputs and produces 6 outputs. -// //go:noescape func mulAvxTwo_5x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x7 takes 5 inputs and produces 7 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_5x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x7Xor takes 5 inputs and produces 7 outputs. -// //go:noescape func mulAvxTwo_5x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x8 takes 5 inputs and produces 8 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_5x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x8Xor takes 5 inputs and produces 8 outputs. -// //go:noescape func mulAvxTwo_5x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x9 takes 5 inputs and produces 9 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_5x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x9Xor takes 5 inputs and produces 9 outputs. -// //go:noescape func mulAvxTwo_5x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x10 takes 5 inputs and produces 10 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_5x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_5x10Xor takes 5 inputs and produces 10 outputs. -// //go:noescape func mulAvxTwo_5x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x1 takes 6 inputs and produces 1 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_6x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x1_64 takes 6 inputs and produces 1 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_6x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x1Xor takes 6 inputs and produces 1 outputs. -// //go:noescape func mulAvxTwo_6x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x1_64Xor takes 6 inputs and produces 1 outputs. -// //go:noescape func mulAvxTwo_6x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x2 takes 6 inputs and produces 2 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_6x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x2_64 takes 6 inputs and produces 2 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_6x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x2Xor takes 6 inputs and produces 2 outputs. -// //go:noescape func mulAvxTwo_6x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x2_64Xor takes 6 inputs and produces 2 outputs. -// //go:noescape func mulAvxTwo_6x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x3 takes 6 inputs and produces 3 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_6x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x3_64 takes 6 inputs and produces 3 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_6x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x3Xor takes 6 inputs and produces 3 outputs. -// //go:noescape func mulAvxTwo_6x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x3_64Xor takes 6 inputs and produces 3 outputs. -// //go:noescape func mulAvxTwo_6x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x4 takes 6 inputs and produces 4 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_6x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x4Xor takes 6 inputs and produces 4 outputs. -// //go:noescape func mulAvxTwo_6x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x5 takes 6 inputs and produces 5 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_6x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x5Xor takes 6 inputs and produces 5 outputs. -// //go:noescape func mulAvxTwo_6x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x6 takes 6 inputs and produces 6 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_6x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x6Xor takes 6 inputs and produces 6 outputs. -// //go:noescape func mulAvxTwo_6x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x7 takes 6 inputs and produces 7 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_6x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x7Xor takes 6 inputs and produces 7 outputs. -// //go:noescape func mulAvxTwo_6x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x8 takes 6 inputs and produces 8 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_6x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x8Xor takes 6 inputs and produces 8 outputs. -// //go:noescape func mulAvxTwo_6x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x9 takes 6 inputs and produces 9 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_6x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x9Xor takes 6 inputs and produces 9 outputs. -// //go:noescape func mulAvxTwo_6x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x10 takes 6 inputs and produces 10 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_6x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_6x10Xor takes 6 inputs and produces 10 outputs. -// //go:noescape func mulAvxTwo_6x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x1 takes 7 inputs and produces 1 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_7x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x1_64 takes 7 inputs and produces 1 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_7x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x1Xor takes 7 inputs and produces 1 outputs. -// //go:noescape func mulAvxTwo_7x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x1_64Xor takes 7 inputs and produces 1 outputs. -// //go:noescape func mulAvxTwo_7x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x2 takes 7 inputs and produces 2 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_7x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x2_64 takes 7 inputs and produces 2 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_7x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x2Xor takes 7 inputs and produces 2 outputs. -// //go:noescape func mulAvxTwo_7x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x2_64Xor takes 7 inputs and produces 2 outputs. -// //go:noescape func mulAvxTwo_7x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x3 takes 7 inputs and produces 3 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_7x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x3_64 takes 7 inputs and produces 3 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_7x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x3Xor takes 7 inputs and produces 3 outputs. -// //go:noescape func mulAvxTwo_7x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x3_64Xor takes 7 inputs and produces 3 outputs. -// //go:noescape func mulAvxTwo_7x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x4 takes 7 inputs and produces 4 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_7x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x4Xor takes 7 inputs and produces 4 outputs. -// //go:noescape func mulAvxTwo_7x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x5 takes 7 inputs and produces 5 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_7x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x5Xor takes 7 inputs and produces 5 outputs. -// //go:noescape func mulAvxTwo_7x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x6 takes 7 inputs and produces 6 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_7x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x6Xor takes 7 inputs and produces 6 outputs. -// //go:noescape func mulAvxTwo_7x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x7 takes 7 inputs and produces 7 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_7x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x7Xor takes 7 inputs and produces 7 outputs. -// //go:noescape func mulAvxTwo_7x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x8 takes 7 inputs and produces 8 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_7x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x8Xor takes 7 inputs and produces 8 outputs. -// //go:noescape func mulAvxTwo_7x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x9 takes 7 inputs and produces 9 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_7x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x9Xor takes 7 inputs and produces 9 outputs. -// //go:noescape func mulAvxTwo_7x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x10 takes 7 inputs and produces 10 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_7x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_7x10Xor takes 7 inputs and produces 10 outputs. -// //go:noescape func mulAvxTwo_7x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x1 takes 8 inputs and produces 1 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_8x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x1_64 takes 8 inputs and produces 1 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_8x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x1Xor takes 8 inputs and produces 1 outputs. -// //go:noescape func mulAvxTwo_8x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x1_64Xor takes 8 inputs and produces 1 outputs. -// //go:noescape func mulAvxTwo_8x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x2 takes 8 inputs and produces 2 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_8x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x2_64 takes 8 inputs and produces 2 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_8x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x2Xor takes 8 inputs and produces 2 outputs. -// //go:noescape func mulAvxTwo_8x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x2_64Xor takes 8 inputs and produces 2 outputs. -// //go:noescape func mulAvxTwo_8x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x3 takes 8 inputs and produces 3 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_8x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x3_64 takes 8 inputs and produces 3 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_8x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x3Xor takes 8 inputs and produces 3 outputs. -// //go:noescape func mulAvxTwo_8x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x3_64Xor takes 8 inputs and produces 3 outputs. -// //go:noescape func mulAvxTwo_8x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x4 takes 8 inputs and produces 4 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_8x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x4Xor takes 8 inputs and produces 4 outputs. -// //go:noescape func mulAvxTwo_8x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x5 takes 8 inputs and produces 5 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_8x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x5Xor takes 8 inputs and produces 5 outputs. -// //go:noescape func mulAvxTwo_8x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x6 takes 8 inputs and produces 6 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_8x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x6Xor takes 8 inputs and produces 6 outputs. -// //go:noescape func mulAvxTwo_8x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x7 takes 8 inputs and produces 7 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_8x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x7Xor takes 8 inputs and produces 7 outputs. -// //go:noescape func mulAvxTwo_8x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x8 takes 8 inputs and produces 8 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_8x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x8Xor takes 8 inputs and produces 8 outputs. -// //go:noescape func mulAvxTwo_8x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x9 takes 8 inputs and produces 9 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_8x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x9Xor takes 8 inputs and produces 9 outputs. -// //go:noescape func mulAvxTwo_8x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x10 takes 8 inputs and produces 10 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_8x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_8x10Xor takes 8 inputs and produces 10 outputs. -// //go:noescape func mulAvxTwo_8x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x1 takes 9 inputs and produces 1 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_9x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x1_64 takes 9 inputs and produces 1 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_9x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x1Xor takes 9 inputs and produces 1 outputs. -// //go:noescape func mulAvxTwo_9x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x1_64Xor takes 9 inputs and produces 1 outputs. -// //go:noescape func mulAvxTwo_9x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x2 takes 9 inputs and produces 2 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_9x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x2_64 takes 9 inputs and produces 2 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_9x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x2Xor takes 9 inputs and produces 2 outputs. -// //go:noescape func mulAvxTwo_9x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x2_64Xor takes 9 inputs and produces 2 outputs. -// //go:noescape func mulAvxTwo_9x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x3 takes 9 inputs and produces 3 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_9x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x3_64 takes 9 inputs and produces 3 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_9x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x3Xor takes 9 inputs and produces 3 outputs. -// //go:noescape func mulAvxTwo_9x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x3_64Xor takes 9 inputs and produces 3 outputs. -// //go:noescape func mulAvxTwo_9x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x4 takes 9 inputs and produces 4 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_9x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x4Xor takes 9 inputs and produces 4 outputs. -// //go:noescape func mulAvxTwo_9x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x5 takes 9 inputs and produces 5 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_9x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x5Xor takes 9 inputs and produces 5 outputs. -// //go:noescape func mulAvxTwo_9x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x6 takes 9 inputs and produces 6 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_9x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x6Xor takes 9 inputs and produces 6 outputs. -// //go:noescape func mulAvxTwo_9x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x7 takes 9 inputs and produces 7 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_9x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x7Xor takes 9 inputs and produces 7 outputs. -// //go:noescape func mulAvxTwo_9x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x8 takes 9 inputs and produces 8 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_9x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x8Xor takes 9 inputs and produces 8 outputs. -// //go:noescape func mulAvxTwo_9x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x9 takes 9 inputs and produces 9 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_9x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x9Xor takes 9 inputs and produces 9 outputs. -// //go:noescape func mulAvxTwo_9x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x10 takes 9 inputs and produces 10 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_9x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_9x10Xor takes 9 inputs and produces 10 outputs. -// //go:noescape func mulAvxTwo_9x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x1 takes 10 inputs and produces 1 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_10x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x1_64 takes 10 inputs and produces 1 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x1Xor takes 10 inputs and produces 1 outputs. -// //go:noescape func mulAvxTwo_10x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x1_64Xor takes 10 inputs and produces 1 outputs. -// //go:noescape func mulAvxTwo_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x2 takes 10 inputs and produces 2 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_10x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x2_64 takes 10 inputs and produces 2 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x2Xor takes 10 inputs and produces 2 outputs. -// //go:noescape func mulAvxTwo_10x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x2_64Xor takes 10 inputs and produces 2 outputs. -// //go:noescape func mulAvxTwo_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x3 takes 10 inputs and produces 3 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_10x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x3_64 takes 10 inputs and produces 3 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x3Xor takes 10 inputs and produces 3 outputs. -// //go:noescape func mulAvxTwo_10x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x3_64Xor takes 10 inputs and produces 3 outputs. -// //go:noescape func mulAvxTwo_10x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x4 takes 10 inputs and produces 4 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x4Xor takes 10 inputs and produces 4 outputs. -// //go:noescape func mulAvxTwo_10x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x5 takes 10 inputs and produces 5 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x5Xor takes 10 inputs and produces 5 outputs. -// //go:noescape func mulAvxTwo_10x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x6 takes 10 inputs and produces 6 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x6Xor takes 10 inputs and produces 6 outputs. -// //go:noescape func mulAvxTwo_10x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x7 takes 10 inputs and produces 7 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x7Xor takes 10 inputs and produces 7 outputs. -// //go:noescape func mulAvxTwo_10x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x8 takes 10 inputs and produces 8 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x8Xor takes 10 inputs and produces 8 outputs. -// //go:noescape func mulAvxTwo_10x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x9 takes 10 inputs and produces 9 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x9Xor takes 10 inputs and produces 9 outputs. -// //go:noescape func mulAvxTwo_10x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x10 takes 10 inputs and produces 10 outputs. // The output is initialized to 0. -// //go:noescape func mulAvxTwo_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x10Xor takes 10 inputs and produces 10 outputs. -// //go:noescape func mulAvxTwo_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) - -//go:noescape -func ifftDIT2_avx2(x []byte, y []byte, table *[128]uint8) - -//go:noescape -func fftDIT2_avx2(x []byte, y []byte, table *[128]uint8) - -//go:noescape -func mulgf16_avx2(x []byte, y []byte, table *[128]uint8) - -//go:noescape -func ifftDIT4_avx512_0(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) - -//go:noescape -func fftDIT4_avx512_0(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) - -//go:noescape -func ifftDIT4_avx512_1(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) - -//go:noescape -func fftDIT4_avx512_1(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) - -//go:noescape -func ifftDIT4_avx512_2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) - -//go:noescape -func fftDIT4_avx512_2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) - -//go:noescape -func ifftDIT4_avx512_3(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) - -//go:noescape -func fftDIT4_avx512_3(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) - -//go:noescape -func ifftDIT4_avx512_4(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) - -//go:noescape -func fftDIT4_avx512_4(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) - -//go:noescape -func ifftDIT4_avx512_5(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) - -//go:noescape -func fftDIT4_avx512_5(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) - -//go:noescape -func ifftDIT4_avx512_6(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) - -//go:noescape -func fftDIT4_avx512_6(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) - -//go:noescape -func ifftDIT4_avx512_7(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) - -//go:noescape -func fftDIT4_avx512_7(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) - -//go:noescape -func ifftDIT4_avx2_0(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) - -//go:noescape -func fftDIT4_avx2_0(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) - -//go:noescape -func ifftDIT4_avx2_1(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) - -//go:noescape -func fftDIT4_avx2_1(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) - -//go:noescape -func ifftDIT4_avx2_2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) - -//go:noescape -func fftDIT4_avx2_2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) - -//go:noescape -func ifftDIT4_avx2_3(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) - -//go:noescape -func fftDIT4_avx2_3(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) - -//go:noescape -func ifftDIT4_avx2_4(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) - -//go:noescape -func fftDIT4_avx2_4(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) - -//go:noescape -func ifftDIT4_avx2_5(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) - -//go:noescape -func fftDIT4_avx2_5(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) - -//go:noescape -func ifftDIT4_avx2_6(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) - -//go:noescape -func fftDIT4_avx2_6(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) - -//go:noescape -func ifftDIT4_avx2_7(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) - -//go:noescape -func fftDIT4_avx2_7(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) - -//go:noescape -func ifftDIT2_ssse3(x []byte, y []byte, table *[128]uint8) - -//go:noescape -func fftDIT2_ssse3(x []byte, y []byte, table *[128]uint8) - -//go:noescape -func mulgf16_ssse3(x []byte, y []byte, table *[128]uint8) diff --git a/vendor/github.com/klauspost/reedsolomon/galois_gen_amd64.s b/vendor/github.com/klauspost/reedsolomon/galois_gen_amd64.s index 890461e..36e885f 100644 --- a/vendor/github.com/klauspost/reedsolomon/galois_gen_amd64.s +++ b/vendor/github.com/klauspost/reedsolomon/galois_gen_amd64.s @@ -1,23 +1,12 @@ // Code generated by command: go run gen.go -out ../galois_gen_amd64.s -stubs ../galois_gen_amd64.go -pkg=reedsolomon. DO NOT EDIT. -//go:build !appengine && !noasm && !nogen && gc +// +build !appengine +// +build !noasm +// +build !nogen +// +build gc #include "textflag.h" -// func _dummy_() -TEXT ·_dummy_(SB), $0 -#ifdef GOAMD64_v4 -#define XOR3WAY(ignore, a, b, dst) \ - VPTERNLOGD $0x96, a, b, dst - -#else -#define XOR3WAY(ignore, a, b, dst) \ - VPXOR a, dst, dst \ - VPXOR b, dst, dst - -#endif - RET - // func mulAvxTwo_1x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x1(SB), NOSPLIT, $0-88 @@ -130,7 +119,7 @@ mulAvxTwo_1x1_64_end: RET // func mulAvxTwo_1x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x1Xor(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -167,7 +156,8 @@ mulAvxTwo_1x1Xor_loop: VMOVDQU (DX), Y2 VPSHUFB Y4, Y0, Y4 VPSHUFB Y5, Y1, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Store 1 outputs VMOVDQU Y2, (DX) @@ -182,7 +172,7 @@ mulAvxTwo_1x1Xor_end: RET // func mulAvxTwo_1x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x1_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -229,8 +219,10 @@ mulAvxTwo_1x1_64Xor_loop: VPSHUFB Y7, Y0, Y7 VPSHUFB Y6, Y1, Y6 VPSHUFB Y8, Y1, Y8 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Store 1 outputs VMOVDQU Y2, (DX) @@ -379,7 +371,7 @@ mulAvxTwo_1x2_64_end: RET // func mulAvxTwo_1x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x2Xor(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -420,11 +412,13 @@ mulAvxTwo_1x2Xor_loop: VMOVDQU (BX), Y4 VPSHUFB Y9, Y0, Y7 VPSHUFB Y10, Y1, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU (DX), Y5 VPSHUFB Y9, Y2, Y7 VPSHUFB Y10, Y3, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Store 2 outputs VMOVDQU Y4, (BX) @@ -441,7 +435,7 @@ mulAvxTwo_1x2Xor_end: RET // func mulAvxTwo_1x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x2_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -492,16 +486,20 @@ mulAvxTwo_1x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Store 2 outputs VMOVDQU Y0, (SI) @@ -675,7 +673,7 @@ mulAvxTwo_1x3_64_end: RET // func mulAvxTwo_1x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x3Xor(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -720,15 +718,18 @@ mulAvxTwo_1x3Xor_loop: VMOVDQU (BX), Y6 VPSHUFB Y12, Y0, Y10 VPSHUFB Y13, Y1, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU (SI), Y7 VPSHUFB Y12, Y2, Y10 VPSHUFB Y13, Y3, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU (DX), Y8 VPSHUFB Y12, Y4, Y10 VPSHUFB Y13, Y5, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Store 3 outputs VMOVDQU Y6, (BX) @@ -747,7 +748,7 @@ mulAvxTwo_1x3Xor_end: RET // func mulAvxTwo_1x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x3_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -802,24 +803,30 @@ mulAvxTwo_1x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Store 3 outputs VMOVDQU Y0, (SI) @@ -919,7 +926,7 @@ mulAvxTwo_1x4_end: RET // func mulAvxTwo_1x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x4Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -962,25 +969,29 @@ mulAvxTwo_1x4Xor_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU (BX), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Store 4 outputs VMOVDQU Y0, (SI) @@ -1088,7 +1099,7 @@ mulAvxTwo_1x5_end: RET // func mulAvxTwo_1x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x5Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -1133,31 +1144,36 @@ mulAvxTwo_1x5Xor_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU (BX), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Store 5 outputs VMOVDQU Y0, (SI) @@ -1276,7 +1292,7 @@ mulAvxTwo_1x6_end: RET // func mulAvxTwo_1x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x6Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -1323,37 +1339,43 @@ mulAvxTwo_1x6Xor_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU (R10), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU (BX), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Store 6 outputs VMOVDQU Y0, (SI) @@ -1483,7 +1505,7 @@ mulAvxTwo_1x7_end: RET // func mulAvxTwo_1x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x7Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -1532,43 +1554,50 @@ mulAvxTwo_1x7Xor_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU (R10), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU (R11), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU (BX), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Store 7 outputs VMOVDQU Y0, (SI) @@ -1709,7 +1738,7 @@ mulAvxTwo_1x8_end: RET // func mulAvxTwo_1x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x8Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -1760,49 +1789,57 @@ mulAvxTwo_1x8Xor_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU (R10), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU (R11), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU (R12), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU (BX), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Store 8 outputs VMOVDQU Y0, (SI) @@ -1954,7 +1991,7 @@ mulAvxTwo_1x9_end: RET // func mulAvxTwo_1x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x9Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -2007,55 +2044,64 @@ mulAvxTwo_1x9Xor_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU (R10), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU (R11), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU (R12), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU (R13), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU (BX), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Store 9 outputs VMOVDQU Y0, (SI) @@ -2218,7 +2264,7 @@ mulAvxTwo_1x10_end: RET // func mulAvxTwo_1x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x10Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -2273,61 +2319,71 @@ mulAvxTwo_1x10Xor_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU (R10), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU (R11), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU (R12), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU (R13), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU (R14), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU (BX), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Store 10 outputs VMOVDQU Y0, (SI) @@ -2360,7 +2416,7 @@ mulAvxTwo_1x10Xor_end: RET // func mulAvxTwo_2x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x1(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -2410,7 +2466,8 @@ mulAvxTwo_2x1_loop: VPAND Y5, Y7, Y7 VPSHUFB Y6, Y2, Y6 VPSHUFB Y7, Y3, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Store 1 outputs VMOVDQU Y4, (BX) @@ -2425,7 +2482,7 @@ mulAvxTwo_2x1_end: RET // func mulAvxTwo_2x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x1_64(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -2489,8 +2546,10 @@ mulAvxTwo_2x1_64_loop: VPSHUFB Y9, Y2, Y9 VPSHUFB Y8, Y3, Y8 VPSHUFB Y10, Y3, Y10 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Store 1 outputs VMOVDQU Y4, (BX) @@ -2506,7 +2565,7 @@ mulAvxTwo_2x1_64_end: RET // func mulAvxTwo_2x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x1Xor(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -2547,7 +2606,8 @@ mulAvxTwo_2x1Xor_loop: VMOVDQU (BX), Y4 VPSHUFB Y6, Y0, Y6 VPSHUFB Y7, Y1, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (CX), Y6 @@ -2557,7 +2617,8 @@ mulAvxTwo_2x1Xor_loop: VPAND Y5, Y7, Y7 VPSHUFB Y6, Y2, Y6 VPSHUFB Y7, Y3, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Store 1 outputs VMOVDQU Y4, (BX) @@ -2572,7 +2633,7 @@ mulAvxTwo_2x1Xor_end: RET // func mulAvxTwo_2x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x1_64Xor(SB), $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -2623,8 +2684,10 @@ mulAvxTwo_2x1_64Xor_loop: VPSHUFB Y9, Y0, Y9 VPSHUFB Y8, Y1, Y8 VPSHUFB Y10, Y1, Y10 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (CX), Y7 @@ -2640,8 +2703,10 @@ mulAvxTwo_2x1_64Xor_loop: VPSHUFB Y9, Y2, Y9 VPSHUFB Y8, Y3, Y8 VPSHUFB Y10, Y3, Y10 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Store 1 outputs VMOVDQU Y4, (BX) @@ -2657,7 +2722,7 @@ mulAvxTwo_2x1_64Xor_end: RET // func mulAvxTwo_2x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x2(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -2716,10 +2781,12 @@ mulAvxTwo_2x2_loop: VPAND Y10, Y14, Y14 VPSHUFB Y13, Y4, Y11 VPSHUFB Y14, Y5, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VPSHUFB Y13, Y6, Y11 VPSHUFB Y14, Y7, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Store 2 outputs VMOVDQU Y8, (SI) @@ -2736,7 +2803,7 @@ mulAvxTwo_2x2_end: RET // func mulAvxTwo_2x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x2_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -2810,16 +2877,20 @@ mulAvxTwo_2x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Store 2 outputs VMOVDQU Y0, (DI) @@ -2838,7 +2909,7 @@ mulAvxTwo_2x2_64_end: RET // func mulAvxTwo_2x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x2Xor(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -2885,11 +2956,13 @@ mulAvxTwo_2x2Xor_loop: VMOVDQU (SI), Y8 VPSHUFB Y13, Y0, Y11 VPSHUFB Y14, Y1, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU (BX), Y9 VPSHUFB Y13, Y2, Y11 VPSHUFB Y14, Y3, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (CX), Y13 @@ -2899,10 +2972,12 @@ mulAvxTwo_2x2Xor_loop: VPAND Y10, Y14, Y14 VPSHUFB Y13, Y4, Y11 VPSHUFB Y14, Y5, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VPSHUFB Y13, Y6, Y11 VPSHUFB Y14, Y7, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Store 2 outputs VMOVDQU Y8, (SI) @@ -2919,7 +2994,7 @@ mulAvxTwo_2x2Xor_end: RET // func mulAvxTwo_2x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x2_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -2972,16 +3047,20 @@ mulAvxTwo_2x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (DX), Y9 @@ -2999,16 +3078,20 @@ mulAvxTwo_2x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Store 2 outputs VMOVDQU Y0, (DI) @@ -3027,7 +3110,7 @@ mulAvxTwo_2x2_64Xor_end: RET // func mulAvxTwo_2x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x3(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -3091,17 +3174,20 @@ mulAvxTwo_2x3_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Store 3 outputs VMOVDQU Y0, (DI) @@ -3120,7 +3206,7 @@ mulAvxTwo_2x3_end: RET // func mulAvxTwo_2x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x3_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -3204,24 +3290,30 @@ mulAvxTwo_2x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Store 3 outputs VMOVDQU Y0, (DI) @@ -3243,7 +3335,7 @@ mulAvxTwo_2x3_64_end: RET // func mulAvxTwo_2x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x3Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -3286,19 +3378,22 @@ mulAvxTwo_2x3Xor_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU (SI), Y2 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (DX), Y6 @@ -3310,17 +3405,20 @@ mulAvxTwo_2x3Xor_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Store 3 outputs VMOVDQU Y0, (DI) @@ -3339,7 +3437,7 @@ mulAvxTwo_2x3Xor_end: RET // func mulAvxTwo_2x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x3_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -3396,24 +3494,30 @@ mulAvxTwo_2x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (DX), Y11 @@ -3431,24 +3535,30 @@ mulAvxTwo_2x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Store 3 outputs VMOVDQU Y0, (DI) @@ -3470,7 +3580,7 @@ mulAvxTwo_2x3_64Xor_end: RET // func mulAvxTwo_2x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x4(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -3541,22 +3651,26 @@ mulAvxTwo_2x4_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Store 4 outputs VMOVDQU Y0, (DI) @@ -3577,7 +3691,7 @@ mulAvxTwo_2x4_end: RET // func mulAvxTwo_2x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x4Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -3622,25 +3736,29 @@ mulAvxTwo_2x4Xor_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU (SI), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (DX), Y7 @@ -3652,22 +3770,26 @@ mulAvxTwo_2x4Xor_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Store 4 outputs VMOVDQU Y0, (DI) @@ -3688,7 +3810,7 @@ mulAvxTwo_2x4Xor_end: RET // func mulAvxTwo_2x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x5(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -3766,27 +3888,32 @@ mulAvxTwo_2x5_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Store 5 outputs VMOVDQU Y0, (DI) @@ -3809,7 +3936,7 @@ mulAvxTwo_2x5_end: RET // func mulAvxTwo_2x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x5Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -3856,31 +3983,36 @@ mulAvxTwo_2x5Xor_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU (SI), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (DX), Y8 @@ -3892,27 +4024,32 @@ mulAvxTwo_2x5Xor_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Store 5 outputs VMOVDQU Y0, (DI) @@ -3935,7 +4072,7 @@ mulAvxTwo_2x5Xor_end: RET // func mulAvxTwo_2x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x6(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -4020,32 +4157,38 @@ mulAvxTwo_2x6_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Store 6 outputs VMOVDQU Y0, (DI) @@ -4070,7 +4213,7 @@ mulAvxTwo_2x6_end: RET // func mulAvxTwo_2x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x6Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -4119,37 +4262,43 @@ mulAvxTwo_2x6Xor_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU (SI), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (DX), Y9 @@ -4161,32 +4310,38 @@ mulAvxTwo_2x6Xor_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Store 6 outputs VMOVDQU Y0, (DI) @@ -4211,7 +4366,7 @@ mulAvxTwo_2x6Xor_end: RET // func mulAvxTwo_2x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x7(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -4303,37 +4458,44 @@ mulAvxTwo_2x7_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Store 7 outputs VMOVDQU Y0, (DI) @@ -4360,7 +4522,7 @@ mulAvxTwo_2x7_end: RET // func mulAvxTwo_2x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x7Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -4411,43 +4573,50 @@ mulAvxTwo_2x7Xor_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU (R12), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU (SI), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (DX), Y10 @@ -4459,37 +4628,44 @@ mulAvxTwo_2x7Xor_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Store 7 outputs VMOVDQU Y0, (DI) @@ -4516,7 +4692,7 @@ mulAvxTwo_2x7Xor_end: RET // func mulAvxTwo_2x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x8(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -4615,42 +4791,50 @@ mulAvxTwo_2x8_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Store 8 outputs VMOVDQU Y0, (DI) @@ -4679,7 +4863,7 @@ mulAvxTwo_2x8_end: RET // func mulAvxTwo_2x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x8Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -4732,49 +4916,57 @@ mulAvxTwo_2x8Xor_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU (R12), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU (R13), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU (SI), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (DX), Y11 @@ -4786,42 +4978,50 @@ mulAvxTwo_2x8Xor_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Store 8 outputs VMOVDQU Y0, (DI) @@ -4850,7 +5050,7 @@ mulAvxTwo_2x8Xor_end: RET // func mulAvxTwo_2x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x9(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -4956,47 +5156,56 @@ mulAvxTwo_2x9_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Store 9 outputs VMOVDQU Y0, (DI) @@ -5027,7 +5236,7 @@ mulAvxTwo_2x9_end: RET // func mulAvxTwo_2x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x9Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -5082,55 +5291,64 @@ mulAvxTwo_2x9Xor_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU (R12), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU (R13), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU (R14), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU (SI), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (DX), Y12 @@ -5142,47 +5360,56 @@ mulAvxTwo_2x9Xor_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Store 9 outputs VMOVDQU Y0, (DI) @@ -5213,7 +5440,7 @@ mulAvxTwo_2x9Xor_end: RET // func mulAvxTwo_2x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x10(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -5326,52 +5553,62 @@ mulAvxTwo_2x10_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Store 10 outputs VMOVDQU Y0, (DI) @@ -5404,7 +5641,7 @@ mulAvxTwo_2x10_end: RET // func mulAvxTwo_2x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x10Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -5461,61 +5698,71 @@ mulAvxTwo_2x10Xor_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU (R12), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU (R13), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU (R14), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU (R15), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU (SI), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (DX), Y13 @@ -5527,52 +5774,62 @@ mulAvxTwo_2x10Xor_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Store 10 outputs VMOVDQU Y0, (DI) @@ -5605,7 +5862,7 @@ mulAvxTwo_2x10Xor_end: RET // func mulAvxTwo_3x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x1(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -5659,7 +5916,8 @@ mulAvxTwo_3x1_loop: VPAND Y7, Y9, Y9 VPSHUFB Y8, Y2, Y8 VPSHUFB Y9, Y3, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (CX), Y8 @@ -5669,7 +5927,8 @@ mulAvxTwo_3x1_loop: VPAND Y7, Y9, Y9 VPSHUFB Y8, Y4, Y8 VPSHUFB Y9, Y5, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Store 1 outputs VMOVDQU Y6, (SI) @@ -5684,7 +5943,7 @@ mulAvxTwo_3x1_end: RET // func mulAvxTwo_3x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x1_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -5750,8 +6009,10 @@ mulAvxTwo_3x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DX), Y6 @@ -5769,8 +6030,10 @@ mulAvxTwo_3x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Store 1 outputs VMOVDQU Y0, (DI) @@ -5786,7 +6049,7 @@ mulAvxTwo_3x1_64_end: RET // func mulAvxTwo_3x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x1Xor(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -5831,7 +6094,8 @@ mulAvxTwo_3x1Xor_loop: VMOVDQU (SI), Y6 VPSHUFB Y8, Y0, Y8 VPSHUFB Y9, Y1, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y8 @@ -5841,7 +6105,8 @@ mulAvxTwo_3x1Xor_loop: VPAND Y7, Y9, Y9 VPSHUFB Y8, Y2, Y8 VPSHUFB Y9, Y3, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (CX), Y8 @@ -5851,7 +6116,8 @@ mulAvxTwo_3x1Xor_loop: VPAND Y7, Y9, Y9 VPSHUFB Y8, Y4, Y8 VPSHUFB Y9, Y5, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Store 1 outputs VMOVDQU Y6, (SI) @@ -5866,7 +6132,7 @@ mulAvxTwo_3x1Xor_end: RET // func mulAvxTwo_3x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x1_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -5917,8 +6183,10 @@ mulAvxTwo_3x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 @@ -5936,8 +6204,10 @@ mulAvxTwo_3x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DX), Y6 @@ -5955,8 +6225,10 @@ mulAvxTwo_3x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Store 1 outputs VMOVDQU Y0, (DI) @@ -5972,7 +6244,7 @@ mulAvxTwo_3x1_64Xor_end: RET // func mulAvxTwo_3x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x2(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -6031,12 +6303,14 @@ mulAvxTwo_3x2_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DX), Y5 @@ -6048,12 +6322,14 @@ mulAvxTwo_3x2_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Store 2 outputs VMOVDQU Y0, (R8) @@ -6070,7 +6346,7 @@ mulAvxTwo_3x2_end: RET // func mulAvxTwo_3x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x2_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -6146,16 +6422,20 @@ mulAvxTwo_3x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DX), Y9 @@ -6173,16 +6453,20 @@ mulAvxTwo_3x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Store 2 outputs VMOVDQU Y0, (R8) @@ -6201,7 +6485,7 @@ mulAvxTwo_3x2_64_end: RET // func mulAvxTwo_3x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x2Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -6244,13 +6528,15 @@ mulAvxTwo_3x2Xor_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 @@ -6262,12 +6548,14 @@ mulAvxTwo_3x2Xor_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DX), Y5 @@ -6279,12 +6567,14 @@ mulAvxTwo_3x2Xor_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Store 2 outputs VMOVDQU Y0, (R8) @@ -6301,7 +6591,7 @@ mulAvxTwo_3x2Xor_end: RET // func mulAvxTwo_3x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x2_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -6356,16 +6646,20 @@ mulAvxTwo_3x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 @@ -6383,16 +6677,20 @@ mulAvxTwo_3x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DX), Y9 @@ -6410,16 +6708,20 @@ mulAvxTwo_3x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Store 2 outputs VMOVDQU Y0, (R8) @@ -6438,7 +6740,7 @@ mulAvxTwo_3x2_64Xor_end: RET // func mulAvxTwo_3x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x3(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -6504,17 +6806,20 @@ mulAvxTwo_3x3_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DX), Y6 @@ -6526,17 +6831,20 @@ mulAvxTwo_3x3_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Store 3 outputs VMOVDQU Y0, (R8) @@ -6555,7 +6863,7 @@ mulAvxTwo_3x3_end: RET // func mulAvxTwo_3x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x3_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -6641,24 +6949,30 @@ mulAvxTwo_3x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DX), Y11 @@ -6676,24 +6990,30 @@ mulAvxTwo_3x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Store 3 outputs VMOVDQU Y0, (R8) @@ -6715,7 +7035,7 @@ mulAvxTwo_3x3_64_end: RET // func mulAvxTwo_3x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x3Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -6760,19 +7080,22 @@ mulAvxTwo_3x3Xor_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU (DI), Y2 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 @@ -6784,17 +7107,20 @@ mulAvxTwo_3x3Xor_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DX), Y6 @@ -6806,17 +7132,20 @@ mulAvxTwo_3x3Xor_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Store 3 outputs VMOVDQU Y0, (R8) @@ -6835,7 +7164,7 @@ mulAvxTwo_3x3Xor_end: RET // func mulAvxTwo_3x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x3_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -6894,24 +7223,30 @@ mulAvxTwo_3x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 @@ -6929,24 +7264,30 @@ mulAvxTwo_3x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DX), Y11 @@ -6964,24 +7305,30 @@ mulAvxTwo_3x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Store 3 outputs VMOVDQU Y0, (R8) @@ -7003,7 +7350,7 @@ mulAvxTwo_3x3_64Xor_end: RET // func mulAvxTwo_3x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x4(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -7076,22 +7423,26 @@ mulAvxTwo_3x4_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DX), Y7 @@ -7103,22 +7454,26 @@ mulAvxTwo_3x4_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Store 4 outputs VMOVDQU Y0, (R8) @@ -7139,7 +7494,7 @@ mulAvxTwo_3x4_end: RET // func mulAvxTwo_3x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x4Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -7186,25 +7541,29 @@ mulAvxTwo_3x4Xor_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU (R10), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU (DI), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 @@ -7216,22 +7575,26 @@ mulAvxTwo_3x4Xor_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DX), Y7 @@ -7243,22 +7606,26 @@ mulAvxTwo_3x4Xor_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Store 4 outputs VMOVDQU Y0, (R8) @@ -7279,7 +7646,7 @@ mulAvxTwo_3x4Xor_end: RET // func mulAvxTwo_3x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x5(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -7359,27 +7726,32 @@ mulAvxTwo_3x5_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DX), Y8 @@ -7391,27 +7763,32 @@ mulAvxTwo_3x5_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Store 5 outputs VMOVDQU Y0, (R8) @@ -7434,7 +7811,7 @@ mulAvxTwo_3x5_end: RET // func mulAvxTwo_3x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x5Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -7483,31 +7860,36 @@ mulAvxTwo_3x5Xor_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU (R10), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU (R11), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU (DI), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 @@ -7519,27 +7901,32 @@ mulAvxTwo_3x5Xor_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DX), Y8 @@ -7551,27 +7938,32 @@ mulAvxTwo_3x5Xor_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Store 5 outputs VMOVDQU Y0, (R8) @@ -7594,7 +7986,7 @@ mulAvxTwo_3x5Xor_end: RET // func mulAvxTwo_3x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x6(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -7681,32 +8073,38 @@ mulAvxTwo_3x6_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DX), Y9 @@ -7718,32 +8116,38 @@ mulAvxTwo_3x6_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Store 6 outputs VMOVDQU Y0, (R8) @@ -7768,7 +8172,7 @@ mulAvxTwo_3x6_end: RET // func mulAvxTwo_3x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x6Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -7819,37 +8223,43 @@ mulAvxTwo_3x6Xor_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU (R10), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU (R11), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU (R12), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU (DI), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 @@ -7861,32 +8271,38 @@ mulAvxTwo_3x6Xor_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DX), Y9 @@ -7898,32 +8314,38 @@ mulAvxTwo_3x6Xor_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Store 6 outputs VMOVDQU Y0, (R8) @@ -7948,7 +8370,7 @@ mulAvxTwo_3x6Xor_end: RET // func mulAvxTwo_3x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x7(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -8042,37 +8464,44 @@ mulAvxTwo_3x7_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DX), Y10 @@ -8084,37 +8513,44 @@ mulAvxTwo_3x7_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Store 7 outputs VMOVDQU Y0, (R8) @@ -8141,7 +8577,7 @@ mulAvxTwo_3x7_end: RET // func mulAvxTwo_3x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x7Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -8194,43 +8630,50 @@ mulAvxTwo_3x7Xor_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU (R10), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU (R11), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU (R12), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU (R13), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU (DI), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 @@ -8242,37 +8685,44 @@ mulAvxTwo_3x7Xor_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DX), Y10 @@ -8284,37 +8734,44 @@ mulAvxTwo_3x7Xor_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Store 7 outputs VMOVDQU Y0, (R8) @@ -8341,7 +8798,7 @@ mulAvxTwo_3x7Xor_end: RET // func mulAvxTwo_3x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x8(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -8442,42 +8899,50 @@ mulAvxTwo_3x8_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DX), Y11 @@ -8489,42 +8954,50 @@ mulAvxTwo_3x8_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Store 8 outputs VMOVDQU Y0, (R8) @@ -8553,7 +9026,7 @@ mulAvxTwo_3x8_end: RET // func mulAvxTwo_3x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x8Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -8608,49 +9081,57 @@ mulAvxTwo_3x8Xor_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU (R10), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU (R11), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU (R12), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU (R13), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU (R14), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU (DI), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 @@ -8662,42 +9143,50 @@ mulAvxTwo_3x8Xor_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DX), Y11 @@ -8709,42 +9198,50 @@ mulAvxTwo_3x8Xor_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Store 8 outputs VMOVDQU Y0, (R8) @@ -8773,7 +9270,7 @@ mulAvxTwo_3x8Xor_end: RET // func mulAvxTwo_3x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x9(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -8881,47 +9378,56 @@ mulAvxTwo_3x9_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DX), Y12 @@ -8933,47 +9439,56 @@ mulAvxTwo_3x9_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Store 9 outputs VMOVDQU Y0, (R8) @@ -9004,7 +9519,7 @@ mulAvxTwo_3x9_end: RET // func mulAvxTwo_3x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x9Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -9061,55 +9576,64 @@ mulAvxTwo_3x9Xor_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU (R10), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU (R11), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU (R12), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU (R13), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU (R14), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU (R15), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU (DI), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 @@ -9121,47 +9645,56 @@ mulAvxTwo_3x9Xor_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DX), Y12 @@ -9173,47 +9706,56 @@ mulAvxTwo_3x9Xor_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Store 9 outputs VMOVDQU Y0, (R8) @@ -9244,7 +9786,7 @@ mulAvxTwo_3x9Xor_end: RET // func mulAvxTwo_3x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x10(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -9361,52 +9903,62 @@ mulAvxTwo_3x10_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (AX), Y13 @@ -9418,52 +9970,62 @@ mulAvxTwo_3x10_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Store 10 outputs VMOVDQU Y0, (DI) @@ -9496,7 +10058,7 @@ mulAvxTwo_3x10_end: RET // func mulAvxTwo_3x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x10Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -9557,61 +10119,71 @@ mulAvxTwo_3x10Xor_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU (R12), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU (R13), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU (R14), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU (R15), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU (SI), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (BX), Y13 @@ -9623,52 +10195,62 @@ mulAvxTwo_3x10Xor_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (AX), Y13 @@ -9680,52 +10262,62 @@ mulAvxTwo_3x10Xor_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Store 10 outputs VMOVDQU Y0, (DI) @@ -9758,7 +10350,7 @@ mulAvxTwo_3x10Xor_end: RET // func mulAvxTwo_4x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x1(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -9816,7 +10408,8 @@ mulAvxTwo_4x1_loop: VPAND Y9, Y11, Y11 VPSHUFB Y10, Y2, Y10 VPSHUFB Y11, Y3, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y10 @@ -9826,7 +10419,8 @@ mulAvxTwo_4x1_loop: VPAND Y9, Y11, Y11 VPSHUFB Y10, Y4, Y10 VPSHUFB Y11, Y5, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (CX), Y10 @@ -9836,7 +10430,8 @@ mulAvxTwo_4x1_loop: VPAND Y9, Y11, Y11 VPSHUFB Y10, Y6, Y10 VPSHUFB Y11, Y7, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Store 1 outputs VMOVDQU Y8, (DI) @@ -9851,7 +10446,7 @@ mulAvxTwo_4x1_end: RET // func mulAvxTwo_4x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x1_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -9919,8 +10514,10 @@ mulAvxTwo_4x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 @@ -9938,8 +10535,10 @@ mulAvxTwo_4x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (DX), Y6 @@ -9957,8 +10556,10 @@ mulAvxTwo_4x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Store 1 outputs VMOVDQU Y0, (R8) @@ -9974,7 +10575,7 @@ mulAvxTwo_4x1_64_end: RET // func mulAvxTwo_4x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x1Xor(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -10023,7 +10624,8 @@ mulAvxTwo_4x1Xor_loop: VMOVDQU (DI), Y8 VPSHUFB Y10, Y0, Y10 VPSHUFB Y11, Y1, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y10 @@ -10033,7 +10635,8 @@ mulAvxTwo_4x1Xor_loop: VPAND Y9, Y11, Y11 VPSHUFB Y10, Y2, Y10 VPSHUFB Y11, Y3, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y10 @@ -10043,7 +10646,8 @@ mulAvxTwo_4x1Xor_loop: VPAND Y9, Y11, Y11 VPSHUFB Y10, Y4, Y10 VPSHUFB Y11, Y5, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (CX), Y10 @@ -10053,7 +10657,8 @@ mulAvxTwo_4x1Xor_loop: VPAND Y9, Y11, Y11 VPSHUFB Y10, Y6, Y10 VPSHUFB Y11, Y7, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Store 1 outputs VMOVDQU Y8, (DI) @@ -10068,7 +10673,7 @@ mulAvxTwo_4x1Xor_end: RET // func mulAvxTwo_4x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x1_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -10121,8 +10726,10 @@ mulAvxTwo_4x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 @@ -10140,8 +10747,10 @@ mulAvxTwo_4x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 @@ -10159,8 +10768,10 @@ mulAvxTwo_4x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (DX), Y6 @@ -10178,8 +10789,10 @@ mulAvxTwo_4x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Store 1 outputs VMOVDQU Y0, (R8) @@ -10195,7 +10808,7 @@ mulAvxTwo_4x1_64Xor_end: RET // func mulAvxTwo_4x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x2(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -10256,12 +10869,14 @@ mulAvxTwo_4x2_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 @@ -10273,12 +10888,14 @@ mulAvxTwo_4x2_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (DX), Y5 @@ -10290,12 +10907,14 @@ mulAvxTwo_4x2_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Store 2 outputs VMOVDQU Y0, (R9) @@ -10312,7 +10931,7 @@ mulAvxTwo_4x2_end: RET // func mulAvxTwo_4x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x2_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -10390,16 +11009,20 @@ mulAvxTwo_4x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 @@ -10417,16 +11040,20 @@ mulAvxTwo_4x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (DX), Y9 @@ -10444,16 +11071,20 @@ mulAvxTwo_4x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Store 2 outputs VMOVDQU Y0, (R9) @@ -10472,7 +11103,7 @@ mulAvxTwo_4x2_64_end: RET // func mulAvxTwo_4x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x2Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -10517,13 +11148,15 @@ mulAvxTwo_4x2Xor_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 @@ -10535,12 +11168,14 @@ mulAvxTwo_4x2Xor_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 @@ -10552,12 +11187,14 @@ mulAvxTwo_4x2Xor_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (DX), Y5 @@ -10569,12 +11206,14 @@ mulAvxTwo_4x2Xor_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Store 2 outputs VMOVDQU Y0, (R9) @@ -10591,7 +11230,7 @@ mulAvxTwo_4x2Xor_end: RET // func mulAvxTwo_4x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x2_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -10648,16 +11287,20 @@ mulAvxTwo_4x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 @@ -10675,16 +11318,20 @@ mulAvxTwo_4x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 @@ -10702,16 +11349,20 @@ mulAvxTwo_4x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (DX), Y9 @@ -10729,16 +11380,20 @@ mulAvxTwo_4x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Store 2 outputs VMOVDQU Y0, (R9) @@ -10757,7 +11412,7 @@ mulAvxTwo_4x2_64Xor_end: RET // func mulAvxTwo_4x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x3(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -10825,17 +11480,20 @@ mulAvxTwo_4x3_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 @@ -10847,17 +11505,20 @@ mulAvxTwo_4x3_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (DX), Y6 @@ -10869,17 +11530,20 @@ mulAvxTwo_4x3_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Store 3 outputs VMOVDQU Y0, (R9) @@ -10898,7 +11562,7 @@ mulAvxTwo_4x3_end: RET // func mulAvxTwo_4x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x3_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -10986,24 +11650,30 @@ mulAvxTwo_4x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 @@ -11021,24 +11691,30 @@ mulAvxTwo_4x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (DX), Y11 @@ -11056,24 +11732,30 @@ mulAvxTwo_4x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Store 3 outputs VMOVDQU Y0, (R9) @@ -11095,7 +11777,7 @@ mulAvxTwo_4x3_64_end: RET // func mulAvxTwo_4x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x3Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -11142,19 +11824,22 @@ mulAvxTwo_4x3Xor_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU (R10), Y1 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 @@ -11166,17 +11851,20 @@ mulAvxTwo_4x3Xor_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 @@ -11188,17 +11876,20 @@ mulAvxTwo_4x3Xor_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (DX), Y6 @@ -11210,17 +11901,20 @@ mulAvxTwo_4x3Xor_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Store 3 outputs VMOVDQU Y0, (R9) @@ -11239,7 +11933,7 @@ mulAvxTwo_4x3Xor_end: RET // func mulAvxTwo_4x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x3_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -11300,24 +11994,30 @@ mulAvxTwo_4x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 @@ -11335,24 +12035,30 @@ mulAvxTwo_4x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 @@ -11370,24 +12076,30 @@ mulAvxTwo_4x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (DX), Y11 @@ -11405,24 +12117,30 @@ mulAvxTwo_4x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Store 3 outputs VMOVDQU Y0, (R9) @@ -11444,7 +12162,7 @@ mulAvxTwo_4x3_64Xor_end: RET // func mulAvxTwo_4x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x4(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -11519,22 +12237,26 @@ mulAvxTwo_4x4_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 @@ -11546,22 +12268,26 @@ mulAvxTwo_4x4_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (DX), Y7 @@ -11573,22 +12299,26 @@ mulAvxTwo_4x4_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Store 4 outputs VMOVDQU Y0, (R9) @@ -11609,7 +12339,7 @@ mulAvxTwo_4x4_end: RET // func mulAvxTwo_4x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x4Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -11658,25 +12388,29 @@ mulAvxTwo_4x4Xor_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU (R10), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU (R11), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU (R8), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 @@ -11688,22 +12422,26 @@ mulAvxTwo_4x4Xor_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 @@ -11715,22 +12453,26 @@ mulAvxTwo_4x4Xor_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (DX), Y7 @@ -11742,22 +12484,26 @@ mulAvxTwo_4x4Xor_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Store 4 outputs VMOVDQU Y0, (R9) @@ -11778,7 +12524,7 @@ mulAvxTwo_4x4Xor_end: RET // func mulAvxTwo_4x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x5(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -11860,27 +12606,32 @@ mulAvxTwo_4x5_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 @@ -11892,27 +12643,32 @@ mulAvxTwo_4x5_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (DX), Y8 @@ -11924,27 +12680,32 @@ mulAvxTwo_4x5_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Store 5 outputs VMOVDQU Y0, (R9) @@ -11967,7 +12728,7 @@ mulAvxTwo_4x5_end: RET // func mulAvxTwo_4x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x5Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -12018,31 +12779,36 @@ mulAvxTwo_4x5Xor_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU (R10), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU (R11), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU (R12), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU (R8), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 @@ -12054,27 +12820,32 @@ mulAvxTwo_4x5Xor_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 @@ -12086,27 +12857,32 @@ mulAvxTwo_4x5Xor_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (DX), Y8 @@ -12118,27 +12894,32 @@ mulAvxTwo_4x5Xor_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Store 5 outputs VMOVDQU Y0, (R9) @@ -12161,7 +12942,7 @@ mulAvxTwo_4x5Xor_end: RET // func mulAvxTwo_4x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x6(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -12250,32 +13031,38 @@ mulAvxTwo_4x6_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 @@ -12287,32 +13074,38 @@ mulAvxTwo_4x6_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (DX), Y9 @@ -12324,32 +13117,38 @@ mulAvxTwo_4x6_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Store 6 outputs VMOVDQU Y0, (R9) @@ -12374,7 +13173,7 @@ mulAvxTwo_4x6_end: RET // func mulAvxTwo_4x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x6Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -12427,37 +13226,43 @@ mulAvxTwo_4x6Xor_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU (R10), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU (R11), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU (R12), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU (R13), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU (R8), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 @@ -12469,32 +13274,38 @@ mulAvxTwo_4x6Xor_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 @@ -12506,32 +13317,38 @@ mulAvxTwo_4x6Xor_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (DX), Y9 @@ -12543,32 +13360,38 @@ mulAvxTwo_4x6Xor_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Store 6 outputs VMOVDQU Y0, (R9) @@ -12593,7 +13416,7 @@ mulAvxTwo_4x6Xor_end: RET // func mulAvxTwo_4x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x7(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -12689,37 +13512,44 @@ mulAvxTwo_4x7_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 @@ -12731,37 +13561,44 @@ mulAvxTwo_4x7_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (DX), Y10 @@ -12773,37 +13610,44 @@ mulAvxTwo_4x7_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Store 7 outputs VMOVDQU Y0, (R9) @@ -12830,7 +13674,7 @@ mulAvxTwo_4x7_end: RET // func mulAvxTwo_4x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x7Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -12885,43 +13729,50 @@ mulAvxTwo_4x7Xor_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU (R10), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU (R11), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU (R12), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU (R13), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU (R14), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU (R8), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 @@ -12933,37 +13784,44 @@ mulAvxTwo_4x7Xor_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 @@ -12975,37 +13833,44 @@ mulAvxTwo_4x7Xor_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (DX), Y10 @@ -13017,37 +13882,44 @@ mulAvxTwo_4x7Xor_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Store 7 outputs VMOVDQU Y0, (R9) @@ -13074,7 +13946,7 @@ mulAvxTwo_4x7Xor_end: RET // func mulAvxTwo_4x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x8(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -13177,42 +14049,50 @@ mulAvxTwo_4x8_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 @@ -13224,42 +14104,50 @@ mulAvxTwo_4x8_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (DX), Y11 @@ -13271,42 +14159,50 @@ mulAvxTwo_4x8_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Store 8 outputs VMOVDQU Y0, (R9) @@ -13335,7 +14231,7 @@ mulAvxTwo_4x8_end: RET // func mulAvxTwo_4x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x8Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -13392,49 +14288,57 @@ mulAvxTwo_4x8Xor_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU (R10), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU (R11), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU (R12), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU (R13), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU (R14), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU (R15), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU (R8), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 @@ -13446,42 +14350,50 @@ mulAvxTwo_4x8Xor_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 @@ -13493,42 +14405,50 @@ mulAvxTwo_4x8Xor_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (DX), Y11 @@ -13540,42 +14460,50 @@ mulAvxTwo_4x8Xor_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Store 8 outputs VMOVDQU Y0, (R9) @@ -13604,7 +14532,7 @@ mulAvxTwo_4x8Xor_end: RET // func mulAvxTwo_4x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x9(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -13716,47 +14644,56 @@ mulAvxTwo_4x9_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (SI), Y12 @@ -13768,47 +14705,56 @@ mulAvxTwo_4x9_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (AX), Y12 @@ -13820,47 +14766,56 @@ mulAvxTwo_4x9_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Store 9 outputs VMOVDQU Y0, (R8) @@ -13891,7 +14846,7 @@ mulAvxTwo_4x9_end: RET // func mulAvxTwo_4x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x9Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -13952,55 +14907,64 @@ mulAvxTwo_4x9Xor_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU (R10), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU (R11), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU (R12), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU (R13), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU (R14), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU (R15), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU (DI), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (BX), Y12 @@ -14012,47 +14976,56 @@ mulAvxTwo_4x9Xor_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (SI), Y12 @@ -14064,47 +15037,56 @@ mulAvxTwo_4x9Xor_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (AX), Y12 @@ -14116,47 +15098,56 @@ mulAvxTwo_4x9Xor_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Store 9 outputs VMOVDQU Y0, (R8) @@ -14187,7 +15178,7 @@ mulAvxTwo_4x9Xor_end: RET // func mulAvxTwo_4x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x10(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -14282,52 +15273,62 @@ mulAvxTwo_4x10_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 @@ -14339,52 +15340,62 @@ mulAvxTwo_4x10_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (DX), Y13 @@ -14396,52 +15407,62 @@ mulAvxTwo_4x10_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Store 10 outputs MOVQ (R8), R10 @@ -14475,7 +15496,7 @@ mulAvxTwo_4x10_end: RET // func mulAvxTwo_4x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x10Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -14515,70 +15536,80 @@ mulAvxTwo_4x10Xor_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 MOVQ 24(R8), R10 VMOVDQU (R10)(R9*1), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 MOVQ 48(R8), R10 VMOVDQU (R10)(R9*1), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 MOVQ 72(R8), R10 VMOVDQU (R10)(R9*1), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 MOVQ 96(R8), R10 VMOVDQU (R10)(R9*1), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 MOVQ 120(R8), R10 VMOVDQU (R10)(R9*1), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 MOVQ 144(R8), R10 VMOVDQU (R10)(R9*1), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 MOVQ 168(R8), R10 VMOVDQU (R10)(R9*1), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 MOVQ 192(R8), R10 VMOVDQU (R10)(R9*1), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 MOVQ 216(R8), R10 VMOVDQU (R10)(R9*1), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 @@ -14590,52 +15621,62 @@ mulAvxTwo_4x10Xor_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 @@ -14647,52 +15688,62 @@ mulAvxTwo_4x10Xor_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (DX), Y13 @@ -14704,52 +15755,62 @@ mulAvxTwo_4x10Xor_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Store 10 outputs MOVQ (R8), R10 @@ -14783,7 +15844,7 @@ mulAvxTwo_4x10Xor_end: RET // func mulAvxTwo_5x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x1(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -14845,7 +15906,8 @@ mulAvxTwo_5x1_loop: VPAND Y11, Y13, Y13 VPSHUFB Y12, Y2, Y12 VPSHUFB Y13, Y3, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) + VPXOR Y12, Y13, Y12 + VPXOR Y12, Y10, Y10 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y12 @@ -14855,7 +15917,8 @@ mulAvxTwo_5x1_loop: VPAND Y11, Y13, Y13 VPSHUFB Y12, Y4, Y12 VPSHUFB Y13, Y5, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) + VPXOR Y12, Y13, Y12 + VPXOR Y12, Y10, Y10 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (DI), Y12 @@ -14865,7 +15928,8 @@ mulAvxTwo_5x1_loop: VPAND Y11, Y13, Y13 VPSHUFB Y12, Y6, Y12 VPSHUFB Y13, Y7, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) + VPXOR Y12, Y13, Y12 + VPXOR Y12, Y10, Y10 // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (CX), Y12 @@ -14875,7 +15939,8 @@ mulAvxTwo_5x1_loop: VPAND Y11, Y13, Y13 VPSHUFB Y12, Y8, Y12 VPSHUFB Y13, Y9, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) + VPXOR Y12, Y13, Y12 + VPXOR Y12, Y10, Y10 // Store 1 outputs VMOVDQU Y10, (R8) @@ -14890,7 +15955,7 @@ mulAvxTwo_5x1_end: RET // func mulAvxTwo_5x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x1_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -14960,8 +16025,10 @@ mulAvxTwo_5x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 @@ -14979,8 +16046,10 @@ mulAvxTwo_5x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 @@ -14998,8 +16067,10 @@ mulAvxTwo_5x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (DX), Y6 @@ -15017,8 +16088,10 @@ mulAvxTwo_5x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Store 1 outputs VMOVDQU Y0, (R9) @@ -15034,7 +16107,7 @@ mulAvxTwo_5x1_64_end: RET // func mulAvxTwo_5x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x1Xor(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -15087,7 +16160,8 @@ mulAvxTwo_5x1Xor_loop: VMOVDQU (R8), Y10 VPSHUFB Y12, Y0, Y12 VPSHUFB Y13, Y1, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) + VPXOR Y12, Y13, Y12 + VPXOR Y12, Y10, Y10 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y12 @@ -15097,7 +16171,8 @@ mulAvxTwo_5x1Xor_loop: VPAND Y11, Y13, Y13 VPSHUFB Y12, Y2, Y12 VPSHUFB Y13, Y3, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) + VPXOR Y12, Y13, Y12 + VPXOR Y12, Y10, Y10 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y12 @@ -15107,7 +16182,8 @@ mulAvxTwo_5x1Xor_loop: VPAND Y11, Y13, Y13 VPSHUFB Y12, Y4, Y12 VPSHUFB Y13, Y5, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) + VPXOR Y12, Y13, Y12 + VPXOR Y12, Y10, Y10 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (DI), Y12 @@ -15117,7 +16193,8 @@ mulAvxTwo_5x1Xor_loop: VPAND Y11, Y13, Y13 VPSHUFB Y12, Y6, Y12 VPSHUFB Y13, Y7, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) + VPXOR Y12, Y13, Y12 + VPXOR Y12, Y10, Y10 // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (CX), Y12 @@ -15127,7 +16204,8 @@ mulAvxTwo_5x1Xor_loop: VPAND Y11, Y13, Y13 VPSHUFB Y12, Y8, Y12 VPSHUFB Y13, Y9, Y13 - XOR3WAY( $0x00, Y12, Y13, Y10) + VPXOR Y12, Y13, Y12 + VPXOR Y12, Y10, Y10 // Store 1 outputs VMOVDQU Y10, (R8) @@ -15142,7 +16220,7 @@ mulAvxTwo_5x1Xor_end: RET // func mulAvxTwo_5x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x1_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -15197,8 +16275,10 @@ mulAvxTwo_5x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 @@ -15216,8 +16296,10 @@ mulAvxTwo_5x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 @@ -15235,8 +16317,10 @@ mulAvxTwo_5x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 @@ -15254,8 +16338,10 @@ mulAvxTwo_5x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (DX), Y6 @@ -15273,8 +16359,10 @@ mulAvxTwo_5x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Store 1 outputs VMOVDQU Y0, (R9) @@ -15290,7 +16378,7 @@ mulAvxTwo_5x1_64Xor_end: RET // func mulAvxTwo_5x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x2(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -15353,12 +16441,14 @@ mulAvxTwo_5x2_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 @@ -15370,12 +16460,14 @@ mulAvxTwo_5x2_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 @@ -15387,12 +16479,14 @@ mulAvxTwo_5x2_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (DX), Y5 @@ -15404,12 +16498,14 @@ mulAvxTwo_5x2_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Store 2 outputs VMOVDQU Y0, (R10) @@ -15426,7 +16522,7 @@ mulAvxTwo_5x2_end: RET // func mulAvxTwo_5x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x2_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -15506,16 +16602,20 @@ mulAvxTwo_5x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 @@ -15533,16 +16633,20 @@ mulAvxTwo_5x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 @@ -15560,16 +16664,20 @@ mulAvxTwo_5x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (DX), Y9 @@ -15587,16 +16695,20 @@ mulAvxTwo_5x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Store 2 outputs VMOVDQU Y0, (R10) @@ -15615,7 +16727,7 @@ mulAvxTwo_5x2_64_end: RET // func mulAvxTwo_5x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x2Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -15662,13 +16774,15 @@ mulAvxTwo_5x2Xor_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 @@ -15680,12 +16794,14 @@ mulAvxTwo_5x2Xor_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 @@ -15697,12 +16813,14 @@ mulAvxTwo_5x2Xor_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 @@ -15714,12 +16832,14 @@ mulAvxTwo_5x2Xor_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (DX), Y5 @@ -15731,12 +16851,14 @@ mulAvxTwo_5x2Xor_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Store 2 outputs VMOVDQU Y0, (R10) @@ -15753,7 +16875,7 @@ mulAvxTwo_5x2Xor_end: RET // func mulAvxTwo_5x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x2_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -15812,16 +16934,20 @@ mulAvxTwo_5x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 @@ -15839,16 +16965,20 @@ mulAvxTwo_5x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 @@ -15866,16 +16996,20 @@ mulAvxTwo_5x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 @@ -15893,16 +17027,20 @@ mulAvxTwo_5x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (DX), Y9 @@ -15920,16 +17058,20 @@ mulAvxTwo_5x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Store 2 outputs VMOVDQU Y0, (R10) @@ -15948,7 +17090,7 @@ mulAvxTwo_5x2_64Xor_end: RET // func mulAvxTwo_5x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x3(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -16018,17 +17160,20 @@ mulAvxTwo_5x3_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 @@ -16040,17 +17185,20 @@ mulAvxTwo_5x3_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 @@ -16062,17 +17210,20 @@ mulAvxTwo_5x3_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (DX), Y6 @@ -16084,17 +17235,20 @@ mulAvxTwo_5x3_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Store 3 outputs VMOVDQU Y0, (R10) @@ -16113,7 +17267,7 @@ mulAvxTwo_5x3_end: RET // func mulAvxTwo_5x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x3_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -16203,24 +17357,30 @@ mulAvxTwo_5x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 @@ -16238,24 +17398,30 @@ mulAvxTwo_5x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 @@ -16273,24 +17439,30 @@ mulAvxTwo_5x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (DX), Y11 @@ -16308,24 +17480,30 @@ mulAvxTwo_5x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Store 3 outputs VMOVDQU Y0, (R10) @@ -16347,7 +17525,7 @@ mulAvxTwo_5x3_64_end: RET // func mulAvxTwo_5x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x3Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -16396,19 +17574,22 @@ mulAvxTwo_5x3Xor_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU (R11), Y1 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 @@ -16420,17 +17601,20 @@ mulAvxTwo_5x3Xor_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 @@ -16442,17 +17626,20 @@ mulAvxTwo_5x3Xor_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 @@ -16464,17 +17651,20 @@ mulAvxTwo_5x3Xor_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (DX), Y6 @@ -16486,17 +17676,20 @@ mulAvxTwo_5x3Xor_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Store 3 outputs VMOVDQU Y0, (R10) @@ -16515,7 +17708,7 @@ mulAvxTwo_5x3Xor_end: RET // func mulAvxTwo_5x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x3_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -16578,24 +17771,30 @@ mulAvxTwo_5x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 @@ -16613,24 +17812,30 @@ mulAvxTwo_5x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 @@ -16648,24 +17853,30 @@ mulAvxTwo_5x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 @@ -16683,24 +17894,30 @@ mulAvxTwo_5x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (DX), Y11 @@ -16718,24 +17935,30 @@ mulAvxTwo_5x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Store 3 outputs VMOVDQU Y0, (R10) @@ -16757,7 +17980,7 @@ mulAvxTwo_5x3_64Xor_end: RET // func mulAvxTwo_5x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x4(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -16834,22 +18057,26 @@ mulAvxTwo_5x4_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 @@ -16861,22 +18088,26 @@ mulAvxTwo_5x4_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 @@ -16888,22 +18119,26 @@ mulAvxTwo_5x4_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (DX), Y7 @@ -16915,22 +18150,26 @@ mulAvxTwo_5x4_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Store 4 outputs VMOVDQU Y0, (R10) @@ -16951,7 +18190,7 @@ mulAvxTwo_5x4_end: RET // func mulAvxTwo_5x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x4Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -17002,25 +18241,29 @@ mulAvxTwo_5x4Xor_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU (R11), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU (R12), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 @@ -17032,22 +18275,26 @@ mulAvxTwo_5x4Xor_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 @@ -17059,22 +18306,26 @@ mulAvxTwo_5x4Xor_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 @@ -17086,22 +18337,26 @@ mulAvxTwo_5x4Xor_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (DX), Y7 @@ -17113,22 +18368,26 @@ mulAvxTwo_5x4Xor_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Store 4 outputs VMOVDQU Y0, (R10) @@ -17149,7 +18408,7 @@ mulAvxTwo_5x4Xor_end: RET // func mulAvxTwo_5x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x5(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -17233,27 +18492,32 @@ mulAvxTwo_5x5_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 @@ -17265,27 +18529,32 @@ mulAvxTwo_5x5_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 @@ -17297,27 +18566,32 @@ mulAvxTwo_5x5_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (DX), Y8 @@ -17329,27 +18603,32 @@ mulAvxTwo_5x5_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Store 5 outputs VMOVDQU Y0, (R10) @@ -17372,7 +18651,7 @@ mulAvxTwo_5x5_end: RET // func mulAvxTwo_5x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x5Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -17425,31 +18704,36 @@ mulAvxTwo_5x5Xor_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU (R11), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU (R12), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU (R13), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU (R9), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 @@ -17461,27 +18745,32 @@ mulAvxTwo_5x5Xor_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 @@ -17493,27 +18782,32 @@ mulAvxTwo_5x5Xor_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 @@ -17525,27 +18819,32 @@ mulAvxTwo_5x5Xor_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (DX), Y8 @@ -17557,27 +18856,32 @@ mulAvxTwo_5x5Xor_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Store 5 outputs VMOVDQU Y0, (R10) @@ -17600,7 +18904,7 @@ mulAvxTwo_5x5Xor_end: RET // func mulAvxTwo_5x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x6(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -17691,32 +18995,38 @@ mulAvxTwo_5x6_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 @@ -17728,32 +19038,38 @@ mulAvxTwo_5x6_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 @@ -17765,32 +19081,38 @@ mulAvxTwo_5x6_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (DX), Y9 @@ -17802,32 +19124,38 @@ mulAvxTwo_5x6_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Store 6 outputs VMOVDQU Y0, (R10) @@ -17852,7 +19180,7 @@ mulAvxTwo_5x6_end: RET // func mulAvxTwo_5x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x6Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -17907,37 +19235,43 @@ mulAvxTwo_5x6Xor_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU (R11), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU (R12), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU (R13), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU (R14), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU (R9), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 @@ -17949,32 +19283,38 @@ mulAvxTwo_5x6Xor_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 @@ -17986,32 +19326,38 @@ mulAvxTwo_5x6Xor_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 @@ -18023,32 +19369,38 @@ mulAvxTwo_5x6Xor_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (DX), Y9 @@ -18060,32 +19412,38 @@ mulAvxTwo_5x6Xor_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Store 6 outputs VMOVDQU Y0, (R10) @@ -18110,7 +19468,7 @@ mulAvxTwo_5x6Xor_end: RET // func mulAvxTwo_5x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x7(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -18208,37 +19566,44 @@ mulAvxTwo_5x7_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 @@ -18250,37 +19615,44 @@ mulAvxTwo_5x7_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 @@ -18292,37 +19664,44 @@ mulAvxTwo_5x7_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (DX), Y10 @@ -18334,37 +19713,44 @@ mulAvxTwo_5x7_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Store 7 outputs VMOVDQU Y0, (R10) @@ -18391,7 +19777,7 @@ mulAvxTwo_5x7_end: RET // func mulAvxTwo_5x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x7Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -18448,43 +19834,50 @@ mulAvxTwo_5x7Xor_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU (R11), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU (R12), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU (R13), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU (R14), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU (R15), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU (R9), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 @@ -18496,37 +19889,44 @@ mulAvxTwo_5x7Xor_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 @@ -18538,37 +19938,44 @@ mulAvxTwo_5x7Xor_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 @@ -18580,37 +19987,44 @@ mulAvxTwo_5x7Xor_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (DX), Y10 @@ -18622,37 +20036,44 @@ mulAvxTwo_5x7Xor_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Store 7 outputs VMOVDQU Y0, (R10) @@ -18679,7 +20100,7 @@ mulAvxTwo_5x7Xor_end: RET // func mulAvxTwo_5x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x8(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -18786,42 +20207,50 @@ mulAvxTwo_5x8_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (SI), Y11 @@ -18833,42 +20262,50 @@ mulAvxTwo_5x8_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (DI), Y11 @@ -18880,42 +20317,50 @@ mulAvxTwo_5x8_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (AX), Y11 @@ -18927,42 +20372,50 @@ mulAvxTwo_5x8_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Store 8 outputs VMOVDQU Y0, (R9) @@ -18991,7 +20444,7 @@ mulAvxTwo_5x8_end: RET // func mulAvxTwo_5x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x8Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -19052,49 +20505,57 @@ mulAvxTwo_5x8Xor_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU (R10), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU (R11), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU (R12), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU (R13), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU (R14), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU (R15), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU (R8), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (BX), Y11 @@ -19106,42 +20567,50 @@ mulAvxTwo_5x8Xor_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (SI), Y11 @@ -19153,42 +20622,50 @@ mulAvxTwo_5x8Xor_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (DI), Y11 @@ -19200,42 +20677,50 @@ mulAvxTwo_5x8Xor_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (AX), Y11 @@ -19247,42 +20732,50 @@ mulAvxTwo_5x8Xor_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Store 8 outputs VMOVDQU Y0, (R9) @@ -19311,7 +20804,7 @@ mulAvxTwo_5x8Xor_end: RET // func mulAvxTwo_5x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x9(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -19403,47 +20896,56 @@ mulAvxTwo_5x9_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 @@ -19455,47 +20957,56 @@ mulAvxTwo_5x9_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 @@ -19507,47 +21018,56 @@ mulAvxTwo_5x9_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (DX), Y12 @@ -19559,47 +21079,56 @@ mulAvxTwo_5x9_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Store 9 outputs MOVQ (R9), R11 @@ -19631,7 +21160,7 @@ mulAvxTwo_5x9_end: RET // func mulAvxTwo_5x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x9Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -19673,63 +21202,72 @@ mulAvxTwo_5x9Xor_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 MOVQ 24(R9), R11 VMOVDQU (R11)(R10*1), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 MOVQ 48(R9), R11 VMOVDQU (R11)(R10*1), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 MOVQ 72(R9), R11 VMOVDQU (R11)(R10*1), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 MOVQ 96(R9), R11 VMOVDQU (R11)(R10*1), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 MOVQ 120(R9), R11 VMOVDQU (R11)(R10*1), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 MOVQ 144(R9), R11 VMOVDQU (R11)(R10*1), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 MOVQ 168(R9), R11 VMOVDQU (R11)(R10*1), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 MOVQ 192(R9), R11 VMOVDQU (R11)(R10*1), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 @@ -19741,47 +21279,56 @@ mulAvxTwo_5x9Xor_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 @@ -19793,47 +21340,56 @@ mulAvxTwo_5x9Xor_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 @@ -19845,47 +21401,56 @@ mulAvxTwo_5x9Xor_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (DX), Y12 @@ -19897,47 +21462,56 @@ mulAvxTwo_5x9Xor_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Store 9 outputs MOVQ (R9), R11 @@ -19969,7 +21543,7 @@ mulAvxTwo_5x9Xor_end: RET // func mulAvxTwo_5x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x10(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -20066,52 +21640,62 @@ mulAvxTwo_5x10_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 @@ -20123,52 +21707,62 @@ mulAvxTwo_5x10_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 @@ -20180,52 +21774,62 @@ mulAvxTwo_5x10_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (DX), Y13 @@ -20237,52 +21841,62 @@ mulAvxTwo_5x10_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Store 10 outputs MOVQ (R9), R11 @@ -20316,7 +21930,7 @@ mulAvxTwo_5x10_end: RET // func mulAvxTwo_5x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x10Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -20358,70 +21972,80 @@ mulAvxTwo_5x10Xor_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 MOVQ 24(R9), R11 VMOVDQU (R11)(R10*1), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 MOVQ 48(R9), R11 VMOVDQU (R11)(R10*1), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 MOVQ 72(R9), R11 VMOVDQU (R11)(R10*1), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 MOVQ 96(R9), R11 VMOVDQU (R11)(R10*1), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 MOVQ 120(R9), R11 VMOVDQU (R11)(R10*1), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 MOVQ 144(R9), R11 VMOVDQU (R11)(R10*1), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 MOVQ 168(R9), R11 VMOVDQU (R11)(R10*1), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 MOVQ 192(R9), R11 VMOVDQU (R11)(R10*1), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 MOVQ 216(R9), R11 VMOVDQU (R11)(R10*1), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 @@ -20433,52 +22057,62 @@ mulAvxTwo_5x10Xor_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 @@ -20490,52 +22124,62 @@ mulAvxTwo_5x10Xor_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 @@ -20547,52 +22191,62 @@ mulAvxTwo_5x10Xor_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (DX), Y13 @@ -20604,52 +22258,62 @@ mulAvxTwo_5x10Xor_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Store 10 outputs MOVQ (R9), R11 @@ -20683,7 +22347,7 @@ mulAvxTwo_5x10Xor_end: RET // func mulAvxTwo_6x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x1(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -20749,7 +22413,8 @@ mulAvxTwo_6x1_loop: VPAND Y13, Y15, Y15 VPSHUFB Y14, Y2, Y14 VPSHUFB Y15, Y3, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y12, Y12 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y14 @@ -20759,7 +22424,8 @@ mulAvxTwo_6x1_loop: VPAND Y13, Y15, Y15 VPSHUFB Y14, Y4, Y14 VPSHUFB Y15, Y5, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y12, Y12 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (DI), Y14 @@ -20769,7 +22435,8 @@ mulAvxTwo_6x1_loop: VPAND Y13, Y15, Y15 VPSHUFB Y14, Y6, Y14 VPSHUFB Y15, Y7, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y12, Y12 // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R8), Y14 @@ -20779,7 +22446,8 @@ mulAvxTwo_6x1_loop: VPAND Y13, Y15, Y15 VPSHUFB Y14, Y8, Y14 VPSHUFB Y15, Y9, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y12, Y12 // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (CX), Y14 @@ -20789,7 +22457,8 @@ mulAvxTwo_6x1_loop: VPAND Y13, Y15, Y15 VPSHUFB Y14, Y10, Y14 VPSHUFB Y15, Y11, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y12, Y12 // Store 1 outputs VMOVDQU Y12, (R9) @@ -20804,7 +22473,7 @@ mulAvxTwo_6x1_end: RET // func mulAvxTwo_6x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x1_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -20876,8 +22545,10 @@ mulAvxTwo_6x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 @@ -20895,8 +22566,10 @@ mulAvxTwo_6x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 @@ -20914,8 +22587,10 @@ mulAvxTwo_6x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 @@ -20933,8 +22608,10 @@ mulAvxTwo_6x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (DX), Y6 @@ -20952,8 +22629,10 @@ mulAvxTwo_6x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Store 1 outputs VMOVDQU Y0, (R10) @@ -20969,7 +22648,7 @@ mulAvxTwo_6x1_64_end: RET // func mulAvxTwo_6x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x1Xor(SB), NOSPLIT, $0-88 // Loading all tables to registers // Destination kept in GP registers @@ -21026,7 +22705,8 @@ mulAvxTwo_6x1Xor_loop: VMOVDQU (R9), Y12 VPSHUFB Y14, Y0, Y14 VPSHUFB Y15, Y1, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y12, Y12 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y14 @@ -21036,7 +22716,8 @@ mulAvxTwo_6x1Xor_loop: VPAND Y13, Y15, Y15 VPSHUFB Y14, Y2, Y14 VPSHUFB Y15, Y3, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y12, Y12 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y14 @@ -21046,7 +22727,8 @@ mulAvxTwo_6x1Xor_loop: VPAND Y13, Y15, Y15 VPSHUFB Y14, Y4, Y14 VPSHUFB Y15, Y5, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y12, Y12 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (DI), Y14 @@ -21056,7 +22738,8 @@ mulAvxTwo_6x1Xor_loop: VPAND Y13, Y15, Y15 VPSHUFB Y14, Y6, Y14 VPSHUFB Y15, Y7, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y12, Y12 // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R8), Y14 @@ -21066,7 +22749,8 @@ mulAvxTwo_6x1Xor_loop: VPAND Y13, Y15, Y15 VPSHUFB Y14, Y8, Y14 VPSHUFB Y15, Y9, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y12, Y12 // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (CX), Y14 @@ -21076,7 +22760,8 @@ mulAvxTwo_6x1Xor_loop: VPAND Y13, Y15, Y15 VPSHUFB Y14, Y10, Y14 VPSHUFB Y15, Y11, Y15 - XOR3WAY( $0x00, Y14, Y15, Y12) + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y12, Y12 // Store 1 outputs VMOVDQU Y12, (R9) @@ -21091,7 +22776,7 @@ mulAvxTwo_6x1Xor_end: RET // func mulAvxTwo_6x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x1_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -21148,8 +22833,10 @@ mulAvxTwo_6x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 @@ -21167,8 +22854,10 @@ mulAvxTwo_6x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 @@ -21186,8 +22875,10 @@ mulAvxTwo_6x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 @@ -21205,8 +22896,10 @@ mulAvxTwo_6x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 @@ -21224,8 +22917,10 @@ mulAvxTwo_6x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (DX), Y6 @@ -21243,8 +22938,10 @@ mulAvxTwo_6x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Store 1 outputs VMOVDQU Y0, (R10) @@ -21260,7 +22957,7 @@ mulAvxTwo_6x1_64Xor_end: RET // func mulAvxTwo_6x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x2(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -21325,12 +23022,14 @@ mulAvxTwo_6x2_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 @@ -21342,12 +23041,14 @@ mulAvxTwo_6x2_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 @@ -21359,12 +23060,14 @@ mulAvxTwo_6x2_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 @@ -21376,12 +23079,14 @@ mulAvxTwo_6x2_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (DX), Y5 @@ -21393,12 +23098,14 @@ mulAvxTwo_6x2_loop: VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Store 2 outputs VMOVDQU Y0, (R11) @@ -21415,7 +23122,7 @@ mulAvxTwo_6x2_end: RET // func mulAvxTwo_6x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x2_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -21497,16 +23204,20 @@ mulAvxTwo_6x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 @@ -21524,16 +23235,20 @@ mulAvxTwo_6x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 @@ -21551,16 +23266,20 @@ mulAvxTwo_6x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 @@ -21578,16 +23297,20 @@ mulAvxTwo_6x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (DX), Y9 @@ -21605,16 +23328,20 @@ mulAvxTwo_6x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Store 2 outputs VMOVDQU Y0, (R11) @@ -21633,7 +23360,7 @@ mulAvxTwo_6x2_64_end: RET // func mulAvxTwo_6x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x2Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -21682,13 +23409,15 @@ mulAvxTwo_6x2Xor_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU (R10), Y1 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 @@ -21700,12 +23429,14 @@ mulAvxTwo_6x2Xor_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 @@ -21717,12 +23448,14 @@ mulAvxTwo_6x2Xor_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 @@ -21734,12 +23467,14 @@ mulAvxTwo_6x2Xor_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 @@ -21751,12 +23486,14 @@ mulAvxTwo_6x2Xor_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (DX), Y5 @@ -21768,12 +23505,14 @@ mulAvxTwo_6x2Xor_loop: VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Store 2 outputs VMOVDQU Y0, (R11) @@ -21790,7 +23529,7 @@ mulAvxTwo_6x2Xor_end: RET // func mulAvxTwo_6x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x2_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -21851,16 +23590,20 @@ mulAvxTwo_6x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 @@ -21878,16 +23621,20 @@ mulAvxTwo_6x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 @@ -21905,16 +23652,20 @@ mulAvxTwo_6x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 @@ -21932,16 +23683,20 @@ mulAvxTwo_6x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 @@ -21959,16 +23714,20 @@ mulAvxTwo_6x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (DX), Y9 @@ -21986,16 +23745,20 @@ mulAvxTwo_6x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Store 2 outputs VMOVDQU Y0, (R11) @@ -22014,7 +23777,7 @@ mulAvxTwo_6x2_64Xor_end: RET // func mulAvxTwo_6x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x3(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -22086,17 +23849,20 @@ mulAvxTwo_6x3_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 @@ -22108,17 +23874,20 @@ mulAvxTwo_6x3_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 @@ -22130,17 +23899,20 @@ mulAvxTwo_6x3_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y6 @@ -22152,17 +23924,20 @@ mulAvxTwo_6x3_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (DX), Y6 @@ -22174,17 +23949,20 @@ mulAvxTwo_6x3_loop: VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Store 3 outputs VMOVDQU Y0, (R11) @@ -22203,7 +23981,7 @@ mulAvxTwo_6x3_end: RET // func mulAvxTwo_6x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x3_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -22295,24 +24073,30 @@ mulAvxTwo_6x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 @@ -22330,24 +24114,30 @@ mulAvxTwo_6x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 @@ -22365,24 +24155,30 @@ mulAvxTwo_6x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R9), Y11 @@ -22400,24 +24196,30 @@ mulAvxTwo_6x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (DX), Y11 @@ -22435,24 +24237,30 @@ mulAvxTwo_6x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Store 3 outputs VMOVDQU Y0, (R11) @@ -22474,7 +24282,7 @@ mulAvxTwo_6x3_64_end: RET // func mulAvxTwo_6x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x3Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -22525,19 +24333,22 @@ mulAvxTwo_6x3Xor_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU (R12), Y1 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU (R10), Y2 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 @@ -22549,17 +24360,20 @@ mulAvxTwo_6x3Xor_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 @@ -22571,17 +24385,20 @@ mulAvxTwo_6x3Xor_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 @@ -22593,17 +24410,20 @@ mulAvxTwo_6x3Xor_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y6 @@ -22615,17 +24435,20 @@ mulAvxTwo_6x3Xor_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (DX), Y6 @@ -22637,17 +24460,20 @@ mulAvxTwo_6x3Xor_loop: VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Store 3 outputs VMOVDQU Y0, (R11) @@ -22666,7 +24492,7 @@ mulAvxTwo_6x3Xor_end: RET // func mulAvxTwo_6x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x3_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -22731,24 +24557,30 @@ mulAvxTwo_6x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 @@ -22766,24 +24598,30 @@ mulAvxTwo_6x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 @@ -22801,24 +24639,30 @@ mulAvxTwo_6x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 @@ -22836,24 +24680,30 @@ mulAvxTwo_6x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R9), Y11 @@ -22871,24 +24721,30 @@ mulAvxTwo_6x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (DX), Y11 @@ -22906,24 +24762,30 @@ mulAvxTwo_6x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Store 3 outputs VMOVDQU Y0, (R11) @@ -22945,7 +24807,7 @@ mulAvxTwo_6x3_64Xor_end: RET // func mulAvxTwo_6x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x4(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -23024,22 +24886,26 @@ mulAvxTwo_6x4_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 @@ -23051,22 +24917,26 @@ mulAvxTwo_6x4_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 @@ -23078,22 +24948,26 @@ mulAvxTwo_6x4_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 @@ -23105,22 +24979,26 @@ mulAvxTwo_6x4_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (DX), Y7 @@ -23132,22 +25010,26 @@ mulAvxTwo_6x4_loop: VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Store 4 outputs VMOVDQU Y0, (R11) @@ -23168,7 +25050,7 @@ mulAvxTwo_6x4_end: RET // func mulAvxTwo_6x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x4Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -23221,25 +25103,29 @@ mulAvxTwo_6x4Xor_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU (R12), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU (R13), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 @@ -23251,22 +25137,26 @@ mulAvxTwo_6x4Xor_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 @@ -23278,22 +25168,26 @@ mulAvxTwo_6x4Xor_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 @@ -23305,22 +25199,26 @@ mulAvxTwo_6x4Xor_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 @@ -23332,22 +25230,26 @@ mulAvxTwo_6x4Xor_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (DX), Y7 @@ -23359,22 +25261,26 @@ mulAvxTwo_6x4Xor_loop: VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Store 4 outputs VMOVDQU Y0, (R11) @@ -23395,7 +25301,7 @@ mulAvxTwo_6x4Xor_end: RET // func mulAvxTwo_6x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x5(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -23481,27 +25387,32 @@ mulAvxTwo_6x5_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 @@ -23513,27 +25424,32 @@ mulAvxTwo_6x5_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 @@ -23545,27 +25461,32 @@ mulAvxTwo_6x5_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 @@ -23577,27 +25498,32 @@ mulAvxTwo_6x5_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (DX), Y8 @@ -23609,27 +25535,32 @@ mulAvxTwo_6x5_loop: VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Store 5 outputs VMOVDQU Y0, (R11) @@ -23652,7 +25583,7 @@ mulAvxTwo_6x5_end: RET // func mulAvxTwo_6x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x5Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -23707,31 +25638,36 @@ mulAvxTwo_6x5Xor_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU (R12), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU (R13), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU (R14), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU (R10), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 @@ -23743,27 +25679,32 @@ mulAvxTwo_6x5Xor_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 @@ -23775,27 +25716,32 @@ mulAvxTwo_6x5Xor_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 @@ -23807,27 +25753,32 @@ mulAvxTwo_6x5Xor_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 @@ -23839,27 +25790,32 @@ mulAvxTwo_6x5Xor_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (DX), Y8 @@ -23871,27 +25827,32 @@ mulAvxTwo_6x5Xor_loop: VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Store 5 outputs VMOVDQU Y0, (R11) @@ -23914,7 +25875,7 @@ mulAvxTwo_6x5Xor_end: RET // func mulAvxTwo_6x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x6(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -24007,32 +25968,38 @@ mulAvxTwo_6x6_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 @@ -24044,32 +26011,38 @@ mulAvxTwo_6x6_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 @@ -24081,32 +26054,38 @@ mulAvxTwo_6x6_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 @@ -24118,32 +26097,38 @@ mulAvxTwo_6x6_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (DX), Y9 @@ -24155,32 +26140,38 @@ mulAvxTwo_6x6_loop: VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Store 6 outputs VMOVDQU Y0, (R11) @@ -24205,7 +26196,7 @@ mulAvxTwo_6x6_end: RET // func mulAvxTwo_6x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x6Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -24262,37 +26253,43 @@ mulAvxTwo_6x6Xor_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU (R12), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU (R13), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU (R14), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU (R15), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU (R10), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 @@ -24304,32 +26301,38 @@ mulAvxTwo_6x6Xor_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 @@ -24341,32 +26344,38 @@ mulAvxTwo_6x6Xor_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 @@ -24378,32 +26387,38 @@ mulAvxTwo_6x6Xor_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 @@ -24415,32 +26430,38 @@ mulAvxTwo_6x6Xor_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (DX), Y9 @@ -24452,32 +26473,38 @@ mulAvxTwo_6x6Xor_loop: VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Store 6 outputs VMOVDQU Y0, (R11) @@ -24502,7 +26529,7 @@ mulAvxTwo_6x6Xor_end: RET // func mulAvxTwo_6x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x7(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -24604,37 +26631,44 @@ mulAvxTwo_6x7_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (SI), Y10 @@ -24646,37 +26680,44 @@ mulAvxTwo_6x7_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (DI), Y10 @@ -24688,37 +26729,44 @@ mulAvxTwo_6x7_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R8), Y10 @@ -24730,37 +26778,44 @@ mulAvxTwo_6x7_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (AX), Y10 @@ -24772,37 +26827,44 @@ mulAvxTwo_6x7_loop: VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Store 7 outputs VMOVDQU Y0, (R10) @@ -24829,7 +26891,7 @@ mulAvxTwo_6x7_end: RET // func mulAvxTwo_6x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x7Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -24890,43 +26952,50 @@ mulAvxTwo_6x7Xor_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU (R11), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU (R12), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU (R13), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU (R14), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU (R15), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU (R9), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (BX), Y10 @@ -24938,37 +27007,44 @@ mulAvxTwo_6x7Xor_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (SI), Y10 @@ -24980,37 +27056,44 @@ mulAvxTwo_6x7Xor_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (DI), Y10 @@ -25022,37 +27105,44 @@ mulAvxTwo_6x7Xor_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R8), Y10 @@ -25064,37 +27154,44 @@ mulAvxTwo_6x7Xor_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (AX), Y10 @@ -25106,37 +27203,44 @@ mulAvxTwo_6x7Xor_loop: VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Store 7 outputs VMOVDQU Y0, (R10) @@ -25163,7 +27267,7 @@ mulAvxTwo_6x7Xor_end: RET // func mulAvxTwo_6x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x8(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -25252,42 +27356,50 @@ mulAvxTwo_6x8_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 @@ -25299,42 +27411,50 @@ mulAvxTwo_6x8_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 @@ -25346,42 +27466,50 @@ mulAvxTwo_6x8_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 @@ -25393,42 +27521,50 @@ mulAvxTwo_6x8_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (DX), Y11 @@ -25440,42 +27576,50 @@ mulAvxTwo_6x8_loop: VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Store 8 outputs MOVQ (R10), R12 @@ -25505,7 +27649,7 @@ mulAvxTwo_6x8_end: RET // func mulAvxTwo_6x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x8Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -25549,56 +27693,64 @@ mulAvxTwo_6x8Xor_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 MOVQ 24(R10), R12 VMOVDQU (R12)(R11*1), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 MOVQ 48(R10), R12 VMOVDQU (R12)(R11*1), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 MOVQ 72(R10), R12 VMOVDQU (R12)(R11*1), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 MOVQ 96(R10), R12 VMOVDQU (R12)(R11*1), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 MOVQ 120(R10), R12 VMOVDQU (R12)(R11*1), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 MOVQ 144(R10), R12 VMOVDQU (R12)(R11*1), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 MOVQ 168(R10), R12 VMOVDQU (R12)(R11*1), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 @@ -25610,42 +27762,50 @@ mulAvxTwo_6x8Xor_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 @@ -25657,42 +27817,50 @@ mulAvxTwo_6x8Xor_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 @@ -25704,42 +27872,50 @@ mulAvxTwo_6x8Xor_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 @@ -25751,42 +27927,50 @@ mulAvxTwo_6x8Xor_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (DX), Y11 @@ -25798,42 +27982,50 @@ mulAvxTwo_6x8Xor_loop: VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Store 8 outputs MOVQ (R10), R12 @@ -25863,7 +28055,7 @@ mulAvxTwo_6x8Xor_end: RET // func mulAvxTwo_6x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x9(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -25957,47 +28149,56 @@ mulAvxTwo_6x9_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 @@ -26009,47 +28210,56 @@ mulAvxTwo_6x9_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 @@ -26061,47 +28271,56 @@ mulAvxTwo_6x9_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 @@ -26113,47 +28332,56 @@ mulAvxTwo_6x9_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (DX), Y12 @@ -26165,47 +28393,56 @@ mulAvxTwo_6x9_loop: VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Store 9 outputs MOVQ (R10), R12 @@ -26237,7 +28474,7 @@ mulAvxTwo_6x9_end: RET // func mulAvxTwo_6x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x9Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -26281,63 +28518,72 @@ mulAvxTwo_6x9Xor_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 MOVQ 24(R10), R12 VMOVDQU (R12)(R11*1), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 MOVQ 48(R10), R12 VMOVDQU (R12)(R11*1), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 MOVQ 72(R10), R12 VMOVDQU (R12)(R11*1), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 MOVQ 96(R10), R12 VMOVDQU (R12)(R11*1), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 MOVQ 120(R10), R12 VMOVDQU (R12)(R11*1), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 MOVQ 144(R10), R12 VMOVDQU (R12)(R11*1), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 MOVQ 168(R10), R12 VMOVDQU (R12)(R11*1), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 MOVQ 192(R10), R12 VMOVDQU (R12)(R11*1), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 @@ -26349,47 +28595,56 @@ mulAvxTwo_6x9Xor_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 @@ -26401,47 +28656,56 @@ mulAvxTwo_6x9Xor_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 @@ -26453,47 +28717,56 @@ mulAvxTwo_6x9Xor_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 @@ -26505,47 +28778,56 @@ mulAvxTwo_6x9Xor_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (DX), Y12 @@ -26557,47 +28839,56 @@ mulAvxTwo_6x9Xor_loop: VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Store 9 outputs MOVQ (R10), R12 @@ -26629,7 +28920,7 @@ mulAvxTwo_6x9Xor_end: RET // func mulAvxTwo_6x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x10(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -26728,52 +29019,62 @@ mulAvxTwo_6x10_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 @@ -26785,52 +29086,62 @@ mulAvxTwo_6x10_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 @@ -26842,52 +29153,62 @@ mulAvxTwo_6x10_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 @@ -26899,52 +29220,62 @@ mulAvxTwo_6x10_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (DX), Y13 @@ -26956,52 +29287,62 @@ mulAvxTwo_6x10_loop: VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Store 10 outputs MOVQ (R10), R12 @@ -27035,7 +29376,7 @@ mulAvxTwo_6x10_end: RET // func mulAvxTwo_6x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x10Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -27079,70 +29420,80 @@ mulAvxTwo_6x10Xor_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 MOVQ 24(R10), R12 VMOVDQU (R12)(R11*1), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 MOVQ 48(R10), R12 VMOVDQU (R12)(R11*1), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 MOVQ 72(R10), R12 VMOVDQU (R12)(R11*1), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 MOVQ 96(R10), R12 VMOVDQU (R12)(R11*1), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 MOVQ 120(R10), R12 VMOVDQU (R12)(R11*1), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 MOVQ 144(R10), R12 VMOVDQU (R12)(R11*1), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 MOVQ 168(R10), R12 VMOVDQU (R12)(R11*1), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 MOVQ 192(R10), R12 VMOVDQU (R12)(R11*1), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 MOVQ 216(R10), R12 VMOVDQU (R12)(R11*1), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 @@ -27154,52 +29505,62 @@ mulAvxTwo_6x10Xor_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 @@ -27211,52 +29572,62 @@ mulAvxTwo_6x10Xor_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 @@ -27268,52 +29639,62 @@ mulAvxTwo_6x10Xor_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 @@ -27325,52 +29706,62 @@ mulAvxTwo_6x10Xor_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (DX), Y13 @@ -27382,52 +29773,62 @@ mulAvxTwo_6x10Xor_loop: VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Store 10 outputs MOVQ (R10), R12 @@ -27461,7 +29862,7 @@ mulAvxTwo_6x10Xor_end: RET // func mulAvxTwo_7x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x1(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -27521,7 +29922,8 @@ mulAvxTwo_7x1_loop: VMOVDQU 96(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (DI), Y4 @@ -27533,7 +29935,8 @@ mulAvxTwo_7x1_loop: VMOVDQU 160(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (R8), Y4 @@ -27545,7 +29948,8 @@ mulAvxTwo_7x1_loop: VMOVDQU 224(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R9), Y4 @@ -27557,7 +29961,8 @@ mulAvxTwo_7x1_loop: VMOVDQU 288(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R10), Y4 @@ -27569,7 +29974,8 @@ mulAvxTwo_7x1_loop: VMOVDQU 352(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (DX), Y4 @@ -27581,7 +29987,8 @@ mulAvxTwo_7x1_loop: VMOVDQU 416(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Store 1 outputs VMOVDQU Y0, (R11) @@ -27596,7 +30003,7 @@ mulAvxTwo_7x1_end: RET // func mulAvxTwo_7x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x1_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -27670,8 +30077,10 @@ mulAvxTwo_7x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 @@ -27689,8 +30098,10 @@ mulAvxTwo_7x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 @@ -27708,8 +30119,10 @@ mulAvxTwo_7x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 @@ -27727,8 +30140,10 @@ mulAvxTwo_7x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R10), Y6 @@ -27746,8 +30161,10 @@ mulAvxTwo_7x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (DX), Y6 @@ -27765,8 +30182,10 @@ mulAvxTwo_7x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Store 1 outputs VMOVDQU Y0, (R11) @@ -27782,7 +30201,7 @@ mulAvxTwo_7x1_64_end: RET // func mulAvxTwo_7x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x1Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -27831,7 +30250,8 @@ mulAvxTwo_7x1Xor_loop: VMOVDQU 32(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (SI), Y4 @@ -27843,7 +30263,8 @@ mulAvxTwo_7x1Xor_loop: VMOVDQU 96(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (DI), Y4 @@ -27855,7 +30276,8 @@ mulAvxTwo_7x1Xor_loop: VMOVDQU 160(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (R8), Y4 @@ -27867,7 +30289,8 @@ mulAvxTwo_7x1Xor_loop: VMOVDQU 224(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R9), Y4 @@ -27879,7 +30302,8 @@ mulAvxTwo_7x1Xor_loop: VMOVDQU 288(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R10), Y4 @@ -27891,7 +30315,8 @@ mulAvxTwo_7x1Xor_loop: VMOVDQU 352(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (DX), Y4 @@ -27903,7 +30328,8 @@ mulAvxTwo_7x1Xor_loop: VMOVDQU 416(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Store 1 outputs VMOVDQU Y0, (R11) @@ -27918,7 +30344,7 @@ mulAvxTwo_7x1Xor_end: RET // func mulAvxTwo_7x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x1_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -27977,8 +30403,10 @@ mulAvxTwo_7x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 @@ -27996,8 +30424,10 @@ mulAvxTwo_7x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 @@ -28015,8 +30445,10 @@ mulAvxTwo_7x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 @@ -28034,8 +30466,10 @@ mulAvxTwo_7x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 @@ -28053,8 +30487,10 @@ mulAvxTwo_7x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R10), Y6 @@ -28072,8 +30508,10 @@ mulAvxTwo_7x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (DX), Y6 @@ -28091,8 +30529,10 @@ mulAvxTwo_7x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Store 1 outputs VMOVDQU Y0, (R11) @@ -28108,7 +30548,7 @@ mulAvxTwo_7x1_64Xor_end: RET // func mulAvxTwo_7x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x2(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -28175,12 +30615,14 @@ mulAvxTwo_7x2_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 @@ -28192,12 +30634,14 @@ mulAvxTwo_7x2_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 @@ -28209,12 +30653,14 @@ mulAvxTwo_7x2_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 @@ -28226,12 +30672,14 @@ mulAvxTwo_7x2_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y5 @@ -28243,12 +30691,14 @@ mulAvxTwo_7x2_loop: VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (DX), Y5 @@ -28260,12 +30710,14 @@ mulAvxTwo_7x2_loop: VMOVDQU 800(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 832(CX), Y3 VMOVDQU 864(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Store 2 outputs VMOVDQU Y0, (R12) @@ -28282,7 +30734,7 @@ mulAvxTwo_7x2_end: RET // func mulAvxTwo_7x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x2_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -28366,16 +30818,20 @@ mulAvxTwo_7x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 @@ -28393,16 +30849,20 @@ mulAvxTwo_7x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 @@ -28420,16 +30880,20 @@ mulAvxTwo_7x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 @@ -28447,16 +30911,20 @@ mulAvxTwo_7x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R10), Y9 @@ -28474,16 +30942,20 @@ mulAvxTwo_7x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (DX), Y9 @@ -28501,16 +30973,20 @@ mulAvxTwo_7x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Store 2 outputs VMOVDQU Y0, (R12) @@ -28529,7 +31005,7 @@ mulAvxTwo_7x2_64_end: RET // func mulAvxTwo_7x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x2Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -28580,13 +31056,15 @@ mulAvxTwo_7x2Xor_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU (R11), Y1 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 @@ -28598,12 +31076,14 @@ mulAvxTwo_7x2Xor_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 @@ -28615,12 +31095,14 @@ mulAvxTwo_7x2Xor_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 @@ -28632,12 +31114,14 @@ mulAvxTwo_7x2Xor_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 @@ -28649,12 +31133,14 @@ mulAvxTwo_7x2Xor_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y5 @@ -28666,12 +31152,14 @@ mulAvxTwo_7x2Xor_loop: VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (DX), Y5 @@ -28683,12 +31171,14 @@ mulAvxTwo_7x2Xor_loop: VMOVDQU 800(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 832(CX), Y3 VMOVDQU 864(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Store 2 outputs VMOVDQU Y0, (R12) @@ -28705,7 +31195,7 @@ mulAvxTwo_7x2Xor_end: RET // func mulAvxTwo_7x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x2_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -28768,16 +31258,20 @@ mulAvxTwo_7x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 @@ -28795,16 +31289,20 @@ mulAvxTwo_7x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 @@ -28822,16 +31320,20 @@ mulAvxTwo_7x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 @@ -28849,16 +31351,20 @@ mulAvxTwo_7x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 @@ -28876,16 +31382,20 @@ mulAvxTwo_7x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R10), Y9 @@ -28903,16 +31413,20 @@ mulAvxTwo_7x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (DX), Y9 @@ -28930,16 +31444,20 @@ mulAvxTwo_7x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Store 2 outputs VMOVDQU Y0, (R12) @@ -28958,7 +31476,7 @@ mulAvxTwo_7x2_64Xor_end: RET // func mulAvxTwo_7x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x3(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -29032,17 +31550,20 @@ mulAvxTwo_7x3_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 @@ -29054,17 +31575,20 @@ mulAvxTwo_7x3_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 @@ -29076,17 +31600,20 @@ mulAvxTwo_7x3_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y6 @@ -29098,17 +31625,20 @@ mulAvxTwo_7x3_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R10), Y6 @@ -29120,17 +31650,20 @@ mulAvxTwo_7x3_loop: VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (DX), Y6 @@ -29142,17 +31675,20 @@ mulAvxTwo_7x3_loop: VMOVDQU 1184(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 1216(CX), Y4 VMOVDQU 1248(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 1280(CX), Y4 VMOVDQU 1312(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Store 3 outputs VMOVDQU Y0, (R12) @@ -29171,7 +31707,7 @@ mulAvxTwo_7x3_end: RET // func mulAvxTwo_7x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x3_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -29265,24 +31801,30 @@ mulAvxTwo_7x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 @@ -29300,24 +31842,30 @@ mulAvxTwo_7x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 @@ -29335,24 +31883,30 @@ mulAvxTwo_7x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R9), Y11 @@ -29370,24 +31924,30 @@ mulAvxTwo_7x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R10), Y11 @@ -29405,24 +31965,30 @@ mulAvxTwo_7x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (DX), Y11 @@ -29440,24 +32006,30 @@ mulAvxTwo_7x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Store 3 outputs VMOVDQU Y0, (R12) @@ -29479,7 +32051,7 @@ mulAvxTwo_7x3_64_end: RET // func mulAvxTwo_7x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x3Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -29532,19 +32104,22 @@ mulAvxTwo_7x3Xor_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU (R13), Y1 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU (R11), Y2 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 @@ -29556,17 +32131,20 @@ mulAvxTwo_7x3Xor_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 @@ -29578,17 +32156,20 @@ mulAvxTwo_7x3Xor_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 @@ -29600,17 +32181,20 @@ mulAvxTwo_7x3Xor_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y6 @@ -29622,17 +32206,20 @@ mulAvxTwo_7x3Xor_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R10), Y6 @@ -29644,17 +32231,20 @@ mulAvxTwo_7x3Xor_loop: VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (DX), Y6 @@ -29666,17 +32256,20 @@ mulAvxTwo_7x3Xor_loop: VMOVDQU 1184(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 1216(CX), Y4 VMOVDQU 1248(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 1280(CX), Y4 VMOVDQU 1312(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Store 3 outputs VMOVDQU Y0, (R12) @@ -29695,7 +32288,7 @@ mulAvxTwo_7x3Xor_end: RET // func mulAvxTwo_7x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x3_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -29762,24 +32355,30 @@ mulAvxTwo_7x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 @@ -29797,24 +32396,30 @@ mulAvxTwo_7x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 @@ -29832,24 +32437,30 @@ mulAvxTwo_7x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 @@ -29867,24 +32478,30 @@ mulAvxTwo_7x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R9), Y11 @@ -29902,24 +32519,30 @@ mulAvxTwo_7x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R10), Y11 @@ -29937,24 +32560,30 @@ mulAvxTwo_7x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (DX), Y11 @@ -29972,24 +32601,30 @@ mulAvxTwo_7x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Store 3 outputs VMOVDQU Y0, (R12) @@ -30011,7 +32646,7 @@ mulAvxTwo_7x3_64Xor_end: RET // func mulAvxTwo_7x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x4(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -30092,22 +32727,26 @@ mulAvxTwo_7x4_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 @@ -30119,22 +32758,26 @@ mulAvxTwo_7x4_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 @@ -30146,22 +32789,26 @@ mulAvxTwo_7x4_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 @@ -30173,22 +32820,26 @@ mulAvxTwo_7x4_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R10), Y7 @@ -30200,22 +32851,26 @@ mulAvxTwo_7x4_loop: VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (DX), Y7 @@ -30227,22 +32882,26 @@ mulAvxTwo_7x4_loop: VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Store 4 outputs VMOVDQU Y0, (R12) @@ -30263,7 +32922,7 @@ mulAvxTwo_7x4_end: RET // func mulAvxTwo_7x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x4Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -30318,25 +32977,29 @@ mulAvxTwo_7x4Xor_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU (R13), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU (R14), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU (R11), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 @@ -30348,22 +33011,26 @@ mulAvxTwo_7x4Xor_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 @@ -30375,22 +33042,26 @@ mulAvxTwo_7x4Xor_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 @@ -30402,22 +33073,26 @@ mulAvxTwo_7x4Xor_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 @@ -30429,22 +33104,26 @@ mulAvxTwo_7x4Xor_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R10), Y7 @@ -30456,22 +33135,26 @@ mulAvxTwo_7x4Xor_loop: VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (DX), Y7 @@ -30483,22 +33166,26 @@ mulAvxTwo_7x4Xor_loop: VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Store 4 outputs VMOVDQU Y0, (R12) @@ -30519,7 +33206,7 @@ mulAvxTwo_7x4Xor_end: RET // func mulAvxTwo_7x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x5(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -30607,27 +33294,32 @@ mulAvxTwo_7x5_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 @@ -30639,27 +33331,32 @@ mulAvxTwo_7x5_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 @@ -30671,27 +33368,32 @@ mulAvxTwo_7x5_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 @@ -30703,27 +33405,32 @@ mulAvxTwo_7x5_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R10), Y8 @@ -30735,27 +33442,32 @@ mulAvxTwo_7x5_loop: VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (DX), Y8 @@ -30767,27 +33479,32 @@ mulAvxTwo_7x5_loop: VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Store 5 outputs VMOVDQU Y0, (R12) @@ -30810,7 +33527,7 @@ mulAvxTwo_7x5_end: RET // func mulAvxTwo_7x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x5Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -30867,31 +33584,36 @@ mulAvxTwo_7x5Xor_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU (R13), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU (R14), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU (R15), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 @@ -30903,27 +33625,32 @@ mulAvxTwo_7x5Xor_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 @@ -30935,27 +33662,32 @@ mulAvxTwo_7x5Xor_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 @@ -30967,27 +33699,32 @@ mulAvxTwo_7x5Xor_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 @@ -30999,27 +33736,32 @@ mulAvxTwo_7x5Xor_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R10), Y8 @@ -31031,27 +33773,32 @@ mulAvxTwo_7x5Xor_loop: VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (DX), Y8 @@ -31063,27 +33810,32 @@ mulAvxTwo_7x5Xor_loop: VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Store 5 outputs VMOVDQU Y0, (R12) @@ -31106,7 +33858,7 @@ mulAvxTwo_7x5Xor_end: RET // func mulAvxTwo_7x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x6(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -31203,32 +33955,38 @@ mulAvxTwo_7x6_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (SI), Y9 @@ -31240,32 +33998,38 @@ mulAvxTwo_7x6_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (DI), Y9 @@ -31277,32 +34041,38 @@ mulAvxTwo_7x6_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R8), Y9 @@ -31314,32 +34084,38 @@ mulAvxTwo_7x6_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R9), Y9 @@ -31351,32 +34127,38 @@ mulAvxTwo_7x6_loop: VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (AX), Y9 @@ -31388,32 +34170,38 @@ mulAvxTwo_7x6_loop: VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Store 6 outputs VMOVDQU Y0, (R11) @@ -31438,7 +34226,7 @@ mulAvxTwo_7x6_end: RET // func mulAvxTwo_7x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x6Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -31499,37 +34287,43 @@ mulAvxTwo_7x6Xor_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU (R12), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU (R13), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU (R14), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU (R15), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU (R10), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (BX), Y9 @@ -31541,32 +34335,38 @@ mulAvxTwo_7x6Xor_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (SI), Y9 @@ -31578,32 +34378,38 @@ mulAvxTwo_7x6Xor_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (DI), Y9 @@ -31615,32 +34421,38 @@ mulAvxTwo_7x6Xor_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R8), Y9 @@ -31652,32 +34464,38 @@ mulAvxTwo_7x6Xor_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R9), Y9 @@ -31689,32 +34507,38 @@ mulAvxTwo_7x6Xor_loop: VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (AX), Y9 @@ -31726,32 +34550,38 @@ mulAvxTwo_7x6Xor_loop: VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Store 6 outputs VMOVDQU Y0, (R11) @@ -31776,7 +34606,7 @@ mulAvxTwo_7x6Xor_end: RET // func mulAvxTwo_7x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x7(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -31862,37 +34692,44 @@ mulAvxTwo_7x7_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 @@ -31904,37 +34741,44 @@ mulAvxTwo_7x7_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 @@ -31946,37 +34790,44 @@ mulAvxTwo_7x7_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 @@ -31988,37 +34839,44 @@ mulAvxTwo_7x7_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 @@ -32030,37 +34888,44 @@ mulAvxTwo_7x7_loop: VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (DX), Y10 @@ -32072,37 +34937,44 @@ mulAvxTwo_7x7_loop: VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Store 7 outputs MOVQ (R11), R13 @@ -32130,7 +35002,7 @@ mulAvxTwo_7x7_end: RET // func mulAvxTwo_7x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x7Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -32176,49 +35048,56 @@ mulAvxTwo_7x7Xor_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 MOVQ 24(R11), R13 VMOVDQU (R13)(R12*1), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 MOVQ 48(R11), R13 VMOVDQU (R13)(R12*1), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 MOVQ 72(R11), R13 VMOVDQU (R13)(R12*1), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 MOVQ 96(R11), R13 VMOVDQU (R13)(R12*1), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 MOVQ 120(R11), R13 VMOVDQU (R13)(R12*1), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 MOVQ 144(R11), R13 VMOVDQU (R13)(R12*1), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 @@ -32230,37 +35109,44 @@ mulAvxTwo_7x7Xor_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 @@ -32272,37 +35158,44 @@ mulAvxTwo_7x7Xor_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 @@ -32314,37 +35207,44 @@ mulAvxTwo_7x7Xor_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 @@ -32356,37 +35256,44 @@ mulAvxTwo_7x7Xor_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 @@ -32398,37 +35305,44 @@ mulAvxTwo_7x7Xor_loop: VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (DX), Y10 @@ -32440,37 +35354,44 @@ mulAvxTwo_7x7Xor_loop: VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Store 7 outputs MOVQ (R11), R13 @@ -32498,7 +35419,7 @@ mulAvxTwo_7x7Xor_end: RET // func mulAvxTwo_7x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x8(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -32589,42 +35510,50 @@ mulAvxTwo_7x8_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 @@ -32636,42 +35565,50 @@ mulAvxTwo_7x8_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 @@ -32683,42 +35620,50 @@ mulAvxTwo_7x8_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 @@ -32730,42 +35675,50 @@ mulAvxTwo_7x8_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 @@ -32777,42 +35730,50 @@ mulAvxTwo_7x8_loop: VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (DX), Y11 @@ -32824,42 +35785,50 @@ mulAvxTwo_7x8_loop: VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Store 8 outputs MOVQ (R11), R13 @@ -32889,7 +35858,7 @@ mulAvxTwo_7x8_end: RET // func mulAvxTwo_7x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x8Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -32935,56 +35904,64 @@ mulAvxTwo_7x8Xor_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 MOVQ 24(R11), R13 VMOVDQU (R13)(R12*1), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 MOVQ 48(R11), R13 VMOVDQU (R13)(R12*1), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 MOVQ 72(R11), R13 VMOVDQU (R13)(R12*1), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 MOVQ 96(R11), R13 VMOVDQU (R13)(R12*1), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 MOVQ 120(R11), R13 VMOVDQU (R13)(R12*1), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 MOVQ 144(R11), R13 VMOVDQU (R13)(R12*1), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 MOVQ 168(R11), R13 VMOVDQU (R13)(R12*1), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 @@ -32996,42 +35973,50 @@ mulAvxTwo_7x8Xor_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 @@ -33043,42 +36028,50 @@ mulAvxTwo_7x8Xor_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 @@ -33090,42 +36083,50 @@ mulAvxTwo_7x8Xor_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 @@ -33137,42 +36138,50 @@ mulAvxTwo_7x8Xor_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 @@ -33184,42 +36193,50 @@ mulAvxTwo_7x8Xor_loop: VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (DX), Y11 @@ -33231,42 +36248,50 @@ mulAvxTwo_7x8Xor_loop: VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Store 8 outputs MOVQ (R11), R13 @@ -33296,7 +36321,7 @@ mulAvxTwo_7x8Xor_end: RET // func mulAvxTwo_7x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x9(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -33392,47 +36417,56 @@ mulAvxTwo_7x9_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 @@ -33444,47 +36478,56 @@ mulAvxTwo_7x9_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 @@ -33496,47 +36539,56 @@ mulAvxTwo_7x9_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 @@ -33548,47 +36600,56 @@ mulAvxTwo_7x9_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 @@ -33600,47 +36661,56 @@ mulAvxTwo_7x9_loop: VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (DX), Y12 @@ -33652,47 +36722,56 @@ mulAvxTwo_7x9_loop: VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Store 9 outputs MOVQ (R11), R13 @@ -33724,7 +36803,7 @@ mulAvxTwo_7x9_end: RET // func mulAvxTwo_7x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x9Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -33770,63 +36849,72 @@ mulAvxTwo_7x9Xor_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 MOVQ 24(R11), R13 VMOVDQU (R13)(R12*1), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 MOVQ 48(R11), R13 VMOVDQU (R13)(R12*1), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 MOVQ 72(R11), R13 VMOVDQU (R13)(R12*1), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 MOVQ 96(R11), R13 VMOVDQU (R13)(R12*1), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 MOVQ 120(R11), R13 VMOVDQU (R13)(R12*1), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 MOVQ 144(R11), R13 VMOVDQU (R13)(R12*1), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 MOVQ 168(R11), R13 VMOVDQU (R13)(R12*1), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 MOVQ 192(R11), R13 VMOVDQU (R13)(R12*1), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 @@ -33838,47 +36926,56 @@ mulAvxTwo_7x9Xor_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 @@ -33890,47 +36987,56 @@ mulAvxTwo_7x9Xor_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 @@ -33942,47 +37048,56 @@ mulAvxTwo_7x9Xor_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 @@ -33994,47 +37109,56 @@ mulAvxTwo_7x9Xor_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 @@ -34046,47 +37170,56 @@ mulAvxTwo_7x9Xor_loop: VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (DX), Y12 @@ -34098,47 +37231,56 @@ mulAvxTwo_7x9Xor_loop: VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Store 9 outputs MOVQ (R11), R13 @@ -34170,7 +37312,7 @@ mulAvxTwo_7x9Xor_end: RET // func mulAvxTwo_7x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x10(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -34271,52 +37413,62 @@ mulAvxTwo_7x10_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 @@ -34328,52 +37480,62 @@ mulAvxTwo_7x10_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 @@ -34385,52 +37547,62 @@ mulAvxTwo_7x10_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 @@ -34442,52 +37614,62 @@ mulAvxTwo_7x10_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 @@ -34499,52 +37681,62 @@ mulAvxTwo_7x10_loop: VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (DX), Y13 @@ -34556,52 +37748,62 @@ mulAvxTwo_7x10_loop: VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Store 10 outputs MOVQ (R11), R13 @@ -34635,7 +37837,7 @@ mulAvxTwo_7x10_end: RET // func mulAvxTwo_7x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x10Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -34681,70 +37883,80 @@ mulAvxTwo_7x10Xor_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 MOVQ 24(R11), R13 VMOVDQU (R13)(R12*1), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 MOVQ 48(R11), R13 VMOVDQU (R13)(R12*1), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 MOVQ 72(R11), R13 VMOVDQU (R13)(R12*1), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 MOVQ 96(R11), R13 VMOVDQU (R13)(R12*1), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 MOVQ 120(R11), R13 VMOVDQU (R13)(R12*1), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 MOVQ 144(R11), R13 VMOVDQU (R13)(R12*1), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 MOVQ 168(R11), R13 VMOVDQU (R13)(R12*1), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 MOVQ 192(R11), R13 VMOVDQU (R13)(R12*1), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 MOVQ 216(R11), R13 VMOVDQU (R13)(R12*1), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 @@ -34756,52 +37968,62 @@ mulAvxTwo_7x10Xor_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 @@ -34813,52 +38035,62 @@ mulAvxTwo_7x10Xor_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 @@ -34870,52 +38102,62 @@ mulAvxTwo_7x10Xor_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 @@ -34927,52 +38169,62 @@ mulAvxTwo_7x10Xor_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 @@ -34984,52 +38236,62 @@ mulAvxTwo_7x10Xor_loop: VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (DX), Y13 @@ -35041,52 +38303,62 @@ mulAvxTwo_7x10Xor_loop: VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Store 10 outputs MOVQ (R11), R13 @@ -35120,7 +38392,7 @@ mulAvxTwo_7x10Xor_end: RET // func mulAvxTwo_8x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x1(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -35182,7 +38454,8 @@ mulAvxTwo_8x1_loop: VMOVDQU 96(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (DI), Y4 @@ -35194,7 +38467,8 @@ mulAvxTwo_8x1_loop: VMOVDQU 160(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (R8), Y4 @@ -35206,7 +38480,8 @@ mulAvxTwo_8x1_loop: VMOVDQU 224(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R9), Y4 @@ -35218,7 +38493,8 @@ mulAvxTwo_8x1_loop: VMOVDQU 288(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R10), Y4 @@ -35230,7 +38506,8 @@ mulAvxTwo_8x1_loop: VMOVDQU 352(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (R11), Y4 @@ -35242,7 +38519,8 @@ mulAvxTwo_8x1_loop: VMOVDQU 416(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 7 to 1 outputs VMOVDQU (DX), Y4 @@ -35254,7 +38532,8 @@ mulAvxTwo_8x1_loop: VMOVDQU 480(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Store 1 outputs VMOVDQU Y0, (R12) @@ -35269,7 +38548,7 @@ mulAvxTwo_8x1_end: RET // func mulAvxTwo_8x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x1_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -35345,8 +38624,10 @@ mulAvxTwo_8x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 @@ -35364,8 +38645,10 @@ mulAvxTwo_8x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 @@ -35383,8 +38666,10 @@ mulAvxTwo_8x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 @@ -35402,8 +38687,10 @@ mulAvxTwo_8x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R10), Y6 @@ -35421,8 +38708,10 @@ mulAvxTwo_8x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (R11), Y6 @@ -35440,8 +38729,10 @@ mulAvxTwo_8x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 7 to 1 outputs VMOVDQU (DX), Y6 @@ -35459,8 +38750,10 @@ mulAvxTwo_8x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Store 1 outputs VMOVDQU Y0, (R12) @@ -35476,7 +38769,7 @@ mulAvxTwo_8x1_64_end: RET // func mulAvxTwo_8x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x1Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -35527,7 +38820,8 @@ mulAvxTwo_8x1Xor_loop: VMOVDQU 32(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (SI), Y4 @@ -35539,7 +38833,8 @@ mulAvxTwo_8x1Xor_loop: VMOVDQU 96(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (DI), Y4 @@ -35551,7 +38846,8 @@ mulAvxTwo_8x1Xor_loop: VMOVDQU 160(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (R8), Y4 @@ -35563,7 +38859,8 @@ mulAvxTwo_8x1Xor_loop: VMOVDQU 224(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R9), Y4 @@ -35575,7 +38872,8 @@ mulAvxTwo_8x1Xor_loop: VMOVDQU 288(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R10), Y4 @@ -35587,7 +38885,8 @@ mulAvxTwo_8x1Xor_loop: VMOVDQU 352(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (R11), Y4 @@ -35599,7 +38898,8 @@ mulAvxTwo_8x1Xor_loop: VMOVDQU 416(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 7 to 1 outputs VMOVDQU (DX), Y4 @@ -35611,7 +38911,8 @@ mulAvxTwo_8x1Xor_loop: VMOVDQU 480(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Store 1 outputs VMOVDQU Y0, (R12) @@ -35626,7 +38927,7 @@ mulAvxTwo_8x1Xor_end: RET // func mulAvxTwo_8x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x1_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -35687,8 +38988,10 @@ mulAvxTwo_8x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 @@ -35706,8 +39009,10 @@ mulAvxTwo_8x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 @@ -35725,8 +39030,10 @@ mulAvxTwo_8x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 @@ -35744,8 +39051,10 @@ mulAvxTwo_8x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 @@ -35763,8 +39072,10 @@ mulAvxTwo_8x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R10), Y6 @@ -35782,8 +39093,10 @@ mulAvxTwo_8x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (R11), Y6 @@ -35801,8 +39114,10 @@ mulAvxTwo_8x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 7 to 1 outputs VMOVDQU (DX), Y6 @@ -35820,8 +39135,10 @@ mulAvxTwo_8x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Store 1 outputs VMOVDQU Y0, (R12) @@ -35837,7 +39154,7 @@ mulAvxTwo_8x1_64Xor_end: RET // func mulAvxTwo_8x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x2(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -35906,12 +39223,14 @@ mulAvxTwo_8x2_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 @@ -35923,12 +39242,14 @@ mulAvxTwo_8x2_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 @@ -35940,12 +39261,14 @@ mulAvxTwo_8x2_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 @@ -35957,12 +39280,14 @@ mulAvxTwo_8x2_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y5 @@ -35974,12 +39299,14 @@ mulAvxTwo_8x2_loop: VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (R11), Y5 @@ -35991,12 +39318,14 @@ mulAvxTwo_8x2_loop: VMOVDQU 800(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 832(CX), Y3 VMOVDQU 864(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 7 to 2 outputs VMOVDQU (DX), Y5 @@ -36008,12 +39337,14 @@ mulAvxTwo_8x2_loop: VMOVDQU 928(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 960(CX), Y3 VMOVDQU 992(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Store 2 outputs VMOVDQU Y0, (R13) @@ -36030,7 +39361,7 @@ mulAvxTwo_8x2_end: RET // func mulAvxTwo_8x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x2_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -36116,16 +39447,20 @@ mulAvxTwo_8x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 @@ -36143,16 +39478,20 @@ mulAvxTwo_8x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 @@ -36170,16 +39509,20 @@ mulAvxTwo_8x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 @@ -36197,16 +39540,20 @@ mulAvxTwo_8x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R10), Y9 @@ -36224,16 +39571,20 @@ mulAvxTwo_8x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (R11), Y9 @@ -36251,16 +39602,20 @@ mulAvxTwo_8x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 7 to 2 outputs VMOVDQU (DX), Y9 @@ -36278,16 +39633,20 @@ mulAvxTwo_8x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Store 2 outputs VMOVDQU Y0, (R13) @@ -36306,7 +39665,7 @@ mulAvxTwo_8x2_64_end: RET // func mulAvxTwo_8x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x2Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -36359,13 +39718,15 @@ mulAvxTwo_8x2Xor_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU (R12), Y1 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 @@ -36377,12 +39738,14 @@ mulAvxTwo_8x2Xor_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 @@ -36394,12 +39757,14 @@ mulAvxTwo_8x2Xor_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 @@ -36411,12 +39776,14 @@ mulAvxTwo_8x2Xor_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 @@ -36428,12 +39795,14 @@ mulAvxTwo_8x2Xor_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y5 @@ -36445,12 +39814,14 @@ mulAvxTwo_8x2Xor_loop: VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (R11), Y5 @@ -36462,12 +39833,14 @@ mulAvxTwo_8x2Xor_loop: VMOVDQU 800(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 832(CX), Y3 VMOVDQU 864(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 7 to 2 outputs VMOVDQU (DX), Y5 @@ -36479,12 +39852,14 @@ mulAvxTwo_8x2Xor_loop: VMOVDQU 928(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 960(CX), Y3 VMOVDQU 992(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Store 2 outputs VMOVDQU Y0, (R13) @@ -36501,7 +39876,7 @@ mulAvxTwo_8x2Xor_end: RET // func mulAvxTwo_8x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x2_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -36566,16 +39941,20 @@ mulAvxTwo_8x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 @@ -36593,16 +39972,20 @@ mulAvxTwo_8x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 @@ -36620,16 +40003,20 @@ mulAvxTwo_8x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 @@ -36647,16 +40034,20 @@ mulAvxTwo_8x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 @@ -36674,16 +40065,20 @@ mulAvxTwo_8x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R10), Y9 @@ -36701,16 +40096,20 @@ mulAvxTwo_8x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (R11), Y9 @@ -36728,16 +40127,20 @@ mulAvxTwo_8x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 7 to 2 outputs VMOVDQU (DX), Y9 @@ -36755,16 +40158,20 @@ mulAvxTwo_8x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Store 2 outputs VMOVDQU Y0, (R13) @@ -36783,7 +40190,7 @@ mulAvxTwo_8x2_64Xor_end: RET // func mulAvxTwo_8x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x3(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -36859,17 +40266,20 @@ mulAvxTwo_8x3_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 @@ -36881,17 +40291,20 @@ mulAvxTwo_8x3_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 @@ -36903,17 +40316,20 @@ mulAvxTwo_8x3_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y6 @@ -36925,17 +40341,20 @@ mulAvxTwo_8x3_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R10), Y6 @@ -36947,17 +40366,20 @@ mulAvxTwo_8x3_loop: VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (R11), Y6 @@ -36969,17 +40391,20 @@ mulAvxTwo_8x3_loop: VMOVDQU 1184(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 1216(CX), Y4 VMOVDQU 1248(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 1280(CX), Y4 VMOVDQU 1312(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 7 to 3 outputs VMOVDQU (DX), Y6 @@ -36991,17 +40416,20 @@ mulAvxTwo_8x3_loop: VMOVDQU 1376(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 1408(CX), Y4 VMOVDQU 1440(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 1472(CX), Y4 VMOVDQU 1504(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Store 3 outputs VMOVDQU Y0, (R13) @@ -37020,7 +40448,7 @@ mulAvxTwo_8x3_end: RET // func mulAvxTwo_8x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x3_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -37116,24 +40544,30 @@ mulAvxTwo_8x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 @@ -37151,24 +40585,30 @@ mulAvxTwo_8x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 @@ -37186,24 +40626,30 @@ mulAvxTwo_8x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R9), Y11 @@ -37221,24 +40667,30 @@ mulAvxTwo_8x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R10), Y11 @@ -37256,24 +40708,30 @@ mulAvxTwo_8x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (R11), Y11 @@ -37291,24 +40749,30 @@ mulAvxTwo_8x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU (DX), Y11 @@ -37326,24 +40790,30 @@ mulAvxTwo_8x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Store 3 outputs VMOVDQU Y0, (R13) @@ -37365,7 +40835,7 @@ mulAvxTwo_8x3_64_end: RET // func mulAvxTwo_8x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x3Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -37420,19 +40890,22 @@ mulAvxTwo_8x3Xor_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU (R14), Y1 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU (R12), Y2 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 @@ -37444,17 +40917,20 @@ mulAvxTwo_8x3Xor_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 @@ -37466,17 +40942,20 @@ mulAvxTwo_8x3Xor_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 @@ -37488,17 +40967,20 @@ mulAvxTwo_8x3Xor_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y6 @@ -37510,17 +40992,20 @@ mulAvxTwo_8x3Xor_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R10), Y6 @@ -37532,17 +41017,20 @@ mulAvxTwo_8x3Xor_loop: VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (R11), Y6 @@ -37554,17 +41042,20 @@ mulAvxTwo_8x3Xor_loop: VMOVDQU 1184(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 1216(CX), Y4 VMOVDQU 1248(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 1280(CX), Y4 VMOVDQU 1312(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 7 to 3 outputs VMOVDQU (DX), Y6 @@ -37576,17 +41067,20 @@ mulAvxTwo_8x3Xor_loop: VMOVDQU 1376(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 1408(CX), Y4 VMOVDQU 1440(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 1472(CX), Y4 VMOVDQU 1504(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Store 3 outputs VMOVDQU Y0, (R13) @@ -37605,7 +41099,7 @@ mulAvxTwo_8x3Xor_end: RET // func mulAvxTwo_8x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x3_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -37674,24 +41168,30 @@ mulAvxTwo_8x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 @@ -37709,24 +41209,30 @@ mulAvxTwo_8x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 @@ -37744,24 +41250,30 @@ mulAvxTwo_8x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 @@ -37779,24 +41291,30 @@ mulAvxTwo_8x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R9), Y11 @@ -37814,24 +41332,30 @@ mulAvxTwo_8x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R10), Y11 @@ -37849,24 +41373,30 @@ mulAvxTwo_8x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (R11), Y11 @@ -37884,24 +41414,30 @@ mulAvxTwo_8x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU (DX), Y11 @@ -37919,24 +41455,30 @@ mulAvxTwo_8x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Store 3 outputs VMOVDQU Y0, (R13) @@ -37958,7 +41500,7 @@ mulAvxTwo_8x3_64Xor_end: RET // func mulAvxTwo_8x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x4(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -38041,22 +41583,26 @@ mulAvxTwo_8x4_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 @@ -38068,22 +41614,26 @@ mulAvxTwo_8x4_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 @@ -38095,22 +41645,26 @@ mulAvxTwo_8x4_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 @@ -38122,22 +41676,26 @@ mulAvxTwo_8x4_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R10), Y7 @@ -38149,22 +41707,26 @@ mulAvxTwo_8x4_loop: VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (R11), Y7 @@ -38176,22 +41738,26 @@ mulAvxTwo_8x4_loop: VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 7 to 4 outputs VMOVDQU (DX), Y7 @@ -38203,22 +41769,26 @@ mulAvxTwo_8x4_loop: VMOVDQU 1824(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1856(CX), Y5 VMOVDQU 1888(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1920(CX), Y5 VMOVDQU 1952(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1984(CX), Y5 VMOVDQU 2016(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Store 4 outputs VMOVDQU Y0, (R13) @@ -38239,7 +41809,7 @@ mulAvxTwo_8x4_end: RET // func mulAvxTwo_8x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x4Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -38296,25 +41866,29 @@ mulAvxTwo_8x4Xor_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU (R14), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU (R15), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU (R12), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 @@ -38326,22 +41900,26 @@ mulAvxTwo_8x4Xor_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 @@ -38353,22 +41931,26 @@ mulAvxTwo_8x4Xor_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 @@ -38380,22 +41962,26 @@ mulAvxTwo_8x4Xor_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 @@ -38407,22 +41993,26 @@ mulAvxTwo_8x4Xor_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R10), Y7 @@ -38434,22 +42024,26 @@ mulAvxTwo_8x4Xor_loop: VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (R11), Y7 @@ -38461,22 +42055,26 @@ mulAvxTwo_8x4Xor_loop: VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 7 to 4 outputs VMOVDQU (DX), Y7 @@ -38488,22 +42086,26 @@ mulAvxTwo_8x4Xor_loop: VMOVDQU 1824(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1856(CX), Y5 VMOVDQU 1888(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1920(CX), Y5 VMOVDQU 1952(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1984(CX), Y5 VMOVDQU 2016(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Store 4 outputs VMOVDQU Y0, (R13) @@ -38524,7 +42126,7 @@ mulAvxTwo_8x4Xor_end: RET // func mulAvxTwo_8x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x5(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -38616,27 +42218,32 @@ mulAvxTwo_8x5_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (SI), Y8 @@ -38648,27 +42255,32 @@ mulAvxTwo_8x5_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (DI), Y8 @@ -38680,27 +42292,32 @@ mulAvxTwo_8x5_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R8), Y8 @@ -38712,27 +42329,32 @@ mulAvxTwo_8x5_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R9), Y8 @@ -38744,27 +42366,32 @@ mulAvxTwo_8x5_loop: VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (R10), Y8 @@ -38776,27 +42403,32 @@ mulAvxTwo_8x5_loop: VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 7 to 5 outputs VMOVDQU (AX), Y8 @@ -38808,27 +42440,32 @@ mulAvxTwo_8x5_loop: VMOVDQU 2272(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 2304(CX), Y6 VMOVDQU 2336(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 2368(CX), Y6 VMOVDQU 2400(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 2432(CX), Y6 VMOVDQU 2464(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 2496(CX), Y6 VMOVDQU 2528(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Store 5 outputs VMOVDQU Y0, (R12) @@ -38851,7 +42488,7 @@ mulAvxTwo_8x5_end: RET // func mulAvxTwo_8x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x5Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -38912,31 +42549,36 @@ mulAvxTwo_8x5Xor_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU (R13), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU (R14), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU (R15), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (BX), Y8 @@ -38948,27 +42590,32 @@ mulAvxTwo_8x5Xor_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (SI), Y8 @@ -38980,27 +42627,32 @@ mulAvxTwo_8x5Xor_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (DI), Y8 @@ -39012,27 +42664,32 @@ mulAvxTwo_8x5Xor_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R8), Y8 @@ -39044,27 +42701,32 @@ mulAvxTwo_8x5Xor_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R9), Y8 @@ -39076,27 +42738,32 @@ mulAvxTwo_8x5Xor_loop: VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (R10), Y8 @@ -39108,27 +42775,32 @@ mulAvxTwo_8x5Xor_loop: VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 7 to 5 outputs VMOVDQU (AX), Y8 @@ -39140,27 +42812,32 @@ mulAvxTwo_8x5Xor_loop: VMOVDQU 2272(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 2304(CX), Y6 VMOVDQU 2336(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 2368(CX), Y6 VMOVDQU 2400(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 2432(CX), Y6 VMOVDQU 2464(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 2496(CX), Y6 VMOVDQU 2528(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Store 5 outputs VMOVDQU Y0, (R12) @@ -39183,7 +42860,7 @@ mulAvxTwo_8x5Xor_end: RET // func mulAvxTwo_8x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x6(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -39266,32 +42943,38 @@ mulAvxTwo_8x6_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 @@ -39303,32 +42986,38 @@ mulAvxTwo_8x6_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 @@ -39340,32 +43029,38 @@ mulAvxTwo_8x6_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 @@ -39377,32 +43072,38 @@ mulAvxTwo_8x6_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y9 @@ -39414,32 +43115,38 @@ mulAvxTwo_8x6_loop: VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y9 @@ -39451,32 +43158,38 @@ mulAvxTwo_8x6_loop: VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 7 to 6 outputs VMOVDQU (DX), Y9 @@ -39488,32 +43201,38 @@ mulAvxTwo_8x6_loop: VMOVDQU 2720(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 2752(CX), Y7 VMOVDQU 2784(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 2816(CX), Y7 VMOVDQU 2848(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 2880(CX), Y7 VMOVDQU 2912(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 2944(CX), Y7 VMOVDQU 2976(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 3008(CX), Y7 VMOVDQU 3040(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Store 6 outputs MOVQ (R12), R14 @@ -39539,7 +43258,7 @@ mulAvxTwo_8x6_end: RET // func mulAvxTwo_8x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x6Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -39587,42 +43306,48 @@ mulAvxTwo_8x6Xor_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 MOVQ 24(R12), R14 VMOVDQU (R14)(R13*1), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 MOVQ 48(R12), R14 VMOVDQU (R14)(R13*1), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 MOVQ 72(R12), R14 VMOVDQU (R14)(R13*1), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 MOVQ 96(R12), R14 VMOVDQU (R14)(R13*1), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 MOVQ 120(R12), R14 VMOVDQU (R14)(R13*1), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 @@ -39634,32 +43359,38 @@ mulAvxTwo_8x6Xor_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 @@ -39671,32 +43402,38 @@ mulAvxTwo_8x6Xor_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 @@ -39708,32 +43445,38 @@ mulAvxTwo_8x6Xor_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 @@ -39745,32 +43488,38 @@ mulAvxTwo_8x6Xor_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y9 @@ -39782,32 +43531,38 @@ mulAvxTwo_8x6Xor_loop: VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y9 @@ -39819,32 +43574,38 @@ mulAvxTwo_8x6Xor_loop: VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 7 to 6 outputs VMOVDQU (DX), Y9 @@ -39856,32 +43617,38 @@ mulAvxTwo_8x6Xor_loop: VMOVDQU 2720(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 2752(CX), Y7 VMOVDQU 2784(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 2816(CX), Y7 VMOVDQU 2848(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 2880(CX), Y7 VMOVDQU 2912(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 2944(CX), Y7 VMOVDQU 2976(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 3008(CX), Y7 VMOVDQU 3040(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Store 6 outputs MOVQ (R12), R14 @@ -39907,7 +43674,7 @@ mulAvxTwo_8x6Xor_end: RET // func mulAvxTwo_8x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x7(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -39995,37 +43762,44 @@ mulAvxTwo_8x7_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 @@ -40037,37 +43811,44 @@ mulAvxTwo_8x7_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 @@ -40079,37 +43860,44 @@ mulAvxTwo_8x7_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 @@ -40121,37 +43909,44 @@ mulAvxTwo_8x7_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 @@ -40163,37 +43958,44 @@ mulAvxTwo_8x7_loop: VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (R11), Y10 @@ -40205,37 +44007,44 @@ mulAvxTwo_8x7_loop: VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 7 to 7 outputs VMOVDQU (DX), Y10 @@ -40247,37 +44056,44 @@ mulAvxTwo_8x7_loop: VMOVDQU 3168(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 3200(CX), Y8 VMOVDQU 3232(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 3264(CX), Y8 VMOVDQU 3296(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 3328(CX), Y8 VMOVDQU 3360(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 3392(CX), Y8 VMOVDQU 3424(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 3456(CX), Y8 VMOVDQU 3488(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 3520(CX), Y8 VMOVDQU 3552(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Store 7 outputs MOVQ (R12), R14 @@ -40305,7 +44121,7 @@ mulAvxTwo_8x7_end: RET // func mulAvxTwo_8x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x7Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -40353,49 +44169,56 @@ mulAvxTwo_8x7Xor_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 MOVQ 24(R12), R14 VMOVDQU (R14)(R13*1), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 MOVQ 48(R12), R14 VMOVDQU (R14)(R13*1), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 MOVQ 72(R12), R14 VMOVDQU (R14)(R13*1), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 MOVQ 96(R12), R14 VMOVDQU (R14)(R13*1), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 MOVQ 120(R12), R14 VMOVDQU (R14)(R13*1), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 MOVQ 144(R12), R14 VMOVDQU (R14)(R13*1), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 @@ -40407,37 +44230,44 @@ mulAvxTwo_8x7Xor_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 @@ -40449,37 +44279,44 @@ mulAvxTwo_8x7Xor_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 @@ -40491,37 +44328,44 @@ mulAvxTwo_8x7Xor_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 @@ -40533,37 +44377,44 @@ mulAvxTwo_8x7Xor_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 @@ -40575,37 +44426,44 @@ mulAvxTwo_8x7Xor_loop: VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (R11), Y10 @@ -40617,37 +44475,44 @@ mulAvxTwo_8x7Xor_loop: VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 7 to 7 outputs VMOVDQU (DX), Y10 @@ -40659,37 +44524,44 @@ mulAvxTwo_8x7Xor_loop: VMOVDQU 3168(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 3200(CX), Y8 VMOVDQU 3232(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 3264(CX), Y8 VMOVDQU 3296(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 3328(CX), Y8 VMOVDQU 3360(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 3392(CX), Y8 VMOVDQU 3424(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 3456(CX), Y8 VMOVDQU 3488(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 3520(CX), Y8 VMOVDQU 3552(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Store 7 outputs MOVQ (R12), R14 @@ -40717,7 +44589,7 @@ mulAvxTwo_8x7Xor_end: RET // func mulAvxTwo_8x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x8(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -40810,42 +44682,50 @@ mulAvxTwo_8x8_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 @@ -40857,42 +44737,50 @@ mulAvxTwo_8x8_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 @@ -40904,42 +44792,50 @@ mulAvxTwo_8x8_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 @@ -40951,42 +44847,50 @@ mulAvxTwo_8x8_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 @@ -40998,42 +44902,50 @@ mulAvxTwo_8x8_loop: VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (R11), Y11 @@ -41045,42 +44957,50 @@ mulAvxTwo_8x8_loop: VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 7 to 8 outputs VMOVDQU (DX), Y11 @@ -41092,42 +45012,50 @@ mulAvxTwo_8x8_loop: VMOVDQU 3616(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 3648(CX), Y9 VMOVDQU 3680(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 3712(CX), Y9 VMOVDQU 3744(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 3776(CX), Y9 VMOVDQU 3808(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 3840(CX), Y9 VMOVDQU 3872(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 3904(CX), Y9 VMOVDQU 3936(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 3968(CX), Y9 VMOVDQU 4000(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 4032(CX), Y9 VMOVDQU 4064(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Store 8 outputs MOVQ (R12), R14 @@ -41157,7 +45085,7 @@ mulAvxTwo_8x8_end: RET // func mulAvxTwo_8x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x8Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -41205,56 +45133,64 @@ mulAvxTwo_8x8Xor_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 MOVQ 24(R12), R14 VMOVDQU (R14)(R13*1), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 MOVQ 48(R12), R14 VMOVDQU (R14)(R13*1), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 MOVQ 72(R12), R14 VMOVDQU (R14)(R13*1), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 MOVQ 96(R12), R14 VMOVDQU (R14)(R13*1), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 MOVQ 120(R12), R14 VMOVDQU (R14)(R13*1), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 MOVQ 144(R12), R14 VMOVDQU (R14)(R13*1), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 MOVQ 168(R12), R14 VMOVDQU (R14)(R13*1), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 @@ -41266,42 +45202,50 @@ mulAvxTwo_8x8Xor_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 @@ -41313,42 +45257,50 @@ mulAvxTwo_8x8Xor_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 @@ -41360,42 +45312,50 @@ mulAvxTwo_8x8Xor_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 @@ -41407,42 +45367,50 @@ mulAvxTwo_8x8Xor_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 @@ -41454,42 +45422,50 @@ mulAvxTwo_8x8Xor_loop: VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (R11), Y11 @@ -41501,42 +45477,50 @@ mulAvxTwo_8x8Xor_loop: VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 7 to 8 outputs VMOVDQU (DX), Y11 @@ -41548,42 +45532,50 @@ mulAvxTwo_8x8Xor_loop: VMOVDQU 3616(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 3648(CX), Y9 VMOVDQU 3680(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 3712(CX), Y9 VMOVDQU 3744(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 3776(CX), Y9 VMOVDQU 3808(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 3840(CX), Y9 VMOVDQU 3872(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 3904(CX), Y9 VMOVDQU 3936(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 3968(CX), Y9 VMOVDQU 4000(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 4032(CX), Y9 VMOVDQU 4064(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Store 8 outputs MOVQ (R12), R14 @@ -41613,7 +45605,7 @@ mulAvxTwo_8x8Xor_end: RET // func mulAvxTwo_8x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x9(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -41711,47 +45703,56 @@ mulAvxTwo_8x9_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 @@ -41763,47 +45764,56 @@ mulAvxTwo_8x9_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 @@ -41815,47 +45825,56 @@ mulAvxTwo_8x9_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 @@ -41867,47 +45886,56 @@ mulAvxTwo_8x9_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 @@ -41919,47 +45947,56 @@ mulAvxTwo_8x9_loop: VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (R11), Y12 @@ -41971,47 +46008,56 @@ mulAvxTwo_8x9_loop: VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 7 to 9 outputs VMOVDQU (DX), Y12 @@ -42023,47 +46069,56 @@ mulAvxTwo_8x9_loop: VMOVDQU 4064(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 4096(CX), Y10 VMOVDQU 4128(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 4160(CX), Y10 VMOVDQU 4192(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 4224(CX), Y10 VMOVDQU 4256(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 4288(CX), Y10 VMOVDQU 4320(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 4352(CX), Y10 VMOVDQU 4384(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 4416(CX), Y10 VMOVDQU 4448(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 4480(CX), Y10 VMOVDQU 4512(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 4544(CX), Y10 VMOVDQU 4576(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Store 9 outputs MOVQ (R12), R14 @@ -42095,7 +46150,7 @@ mulAvxTwo_8x9_end: RET // func mulAvxTwo_8x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x9Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -42143,63 +46198,72 @@ mulAvxTwo_8x9Xor_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 MOVQ 24(R12), R14 VMOVDQU (R14)(R13*1), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 MOVQ 48(R12), R14 VMOVDQU (R14)(R13*1), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 MOVQ 72(R12), R14 VMOVDQU (R14)(R13*1), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 MOVQ 96(R12), R14 VMOVDQU (R14)(R13*1), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 MOVQ 120(R12), R14 VMOVDQU (R14)(R13*1), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 MOVQ 144(R12), R14 VMOVDQU (R14)(R13*1), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 MOVQ 168(R12), R14 VMOVDQU (R14)(R13*1), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 MOVQ 192(R12), R14 VMOVDQU (R14)(R13*1), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 @@ -42211,47 +46275,56 @@ mulAvxTwo_8x9Xor_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 @@ -42263,47 +46336,56 @@ mulAvxTwo_8x9Xor_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 @@ -42315,47 +46397,56 @@ mulAvxTwo_8x9Xor_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 @@ -42367,47 +46458,56 @@ mulAvxTwo_8x9Xor_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 @@ -42419,47 +46519,56 @@ mulAvxTwo_8x9Xor_loop: VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (R11), Y12 @@ -42471,47 +46580,56 @@ mulAvxTwo_8x9Xor_loop: VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 7 to 9 outputs VMOVDQU (DX), Y12 @@ -42523,47 +46641,56 @@ mulAvxTwo_8x9Xor_loop: VMOVDQU 4064(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 4096(CX), Y10 VMOVDQU 4128(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 4160(CX), Y10 VMOVDQU 4192(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 4224(CX), Y10 VMOVDQU 4256(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 4288(CX), Y10 VMOVDQU 4320(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 4352(CX), Y10 VMOVDQU 4384(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 4416(CX), Y10 VMOVDQU 4448(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 4480(CX), Y10 VMOVDQU 4512(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 4544(CX), Y10 VMOVDQU 4576(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Store 9 outputs MOVQ (R12), R14 @@ -42595,7 +46722,7 @@ mulAvxTwo_8x9Xor_end: RET // func mulAvxTwo_8x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x10(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -42698,52 +46825,62 @@ mulAvxTwo_8x10_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 @@ -42755,52 +46892,62 @@ mulAvxTwo_8x10_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 @@ -42812,52 +46959,62 @@ mulAvxTwo_8x10_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 @@ -42869,52 +47026,62 @@ mulAvxTwo_8x10_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 @@ -42926,52 +47093,62 @@ mulAvxTwo_8x10_loop: VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (R11), Y13 @@ -42983,52 +47160,62 @@ mulAvxTwo_8x10_loop: VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 7 to 10 outputs VMOVDQU (DX), Y13 @@ -43040,52 +47227,62 @@ mulAvxTwo_8x10_loop: VMOVDQU 4512(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 4544(CX), Y11 VMOVDQU 4576(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 4608(CX), Y11 VMOVDQU 4640(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 4672(CX), Y11 VMOVDQU 4704(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 4736(CX), Y11 VMOVDQU 4768(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 4800(CX), Y11 VMOVDQU 4832(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 4864(CX), Y11 VMOVDQU 4896(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 4928(CX), Y11 VMOVDQU 4960(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 4992(CX), Y11 VMOVDQU 5024(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 5056(CX), Y11 VMOVDQU 5088(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Store 10 outputs MOVQ (R12), R14 @@ -43119,7 +47316,7 @@ mulAvxTwo_8x10_end: RET // func mulAvxTwo_8x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x10Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -43167,70 +47364,80 @@ mulAvxTwo_8x10Xor_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 MOVQ 24(R12), R14 VMOVDQU (R14)(R13*1), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 MOVQ 48(R12), R14 VMOVDQU (R14)(R13*1), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 MOVQ 72(R12), R14 VMOVDQU (R14)(R13*1), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 MOVQ 96(R12), R14 VMOVDQU (R14)(R13*1), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 MOVQ 120(R12), R14 VMOVDQU (R14)(R13*1), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 MOVQ 144(R12), R14 VMOVDQU (R14)(R13*1), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 MOVQ 168(R12), R14 VMOVDQU (R14)(R13*1), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 MOVQ 192(R12), R14 VMOVDQU (R14)(R13*1), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 MOVQ 216(R12), R14 VMOVDQU (R14)(R13*1), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 @@ -43242,52 +47449,62 @@ mulAvxTwo_8x10Xor_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 @@ -43299,52 +47516,62 @@ mulAvxTwo_8x10Xor_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 @@ -43356,52 +47583,62 @@ mulAvxTwo_8x10Xor_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 @@ -43413,52 +47650,62 @@ mulAvxTwo_8x10Xor_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 @@ -43470,52 +47717,62 @@ mulAvxTwo_8x10Xor_loop: VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (R11), Y13 @@ -43527,52 +47784,62 @@ mulAvxTwo_8x10Xor_loop: VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 7 to 10 outputs VMOVDQU (DX), Y13 @@ -43584,52 +47851,62 @@ mulAvxTwo_8x10Xor_loop: VMOVDQU 4512(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 4544(CX), Y11 VMOVDQU 4576(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 4608(CX), Y11 VMOVDQU 4640(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 4672(CX), Y11 VMOVDQU 4704(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 4736(CX), Y11 VMOVDQU 4768(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 4800(CX), Y11 VMOVDQU 4832(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 4864(CX), Y11 VMOVDQU 4896(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 4928(CX), Y11 VMOVDQU 4960(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 4992(CX), Y11 VMOVDQU 5024(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 5056(CX), Y11 VMOVDQU 5088(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Store 10 outputs MOVQ (R12), R14 @@ -43663,7 +47940,7 @@ mulAvxTwo_8x10Xor_end: RET // func mulAvxTwo_9x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x1(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -43727,7 +48004,8 @@ mulAvxTwo_9x1_loop: VMOVDQU 96(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (DI), Y4 @@ -43739,7 +48017,8 @@ mulAvxTwo_9x1_loop: VMOVDQU 160(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (R8), Y4 @@ -43751,7 +48030,8 @@ mulAvxTwo_9x1_loop: VMOVDQU 224(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R9), Y4 @@ -43763,7 +48043,8 @@ mulAvxTwo_9x1_loop: VMOVDQU 288(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R10), Y4 @@ -43775,7 +48056,8 @@ mulAvxTwo_9x1_loop: VMOVDQU 352(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (R11), Y4 @@ -43787,7 +48069,8 @@ mulAvxTwo_9x1_loop: VMOVDQU 416(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 7 to 1 outputs VMOVDQU (R12), Y4 @@ -43799,7 +48082,8 @@ mulAvxTwo_9x1_loop: VMOVDQU 480(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 8 to 1 outputs VMOVDQU (DX), Y4 @@ -43811,7 +48095,8 @@ mulAvxTwo_9x1_loop: VMOVDQU 544(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Store 1 outputs VMOVDQU Y0, (R13) @@ -43826,7 +48111,7 @@ mulAvxTwo_9x1_end: RET // func mulAvxTwo_9x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x1_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -43904,8 +48189,10 @@ mulAvxTwo_9x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 @@ -43923,8 +48210,10 @@ mulAvxTwo_9x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 @@ -43942,8 +48231,10 @@ mulAvxTwo_9x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 @@ -43961,8 +48252,10 @@ mulAvxTwo_9x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R10), Y6 @@ -43980,8 +48273,10 @@ mulAvxTwo_9x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (R11), Y6 @@ -43999,8 +48294,10 @@ mulAvxTwo_9x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 7 to 1 outputs VMOVDQU (R12), Y6 @@ -44018,8 +48315,10 @@ mulAvxTwo_9x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 8 to 1 outputs VMOVDQU (DX), Y6 @@ -44037,8 +48336,10 @@ mulAvxTwo_9x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Store 1 outputs VMOVDQU Y0, (R13) @@ -44054,7 +48355,7 @@ mulAvxTwo_9x1_64_end: RET // func mulAvxTwo_9x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x1Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -44107,7 +48408,8 @@ mulAvxTwo_9x1Xor_loop: VMOVDQU 32(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (SI), Y4 @@ -44119,7 +48421,8 @@ mulAvxTwo_9x1Xor_loop: VMOVDQU 96(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (DI), Y4 @@ -44131,7 +48434,8 @@ mulAvxTwo_9x1Xor_loop: VMOVDQU 160(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (R8), Y4 @@ -44143,7 +48447,8 @@ mulAvxTwo_9x1Xor_loop: VMOVDQU 224(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R9), Y4 @@ -44155,7 +48460,8 @@ mulAvxTwo_9x1Xor_loop: VMOVDQU 288(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R10), Y4 @@ -44167,7 +48473,8 @@ mulAvxTwo_9x1Xor_loop: VMOVDQU 352(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (R11), Y4 @@ -44179,7 +48486,8 @@ mulAvxTwo_9x1Xor_loop: VMOVDQU 416(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 7 to 1 outputs VMOVDQU (R12), Y4 @@ -44191,7 +48499,8 @@ mulAvxTwo_9x1Xor_loop: VMOVDQU 480(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 8 to 1 outputs VMOVDQU (DX), Y4 @@ -44203,7 +48512,8 @@ mulAvxTwo_9x1Xor_loop: VMOVDQU 544(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Store 1 outputs VMOVDQU Y0, (R13) @@ -44218,7 +48528,7 @@ mulAvxTwo_9x1Xor_end: RET // func mulAvxTwo_9x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x1_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -44281,8 +48591,10 @@ mulAvxTwo_9x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 @@ -44300,8 +48612,10 @@ mulAvxTwo_9x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 @@ -44319,8 +48633,10 @@ mulAvxTwo_9x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 @@ -44338,8 +48654,10 @@ mulAvxTwo_9x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 @@ -44357,8 +48675,10 @@ mulAvxTwo_9x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R10), Y6 @@ -44376,8 +48696,10 @@ mulAvxTwo_9x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (R11), Y6 @@ -44395,8 +48717,10 @@ mulAvxTwo_9x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 7 to 1 outputs VMOVDQU (R12), Y6 @@ -44414,8 +48738,10 @@ mulAvxTwo_9x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 8 to 1 outputs VMOVDQU (DX), Y6 @@ -44433,8 +48759,10 @@ mulAvxTwo_9x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Store 1 outputs VMOVDQU Y0, (R13) @@ -44450,7 +48778,7 @@ mulAvxTwo_9x1_64Xor_end: RET // func mulAvxTwo_9x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x2(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -44521,12 +48849,14 @@ mulAvxTwo_9x2_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 @@ -44538,12 +48868,14 @@ mulAvxTwo_9x2_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 @@ -44555,12 +48887,14 @@ mulAvxTwo_9x2_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 @@ -44572,12 +48906,14 @@ mulAvxTwo_9x2_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y5 @@ -44589,12 +48925,14 @@ mulAvxTwo_9x2_loop: VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (R11), Y5 @@ -44606,12 +48944,14 @@ mulAvxTwo_9x2_loop: VMOVDQU 800(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 832(CX), Y3 VMOVDQU 864(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 7 to 2 outputs VMOVDQU (R12), Y5 @@ -44623,12 +48963,14 @@ mulAvxTwo_9x2_loop: VMOVDQU 928(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 960(CX), Y3 VMOVDQU 992(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 8 to 2 outputs VMOVDQU (DX), Y5 @@ -44640,12 +48982,14 @@ mulAvxTwo_9x2_loop: VMOVDQU 1056(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 1088(CX), Y3 VMOVDQU 1120(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Store 2 outputs VMOVDQU Y0, (R14) @@ -44662,7 +49006,7 @@ mulAvxTwo_9x2_end: RET // func mulAvxTwo_9x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x2_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -44750,16 +49094,20 @@ mulAvxTwo_9x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 @@ -44777,16 +49125,20 @@ mulAvxTwo_9x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 @@ -44804,16 +49156,20 @@ mulAvxTwo_9x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 @@ -44831,16 +49187,20 @@ mulAvxTwo_9x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R10), Y9 @@ -44858,16 +49218,20 @@ mulAvxTwo_9x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (R11), Y9 @@ -44885,16 +49249,20 @@ mulAvxTwo_9x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 7 to 2 outputs VMOVDQU (R12), Y9 @@ -44912,16 +49280,20 @@ mulAvxTwo_9x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 8 to 2 outputs VMOVDQU (DX), Y9 @@ -44939,16 +49311,20 @@ mulAvxTwo_9x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Store 2 outputs VMOVDQU Y0, (R14) @@ -44967,7 +49343,7 @@ mulAvxTwo_9x2_64_end: RET // func mulAvxTwo_9x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x2Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -45022,13 +49398,15 @@ mulAvxTwo_9x2Xor_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU (R13), Y1 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 @@ -45040,12 +49418,14 @@ mulAvxTwo_9x2Xor_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 @@ -45057,12 +49437,14 @@ mulAvxTwo_9x2Xor_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 @@ -45074,12 +49456,14 @@ mulAvxTwo_9x2Xor_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 @@ -45091,12 +49475,14 @@ mulAvxTwo_9x2Xor_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y5 @@ -45108,12 +49494,14 @@ mulAvxTwo_9x2Xor_loop: VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (R11), Y5 @@ -45125,12 +49513,14 @@ mulAvxTwo_9x2Xor_loop: VMOVDQU 800(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 832(CX), Y3 VMOVDQU 864(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 7 to 2 outputs VMOVDQU (R12), Y5 @@ -45142,12 +49532,14 @@ mulAvxTwo_9x2Xor_loop: VMOVDQU 928(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 960(CX), Y3 VMOVDQU 992(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 8 to 2 outputs VMOVDQU (DX), Y5 @@ -45159,12 +49551,14 @@ mulAvxTwo_9x2Xor_loop: VMOVDQU 1056(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 1088(CX), Y3 VMOVDQU 1120(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Store 2 outputs VMOVDQU Y0, (R14) @@ -45181,7 +49575,7 @@ mulAvxTwo_9x2Xor_end: RET // func mulAvxTwo_9x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x2_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -45248,16 +49642,20 @@ mulAvxTwo_9x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 @@ -45275,16 +49673,20 @@ mulAvxTwo_9x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 @@ -45302,16 +49704,20 @@ mulAvxTwo_9x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 @@ -45329,16 +49735,20 @@ mulAvxTwo_9x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 @@ -45356,16 +49766,20 @@ mulAvxTwo_9x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R10), Y9 @@ -45383,16 +49797,20 @@ mulAvxTwo_9x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (R11), Y9 @@ -45410,16 +49828,20 @@ mulAvxTwo_9x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 7 to 2 outputs VMOVDQU (R12), Y9 @@ -45437,16 +49859,20 @@ mulAvxTwo_9x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 8 to 2 outputs VMOVDQU (DX), Y9 @@ -45464,16 +49890,20 @@ mulAvxTwo_9x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Store 2 outputs VMOVDQU Y0, (R14) @@ -45492,7 +49922,7 @@ mulAvxTwo_9x2_64Xor_end: RET // func mulAvxTwo_9x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x3(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -45570,17 +50000,20 @@ mulAvxTwo_9x3_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 @@ -45592,17 +50025,20 @@ mulAvxTwo_9x3_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 @@ -45614,17 +50050,20 @@ mulAvxTwo_9x3_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y6 @@ -45636,17 +50075,20 @@ mulAvxTwo_9x3_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R10), Y6 @@ -45658,17 +50100,20 @@ mulAvxTwo_9x3_loop: VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (R11), Y6 @@ -45680,17 +50125,20 @@ mulAvxTwo_9x3_loop: VMOVDQU 1184(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 1216(CX), Y4 VMOVDQU 1248(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 1280(CX), Y4 VMOVDQU 1312(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 7 to 3 outputs VMOVDQU (R12), Y6 @@ -45702,17 +50150,20 @@ mulAvxTwo_9x3_loop: VMOVDQU 1376(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 1408(CX), Y4 VMOVDQU 1440(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 1472(CX), Y4 VMOVDQU 1504(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 8 to 3 outputs VMOVDQU (DX), Y6 @@ -45724,17 +50175,20 @@ mulAvxTwo_9x3_loop: VMOVDQU 1568(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 1600(CX), Y4 VMOVDQU 1632(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 1664(CX), Y4 VMOVDQU 1696(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Store 3 outputs VMOVDQU Y0, (R14) @@ -45753,7 +50207,7 @@ mulAvxTwo_9x3_end: RET // func mulAvxTwo_9x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x3_64(SB), $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -45851,24 +50305,30 @@ mulAvxTwo_9x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 @@ -45886,24 +50346,30 @@ mulAvxTwo_9x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 @@ -45921,24 +50387,30 @@ mulAvxTwo_9x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R9), Y11 @@ -45956,24 +50428,30 @@ mulAvxTwo_9x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R10), Y11 @@ -45991,24 +50469,30 @@ mulAvxTwo_9x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (R11), Y11 @@ -46026,24 +50510,30 @@ mulAvxTwo_9x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU (R12), Y11 @@ -46061,24 +50551,30 @@ mulAvxTwo_9x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 8 to 3 outputs VMOVDQU (DX), Y11 @@ -46096,24 +50592,30 @@ mulAvxTwo_9x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Store 3 outputs VMOVDQU Y0, (R14) @@ -46135,7 +50637,7 @@ mulAvxTwo_9x3_64_end: RET // func mulAvxTwo_9x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x3Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -46192,19 +50694,22 @@ mulAvxTwo_9x3Xor_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU (R15), Y1 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU (R13), Y2 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 @@ -46216,17 +50721,20 @@ mulAvxTwo_9x3Xor_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 @@ -46238,17 +50746,20 @@ mulAvxTwo_9x3Xor_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 @@ -46260,17 +50771,20 @@ mulAvxTwo_9x3Xor_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y6 @@ -46282,17 +50796,20 @@ mulAvxTwo_9x3Xor_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R10), Y6 @@ -46304,17 +50821,20 @@ mulAvxTwo_9x3Xor_loop: VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (R11), Y6 @@ -46326,17 +50846,20 @@ mulAvxTwo_9x3Xor_loop: VMOVDQU 1184(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 1216(CX), Y4 VMOVDQU 1248(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 1280(CX), Y4 VMOVDQU 1312(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 7 to 3 outputs VMOVDQU (R12), Y6 @@ -46348,17 +50871,20 @@ mulAvxTwo_9x3Xor_loop: VMOVDQU 1376(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 1408(CX), Y4 VMOVDQU 1440(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 1472(CX), Y4 VMOVDQU 1504(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 8 to 3 outputs VMOVDQU (DX), Y6 @@ -46370,17 +50896,20 @@ mulAvxTwo_9x3Xor_loop: VMOVDQU 1568(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 1600(CX), Y4 VMOVDQU 1632(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 1664(CX), Y4 VMOVDQU 1696(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Store 3 outputs VMOVDQU Y0, (R14) @@ -46399,7 +50928,7 @@ mulAvxTwo_9x3Xor_end: RET // func mulAvxTwo_9x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x3_64Xor(SB), $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -46470,24 +50999,30 @@ mulAvxTwo_9x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 @@ -46505,24 +51040,30 @@ mulAvxTwo_9x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 @@ -46540,24 +51081,30 @@ mulAvxTwo_9x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 @@ -46575,24 +51122,30 @@ mulAvxTwo_9x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R9), Y11 @@ -46610,24 +51163,30 @@ mulAvxTwo_9x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R10), Y11 @@ -46645,24 +51204,30 @@ mulAvxTwo_9x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (R11), Y11 @@ -46680,24 +51245,30 @@ mulAvxTwo_9x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU (R12), Y11 @@ -46715,24 +51286,30 @@ mulAvxTwo_9x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 8 to 3 outputs VMOVDQU (DX), Y11 @@ -46750,24 +51327,30 @@ mulAvxTwo_9x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Store 3 outputs VMOVDQU Y0, (R14) @@ -46789,7 +51372,7 @@ mulAvxTwo_9x3_64Xor_end: RET // func mulAvxTwo_9x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x4(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -46876,22 +51459,26 @@ mulAvxTwo_9x4_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (SI), Y7 @@ -46903,22 +51490,26 @@ mulAvxTwo_9x4_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (DI), Y7 @@ -46930,22 +51521,26 @@ mulAvxTwo_9x4_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R8), Y7 @@ -46957,22 +51552,26 @@ mulAvxTwo_9x4_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R9), Y7 @@ -46984,22 +51583,26 @@ mulAvxTwo_9x4_loop: VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (R10), Y7 @@ -47011,22 +51614,26 @@ mulAvxTwo_9x4_loop: VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 7 to 4 outputs VMOVDQU (R11), Y7 @@ -47038,22 +51645,26 @@ mulAvxTwo_9x4_loop: VMOVDQU 1824(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1856(CX), Y5 VMOVDQU 1888(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1920(CX), Y5 VMOVDQU 1952(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1984(CX), Y5 VMOVDQU 2016(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 8 to 4 outputs VMOVDQU (AX), Y7 @@ -47065,22 +51676,26 @@ mulAvxTwo_9x4_loop: VMOVDQU 2080(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 2112(CX), Y5 VMOVDQU 2144(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 2176(CX), Y5 VMOVDQU 2208(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 2240(CX), Y5 VMOVDQU 2272(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Store 4 outputs VMOVDQU Y0, (R13) @@ -47101,7 +51716,7 @@ mulAvxTwo_9x4_end: RET // func mulAvxTwo_9x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x4Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -47162,25 +51777,29 @@ mulAvxTwo_9x4Xor_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU (R14), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU (R15), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU (R12), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (BX), Y7 @@ -47192,22 +51811,26 @@ mulAvxTwo_9x4Xor_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (SI), Y7 @@ -47219,22 +51842,26 @@ mulAvxTwo_9x4Xor_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (DI), Y7 @@ -47246,22 +51873,26 @@ mulAvxTwo_9x4Xor_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R8), Y7 @@ -47273,22 +51904,26 @@ mulAvxTwo_9x4Xor_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R9), Y7 @@ -47300,22 +51935,26 @@ mulAvxTwo_9x4Xor_loop: VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (R10), Y7 @@ -47327,22 +51966,26 @@ mulAvxTwo_9x4Xor_loop: VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 7 to 4 outputs VMOVDQU (R11), Y7 @@ -47354,22 +51997,26 @@ mulAvxTwo_9x4Xor_loop: VMOVDQU 1824(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1856(CX), Y5 VMOVDQU 1888(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1920(CX), Y5 VMOVDQU 1952(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1984(CX), Y5 VMOVDQU 2016(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 8 to 4 outputs VMOVDQU (AX), Y7 @@ -47381,22 +52028,26 @@ mulAvxTwo_9x4Xor_loop: VMOVDQU 2080(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 2112(CX), Y5 VMOVDQU 2144(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 2176(CX), Y5 VMOVDQU 2208(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 2240(CX), Y5 VMOVDQU 2272(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Store 4 outputs VMOVDQU Y0, (R13) @@ -47417,7 +52068,7 @@ mulAvxTwo_9x4Xor_end: RET // func mulAvxTwo_9x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x5(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -47497,27 +52148,32 @@ mulAvxTwo_9x5_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 @@ -47529,27 +52185,32 @@ mulAvxTwo_9x5_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 @@ -47561,27 +52222,32 @@ mulAvxTwo_9x5_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 @@ -47593,27 +52259,32 @@ mulAvxTwo_9x5_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R10), Y8 @@ -47625,27 +52296,32 @@ mulAvxTwo_9x5_loop: VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (R11), Y8 @@ -47657,27 +52333,32 @@ mulAvxTwo_9x5_loop: VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 7 to 5 outputs VMOVDQU (R12), Y8 @@ -47689,27 +52370,32 @@ mulAvxTwo_9x5_loop: VMOVDQU 2272(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 2304(CX), Y6 VMOVDQU 2336(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 2368(CX), Y6 VMOVDQU 2400(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 2432(CX), Y6 VMOVDQU 2464(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 2496(CX), Y6 VMOVDQU 2528(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 8 to 5 outputs VMOVDQU (DX), Y8 @@ -47721,27 +52407,32 @@ mulAvxTwo_9x5_loop: VMOVDQU 2592(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 2624(CX), Y6 VMOVDQU 2656(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 2688(CX), Y6 VMOVDQU 2720(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 2752(CX), Y6 VMOVDQU 2784(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 2816(CX), Y6 VMOVDQU 2848(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Store 5 outputs MOVQ (R13), R15 @@ -47765,7 +52456,7 @@ mulAvxTwo_9x5_end: RET // func mulAvxTwo_9x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x5Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -47815,35 +52506,40 @@ mulAvxTwo_9x5Xor_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 MOVQ 24(R13), R15 VMOVDQU (R15)(R14*1), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 MOVQ 48(R13), R15 VMOVDQU (R15)(R14*1), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 MOVQ 72(R13), R15 VMOVDQU (R15)(R14*1), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 MOVQ 96(R13), R15 VMOVDQU (R15)(R14*1), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 @@ -47855,27 +52551,32 @@ mulAvxTwo_9x5Xor_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 @@ -47887,27 +52588,32 @@ mulAvxTwo_9x5Xor_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 @@ -47919,27 +52625,32 @@ mulAvxTwo_9x5Xor_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 @@ -47951,27 +52662,32 @@ mulAvxTwo_9x5Xor_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R10), Y8 @@ -47983,27 +52699,32 @@ mulAvxTwo_9x5Xor_loop: VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (R11), Y8 @@ -48015,27 +52736,32 @@ mulAvxTwo_9x5Xor_loop: VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 7 to 5 outputs VMOVDQU (R12), Y8 @@ -48047,27 +52773,32 @@ mulAvxTwo_9x5Xor_loop: VMOVDQU 2272(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 2304(CX), Y6 VMOVDQU 2336(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 2368(CX), Y6 VMOVDQU 2400(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 2432(CX), Y6 VMOVDQU 2464(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 2496(CX), Y6 VMOVDQU 2528(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 8 to 5 outputs VMOVDQU (DX), Y8 @@ -48079,27 +52810,32 @@ mulAvxTwo_9x5Xor_loop: VMOVDQU 2592(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 2624(CX), Y6 VMOVDQU 2656(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 2688(CX), Y6 VMOVDQU 2720(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 2752(CX), Y6 VMOVDQU 2784(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 2816(CX), Y6 VMOVDQU 2848(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Store 5 outputs MOVQ (R13), R15 @@ -48123,7 +52859,7 @@ mulAvxTwo_9x5Xor_end: RET // func mulAvxTwo_9x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x6(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -48208,32 +52944,38 @@ mulAvxTwo_9x6_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 @@ -48245,32 +52987,38 @@ mulAvxTwo_9x6_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 @@ -48282,32 +53030,38 @@ mulAvxTwo_9x6_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 @@ -48319,32 +53073,38 @@ mulAvxTwo_9x6_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y9 @@ -48356,32 +53116,38 @@ mulAvxTwo_9x6_loop: VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y9 @@ -48393,32 +53159,38 @@ mulAvxTwo_9x6_loop: VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 7 to 6 outputs VMOVDQU (R12), Y9 @@ -48430,32 +53202,38 @@ mulAvxTwo_9x6_loop: VMOVDQU 2720(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 2752(CX), Y7 VMOVDQU 2784(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 2816(CX), Y7 VMOVDQU 2848(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 2880(CX), Y7 VMOVDQU 2912(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 2944(CX), Y7 VMOVDQU 2976(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 3008(CX), Y7 VMOVDQU 3040(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 8 to 6 outputs VMOVDQU (DX), Y9 @@ -48467,32 +53245,38 @@ mulAvxTwo_9x6_loop: VMOVDQU 3104(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 3136(CX), Y7 VMOVDQU 3168(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 3200(CX), Y7 VMOVDQU 3232(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 3264(CX), Y7 VMOVDQU 3296(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 3328(CX), Y7 VMOVDQU 3360(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 3392(CX), Y7 VMOVDQU 3424(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Store 6 outputs MOVQ (R13), R15 @@ -48518,7 +53302,7 @@ mulAvxTwo_9x6_end: RET // func mulAvxTwo_9x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x6Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -48568,42 +53352,48 @@ mulAvxTwo_9x6Xor_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 MOVQ 24(R13), R15 VMOVDQU (R15)(R14*1), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 MOVQ 48(R13), R15 VMOVDQU (R15)(R14*1), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 MOVQ 72(R13), R15 VMOVDQU (R15)(R14*1), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 MOVQ 96(R13), R15 VMOVDQU (R15)(R14*1), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 MOVQ 120(R13), R15 VMOVDQU (R15)(R14*1), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 @@ -48615,32 +53405,38 @@ mulAvxTwo_9x6Xor_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 @@ -48652,32 +53448,38 @@ mulAvxTwo_9x6Xor_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 @@ -48689,32 +53491,38 @@ mulAvxTwo_9x6Xor_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 @@ -48726,32 +53534,38 @@ mulAvxTwo_9x6Xor_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y9 @@ -48763,32 +53577,38 @@ mulAvxTwo_9x6Xor_loop: VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y9 @@ -48800,32 +53620,38 @@ mulAvxTwo_9x6Xor_loop: VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 7 to 6 outputs VMOVDQU (R12), Y9 @@ -48837,32 +53663,38 @@ mulAvxTwo_9x6Xor_loop: VMOVDQU 2720(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 2752(CX), Y7 VMOVDQU 2784(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 2816(CX), Y7 VMOVDQU 2848(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 2880(CX), Y7 VMOVDQU 2912(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 2944(CX), Y7 VMOVDQU 2976(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 3008(CX), Y7 VMOVDQU 3040(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 8 to 6 outputs VMOVDQU (DX), Y9 @@ -48874,32 +53706,38 @@ mulAvxTwo_9x6Xor_loop: VMOVDQU 3104(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 3136(CX), Y7 VMOVDQU 3168(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 3200(CX), Y7 VMOVDQU 3232(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 3264(CX), Y7 VMOVDQU 3296(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 3328(CX), Y7 VMOVDQU 3360(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 3392(CX), Y7 VMOVDQU 3424(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Store 6 outputs MOVQ (R13), R15 @@ -48925,7 +53763,7 @@ mulAvxTwo_9x6Xor_end: RET // func mulAvxTwo_9x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x7(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -49015,37 +53853,44 @@ mulAvxTwo_9x7_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 @@ -49057,37 +53902,44 @@ mulAvxTwo_9x7_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 @@ -49099,37 +53951,44 @@ mulAvxTwo_9x7_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 @@ -49141,37 +54000,44 @@ mulAvxTwo_9x7_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 @@ -49183,37 +54049,44 @@ mulAvxTwo_9x7_loop: VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (R11), Y10 @@ -49225,37 +54098,44 @@ mulAvxTwo_9x7_loop: VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 7 to 7 outputs VMOVDQU (R12), Y10 @@ -49267,37 +54147,44 @@ mulAvxTwo_9x7_loop: VMOVDQU 3168(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 3200(CX), Y8 VMOVDQU 3232(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 3264(CX), Y8 VMOVDQU 3296(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 3328(CX), Y8 VMOVDQU 3360(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 3392(CX), Y8 VMOVDQU 3424(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 3456(CX), Y8 VMOVDQU 3488(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 3520(CX), Y8 VMOVDQU 3552(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 8 to 7 outputs VMOVDQU (DX), Y10 @@ -49309,37 +54196,44 @@ mulAvxTwo_9x7_loop: VMOVDQU 3616(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 3648(CX), Y8 VMOVDQU 3680(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 3712(CX), Y8 VMOVDQU 3744(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 3776(CX), Y8 VMOVDQU 3808(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 3840(CX), Y8 VMOVDQU 3872(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 3904(CX), Y8 VMOVDQU 3936(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 3968(CX), Y8 VMOVDQU 4000(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Store 7 outputs MOVQ (R13), R15 @@ -49367,7 +54261,7 @@ mulAvxTwo_9x7_end: RET // func mulAvxTwo_9x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x7Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -49417,49 +54311,56 @@ mulAvxTwo_9x7Xor_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 MOVQ 24(R13), R15 VMOVDQU (R15)(R14*1), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 MOVQ 48(R13), R15 VMOVDQU (R15)(R14*1), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 MOVQ 72(R13), R15 VMOVDQU (R15)(R14*1), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 MOVQ 96(R13), R15 VMOVDQU (R15)(R14*1), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 MOVQ 120(R13), R15 VMOVDQU (R15)(R14*1), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 MOVQ 144(R13), R15 VMOVDQU (R15)(R14*1), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 @@ -49471,37 +54372,44 @@ mulAvxTwo_9x7Xor_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 @@ -49513,37 +54421,44 @@ mulAvxTwo_9x7Xor_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 @@ -49555,37 +54470,44 @@ mulAvxTwo_9x7Xor_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 @@ -49597,37 +54519,44 @@ mulAvxTwo_9x7Xor_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 @@ -49639,37 +54568,44 @@ mulAvxTwo_9x7Xor_loop: VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (R11), Y10 @@ -49681,37 +54617,44 @@ mulAvxTwo_9x7Xor_loop: VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 7 to 7 outputs VMOVDQU (R12), Y10 @@ -49723,37 +54666,44 @@ mulAvxTwo_9x7Xor_loop: VMOVDQU 3168(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 3200(CX), Y8 VMOVDQU 3232(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 3264(CX), Y8 VMOVDQU 3296(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 3328(CX), Y8 VMOVDQU 3360(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 3392(CX), Y8 VMOVDQU 3424(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 3456(CX), Y8 VMOVDQU 3488(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 3520(CX), Y8 VMOVDQU 3552(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 8 to 7 outputs VMOVDQU (DX), Y10 @@ -49765,37 +54715,44 @@ mulAvxTwo_9x7Xor_loop: VMOVDQU 3616(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 3648(CX), Y8 VMOVDQU 3680(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 3712(CX), Y8 VMOVDQU 3744(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 3776(CX), Y8 VMOVDQU 3808(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 3840(CX), Y8 VMOVDQU 3872(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 3904(CX), Y8 VMOVDQU 3936(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 3968(CX), Y8 VMOVDQU 4000(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Store 7 outputs MOVQ (R13), R15 @@ -49823,7 +54780,7 @@ mulAvxTwo_9x7Xor_end: RET // func mulAvxTwo_9x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x8(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -49918,42 +54875,50 @@ mulAvxTwo_9x8_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 @@ -49965,42 +54930,50 @@ mulAvxTwo_9x8_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 @@ -50012,42 +54985,50 @@ mulAvxTwo_9x8_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 @@ -50059,42 +55040,50 @@ mulAvxTwo_9x8_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 @@ -50106,42 +55095,50 @@ mulAvxTwo_9x8_loop: VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (R11), Y11 @@ -50153,42 +55150,50 @@ mulAvxTwo_9x8_loop: VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 7 to 8 outputs VMOVDQU (R12), Y11 @@ -50200,42 +55205,50 @@ mulAvxTwo_9x8_loop: VMOVDQU 3616(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 3648(CX), Y9 VMOVDQU 3680(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 3712(CX), Y9 VMOVDQU 3744(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 3776(CX), Y9 VMOVDQU 3808(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 3840(CX), Y9 VMOVDQU 3872(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 3904(CX), Y9 VMOVDQU 3936(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 3968(CX), Y9 VMOVDQU 4000(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 4032(CX), Y9 VMOVDQU 4064(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 8 to 8 outputs VMOVDQU (DX), Y11 @@ -50247,42 +55260,50 @@ mulAvxTwo_9x8_loop: VMOVDQU 4128(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 4160(CX), Y9 VMOVDQU 4192(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 4224(CX), Y9 VMOVDQU 4256(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 4288(CX), Y9 VMOVDQU 4320(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 4352(CX), Y9 VMOVDQU 4384(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 4416(CX), Y9 VMOVDQU 4448(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 4480(CX), Y9 VMOVDQU 4512(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 4544(CX), Y9 VMOVDQU 4576(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Store 8 outputs MOVQ (R13), R15 @@ -50312,7 +55333,7 @@ mulAvxTwo_9x8_end: RET // func mulAvxTwo_9x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x8Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -50362,56 +55383,64 @@ mulAvxTwo_9x8Xor_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 MOVQ 24(R13), R15 VMOVDQU (R15)(R14*1), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 MOVQ 48(R13), R15 VMOVDQU (R15)(R14*1), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 MOVQ 72(R13), R15 VMOVDQU (R15)(R14*1), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 MOVQ 96(R13), R15 VMOVDQU (R15)(R14*1), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 MOVQ 120(R13), R15 VMOVDQU (R15)(R14*1), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 MOVQ 144(R13), R15 VMOVDQU (R15)(R14*1), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 MOVQ 168(R13), R15 VMOVDQU (R15)(R14*1), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 @@ -50423,42 +55452,50 @@ mulAvxTwo_9x8Xor_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 @@ -50470,42 +55507,50 @@ mulAvxTwo_9x8Xor_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 @@ -50517,42 +55562,50 @@ mulAvxTwo_9x8Xor_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 @@ -50564,42 +55617,50 @@ mulAvxTwo_9x8Xor_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 @@ -50611,42 +55672,50 @@ mulAvxTwo_9x8Xor_loop: VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (R11), Y11 @@ -50658,42 +55727,50 @@ mulAvxTwo_9x8Xor_loop: VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 7 to 8 outputs VMOVDQU (R12), Y11 @@ -50705,42 +55782,50 @@ mulAvxTwo_9x8Xor_loop: VMOVDQU 3616(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 3648(CX), Y9 VMOVDQU 3680(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 3712(CX), Y9 VMOVDQU 3744(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 3776(CX), Y9 VMOVDQU 3808(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 3840(CX), Y9 VMOVDQU 3872(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 3904(CX), Y9 VMOVDQU 3936(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 3968(CX), Y9 VMOVDQU 4000(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 4032(CX), Y9 VMOVDQU 4064(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 8 to 8 outputs VMOVDQU (DX), Y11 @@ -50752,42 +55837,50 @@ mulAvxTwo_9x8Xor_loop: VMOVDQU 4128(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 4160(CX), Y9 VMOVDQU 4192(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 4224(CX), Y9 VMOVDQU 4256(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 4288(CX), Y9 VMOVDQU 4320(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 4352(CX), Y9 VMOVDQU 4384(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 4416(CX), Y9 VMOVDQU 4448(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 4480(CX), Y9 VMOVDQU 4512(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 4544(CX), Y9 VMOVDQU 4576(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Store 8 outputs MOVQ (R13), R15 @@ -50817,7 +55910,7 @@ mulAvxTwo_9x8Xor_end: RET // func mulAvxTwo_9x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x9(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -50917,47 +56010,56 @@ mulAvxTwo_9x9_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 @@ -50969,47 +56071,56 @@ mulAvxTwo_9x9_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 @@ -51021,47 +56132,56 @@ mulAvxTwo_9x9_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 @@ -51073,47 +56193,56 @@ mulAvxTwo_9x9_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 @@ -51125,47 +56254,56 @@ mulAvxTwo_9x9_loop: VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (R11), Y12 @@ -51177,47 +56315,56 @@ mulAvxTwo_9x9_loop: VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 7 to 9 outputs VMOVDQU (R12), Y12 @@ -51229,47 +56376,56 @@ mulAvxTwo_9x9_loop: VMOVDQU 4064(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 4096(CX), Y10 VMOVDQU 4128(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 4160(CX), Y10 VMOVDQU 4192(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 4224(CX), Y10 VMOVDQU 4256(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 4288(CX), Y10 VMOVDQU 4320(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 4352(CX), Y10 VMOVDQU 4384(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 4416(CX), Y10 VMOVDQU 4448(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 4480(CX), Y10 VMOVDQU 4512(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 4544(CX), Y10 VMOVDQU 4576(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 8 to 9 outputs VMOVDQU (DX), Y12 @@ -51281,47 +56437,56 @@ mulAvxTwo_9x9_loop: VMOVDQU 4640(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 4672(CX), Y10 VMOVDQU 4704(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 4736(CX), Y10 VMOVDQU 4768(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 4800(CX), Y10 VMOVDQU 4832(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 4864(CX), Y10 VMOVDQU 4896(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 4928(CX), Y10 VMOVDQU 4960(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 4992(CX), Y10 VMOVDQU 5024(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 5056(CX), Y10 VMOVDQU 5088(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 5120(CX), Y10 VMOVDQU 5152(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Store 9 outputs MOVQ (R13), R15 @@ -51353,7 +56518,7 @@ mulAvxTwo_9x9_end: RET // func mulAvxTwo_9x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x9Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -51403,63 +56568,72 @@ mulAvxTwo_9x9Xor_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 MOVQ 24(R13), R15 VMOVDQU (R15)(R14*1), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 MOVQ 48(R13), R15 VMOVDQU (R15)(R14*1), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 MOVQ 72(R13), R15 VMOVDQU (R15)(R14*1), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 MOVQ 96(R13), R15 VMOVDQU (R15)(R14*1), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 MOVQ 120(R13), R15 VMOVDQU (R15)(R14*1), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 MOVQ 144(R13), R15 VMOVDQU (R15)(R14*1), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 MOVQ 168(R13), R15 VMOVDQU (R15)(R14*1), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 MOVQ 192(R13), R15 VMOVDQU (R15)(R14*1), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 @@ -51471,47 +56645,56 @@ mulAvxTwo_9x9Xor_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 @@ -51523,47 +56706,56 @@ mulAvxTwo_9x9Xor_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 @@ -51575,47 +56767,56 @@ mulAvxTwo_9x9Xor_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 @@ -51627,47 +56828,56 @@ mulAvxTwo_9x9Xor_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 @@ -51679,47 +56889,56 @@ mulAvxTwo_9x9Xor_loop: VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (R11), Y12 @@ -51731,47 +56950,56 @@ mulAvxTwo_9x9Xor_loop: VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 7 to 9 outputs VMOVDQU (R12), Y12 @@ -51783,47 +57011,56 @@ mulAvxTwo_9x9Xor_loop: VMOVDQU 4064(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 4096(CX), Y10 VMOVDQU 4128(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 4160(CX), Y10 VMOVDQU 4192(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 4224(CX), Y10 VMOVDQU 4256(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 4288(CX), Y10 VMOVDQU 4320(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 4352(CX), Y10 VMOVDQU 4384(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 4416(CX), Y10 VMOVDQU 4448(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 4480(CX), Y10 VMOVDQU 4512(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 4544(CX), Y10 VMOVDQU 4576(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 8 to 9 outputs VMOVDQU (DX), Y12 @@ -51835,47 +57072,56 @@ mulAvxTwo_9x9Xor_loop: VMOVDQU 4640(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 4672(CX), Y10 VMOVDQU 4704(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 4736(CX), Y10 VMOVDQU 4768(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 4800(CX), Y10 VMOVDQU 4832(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 4864(CX), Y10 VMOVDQU 4896(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 4928(CX), Y10 VMOVDQU 4960(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 4992(CX), Y10 VMOVDQU 5024(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 5056(CX), Y10 VMOVDQU 5088(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 5120(CX), Y10 VMOVDQU 5152(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Store 9 outputs MOVQ (R13), R15 @@ -51907,7 +57153,7 @@ mulAvxTwo_9x9Xor_end: RET // func mulAvxTwo_9x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x10(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -52012,52 +57258,62 @@ mulAvxTwo_9x10_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 @@ -52069,52 +57325,62 @@ mulAvxTwo_9x10_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 @@ -52126,52 +57392,62 @@ mulAvxTwo_9x10_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 @@ -52183,52 +57459,62 @@ mulAvxTwo_9x10_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 @@ -52240,52 +57526,62 @@ mulAvxTwo_9x10_loop: VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (R11), Y13 @@ -52297,52 +57593,62 @@ mulAvxTwo_9x10_loop: VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 7 to 10 outputs VMOVDQU (R12), Y13 @@ -52354,52 +57660,62 @@ mulAvxTwo_9x10_loop: VMOVDQU 4512(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 4544(CX), Y11 VMOVDQU 4576(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 4608(CX), Y11 VMOVDQU 4640(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 4672(CX), Y11 VMOVDQU 4704(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 4736(CX), Y11 VMOVDQU 4768(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 4800(CX), Y11 VMOVDQU 4832(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 4864(CX), Y11 VMOVDQU 4896(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 4928(CX), Y11 VMOVDQU 4960(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 4992(CX), Y11 VMOVDQU 5024(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 5056(CX), Y11 VMOVDQU 5088(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 8 to 10 outputs VMOVDQU (DX), Y13 @@ -52411,52 +57727,62 @@ mulAvxTwo_9x10_loop: VMOVDQU 5152(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 5184(CX), Y11 VMOVDQU 5216(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 5248(CX), Y11 VMOVDQU 5280(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 5312(CX), Y11 VMOVDQU 5344(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 5376(CX), Y11 VMOVDQU 5408(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 5440(CX), Y11 VMOVDQU 5472(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 5504(CX), Y11 VMOVDQU 5536(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 5568(CX), Y11 VMOVDQU 5600(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 5632(CX), Y11 VMOVDQU 5664(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 5696(CX), Y11 VMOVDQU 5728(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Store 10 outputs MOVQ (R13), R15 @@ -52490,7 +57816,7 @@ mulAvxTwo_9x10_end: RET // func mulAvxTwo_9x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x10Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept on stack @@ -52540,70 +57866,80 @@ mulAvxTwo_9x10Xor_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 MOVQ 24(R13), R15 VMOVDQU (R15)(R14*1), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 MOVQ 48(R13), R15 VMOVDQU (R15)(R14*1), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 MOVQ 72(R13), R15 VMOVDQU (R15)(R14*1), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 MOVQ 96(R13), R15 VMOVDQU (R15)(R14*1), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 MOVQ 120(R13), R15 VMOVDQU (R15)(R14*1), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 MOVQ 144(R13), R15 VMOVDQU (R15)(R14*1), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 MOVQ 168(R13), R15 VMOVDQU (R15)(R14*1), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 MOVQ 192(R13), R15 VMOVDQU (R15)(R14*1), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 MOVQ 216(R13), R15 VMOVDQU (R15)(R14*1), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 @@ -52615,52 +57951,62 @@ mulAvxTwo_9x10Xor_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 @@ -52672,52 +58018,62 @@ mulAvxTwo_9x10Xor_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 @@ -52729,52 +58085,62 @@ mulAvxTwo_9x10Xor_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 @@ -52786,52 +58152,62 @@ mulAvxTwo_9x10Xor_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 @@ -52843,52 +58219,62 @@ mulAvxTwo_9x10Xor_loop: VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (R11), Y13 @@ -52900,52 +58286,62 @@ mulAvxTwo_9x10Xor_loop: VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 7 to 10 outputs VMOVDQU (R12), Y13 @@ -52957,52 +58353,62 @@ mulAvxTwo_9x10Xor_loop: VMOVDQU 4512(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 4544(CX), Y11 VMOVDQU 4576(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 4608(CX), Y11 VMOVDQU 4640(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 4672(CX), Y11 VMOVDQU 4704(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 4736(CX), Y11 VMOVDQU 4768(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 4800(CX), Y11 VMOVDQU 4832(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 4864(CX), Y11 VMOVDQU 4896(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 4928(CX), Y11 VMOVDQU 4960(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 4992(CX), Y11 VMOVDQU 5024(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 5056(CX), Y11 VMOVDQU 5088(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 8 to 10 outputs VMOVDQU (DX), Y13 @@ -53014,52 +58420,62 @@ mulAvxTwo_9x10Xor_loop: VMOVDQU 5152(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 5184(CX), Y11 VMOVDQU 5216(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 5248(CX), Y11 VMOVDQU 5280(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 5312(CX), Y11 VMOVDQU 5344(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 5376(CX), Y11 VMOVDQU 5408(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 5440(CX), Y11 VMOVDQU 5472(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 5504(CX), Y11 VMOVDQU 5536(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 5568(CX), Y11 VMOVDQU 5600(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 5632(CX), Y11 VMOVDQU 5664(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 5696(CX), Y11 VMOVDQU 5728(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Store 10 outputs MOVQ (R13), R15 @@ -53093,7 +58509,7 @@ mulAvxTwo_9x10Xor_end: RET // func mulAvxTwo_10x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x1(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -53159,7 +58575,8 @@ mulAvxTwo_10x1_loop: VMOVDQU 96(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (DI), Y4 @@ -53171,7 +58588,8 @@ mulAvxTwo_10x1_loop: VMOVDQU 160(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (R8), Y4 @@ -53183,7 +58601,8 @@ mulAvxTwo_10x1_loop: VMOVDQU 224(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R9), Y4 @@ -53195,7 +58614,8 @@ mulAvxTwo_10x1_loop: VMOVDQU 288(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R10), Y4 @@ -53207,7 +58627,8 @@ mulAvxTwo_10x1_loop: VMOVDQU 352(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (R11), Y4 @@ -53219,7 +58640,8 @@ mulAvxTwo_10x1_loop: VMOVDQU 416(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 7 to 1 outputs VMOVDQU (R12), Y4 @@ -53231,7 +58653,8 @@ mulAvxTwo_10x1_loop: VMOVDQU 480(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 8 to 1 outputs VMOVDQU (R13), Y4 @@ -53243,7 +58666,8 @@ mulAvxTwo_10x1_loop: VMOVDQU 544(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 9 to 1 outputs VMOVDQU (DX), Y4 @@ -53255,7 +58679,8 @@ mulAvxTwo_10x1_loop: VMOVDQU 608(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Store 1 outputs VMOVDQU Y0, (R14) @@ -53270,7 +58695,7 @@ mulAvxTwo_10x1_end: RET // func mulAvxTwo_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x1_64(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -53350,8 +58775,10 @@ mulAvxTwo_10x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 @@ -53369,8 +58796,10 @@ mulAvxTwo_10x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 @@ -53388,8 +58817,10 @@ mulAvxTwo_10x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 @@ -53407,8 +58838,10 @@ mulAvxTwo_10x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R10), Y6 @@ -53426,8 +58859,10 @@ mulAvxTwo_10x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (R11), Y6 @@ -53445,8 +58880,10 @@ mulAvxTwo_10x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 7 to 1 outputs VMOVDQU (R12), Y6 @@ -53464,8 +58901,10 @@ mulAvxTwo_10x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 8 to 1 outputs VMOVDQU (R13), Y6 @@ -53483,8 +58922,10 @@ mulAvxTwo_10x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 9 to 1 outputs VMOVDQU (DX), Y6 @@ -53502,8 +58943,10 @@ mulAvxTwo_10x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Store 1 outputs VMOVDQU Y0, (R14) @@ -53519,7 +58962,7 @@ mulAvxTwo_10x1_64_end: RET // func mulAvxTwo_10x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x1Xor(SB), NOSPLIT, $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -53574,7 +59017,8 @@ mulAvxTwo_10x1Xor_loop: VMOVDQU 32(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (SI), Y4 @@ -53586,7 +59030,8 @@ mulAvxTwo_10x1Xor_loop: VMOVDQU 96(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (DI), Y4 @@ -53598,7 +59043,8 @@ mulAvxTwo_10x1Xor_loop: VMOVDQU 160(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (R8), Y4 @@ -53610,7 +59056,8 @@ mulAvxTwo_10x1Xor_loop: VMOVDQU 224(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R9), Y4 @@ -53622,7 +59069,8 @@ mulAvxTwo_10x1Xor_loop: VMOVDQU 288(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R10), Y4 @@ -53634,7 +59082,8 @@ mulAvxTwo_10x1Xor_loop: VMOVDQU 352(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (R11), Y4 @@ -53646,7 +59095,8 @@ mulAvxTwo_10x1Xor_loop: VMOVDQU 416(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 7 to 1 outputs VMOVDQU (R12), Y4 @@ -53658,7 +59108,8 @@ mulAvxTwo_10x1Xor_loop: VMOVDQU 480(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 8 to 1 outputs VMOVDQU (R13), Y4 @@ -53670,7 +59121,8 @@ mulAvxTwo_10x1Xor_loop: VMOVDQU 544(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Load and process 32 bytes from input 9 to 1 outputs VMOVDQU (DX), Y4 @@ -53682,7 +59134,8 @@ mulAvxTwo_10x1Xor_loop: VMOVDQU 608(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - XOR3WAY( $0x00, Y2, Y3, Y0) + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 // Store 1 outputs VMOVDQU Y0, (R14) @@ -53697,7 +59150,7 @@ mulAvxTwo_10x1Xor_end: RET // func mulAvxTwo_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x1_64Xor(SB), $0-88 // Loading no tables to registers // Destination kept in GP registers @@ -53762,8 +59215,10 @@ mulAvxTwo_10x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 @@ -53781,8 +59236,10 @@ mulAvxTwo_10x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 @@ -53800,8 +59257,10 @@ mulAvxTwo_10x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 @@ -53819,8 +59278,10 @@ mulAvxTwo_10x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 @@ -53838,8 +59299,10 @@ mulAvxTwo_10x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R10), Y6 @@ -53857,8 +59320,10 @@ mulAvxTwo_10x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (R11), Y6 @@ -53876,8 +59341,10 @@ mulAvxTwo_10x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 7 to 1 outputs VMOVDQU (R12), Y6 @@ -53895,8 +59362,10 @@ mulAvxTwo_10x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 8 to 1 outputs VMOVDQU (R13), Y6 @@ -53914,8 +59383,10 @@ mulAvxTwo_10x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 9 to 1 outputs VMOVDQU (DX), Y6 @@ -53933,8 +59404,10 @@ mulAvxTwo_10x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 // Store 1 outputs VMOVDQU Y0, (R14) @@ -53950,7 +59423,7 @@ mulAvxTwo_10x1_64Xor_end: RET // func mulAvxTwo_10x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x2(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -54023,12 +59496,14 @@ mulAvxTwo_10x2_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 @@ -54040,12 +59515,14 @@ mulAvxTwo_10x2_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 @@ -54057,12 +59534,14 @@ mulAvxTwo_10x2_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 @@ -54074,12 +59553,14 @@ mulAvxTwo_10x2_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y5 @@ -54091,12 +59572,14 @@ mulAvxTwo_10x2_loop: VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (R11), Y5 @@ -54108,12 +59591,14 @@ mulAvxTwo_10x2_loop: VMOVDQU 800(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 832(CX), Y3 VMOVDQU 864(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 7 to 2 outputs VMOVDQU (R12), Y5 @@ -54125,12 +59610,14 @@ mulAvxTwo_10x2_loop: VMOVDQU 928(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 960(CX), Y3 VMOVDQU 992(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 8 to 2 outputs VMOVDQU (R13), Y5 @@ -54142,12 +59629,14 @@ mulAvxTwo_10x2_loop: VMOVDQU 1056(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 1088(CX), Y3 VMOVDQU 1120(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 9 to 2 outputs VMOVDQU (DX), Y5 @@ -54159,12 +59648,14 @@ mulAvxTwo_10x2_loop: VMOVDQU 1184(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 1216(CX), Y3 VMOVDQU 1248(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Store 2 outputs VMOVDQU Y0, (R15) @@ -54181,7 +59672,7 @@ mulAvxTwo_10x2_end: RET // func mulAvxTwo_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x2_64(SB), $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -54271,16 +59762,20 @@ mulAvxTwo_10x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 @@ -54298,16 +59793,20 @@ mulAvxTwo_10x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 @@ -54325,16 +59824,20 @@ mulAvxTwo_10x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 @@ -54352,16 +59855,20 @@ mulAvxTwo_10x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R10), Y9 @@ -54379,16 +59886,20 @@ mulAvxTwo_10x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (R11), Y9 @@ -54406,16 +59917,20 @@ mulAvxTwo_10x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 7 to 2 outputs VMOVDQU (R12), Y9 @@ -54433,16 +59948,20 @@ mulAvxTwo_10x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 8 to 2 outputs VMOVDQU (R13), Y9 @@ -54460,16 +59979,20 @@ mulAvxTwo_10x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 9 to 2 outputs VMOVDQU (DX), Y9 @@ -54487,16 +60010,20 @@ mulAvxTwo_10x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Store 2 outputs VMOVDQU Y0, (R15) @@ -54515,7 +60042,7 @@ mulAvxTwo_10x2_64_end: RET // func mulAvxTwo_10x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x2Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -54572,13 +60099,15 @@ mulAvxTwo_10x2Xor_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU (R14), Y1 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 @@ -54590,12 +60119,14 @@ mulAvxTwo_10x2Xor_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 @@ -54607,12 +60138,14 @@ mulAvxTwo_10x2Xor_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 @@ -54624,12 +60157,14 @@ mulAvxTwo_10x2Xor_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 @@ -54641,12 +60176,14 @@ mulAvxTwo_10x2Xor_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y5 @@ -54658,12 +60195,14 @@ mulAvxTwo_10x2Xor_loop: VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (R11), Y5 @@ -54675,12 +60214,14 @@ mulAvxTwo_10x2Xor_loop: VMOVDQU 800(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 832(CX), Y3 VMOVDQU 864(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 7 to 2 outputs VMOVDQU (R12), Y5 @@ -54692,12 +60233,14 @@ mulAvxTwo_10x2Xor_loop: VMOVDQU 928(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 960(CX), Y3 VMOVDQU 992(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 8 to 2 outputs VMOVDQU (R13), Y5 @@ -54709,12 +60252,14 @@ mulAvxTwo_10x2Xor_loop: VMOVDQU 1056(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 1088(CX), Y3 VMOVDQU 1120(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Load and process 32 bytes from input 9 to 2 outputs VMOVDQU (DX), Y5 @@ -54726,12 +60271,14 @@ mulAvxTwo_10x2Xor_loop: VMOVDQU 1184(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y0) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 VMOVDQU 1216(CX), Y3 VMOVDQU 1248(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - XOR3WAY( $0x00, Y3, Y4, Y1) + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 // Store 2 outputs VMOVDQU Y0, (R15) @@ -54748,7 +60295,7 @@ mulAvxTwo_10x2Xor_end: RET // func mulAvxTwo_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x2_64Xor(SB), $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -54817,16 +60364,20 @@ mulAvxTwo_10x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 @@ -54844,16 +60395,20 @@ mulAvxTwo_10x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 @@ -54871,16 +60426,20 @@ mulAvxTwo_10x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 @@ -54898,16 +60457,20 @@ mulAvxTwo_10x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 @@ -54925,16 +60488,20 @@ mulAvxTwo_10x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R10), Y9 @@ -54952,16 +60519,20 @@ mulAvxTwo_10x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (R11), Y9 @@ -54979,16 +60550,20 @@ mulAvxTwo_10x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 7 to 2 outputs VMOVDQU (R12), Y9 @@ -55006,16 +60581,20 @@ mulAvxTwo_10x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 8 to 2 outputs VMOVDQU (R13), Y9 @@ -55033,16 +60612,20 @@ mulAvxTwo_10x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 9 to 2 outputs VMOVDQU (DX), Y9 @@ -55060,16 +60643,20 @@ mulAvxTwo_10x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 // Store 2 outputs VMOVDQU Y0, (R15) @@ -55088,7 +60675,7 @@ mulAvxTwo_10x2_64Xor_end: RET // func mulAvxTwo_10x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x3(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -55170,17 +60757,20 @@ mulAvxTwo_10x3_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (SI), Y6 @@ -55192,17 +60782,20 @@ mulAvxTwo_10x3_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (DI), Y6 @@ -55214,17 +60807,20 @@ mulAvxTwo_10x3_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R8), Y6 @@ -55236,17 +60832,20 @@ mulAvxTwo_10x3_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R9), Y6 @@ -55258,17 +60857,20 @@ mulAvxTwo_10x3_loop: VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (R10), Y6 @@ -55280,17 +60882,20 @@ mulAvxTwo_10x3_loop: VMOVDQU 1184(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 1216(CX), Y4 VMOVDQU 1248(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 1280(CX), Y4 VMOVDQU 1312(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 7 to 3 outputs VMOVDQU (R11), Y6 @@ -55302,17 +60907,20 @@ mulAvxTwo_10x3_loop: VMOVDQU 1376(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 1408(CX), Y4 VMOVDQU 1440(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 1472(CX), Y4 VMOVDQU 1504(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 8 to 3 outputs VMOVDQU (R12), Y6 @@ -55324,17 +60932,20 @@ mulAvxTwo_10x3_loop: VMOVDQU 1568(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 1600(CX), Y4 VMOVDQU 1632(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 1664(CX), Y4 VMOVDQU 1696(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 9 to 3 outputs VMOVDQU (AX), Y6 @@ -55346,17 +60957,20 @@ mulAvxTwo_10x3_loop: VMOVDQU 1760(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 1792(CX), Y4 VMOVDQU 1824(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 1856(CX), Y4 VMOVDQU 1888(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Store 3 outputs VMOVDQU Y0, (R14) @@ -55375,7 +60989,7 @@ mulAvxTwo_10x3_end: RET // func mulAvxTwo_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x3_64(SB), $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -55479,24 +61093,30 @@ mulAvxTwo_10x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (SI), Y11 @@ -55514,24 +61134,30 @@ mulAvxTwo_10x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (DI), Y11 @@ -55549,24 +61175,30 @@ mulAvxTwo_10x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R8), Y11 @@ -55584,24 +61216,30 @@ mulAvxTwo_10x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R9), Y11 @@ -55619,24 +61257,30 @@ mulAvxTwo_10x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (R10), Y11 @@ -55654,24 +61298,30 @@ mulAvxTwo_10x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU (R11), Y11 @@ -55689,24 +61339,30 @@ mulAvxTwo_10x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 8 to 3 outputs VMOVDQU (R12), Y11 @@ -55724,24 +61380,30 @@ mulAvxTwo_10x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 9 to 3 outputs VMOVDQU (AX), Y11 @@ -55759,24 +61421,30 @@ mulAvxTwo_10x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Store 3 outputs VMOVDQU Y0, (R14) @@ -55798,7 +61466,7 @@ mulAvxTwo_10x3_64_end: RET // func mulAvxTwo_10x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x3Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -55859,19 +61527,22 @@ mulAvxTwo_10x3Xor_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU (R15), Y1 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU (R13), Y2 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (BX), Y6 @@ -55883,17 +61554,20 @@ mulAvxTwo_10x3Xor_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (SI), Y6 @@ -55905,17 +61579,20 @@ mulAvxTwo_10x3Xor_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (DI), Y6 @@ -55927,17 +61604,20 @@ mulAvxTwo_10x3Xor_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R8), Y6 @@ -55949,17 +61629,20 @@ mulAvxTwo_10x3Xor_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R9), Y6 @@ -55971,17 +61654,20 @@ mulAvxTwo_10x3Xor_loop: VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (R10), Y6 @@ -55993,17 +61679,20 @@ mulAvxTwo_10x3Xor_loop: VMOVDQU 1184(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 1216(CX), Y4 VMOVDQU 1248(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 1280(CX), Y4 VMOVDQU 1312(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 7 to 3 outputs VMOVDQU (R11), Y6 @@ -56015,17 +61704,20 @@ mulAvxTwo_10x3Xor_loop: VMOVDQU 1376(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 1408(CX), Y4 VMOVDQU 1440(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 1472(CX), Y4 VMOVDQU 1504(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 8 to 3 outputs VMOVDQU (R12), Y6 @@ -56037,17 +61729,20 @@ mulAvxTwo_10x3Xor_loop: VMOVDQU 1568(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 1600(CX), Y4 VMOVDQU 1632(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 1664(CX), Y4 VMOVDQU 1696(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Load and process 32 bytes from input 9 to 3 outputs VMOVDQU (AX), Y6 @@ -56059,17 +61754,20 @@ mulAvxTwo_10x3Xor_loop: VMOVDQU 1760(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y0) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 VMOVDQU 1792(CX), Y4 VMOVDQU 1824(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y1) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 VMOVDQU 1856(CX), Y4 VMOVDQU 1888(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - XOR3WAY( $0x00, Y4, Y5, Y2) + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 // Store 3 outputs VMOVDQU Y0, (R14) @@ -56088,7 +61786,7 @@ mulAvxTwo_10x3Xor_end: RET // func mulAvxTwo_10x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x3_64Xor(SB), $8-88 // Loading no tables to registers // Destination kept in GP registers @@ -56165,24 +61863,30 @@ mulAvxTwo_10x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (BX), Y11 @@ -56200,24 +61904,30 @@ mulAvxTwo_10x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (SI), Y11 @@ -56235,24 +61945,30 @@ mulAvxTwo_10x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (DI), Y11 @@ -56270,24 +61986,30 @@ mulAvxTwo_10x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R8), Y11 @@ -56305,24 +62027,30 @@ mulAvxTwo_10x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R9), Y11 @@ -56340,24 +62068,30 @@ mulAvxTwo_10x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (R10), Y11 @@ -56375,24 +62109,30 @@ mulAvxTwo_10x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU (R11), Y11 @@ -56410,24 +62150,30 @@ mulAvxTwo_10x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 8 to 3 outputs VMOVDQU (R12), Y11 @@ -56445,24 +62191,30 @@ mulAvxTwo_10x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 9 to 3 outputs VMOVDQU (AX), Y11 @@ -56480,24 +62232,30 @@ mulAvxTwo_10x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Store 3 outputs VMOVDQU Y0, (R14) @@ -56519,7 +62277,7 @@ mulAvxTwo_10x3_64Xor_end: RET // func mulAvxTwo_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x4(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -56596,22 +62354,26 @@ mulAvxTwo_10x4_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 @@ -56623,22 +62385,26 @@ mulAvxTwo_10x4_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 @@ -56650,22 +62416,26 @@ mulAvxTwo_10x4_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 @@ -56677,22 +62447,26 @@ mulAvxTwo_10x4_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R10), Y7 @@ -56704,22 +62478,26 @@ mulAvxTwo_10x4_loop: VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (R11), Y7 @@ -56731,22 +62509,26 @@ mulAvxTwo_10x4_loop: VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 7 to 4 outputs VMOVDQU (R12), Y7 @@ -56758,22 +62540,26 @@ mulAvxTwo_10x4_loop: VMOVDQU 1824(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1856(CX), Y5 VMOVDQU 1888(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1920(CX), Y5 VMOVDQU 1952(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1984(CX), Y5 VMOVDQU 2016(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 8 to 4 outputs VMOVDQU (R13), Y7 @@ -56785,22 +62571,26 @@ mulAvxTwo_10x4_loop: VMOVDQU 2080(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 2112(CX), Y5 VMOVDQU 2144(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 2176(CX), Y5 VMOVDQU 2208(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 2240(CX), Y5 VMOVDQU 2272(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 9 to 4 outputs VMOVDQU (DX), Y7 @@ -56812,22 +62602,26 @@ mulAvxTwo_10x4_loop: VMOVDQU 2336(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 2368(CX), Y5 VMOVDQU 2400(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 2432(CX), Y5 VMOVDQU 2464(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 2496(CX), Y5 VMOVDQU 2528(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Store 4 outputs MOVQ (R14), BP @@ -56849,7 +62643,7 @@ mulAvxTwo_10x4_end: RET // func mulAvxTwo_10x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x4Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -56901,28 +62695,32 @@ mulAvxTwo_10x4Xor_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 @@ -56934,22 +62732,26 @@ mulAvxTwo_10x4Xor_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 @@ -56961,22 +62763,26 @@ mulAvxTwo_10x4Xor_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 @@ -56988,22 +62794,26 @@ mulAvxTwo_10x4Xor_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 @@ -57015,22 +62825,26 @@ mulAvxTwo_10x4Xor_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R10), Y7 @@ -57042,22 +62856,26 @@ mulAvxTwo_10x4Xor_loop: VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (R11), Y7 @@ -57069,22 +62887,26 @@ mulAvxTwo_10x4Xor_loop: VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 7 to 4 outputs VMOVDQU (R12), Y7 @@ -57096,22 +62918,26 @@ mulAvxTwo_10x4Xor_loop: VMOVDQU 1824(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 1856(CX), Y5 VMOVDQU 1888(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 1920(CX), Y5 VMOVDQU 1952(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 1984(CX), Y5 VMOVDQU 2016(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 8 to 4 outputs VMOVDQU (R13), Y7 @@ -57123,22 +62949,26 @@ mulAvxTwo_10x4Xor_loop: VMOVDQU 2080(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 2112(CX), Y5 VMOVDQU 2144(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 2176(CX), Y5 VMOVDQU 2208(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 2240(CX), Y5 VMOVDQU 2272(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Load and process 32 bytes from input 9 to 4 outputs VMOVDQU (DX), Y7 @@ -57150,22 +62980,26 @@ mulAvxTwo_10x4Xor_loop: VMOVDQU 2336(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y0) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 VMOVDQU 2368(CX), Y5 VMOVDQU 2400(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y1) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 VMOVDQU 2432(CX), Y5 VMOVDQU 2464(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y2) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 VMOVDQU 2496(CX), Y5 VMOVDQU 2528(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y5, Y6, Y3) + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 // Store 4 outputs MOVQ (R14), BP @@ -57187,7 +63021,7 @@ mulAvxTwo_10x4Xor_end: RET // func mulAvxTwo_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x5(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -57269,27 +63103,32 @@ mulAvxTwo_10x5_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 @@ -57301,27 +63140,32 @@ mulAvxTwo_10x5_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 @@ -57333,27 +63177,32 @@ mulAvxTwo_10x5_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 @@ -57365,27 +63214,32 @@ mulAvxTwo_10x5_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R10), Y8 @@ -57397,27 +63251,32 @@ mulAvxTwo_10x5_loop: VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (R11), Y8 @@ -57429,27 +63288,32 @@ mulAvxTwo_10x5_loop: VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 7 to 5 outputs VMOVDQU (R12), Y8 @@ -57461,27 +63325,32 @@ mulAvxTwo_10x5_loop: VMOVDQU 2272(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 2304(CX), Y6 VMOVDQU 2336(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 2368(CX), Y6 VMOVDQU 2400(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 2432(CX), Y6 VMOVDQU 2464(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 2496(CX), Y6 VMOVDQU 2528(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 8 to 5 outputs VMOVDQU (R13), Y8 @@ -57493,27 +63362,32 @@ mulAvxTwo_10x5_loop: VMOVDQU 2592(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 2624(CX), Y6 VMOVDQU 2656(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 2688(CX), Y6 VMOVDQU 2720(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 2752(CX), Y6 VMOVDQU 2784(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 2816(CX), Y6 VMOVDQU 2848(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 9 to 5 outputs VMOVDQU (DX), Y8 @@ -57525,27 +63399,32 @@ mulAvxTwo_10x5_loop: VMOVDQU 2912(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 2944(CX), Y6 VMOVDQU 2976(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 3008(CX), Y6 VMOVDQU 3040(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 3072(CX), Y6 VMOVDQU 3104(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 3136(CX), Y6 VMOVDQU 3168(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Store 5 outputs MOVQ (R14), BP @@ -57569,7 +63448,7 @@ mulAvxTwo_10x5_end: RET // func mulAvxTwo_10x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x5Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -57621,35 +63500,40 @@ mulAvxTwo_10x5Xor_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 MOVQ 96(R14), BP VMOVDQU (BP)(R15*1), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 @@ -57661,27 +63545,32 @@ mulAvxTwo_10x5Xor_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 @@ -57693,27 +63582,32 @@ mulAvxTwo_10x5Xor_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 @@ -57725,27 +63619,32 @@ mulAvxTwo_10x5Xor_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 @@ -57757,27 +63656,32 @@ mulAvxTwo_10x5Xor_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R10), Y8 @@ -57789,27 +63693,32 @@ mulAvxTwo_10x5Xor_loop: VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (R11), Y8 @@ -57821,27 +63730,32 @@ mulAvxTwo_10x5Xor_loop: VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 7 to 5 outputs VMOVDQU (R12), Y8 @@ -57853,27 +63767,32 @@ mulAvxTwo_10x5Xor_loop: VMOVDQU 2272(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 2304(CX), Y6 VMOVDQU 2336(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 2368(CX), Y6 VMOVDQU 2400(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 2432(CX), Y6 VMOVDQU 2464(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 2496(CX), Y6 VMOVDQU 2528(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 8 to 5 outputs VMOVDQU (R13), Y8 @@ -57885,27 +63804,32 @@ mulAvxTwo_10x5Xor_loop: VMOVDQU 2592(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 2624(CX), Y6 VMOVDQU 2656(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 2688(CX), Y6 VMOVDQU 2720(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 2752(CX), Y6 VMOVDQU 2784(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 2816(CX), Y6 VMOVDQU 2848(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Load and process 32 bytes from input 9 to 5 outputs VMOVDQU (DX), Y8 @@ -57917,27 +63841,32 @@ mulAvxTwo_10x5Xor_loop: VMOVDQU 2912(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y0) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 VMOVDQU 2944(CX), Y6 VMOVDQU 2976(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y1) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 VMOVDQU 3008(CX), Y6 VMOVDQU 3040(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y2) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 VMOVDQU 3072(CX), Y6 VMOVDQU 3104(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y3) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 VMOVDQU 3136(CX), Y6 VMOVDQU 3168(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - XOR3WAY( $0x00, Y6, Y7, Y4) + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 // Store 5 outputs MOVQ (R14), BP @@ -57961,7 +63890,7 @@ mulAvxTwo_10x5Xor_end: RET // func mulAvxTwo_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x6(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -58048,32 +63977,38 @@ mulAvxTwo_10x6_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 @@ -58085,32 +64020,38 @@ mulAvxTwo_10x6_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 @@ -58122,32 +64063,38 @@ mulAvxTwo_10x6_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 @@ -58159,32 +64106,38 @@ mulAvxTwo_10x6_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y9 @@ -58196,32 +64149,38 @@ mulAvxTwo_10x6_loop: VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y9 @@ -58233,32 +64192,38 @@ mulAvxTwo_10x6_loop: VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 7 to 6 outputs VMOVDQU (R12), Y9 @@ -58270,32 +64235,38 @@ mulAvxTwo_10x6_loop: VMOVDQU 2720(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 2752(CX), Y7 VMOVDQU 2784(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 2816(CX), Y7 VMOVDQU 2848(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 2880(CX), Y7 VMOVDQU 2912(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 2944(CX), Y7 VMOVDQU 2976(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 3008(CX), Y7 VMOVDQU 3040(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 8 to 6 outputs VMOVDQU (R13), Y9 @@ -58307,32 +64278,38 @@ mulAvxTwo_10x6_loop: VMOVDQU 3104(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 3136(CX), Y7 VMOVDQU 3168(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 3200(CX), Y7 VMOVDQU 3232(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 3264(CX), Y7 VMOVDQU 3296(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 3328(CX), Y7 VMOVDQU 3360(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 3392(CX), Y7 VMOVDQU 3424(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 9 to 6 outputs VMOVDQU (DX), Y9 @@ -58344,32 +64321,38 @@ mulAvxTwo_10x6_loop: VMOVDQU 3488(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 3520(CX), Y7 VMOVDQU 3552(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 3584(CX), Y7 VMOVDQU 3616(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 3648(CX), Y7 VMOVDQU 3680(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 3712(CX), Y7 VMOVDQU 3744(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 3776(CX), Y7 VMOVDQU 3808(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Store 6 outputs MOVQ (R14), BP @@ -58395,7 +64378,7 @@ mulAvxTwo_10x6_end: RET // func mulAvxTwo_10x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x6Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -58447,42 +64430,48 @@ mulAvxTwo_10x6Xor_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 MOVQ 96(R14), BP VMOVDQU (BP)(R15*1), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 MOVQ 120(R14), BP VMOVDQU (BP)(R15*1), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 @@ -58494,32 +64483,38 @@ mulAvxTwo_10x6Xor_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 @@ -58531,32 +64526,38 @@ mulAvxTwo_10x6Xor_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 @@ -58568,32 +64569,38 @@ mulAvxTwo_10x6Xor_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 @@ -58605,32 +64612,38 @@ mulAvxTwo_10x6Xor_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y9 @@ -58642,32 +64655,38 @@ mulAvxTwo_10x6Xor_loop: VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y9 @@ -58679,32 +64698,38 @@ mulAvxTwo_10x6Xor_loop: VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 7 to 6 outputs VMOVDQU (R12), Y9 @@ -58716,32 +64741,38 @@ mulAvxTwo_10x6Xor_loop: VMOVDQU 2720(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 2752(CX), Y7 VMOVDQU 2784(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 2816(CX), Y7 VMOVDQU 2848(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 2880(CX), Y7 VMOVDQU 2912(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 2944(CX), Y7 VMOVDQU 2976(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 3008(CX), Y7 VMOVDQU 3040(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 8 to 6 outputs VMOVDQU (R13), Y9 @@ -58753,32 +64784,38 @@ mulAvxTwo_10x6Xor_loop: VMOVDQU 3104(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 3136(CX), Y7 VMOVDQU 3168(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 3200(CX), Y7 VMOVDQU 3232(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 3264(CX), Y7 VMOVDQU 3296(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 3328(CX), Y7 VMOVDQU 3360(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 3392(CX), Y7 VMOVDQU 3424(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Load and process 32 bytes from input 9 to 6 outputs VMOVDQU (DX), Y9 @@ -58790,32 +64827,38 @@ mulAvxTwo_10x6Xor_loop: VMOVDQU 3488(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y0) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 VMOVDQU 3520(CX), Y7 VMOVDQU 3552(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y1) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 VMOVDQU 3584(CX), Y7 VMOVDQU 3616(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y2) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 VMOVDQU 3648(CX), Y7 VMOVDQU 3680(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y3) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 VMOVDQU 3712(CX), Y7 VMOVDQU 3744(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y4) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 VMOVDQU 3776(CX), Y7 VMOVDQU 3808(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - XOR3WAY( $0x00, Y7, Y8, Y5) + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 // Store 6 outputs MOVQ (R14), BP @@ -58841,7 +64884,7 @@ mulAvxTwo_10x6Xor_end: RET // func mulAvxTwo_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x7(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -58933,37 +64976,44 @@ mulAvxTwo_10x7_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 @@ -58975,37 +65025,44 @@ mulAvxTwo_10x7_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 @@ -59017,37 +65074,44 @@ mulAvxTwo_10x7_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 @@ -59059,37 +65123,44 @@ mulAvxTwo_10x7_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 @@ -59101,37 +65172,44 @@ mulAvxTwo_10x7_loop: VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (R11), Y10 @@ -59143,37 +65221,44 @@ mulAvxTwo_10x7_loop: VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 7 to 7 outputs VMOVDQU (R12), Y10 @@ -59185,37 +65270,44 @@ mulAvxTwo_10x7_loop: VMOVDQU 3168(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 3200(CX), Y8 VMOVDQU 3232(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 3264(CX), Y8 VMOVDQU 3296(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 3328(CX), Y8 VMOVDQU 3360(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 3392(CX), Y8 VMOVDQU 3424(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 3456(CX), Y8 VMOVDQU 3488(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 3520(CX), Y8 VMOVDQU 3552(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 8 to 7 outputs VMOVDQU (R13), Y10 @@ -59227,37 +65319,44 @@ mulAvxTwo_10x7_loop: VMOVDQU 3616(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 3648(CX), Y8 VMOVDQU 3680(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 3712(CX), Y8 VMOVDQU 3744(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 3776(CX), Y8 VMOVDQU 3808(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 3840(CX), Y8 VMOVDQU 3872(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 3904(CX), Y8 VMOVDQU 3936(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 3968(CX), Y8 VMOVDQU 4000(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 9 to 7 outputs VMOVDQU (DX), Y10 @@ -59269,37 +65368,44 @@ mulAvxTwo_10x7_loop: VMOVDQU 4064(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 4096(CX), Y8 VMOVDQU 4128(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 4160(CX), Y8 VMOVDQU 4192(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 4224(CX), Y8 VMOVDQU 4256(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 4288(CX), Y8 VMOVDQU 4320(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 4352(CX), Y8 VMOVDQU 4384(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 4416(CX), Y8 VMOVDQU 4448(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Store 7 outputs MOVQ (R14), BP @@ -59327,7 +65433,7 @@ mulAvxTwo_10x7_end: RET // func mulAvxTwo_10x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x7Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -59379,49 +65485,56 @@ mulAvxTwo_10x7Xor_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 MOVQ 96(R14), BP VMOVDQU (BP)(R15*1), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 MOVQ 120(R14), BP VMOVDQU (BP)(R15*1), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 MOVQ 144(R14), BP VMOVDQU (BP)(R15*1), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 @@ -59433,37 +65546,44 @@ mulAvxTwo_10x7Xor_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 @@ -59475,37 +65595,44 @@ mulAvxTwo_10x7Xor_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 @@ -59517,37 +65644,44 @@ mulAvxTwo_10x7Xor_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 @@ -59559,37 +65693,44 @@ mulAvxTwo_10x7Xor_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 @@ -59601,37 +65742,44 @@ mulAvxTwo_10x7Xor_loop: VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (R11), Y10 @@ -59643,37 +65791,44 @@ mulAvxTwo_10x7Xor_loop: VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 7 to 7 outputs VMOVDQU (R12), Y10 @@ -59685,37 +65840,44 @@ mulAvxTwo_10x7Xor_loop: VMOVDQU 3168(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 3200(CX), Y8 VMOVDQU 3232(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 3264(CX), Y8 VMOVDQU 3296(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 3328(CX), Y8 VMOVDQU 3360(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 3392(CX), Y8 VMOVDQU 3424(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 3456(CX), Y8 VMOVDQU 3488(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 3520(CX), Y8 VMOVDQU 3552(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 8 to 7 outputs VMOVDQU (R13), Y10 @@ -59727,37 +65889,44 @@ mulAvxTwo_10x7Xor_loop: VMOVDQU 3616(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 3648(CX), Y8 VMOVDQU 3680(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 3712(CX), Y8 VMOVDQU 3744(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 3776(CX), Y8 VMOVDQU 3808(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 3840(CX), Y8 VMOVDQU 3872(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 3904(CX), Y8 VMOVDQU 3936(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 3968(CX), Y8 VMOVDQU 4000(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Load and process 32 bytes from input 9 to 7 outputs VMOVDQU (DX), Y10 @@ -59769,37 +65938,44 @@ mulAvxTwo_10x7Xor_loop: VMOVDQU 4064(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y0) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 VMOVDQU 4096(CX), Y8 VMOVDQU 4128(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y1) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 VMOVDQU 4160(CX), Y8 VMOVDQU 4192(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y2) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 VMOVDQU 4224(CX), Y8 VMOVDQU 4256(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y3) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 VMOVDQU 4288(CX), Y8 VMOVDQU 4320(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y4) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 VMOVDQU 4352(CX), Y8 VMOVDQU 4384(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y5) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 VMOVDQU 4416(CX), Y8 VMOVDQU 4448(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - XOR3WAY( $0x00, Y8, Y9, Y6) + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 // Store 7 outputs MOVQ (R14), BP @@ -59827,7 +66003,7 @@ mulAvxTwo_10x7Xor_end: RET // func mulAvxTwo_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x8(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -59924,42 +66100,50 @@ mulAvxTwo_10x8_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 @@ -59971,42 +66155,50 @@ mulAvxTwo_10x8_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 @@ -60018,42 +66210,50 @@ mulAvxTwo_10x8_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 @@ -60065,42 +66265,50 @@ mulAvxTwo_10x8_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 @@ -60112,42 +66320,50 @@ mulAvxTwo_10x8_loop: VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (R11), Y11 @@ -60159,42 +66375,50 @@ mulAvxTwo_10x8_loop: VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 7 to 8 outputs VMOVDQU (R12), Y11 @@ -60206,42 +66430,50 @@ mulAvxTwo_10x8_loop: VMOVDQU 3616(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 3648(CX), Y9 VMOVDQU 3680(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 3712(CX), Y9 VMOVDQU 3744(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 3776(CX), Y9 VMOVDQU 3808(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 3840(CX), Y9 VMOVDQU 3872(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 3904(CX), Y9 VMOVDQU 3936(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 3968(CX), Y9 VMOVDQU 4000(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 4032(CX), Y9 VMOVDQU 4064(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 8 to 8 outputs VMOVDQU (R13), Y11 @@ -60253,42 +66485,50 @@ mulAvxTwo_10x8_loop: VMOVDQU 4128(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 4160(CX), Y9 VMOVDQU 4192(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 4224(CX), Y9 VMOVDQU 4256(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 4288(CX), Y9 VMOVDQU 4320(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 4352(CX), Y9 VMOVDQU 4384(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 4416(CX), Y9 VMOVDQU 4448(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 4480(CX), Y9 VMOVDQU 4512(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 4544(CX), Y9 VMOVDQU 4576(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 9 to 8 outputs VMOVDQU (DX), Y11 @@ -60300,42 +66540,50 @@ mulAvxTwo_10x8_loop: VMOVDQU 4640(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 4672(CX), Y9 VMOVDQU 4704(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 4736(CX), Y9 VMOVDQU 4768(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 4800(CX), Y9 VMOVDQU 4832(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 4864(CX), Y9 VMOVDQU 4896(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 4928(CX), Y9 VMOVDQU 4960(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 4992(CX), Y9 VMOVDQU 5024(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 5056(CX), Y9 VMOVDQU 5088(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Store 8 outputs MOVQ (R14), BP @@ -60365,7 +66613,7 @@ mulAvxTwo_10x8_end: RET // func mulAvxTwo_10x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x8Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -60417,56 +66665,64 @@ mulAvxTwo_10x8Xor_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 MOVQ 96(R14), BP VMOVDQU (BP)(R15*1), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 MOVQ 120(R14), BP VMOVDQU (BP)(R15*1), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 MOVQ 144(R14), BP VMOVDQU (BP)(R15*1), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 MOVQ 168(R14), BP VMOVDQU (BP)(R15*1), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 @@ -60478,42 +66734,50 @@ mulAvxTwo_10x8Xor_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 @@ -60525,42 +66789,50 @@ mulAvxTwo_10x8Xor_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 @@ -60572,42 +66844,50 @@ mulAvxTwo_10x8Xor_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 @@ -60619,42 +66899,50 @@ mulAvxTwo_10x8Xor_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 @@ -60666,42 +66954,50 @@ mulAvxTwo_10x8Xor_loop: VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (R11), Y11 @@ -60713,42 +67009,50 @@ mulAvxTwo_10x8Xor_loop: VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 7 to 8 outputs VMOVDQU (R12), Y11 @@ -60760,42 +67064,50 @@ mulAvxTwo_10x8Xor_loop: VMOVDQU 3616(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 3648(CX), Y9 VMOVDQU 3680(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 3712(CX), Y9 VMOVDQU 3744(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 3776(CX), Y9 VMOVDQU 3808(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 3840(CX), Y9 VMOVDQU 3872(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 3904(CX), Y9 VMOVDQU 3936(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 3968(CX), Y9 VMOVDQU 4000(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 4032(CX), Y9 VMOVDQU 4064(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 8 to 8 outputs VMOVDQU (R13), Y11 @@ -60807,42 +67119,50 @@ mulAvxTwo_10x8Xor_loop: VMOVDQU 4128(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 4160(CX), Y9 VMOVDQU 4192(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 4224(CX), Y9 VMOVDQU 4256(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 4288(CX), Y9 VMOVDQU 4320(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 4352(CX), Y9 VMOVDQU 4384(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 4416(CX), Y9 VMOVDQU 4448(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 4480(CX), Y9 VMOVDQU 4512(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 4544(CX), Y9 VMOVDQU 4576(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Load and process 32 bytes from input 9 to 8 outputs VMOVDQU (DX), Y11 @@ -60854,42 +67174,50 @@ mulAvxTwo_10x8Xor_loop: VMOVDQU 4640(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y0) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 VMOVDQU 4672(CX), Y9 VMOVDQU 4704(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y1) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 VMOVDQU 4736(CX), Y9 VMOVDQU 4768(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y2) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 VMOVDQU 4800(CX), Y9 VMOVDQU 4832(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y3) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 VMOVDQU 4864(CX), Y9 VMOVDQU 4896(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y4) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 VMOVDQU 4928(CX), Y9 VMOVDQU 4960(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y5) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 VMOVDQU 4992(CX), Y9 VMOVDQU 5024(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y6) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 VMOVDQU 5056(CX), Y9 VMOVDQU 5088(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y9, Y10, Y7) + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 // Store 8 outputs MOVQ (R14), BP @@ -60919,7 +67247,7 @@ mulAvxTwo_10x8Xor_end: RET // func mulAvxTwo_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x9(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -61021,47 +67349,56 @@ mulAvxTwo_10x9_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 @@ -61073,47 +67410,56 @@ mulAvxTwo_10x9_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 @@ -61125,47 +67471,56 @@ mulAvxTwo_10x9_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 @@ -61177,47 +67532,56 @@ mulAvxTwo_10x9_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 @@ -61229,47 +67593,56 @@ mulAvxTwo_10x9_loop: VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (R11), Y12 @@ -61281,47 +67654,56 @@ mulAvxTwo_10x9_loop: VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 7 to 9 outputs VMOVDQU (R12), Y12 @@ -61333,47 +67715,56 @@ mulAvxTwo_10x9_loop: VMOVDQU 4064(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 4096(CX), Y10 VMOVDQU 4128(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 4160(CX), Y10 VMOVDQU 4192(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 4224(CX), Y10 VMOVDQU 4256(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 4288(CX), Y10 VMOVDQU 4320(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 4352(CX), Y10 VMOVDQU 4384(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 4416(CX), Y10 VMOVDQU 4448(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 4480(CX), Y10 VMOVDQU 4512(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 4544(CX), Y10 VMOVDQU 4576(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 8 to 9 outputs VMOVDQU (R13), Y12 @@ -61385,47 +67776,56 @@ mulAvxTwo_10x9_loop: VMOVDQU 4640(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 4672(CX), Y10 VMOVDQU 4704(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 4736(CX), Y10 VMOVDQU 4768(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 4800(CX), Y10 VMOVDQU 4832(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 4864(CX), Y10 VMOVDQU 4896(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 4928(CX), Y10 VMOVDQU 4960(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 4992(CX), Y10 VMOVDQU 5024(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 5056(CX), Y10 VMOVDQU 5088(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 5120(CX), Y10 VMOVDQU 5152(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 9 to 9 outputs VMOVDQU (DX), Y12 @@ -61437,47 +67837,56 @@ mulAvxTwo_10x9_loop: VMOVDQU 5216(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 5248(CX), Y10 VMOVDQU 5280(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 5312(CX), Y10 VMOVDQU 5344(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 5376(CX), Y10 VMOVDQU 5408(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 5440(CX), Y10 VMOVDQU 5472(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 5504(CX), Y10 VMOVDQU 5536(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 5568(CX), Y10 VMOVDQU 5600(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 5632(CX), Y10 VMOVDQU 5664(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 5696(CX), Y10 VMOVDQU 5728(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Store 9 outputs MOVQ (R14), BP @@ -61509,7 +67918,7 @@ mulAvxTwo_10x9_end: RET // func mulAvxTwo_10x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x9Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -61561,63 +67970,72 @@ mulAvxTwo_10x9Xor_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 MOVQ 96(R14), BP VMOVDQU (BP)(R15*1), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 MOVQ 120(R14), BP VMOVDQU (BP)(R15*1), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 MOVQ 144(R14), BP VMOVDQU (BP)(R15*1), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 MOVQ 168(R14), BP VMOVDQU (BP)(R15*1), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 MOVQ 192(R14), BP VMOVDQU (BP)(R15*1), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 @@ -61629,47 +68047,56 @@ mulAvxTwo_10x9Xor_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 @@ -61681,47 +68108,56 @@ mulAvxTwo_10x9Xor_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 @@ -61733,47 +68169,56 @@ mulAvxTwo_10x9Xor_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 @@ -61785,47 +68230,56 @@ mulAvxTwo_10x9Xor_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 @@ -61837,47 +68291,56 @@ mulAvxTwo_10x9Xor_loop: VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (R11), Y12 @@ -61889,47 +68352,56 @@ mulAvxTwo_10x9Xor_loop: VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 7 to 9 outputs VMOVDQU (R12), Y12 @@ -61941,47 +68413,56 @@ mulAvxTwo_10x9Xor_loop: VMOVDQU 4064(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 4096(CX), Y10 VMOVDQU 4128(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 4160(CX), Y10 VMOVDQU 4192(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 4224(CX), Y10 VMOVDQU 4256(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 4288(CX), Y10 VMOVDQU 4320(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 4352(CX), Y10 VMOVDQU 4384(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 4416(CX), Y10 VMOVDQU 4448(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 4480(CX), Y10 VMOVDQU 4512(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 4544(CX), Y10 VMOVDQU 4576(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 8 to 9 outputs VMOVDQU (R13), Y12 @@ -61993,47 +68474,56 @@ mulAvxTwo_10x9Xor_loop: VMOVDQU 4640(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 4672(CX), Y10 VMOVDQU 4704(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 4736(CX), Y10 VMOVDQU 4768(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 4800(CX), Y10 VMOVDQU 4832(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 4864(CX), Y10 VMOVDQU 4896(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 4928(CX), Y10 VMOVDQU 4960(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 4992(CX), Y10 VMOVDQU 5024(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 5056(CX), Y10 VMOVDQU 5088(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 5120(CX), Y10 VMOVDQU 5152(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Load and process 32 bytes from input 9 to 9 outputs VMOVDQU (DX), Y12 @@ -62045,47 +68535,56 @@ mulAvxTwo_10x9Xor_loop: VMOVDQU 5216(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y0) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 VMOVDQU 5248(CX), Y10 VMOVDQU 5280(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y1) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 VMOVDQU 5312(CX), Y10 VMOVDQU 5344(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y2) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 VMOVDQU 5376(CX), Y10 VMOVDQU 5408(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y3) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 VMOVDQU 5440(CX), Y10 VMOVDQU 5472(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y4) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 VMOVDQU 5504(CX), Y10 VMOVDQU 5536(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y5) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 VMOVDQU 5568(CX), Y10 VMOVDQU 5600(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y6) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 VMOVDQU 5632(CX), Y10 VMOVDQU 5664(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y7) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 VMOVDQU 5696(CX), Y10 VMOVDQU 5728(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - XOR3WAY( $0x00, Y10, Y11, Y8) + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 // Store 9 outputs MOVQ (R14), BP @@ -62117,7 +68616,7 @@ mulAvxTwo_10x9Xor_end: RET // func mulAvxTwo_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x10(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -62224,52 +68723,62 @@ mulAvxTwo_10x10_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 @@ -62281,52 +68790,62 @@ mulAvxTwo_10x10_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 @@ -62338,52 +68857,62 @@ mulAvxTwo_10x10_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 @@ -62395,52 +68924,62 @@ mulAvxTwo_10x10_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 @@ -62452,52 +68991,62 @@ mulAvxTwo_10x10_loop: VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (R11), Y13 @@ -62509,52 +69058,62 @@ mulAvxTwo_10x10_loop: VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 7 to 10 outputs VMOVDQU (R12), Y13 @@ -62566,52 +69125,62 @@ mulAvxTwo_10x10_loop: VMOVDQU 4512(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 4544(CX), Y11 VMOVDQU 4576(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 4608(CX), Y11 VMOVDQU 4640(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 4672(CX), Y11 VMOVDQU 4704(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 4736(CX), Y11 VMOVDQU 4768(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 4800(CX), Y11 VMOVDQU 4832(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 4864(CX), Y11 VMOVDQU 4896(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 4928(CX), Y11 VMOVDQU 4960(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 4992(CX), Y11 VMOVDQU 5024(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 5056(CX), Y11 VMOVDQU 5088(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 8 to 10 outputs VMOVDQU (R13), Y13 @@ -62623,52 +69192,62 @@ mulAvxTwo_10x10_loop: VMOVDQU 5152(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 5184(CX), Y11 VMOVDQU 5216(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 5248(CX), Y11 VMOVDQU 5280(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 5312(CX), Y11 VMOVDQU 5344(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 5376(CX), Y11 VMOVDQU 5408(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 5440(CX), Y11 VMOVDQU 5472(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 5504(CX), Y11 VMOVDQU 5536(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 5568(CX), Y11 VMOVDQU 5600(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 5632(CX), Y11 VMOVDQU 5664(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 5696(CX), Y11 VMOVDQU 5728(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 9 to 10 outputs VMOVDQU (DX), Y13 @@ -62680,52 +69259,62 @@ mulAvxTwo_10x10_loop: VMOVDQU 5792(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 5824(CX), Y11 VMOVDQU 5856(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 5888(CX), Y11 VMOVDQU 5920(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 5952(CX), Y11 VMOVDQU 5984(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 6016(CX), Y11 VMOVDQU 6048(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 6080(CX), Y11 VMOVDQU 6112(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 6144(CX), Y11 VMOVDQU 6176(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 6208(CX), Y11 VMOVDQU 6240(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 6272(CX), Y11 VMOVDQU 6304(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 6336(CX), Y11 VMOVDQU 6368(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Store 10 outputs MOVQ (R14), BP @@ -62759,7 +69348,7 @@ mulAvxTwo_10x10_end: RET // func mulAvxTwo_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +// Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x10Xor(SB), NOSPLIT, $8-88 // Loading no tables to registers // Destination kept on stack @@ -62811,70 +69400,80 @@ mulAvxTwo_10x10Xor_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 MOVQ 96(R14), BP VMOVDQU (BP)(R15*1), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 MOVQ 120(R14), BP VMOVDQU (BP)(R15*1), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 MOVQ 144(R14), BP VMOVDQU (BP)(R15*1), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 MOVQ 168(R14), BP VMOVDQU (BP)(R15*1), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 MOVQ 192(R14), BP VMOVDQU (BP)(R15*1), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 MOVQ 216(R14), BP VMOVDQU (BP)(R15*1), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 @@ -62886,52 +69485,62 @@ mulAvxTwo_10x10Xor_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 @@ -62943,52 +69552,62 @@ mulAvxTwo_10x10Xor_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 @@ -63000,52 +69619,62 @@ mulAvxTwo_10x10Xor_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 @@ -63057,52 +69686,62 @@ mulAvxTwo_10x10Xor_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 @@ -63114,52 +69753,62 @@ mulAvxTwo_10x10Xor_loop: VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (R11), Y13 @@ -63171,52 +69820,62 @@ mulAvxTwo_10x10Xor_loop: VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 7 to 10 outputs VMOVDQU (R12), Y13 @@ -63228,52 +69887,62 @@ mulAvxTwo_10x10Xor_loop: VMOVDQU 4512(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 4544(CX), Y11 VMOVDQU 4576(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 4608(CX), Y11 VMOVDQU 4640(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 4672(CX), Y11 VMOVDQU 4704(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 4736(CX), Y11 VMOVDQU 4768(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 4800(CX), Y11 VMOVDQU 4832(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 4864(CX), Y11 VMOVDQU 4896(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 4928(CX), Y11 VMOVDQU 4960(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 4992(CX), Y11 VMOVDQU 5024(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 5056(CX), Y11 VMOVDQU 5088(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 8 to 10 outputs VMOVDQU (R13), Y13 @@ -63285,52 +69954,62 @@ mulAvxTwo_10x10Xor_loop: VMOVDQU 5152(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 5184(CX), Y11 VMOVDQU 5216(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 5248(CX), Y11 VMOVDQU 5280(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 5312(CX), Y11 VMOVDQU 5344(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 5376(CX), Y11 VMOVDQU 5408(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 5440(CX), Y11 VMOVDQU 5472(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 5504(CX), Y11 VMOVDQU 5536(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 5568(CX), Y11 VMOVDQU 5600(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 5632(CX), Y11 VMOVDQU 5664(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 5696(CX), Y11 VMOVDQU 5728(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Load and process 32 bytes from input 9 to 10 outputs VMOVDQU (DX), Y13 @@ -63342,52 +70021,62 @@ mulAvxTwo_10x10Xor_loop: VMOVDQU 5792(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y0) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 VMOVDQU 5824(CX), Y11 VMOVDQU 5856(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y1) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 VMOVDQU 5888(CX), Y11 VMOVDQU 5920(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y2) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 VMOVDQU 5952(CX), Y11 VMOVDQU 5984(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y3) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 VMOVDQU 6016(CX), Y11 VMOVDQU 6048(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y4) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 VMOVDQU 6080(CX), Y11 VMOVDQU 6112(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y5) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 VMOVDQU 6144(CX), Y11 VMOVDQU 6176(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y6) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 VMOVDQU 6208(CX), Y11 VMOVDQU 6240(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y7) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 VMOVDQU 6272(CX), Y11 VMOVDQU 6304(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y8) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 VMOVDQU 6336(CX), Y11 VMOVDQU 6368(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - XOR3WAY( $0x00, Y11, Y12, Y9) + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 // Store 10 outputs MOVQ (R14), BP @@ -63419,4191 +70108,3 @@ mulAvxTwo_10x10Xor_loop: mulAvxTwo_10x10Xor_end: RET - -// func ifftDIT2_avx2(x []byte, y []byte, table *[128]uint8) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·ifftDIT2_avx2(SB), NOSPLIT, $0-56 - MOVQ table+48(FP), AX - VBROADCASTI128 (AX), Y0 - VBROADCASTI128 64(AX), Y1 - VBROADCASTI128 16(AX), Y2 - VBROADCASTI128 80(AX), Y3 - VBROADCASTI128 32(AX), Y4 - VBROADCASTI128 96(AX), Y5 - VBROADCASTI128 48(AX), Y6 - VBROADCASTI128 112(AX), Y7 - MOVQ x_len+8(FP), AX - MOVQ x_base+0(FP), CX - MOVQ y_base+24(FP), DX - MOVQ $0x0000000f, BX - MOVQ BX, X8 - VPBROADCASTB X8, Y8 - -loop: - VMOVDQU (CX), Y9 - VMOVDQU 32(CX), Y10 - VMOVDQU (DX), Y11 - VMOVDQU 32(DX), Y12 - VPXOR Y11, Y9, Y11 - VPXOR Y12, Y10, Y12 - VMOVDQU Y11, (DX) - VMOVDQU Y12, 32(DX) - VPSRLQ $0x04, Y11, Y13 - VPAND Y8, Y11, Y11 - VPAND Y8, Y13, Y13 - VPSHUFB Y11, Y0, Y14 - VPSHUFB Y11, Y1, Y11 - VPSHUFB Y13, Y2, Y15 - VPSHUFB Y13, Y3, Y13 - VPXOR Y14, Y15, Y14 - VPXOR Y11, Y13, Y11 - VPAND Y12, Y8, Y13 - VPSRLQ $0x04, Y12, Y12 - VPAND Y8, Y12, Y12 - VPSHUFB Y13, Y4, Y15 - VPSHUFB Y13, Y5, Y13 - VPXOR Y14, Y15, Y14 - VPXOR Y11, Y13, Y11 - VPSHUFB Y12, Y6, Y15 - VPSHUFB Y12, Y7, Y13 - XOR3WAY( $0x00, Y14, Y15, Y9) - XOR3WAY( $0x00, Y11, Y13, Y10) - VMOVDQU Y9, (CX) - VMOVDQU Y10, 32(CX) - ADDQ $0x40, CX - ADDQ $0x40, DX - SUBQ $0x40, AX - JNZ loop - VZEROUPPER - RET - -// func fftDIT2_avx2(x []byte, y []byte, table *[128]uint8) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·fftDIT2_avx2(SB), NOSPLIT, $0-56 - MOVQ table+48(FP), AX - VBROADCASTI128 (AX), Y0 - VBROADCASTI128 64(AX), Y1 - VBROADCASTI128 16(AX), Y2 - VBROADCASTI128 80(AX), Y3 - VBROADCASTI128 32(AX), Y4 - VBROADCASTI128 96(AX), Y5 - VBROADCASTI128 48(AX), Y6 - VBROADCASTI128 112(AX), Y7 - MOVQ x_len+8(FP), AX - MOVQ x_base+0(FP), CX - MOVQ y_base+24(FP), DX - MOVQ $0x0000000f, BX - MOVQ BX, X8 - VPBROADCASTB X8, Y8 - -loop: - VMOVDQU (CX), Y9 - VMOVDQU 32(CX), Y10 - VMOVDQU (DX), Y11 - VMOVDQU 32(DX), Y12 - VPSRLQ $0x04, Y11, Y13 - VPAND Y8, Y11, Y11 - VPAND Y8, Y13, Y13 - VPSHUFB Y11, Y0, Y14 - VPSHUFB Y11, Y1, Y11 - VPSHUFB Y13, Y2, Y15 - VPSHUFB Y13, Y3, Y13 - VPXOR Y14, Y15, Y14 - VPXOR Y11, Y13, Y11 - VPAND Y12, Y8, Y13 - VPSRLQ $0x04, Y12, Y12 - VPAND Y8, Y12, Y12 - VPSHUFB Y13, Y4, Y15 - VPSHUFB Y13, Y5, Y13 - VPXOR Y14, Y15, Y14 - VPXOR Y11, Y13, Y11 - VPSHUFB Y12, Y6, Y15 - VPSHUFB Y12, Y7, Y13 - XOR3WAY( $0x00, Y14, Y15, Y9) - XOR3WAY( $0x00, Y11, Y13, Y10) - VMOVDQU Y9, (CX) - VMOVDQU Y10, 32(CX) - VMOVDQU (DX), Y11 - VMOVDQU 32(DX), Y12 - VPXOR Y11, Y9, Y11 - VPXOR Y12, Y10, Y12 - VMOVDQU Y11, (DX) - VMOVDQU Y12, 32(DX) - ADDQ $0x40, CX - ADDQ $0x40, DX - SUBQ $0x40, AX - JNZ loop - VZEROUPPER - RET - -// func mulgf16_avx2(x []byte, y []byte, table *[128]uint8) -// Requires: AVX, AVX2, SSE2 -TEXT ·mulgf16_avx2(SB), NOSPLIT, $0-56 - MOVQ table+48(FP), AX - VBROADCASTI128 (AX), Y0 - VBROADCASTI128 64(AX), Y1 - VBROADCASTI128 16(AX), Y2 - VBROADCASTI128 80(AX), Y3 - VBROADCASTI128 32(AX), Y4 - VBROADCASTI128 96(AX), Y5 - VBROADCASTI128 48(AX), Y6 - VBROADCASTI128 112(AX), Y7 - MOVQ x_len+8(FP), AX - MOVQ x_base+0(FP), CX - MOVQ y_base+24(FP), DX - MOVQ $0x0000000f, BX - MOVQ BX, X8 - VPBROADCASTB X8, Y8 - -loop: - VMOVDQU (DX), Y9 - VMOVDQU 32(DX), Y10 - VPSRLQ $0x04, Y9, Y11 - VPAND Y8, Y9, Y9 - VPAND Y8, Y11, Y11 - VPSHUFB Y9, Y0, Y12 - VPSHUFB Y9, Y1, Y9 - VPSHUFB Y11, Y2, Y13 - VPSHUFB Y11, Y3, Y11 - VPXOR Y12, Y13, Y12 - VPXOR Y9, Y11, Y9 - VPAND Y10, Y8, Y11 - VPSRLQ $0x04, Y10, Y10 - VPAND Y8, Y10, Y10 - VPSHUFB Y11, Y4, Y13 - VPSHUFB Y11, Y5, Y11 - VPXOR Y12, Y13, Y12 - VPXOR Y9, Y11, Y9 - VPSHUFB Y10, Y6, Y13 - VPSHUFB Y10, Y7, Y11 - VPXOR Y12, Y13, Y12 - VPXOR Y9, Y11, Y9 - VMOVDQU Y12, (CX) - VMOVDQU Y9, 32(CX) - ADDQ $0x40, CX - ADDQ $0x40, DX - SUBQ $0x40, AX - JNZ loop - VZEROUPPER - RET - -// func ifftDIT4_avx512_0(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·ifftDIT4_avx512_0(SB), NOSPLIT, $0-56 - // dist must be multiplied by 24 (size of slice header) - MOVQ table01+32(FP), AX - MOVQ table23+40(FP), CX - MOVQ table02+48(FP), DX - VBROADCASTI128 (DX), Y1 - VBROADCASTI128 64(DX), Y0 - VMOVAPS Z1, Z16 - VMOVAPS Z0, Z17 - VBROADCASTI128 16(DX), Y1 - VBROADCASTI128 80(DX), Y0 - VMOVAPS Z1, Z18 - VMOVAPS Z0, Z19 - VBROADCASTI128 32(DX), Y1 - VBROADCASTI128 96(DX), Y0 - VMOVAPS Z1, Z20 - VMOVAPS Z0, Z21 - VBROADCASTI128 48(DX), Y1 - VBROADCASTI128 112(DX), Y0 - VMOVAPS Z1, Z22 - VMOVAPS Z0, Z23 - VBROADCASTI128 (AX), Y1 - VBROADCASTI128 64(AX), Y0 - VMOVAPS Z1, Z24 - VMOVAPS Z0, Z25 - VBROADCASTI128 16(AX), Y1 - VBROADCASTI128 80(AX), Y0 - VMOVAPS Z1, Z26 - VMOVAPS Z0, Z27 - VBROADCASTI128 32(AX), Y1 - VBROADCASTI128 96(AX), Y0 - VMOVAPS Z1, Z28 - VMOVAPS Z0, Z29 - VBROADCASTI128 48(AX), Y1 - VBROADCASTI128 112(AX), Y0 - VMOVAPS Z1, Z30 - VMOVAPS Z0, Z31 - MOVQ $0x0000000f, AX - MOVQ AX, X0 - VPBROADCASTB X0, Y0 - MOVQ dist+24(FP), AX - MOVQ work_base+0(FP), DX - MOVQ 8(DX), BX - XORQ SI, SI - MOVQ (DX)(SI*1), DI - ADDQ AX, SI - MOVQ (DX)(SI*1), R8 - ADDQ AX, SI - MOVQ (DX)(SI*1), R9 - ADDQ AX, SI - MOVQ (DX)(SI*1), AX - -loop: - VMOVDQU (DI), Y1 - VMOVDQU 32(DI), Y2 - VMOVDQU (R8), Y3 - VMOVDQU 32(R8), Y4 - VPXOR Y1, Y3, Y3 - VPXOR Y2, Y4, Y4 - VPSRLQ $0x04, Y3, Y6 - VPAND Y0, Y3, Y5 - VPAND Y0, Y6, Y6 - VPSHUFB Y5, Y24, Y7 - VPSHUFB Y5, Y25, Y5 - VPSHUFB Y6, Y26, Y8 - VPSHUFB Y6, Y27, Y6 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y6, Y5 - VPAND Y4, Y0, Y6 - VPSRLQ $0x04, Y4, Y8 - VPAND Y0, Y8, Y8 - VPSHUFB Y6, Y28, Y9 - VPSHUFB Y6, Y29, Y6 - VPXOR Y7, Y9, Y7 - VPXOR Y5, Y6, Y5 - VPSHUFB Y8, Y30, Y9 - VPSHUFB Y8, Y31, Y6 - VPTERNLOGD $0x96, Y7, Y9, Y1 - VPTERNLOGD $0x96, Y5, Y6, Y2 - VMOVDQU (R9), Y5 - VMOVDQU 32(R9), Y6 - VMOVDQU (AX), Y7 - VMOVDQU 32(AX), Y8 - VPXOR Y5, Y7, Y7 - VPXOR Y6, Y8, Y8 - VPSRLQ $0x04, Y7, Y10 - VPAND Y0, Y7, Y9 - VPAND Y0, Y10, Y10 - VBROADCASTI128 (CX), Y11 - VBROADCASTI128 64(CX), Y12 - VPSHUFB Y9, Y11, Y11 - VPSHUFB Y9, Y12, Y9 - VBROADCASTI128 16(CX), Y12 - VBROADCASTI128 80(CX), Y13 - VPSHUFB Y10, Y12, Y12 - VPSHUFB Y10, Y13, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y8, Y0, Y10 - VPSRLQ $0x04, Y8, Y12 - VPAND Y0, Y12, Y12 - VBROADCASTI128 32(CX), Y13 - VBROADCASTI128 96(CX), Y14 - VPSHUFB Y10, Y13, Y13 - VPSHUFB Y10, Y14, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VBROADCASTI128 48(CX), Y13 - VBROADCASTI128 112(CX), Y10 - VPSHUFB Y12, Y13, Y13 - VPSHUFB Y12, Y10, Y10 - VPTERNLOGD $0x96, Y11, Y13, Y5 - VPTERNLOGD $0x96, Y9, Y10, Y6 - VPXOR Y1, Y5, Y5 - VPXOR Y2, Y6, Y6 - VPXOR Y3, Y7, Y7 - VPXOR Y4, Y8, Y8 - VPSRLQ $0x04, Y5, Y10 - VPAND Y0, Y5, Y9 - VPAND Y0, Y10, Y10 - VPSHUFB Y9, Y16, Y11 - VPSHUFB Y9, Y17, Y9 - VPSHUFB Y10, Y18, Y12 - VPSHUFB Y10, Y19, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y6, Y0, Y10 - VPSRLQ $0x04, Y6, Y12 - VPAND Y0, Y12, Y12 - VPSHUFB Y10, Y20, Y13 - VPSHUFB Y10, Y21, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VPSHUFB Y12, Y22, Y13 - VPSHUFB Y12, Y23, Y10 - VPTERNLOGD $0x96, Y11, Y13, Y1 - VPTERNLOGD $0x96, Y9, Y10, Y2 - VPSRLQ $0x04, Y7, Y10 - VPAND Y0, Y7, Y9 - VPAND Y0, Y10, Y10 - VPSHUFB Y9, Y16, Y11 - VPSHUFB Y9, Y17, Y9 - VPSHUFB Y10, Y18, Y12 - VPSHUFB Y10, Y19, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y8, Y0, Y10 - VPSRLQ $0x04, Y8, Y12 - VPAND Y0, Y12, Y12 - VPSHUFB Y10, Y20, Y13 - VPSHUFB Y10, Y21, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VPSHUFB Y12, Y22, Y13 - VPSHUFB Y12, Y23, Y10 - VPTERNLOGD $0x96, Y11, Y13, Y3 - VPTERNLOGD $0x96, Y9, Y10, Y4 - VMOVDQU Y1, (DI) - VMOVDQU Y2, 32(DI) - ADDQ $0x40, DI - VMOVDQU Y3, (R8) - VMOVDQU Y4, 32(R8) - ADDQ $0x40, R8 - VMOVDQU Y5, (R9) - VMOVDQU Y6, 32(R9) - ADDQ $0x40, R9 - VMOVDQU Y7, (AX) - VMOVDQU Y8, 32(AX) - ADDQ $0x40, AX - SUBQ $0x40, BX - JNZ loop - VZEROUPPER - RET - -// func fftDIT4_avx512_0(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·fftDIT4_avx512_0(SB), NOSPLIT, $0-56 - // dist must be multiplied by 24 (size of slice header) - MOVQ table01+32(FP), AX - MOVQ table23+40(FP), CX - MOVQ table02+48(FP), DX - VBROADCASTI128 (DX), Y1 - VBROADCASTI128 64(DX), Y0 - VMOVAPS Z1, Z16 - VMOVAPS Z0, Z17 - VBROADCASTI128 16(DX), Y1 - VBROADCASTI128 80(DX), Y0 - VMOVAPS Z1, Z18 - VMOVAPS Z0, Z19 - VBROADCASTI128 32(DX), Y1 - VBROADCASTI128 96(DX), Y0 - VMOVAPS Z1, Z20 - VMOVAPS Z0, Z21 - VBROADCASTI128 48(DX), Y1 - VBROADCASTI128 112(DX), Y0 - VMOVAPS Z1, Z22 - VMOVAPS Z0, Z23 - VBROADCASTI128 (AX), Y1 - VBROADCASTI128 64(AX), Y0 - VMOVAPS Z1, Z24 - VMOVAPS Z0, Z25 - VBROADCASTI128 16(AX), Y1 - VBROADCASTI128 80(AX), Y0 - VMOVAPS Z1, Z26 - VMOVAPS Z0, Z27 - VBROADCASTI128 32(AX), Y1 - VBROADCASTI128 96(AX), Y0 - VMOVAPS Z1, Z28 - VMOVAPS Z0, Z29 - VBROADCASTI128 48(AX), Y1 - VBROADCASTI128 112(AX), Y0 - VMOVAPS Z1, Z30 - VMOVAPS Z0, Z31 - MOVQ $0x0000000f, AX - MOVQ AX, X0 - VPBROADCASTB X0, Y0 - MOVQ dist+24(FP), AX - MOVQ work_base+0(FP), DX - MOVQ 8(DX), BX - XORQ SI, SI - MOVQ (DX)(SI*1), DI - ADDQ AX, SI - MOVQ (DX)(SI*1), R8 - ADDQ AX, SI - MOVQ (DX)(SI*1), R9 - ADDQ AX, SI - MOVQ (DX)(SI*1), AX - -loop: - VMOVDQU (DI), Y1 - VMOVDQU 32(DI), Y2 - VMOVDQU (R9), Y5 - VMOVDQU 32(R9), Y6 - VMOVDQU (R8), Y3 - VMOVDQU 32(R8), Y4 - VMOVDQU (AX), Y7 - VMOVDQU 32(AX), Y8 - VPSRLQ $0x04, Y5, Y10 - VPAND Y0, Y5, Y9 - VPAND Y0, Y10, Y10 - VPSHUFB Y9, Y16, Y11 - VPSHUFB Y9, Y17, Y9 - VPSHUFB Y10, Y18, Y12 - VPSHUFB Y10, Y19, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y6, Y0, Y10 - VPSRLQ $0x04, Y6, Y12 - VPAND Y0, Y12, Y12 - VPSHUFB Y10, Y20, Y13 - VPSHUFB Y10, Y21, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VPSHUFB Y12, Y22, Y13 - VPSHUFB Y12, Y23, Y10 - VPTERNLOGD $0x96, Y11, Y13, Y1 - VPTERNLOGD $0x96, Y9, Y10, Y2 - VPSRLQ $0x04, Y7, Y10 - VPAND Y0, Y7, Y9 - VPAND Y0, Y10, Y10 - VPSHUFB Y9, Y16, Y11 - VPSHUFB Y9, Y17, Y9 - VPSHUFB Y10, Y18, Y12 - VPSHUFB Y10, Y19, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y8, Y0, Y10 - VPSRLQ $0x04, Y8, Y12 - VPAND Y0, Y12, Y12 - VPSHUFB Y10, Y20, Y13 - VPSHUFB Y10, Y21, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VPSHUFB Y12, Y22, Y13 - VPSHUFB Y12, Y23, Y10 - VPTERNLOGD $0x96, Y11, Y13, Y3 - VPTERNLOGD $0x96, Y9, Y10, Y4 - VPXOR Y1, Y5, Y5 - VPXOR Y2, Y6, Y6 - VPXOR Y3, Y7, Y7 - VPXOR Y4, Y8, Y8 - VPSRLQ $0x04, Y3, Y10 - VPAND Y0, Y3, Y9 - VPAND Y0, Y10, Y10 - VPSHUFB Y9, Y24, Y11 - VPSHUFB Y9, Y25, Y9 - VPSHUFB Y10, Y26, Y12 - VPSHUFB Y10, Y27, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y4, Y0, Y10 - VPSRLQ $0x04, Y4, Y12 - VPAND Y0, Y12, Y12 - VPSHUFB Y10, Y28, Y13 - VPSHUFB Y10, Y29, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VPSHUFB Y12, Y30, Y13 - VPSHUFB Y12, Y31, Y10 - VPTERNLOGD $0x96, Y11, Y13, Y1 - VPTERNLOGD $0x96, Y9, Y10, Y2 - VPXOR Y1, Y3, Y3 - VPXOR Y2, Y4, Y4 - VMOVDQU Y1, (DI) - VMOVDQU Y2, 32(DI) - ADDQ $0x40, DI - VMOVDQU Y3, (R8) - VMOVDQU Y4, 32(R8) - ADDQ $0x40, R8 - VPSRLQ $0x04, Y7, Y2 - VPAND Y0, Y7, Y1 - VPAND Y0, Y2, Y2 - VBROADCASTI128 (CX), Y3 - VBROADCASTI128 64(CX), Y4 - VPSHUFB Y1, Y3, Y3 - VPSHUFB Y1, Y4, Y1 - VBROADCASTI128 16(CX), Y4 - VBROADCASTI128 80(CX), Y9 - VPSHUFB Y2, Y4, Y4 - VPSHUFB Y2, Y9, Y2 - VPXOR Y3, Y4, Y3 - VPXOR Y1, Y2, Y1 - VPAND Y8, Y0, Y2 - VPSRLQ $0x04, Y8, Y4 - VPAND Y0, Y4, Y4 - VBROADCASTI128 32(CX), Y9 - VBROADCASTI128 96(CX), Y10 - VPSHUFB Y2, Y9, Y9 - VPSHUFB Y2, Y10, Y2 - VPXOR Y3, Y9, Y3 - VPXOR Y1, Y2, Y1 - VBROADCASTI128 48(CX), Y9 - VBROADCASTI128 112(CX), Y2 - VPSHUFB Y4, Y9, Y9 - VPSHUFB Y4, Y2, Y2 - VPTERNLOGD $0x96, Y3, Y9, Y5 - VPTERNLOGD $0x96, Y1, Y2, Y6 - VPXOR Y5, Y7, Y7 - VPXOR Y6, Y8, Y8 - VMOVDQU Y5, (R9) - VMOVDQU Y6, 32(R9) - ADDQ $0x40, R9 - VMOVDQU Y7, (AX) - VMOVDQU Y8, 32(AX) - ADDQ $0x40, AX - SUBQ $0x40, BX - JNZ loop - VZEROUPPER - RET - -// func ifftDIT4_avx512_1(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·ifftDIT4_avx512_1(SB), NOSPLIT, $0-56 - // dist must be multiplied by 24 (size of slice header) - MOVQ table01+32(FP), AX - MOVQ table23+40(FP), AX - MOVQ table02+48(FP), CX - VBROADCASTI128 (CX), Y1 - VBROADCASTI128 64(CX), Y0 - VMOVAPS Z1, Z16 - VMOVAPS Z0, Z17 - VBROADCASTI128 16(CX), Y1 - VBROADCASTI128 80(CX), Y0 - VMOVAPS Z1, Z18 - VMOVAPS Z0, Z19 - VBROADCASTI128 32(CX), Y1 - VBROADCASTI128 96(CX), Y0 - VMOVAPS Z1, Z20 - VMOVAPS Z0, Z21 - VBROADCASTI128 48(CX), Y1 - VBROADCASTI128 112(CX), Y0 - VMOVAPS Z1, Z22 - VMOVAPS Z0, Z23 - VBROADCASTI128 (AX), Y1 - VBROADCASTI128 64(AX), Y0 - VMOVAPS Z1, Z24 - VMOVAPS Z0, Z25 - VBROADCASTI128 16(AX), Y1 - VBROADCASTI128 80(AX), Y0 - VMOVAPS Z1, Z26 - VMOVAPS Z0, Z27 - VBROADCASTI128 32(AX), Y1 - VBROADCASTI128 96(AX), Y0 - VMOVAPS Z1, Z28 - VMOVAPS Z0, Z29 - VBROADCASTI128 48(AX), Y1 - VBROADCASTI128 112(AX), Y0 - VMOVAPS Z1, Z30 - VMOVAPS Z0, Z31 - MOVQ $0x0000000f, AX - MOVQ AX, X0 - VPBROADCASTB X0, Y0 - MOVQ dist+24(FP), AX - MOVQ work_base+0(FP), CX - MOVQ 8(CX), DX - XORQ BX, BX - MOVQ (CX)(BX*1), SI - ADDQ AX, BX - MOVQ (CX)(BX*1), DI - ADDQ AX, BX - MOVQ (CX)(BX*1), R8 - ADDQ AX, BX - MOVQ (CX)(BX*1), AX - -loop: - VMOVDQU (SI), Y1 - VMOVDQU 32(SI), Y2 - VMOVDQU (DI), Y3 - VMOVDQU 32(DI), Y4 - VPXOR Y1, Y3, Y3 - VPXOR Y2, Y4, Y4 - VMOVDQU (R8), Y5 - VMOVDQU 32(R8), Y6 - VMOVDQU (AX), Y7 - VMOVDQU 32(AX), Y8 - VPXOR Y5, Y7, Y7 - VPXOR Y6, Y8, Y8 - VPSRLQ $0x04, Y7, Y10 - VPAND Y0, Y7, Y9 - VPAND Y0, Y10, Y10 - VPSHUFB Y9, Y24, Y11 - VPSHUFB Y9, Y25, Y9 - VPSHUFB Y10, Y26, Y12 - VPSHUFB Y10, Y27, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y8, Y0, Y10 - VPSRLQ $0x04, Y8, Y12 - VPAND Y0, Y12, Y12 - VPSHUFB Y10, Y28, Y13 - VPSHUFB Y10, Y29, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VPSHUFB Y12, Y30, Y13 - VPSHUFB Y12, Y31, Y10 - VPTERNLOGD $0x96, Y11, Y13, Y5 - VPTERNLOGD $0x96, Y9, Y10, Y6 - VPXOR Y1, Y5, Y5 - VPXOR Y2, Y6, Y6 - VPXOR Y3, Y7, Y7 - VPXOR Y4, Y8, Y8 - VPSRLQ $0x04, Y5, Y10 - VPAND Y0, Y5, Y9 - VPAND Y0, Y10, Y10 - VPSHUFB Y9, Y16, Y11 - VPSHUFB Y9, Y17, Y9 - VPSHUFB Y10, Y18, Y12 - VPSHUFB Y10, Y19, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y6, Y0, Y10 - VPSRLQ $0x04, Y6, Y12 - VPAND Y0, Y12, Y12 - VPSHUFB Y10, Y20, Y13 - VPSHUFB Y10, Y21, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VPSHUFB Y12, Y22, Y13 - VPSHUFB Y12, Y23, Y10 - VPTERNLOGD $0x96, Y11, Y13, Y1 - VPTERNLOGD $0x96, Y9, Y10, Y2 - VPSRLQ $0x04, Y7, Y10 - VPAND Y0, Y7, Y9 - VPAND Y0, Y10, Y10 - VPSHUFB Y9, Y16, Y11 - VPSHUFB Y9, Y17, Y9 - VPSHUFB Y10, Y18, Y12 - VPSHUFB Y10, Y19, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y8, Y0, Y10 - VPSRLQ $0x04, Y8, Y12 - VPAND Y0, Y12, Y12 - VPSHUFB Y10, Y20, Y13 - VPSHUFB Y10, Y21, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VPSHUFB Y12, Y22, Y13 - VPSHUFB Y12, Y23, Y10 - VPTERNLOGD $0x96, Y11, Y13, Y3 - VPTERNLOGD $0x96, Y9, Y10, Y4 - VMOVDQU Y1, (SI) - VMOVDQU Y2, 32(SI) - ADDQ $0x40, SI - VMOVDQU Y3, (DI) - VMOVDQU Y4, 32(DI) - ADDQ $0x40, DI - VMOVDQU Y5, (R8) - VMOVDQU Y6, 32(R8) - ADDQ $0x40, R8 - VMOVDQU Y7, (AX) - VMOVDQU Y8, 32(AX) - ADDQ $0x40, AX - SUBQ $0x40, DX - JNZ loop - VZEROUPPER - RET - -// func fftDIT4_avx512_1(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·fftDIT4_avx512_1(SB), NOSPLIT, $0-56 - // dist must be multiplied by 24 (size of slice header) - MOVQ table01+32(FP), AX - MOVQ table23+40(FP), CX - MOVQ table02+48(FP), DX - VBROADCASTI128 (AX), Y1 - VBROADCASTI128 64(AX), Y0 - VMOVAPS Z1, Z16 - VMOVAPS Z0, Z17 - VBROADCASTI128 16(AX), Y1 - VBROADCASTI128 80(AX), Y0 - VMOVAPS Z1, Z18 - VMOVAPS Z0, Z19 - VBROADCASTI128 32(AX), Y1 - VBROADCASTI128 96(AX), Y0 - VMOVAPS Z1, Z20 - VMOVAPS Z0, Z21 - VBROADCASTI128 48(AX), Y1 - VBROADCASTI128 112(AX), Y0 - VMOVAPS Z1, Z22 - VMOVAPS Z0, Z23 - VBROADCASTI128 (CX), Y1 - VBROADCASTI128 64(CX), Y0 - VMOVAPS Z1, Z24 - VMOVAPS Z0, Z25 - VBROADCASTI128 16(CX), Y1 - VBROADCASTI128 80(CX), Y0 - VMOVAPS Z1, Z26 - VMOVAPS Z0, Z27 - VBROADCASTI128 32(CX), Y1 - VBROADCASTI128 96(CX), Y0 - VMOVAPS Z1, Z28 - VMOVAPS Z0, Z29 - VBROADCASTI128 48(CX), Y1 - VBROADCASTI128 112(CX), Y0 - VMOVAPS Z1, Z30 - VMOVAPS Z0, Z31 - MOVQ $0x0000000f, AX - MOVQ AX, X0 - VPBROADCASTB X0, Y0 - MOVQ dist+24(FP), AX - MOVQ work_base+0(FP), CX - MOVQ 8(CX), DX - XORQ BX, BX - MOVQ (CX)(BX*1), SI - ADDQ AX, BX - MOVQ (CX)(BX*1), DI - ADDQ AX, BX - MOVQ (CX)(BX*1), R8 - ADDQ AX, BX - MOVQ (CX)(BX*1), AX - -loop: - VMOVDQU (SI), Y1 - VMOVDQU 32(SI), Y2 - VMOVDQU (R8), Y5 - VMOVDQU 32(R8), Y6 - VMOVDQU (DI), Y3 - VMOVDQU 32(DI), Y4 - VMOVDQU (AX), Y7 - VMOVDQU 32(AX), Y8 - VPXOR Y1, Y5, Y5 - VPXOR Y2, Y6, Y6 - VPXOR Y3, Y7, Y7 - VPXOR Y4, Y8, Y8 - VPSRLQ $0x04, Y3, Y10 - VPAND Y0, Y3, Y9 - VPAND Y0, Y10, Y10 - VPSHUFB Y9, Y16, Y11 - VPSHUFB Y9, Y17, Y9 - VPSHUFB Y10, Y18, Y12 - VPSHUFB Y10, Y19, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y4, Y0, Y10 - VPSRLQ $0x04, Y4, Y12 - VPAND Y0, Y12, Y12 - VPSHUFB Y10, Y20, Y13 - VPSHUFB Y10, Y21, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VPSHUFB Y12, Y22, Y13 - VPSHUFB Y12, Y23, Y10 - VPTERNLOGD $0x96, Y11, Y13, Y1 - VPTERNLOGD $0x96, Y9, Y10, Y2 - VPXOR Y1, Y3, Y3 - VPXOR Y2, Y4, Y4 - VMOVDQU Y1, (SI) - VMOVDQU Y2, 32(SI) - ADDQ $0x40, SI - VMOVDQU Y3, (DI) - VMOVDQU Y4, 32(DI) - ADDQ $0x40, DI - VPSRLQ $0x04, Y7, Y2 - VPAND Y0, Y7, Y1 - VPAND Y0, Y2, Y2 - VPSHUFB Y1, Y24, Y3 - VPSHUFB Y1, Y25, Y1 - VPSHUFB Y2, Y26, Y4 - VPSHUFB Y2, Y27, Y2 - VPXOR Y3, Y4, Y3 - VPXOR Y1, Y2, Y1 - VPAND Y8, Y0, Y2 - VPSRLQ $0x04, Y8, Y4 - VPAND Y0, Y4, Y4 - VPSHUFB Y2, Y28, Y9 - VPSHUFB Y2, Y29, Y2 - VPXOR Y3, Y9, Y3 - VPXOR Y1, Y2, Y1 - VPSHUFB Y4, Y30, Y9 - VPSHUFB Y4, Y31, Y2 - VPTERNLOGD $0x96, Y3, Y9, Y5 - VPTERNLOGD $0x96, Y1, Y2, Y6 - VPXOR Y5, Y7, Y7 - VPXOR Y6, Y8, Y8 - VMOVDQU Y5, (R8) - VMOVDQU Y6, 32(R8) - ADDQ $0x40, R8 - VMOVDQU Y7, (AX) - VMOVDQU Y8, 32(AX) - ADDQ $0x40, AX - SUBQ $0x40, DX - JNZ loop - VZEROUPPER - RET - -// func ifftDIT4_avx512_2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·ifftDIT4_avx512_2(SB), NOSPLIT, $0-56 - // dist must be multiplied by 24 (size of slice header) - MOVQ table01+32(FP), AX - MOVQ table23+40(FP), CX - MOVQ table02+48(FP), CX - VBROADCASTI128 (CX), Y1 - VBROADCASTI128 64(CX), Y0 - VMOVAPS Z1, Z16 - VMOVAPS Z0, Z17 - VBROADCASTI128 16(CX), Y1 - VBROADCASTI128 80(CX), Y0 - VMOVAPS Z1, Z18 - VMOVAPS Z0, Z19 - VBROADCASTI128 32(CX), Y1 - VBROADCASTI128 96(CX), Y0 - VMOVAPS Z1, Z20 - VMOVAPS Z0, Z21 - VBROADCASTI128 48(CX), Y1 - VBROADCASTI128 112(CX), Y0 - VMOVAPS Z1, Z22 - VMOVAPS Z0, Z23 - VBROADCASTI128 (AX), Y1 - VBROADCASTI128 64(AX), Y0 - VMOVAPS Z1, Z24 - VMOVAPS Z0, Z25 - VBROADCASTI128 16(AX), Y1 - VBROADCASTI128 80(AX), Y0 - VMOVAPS Z1, Z26 - VMOVAPS Z0, Z27 - VBROADCASTI128 32(AX), Y1 - VBROADCASTI128 96(AX), Y0 - VMOVAPS Z1, Z28 - VMOVAPS Z0, Z29 - VBROADCASTI128 48(AX), Y1 - VBROADCASTI128 112(AX), Y0 - VMOVAPS Z1, Z30 - VMOVAPS Z0, Z31 - MOVQ $0x0000000f, AX - MOVQ AX, X0 - VPBROADCASTB X0, Y0 - MOVQ dist+24(FP), AX - MOVQ work_base+0(FP), CX - MOVQ 8(CX), DX - XORQ BX, BX - MOVQ (CX)(BX*1), SI - ADDQ AX, BX - MOVQ (CX)(BX*1), DI - ADDQ AX, BX - MOVQ (CX)(BX*1), R8 - ADDQ AX, BX - MOVQ (CX)(BX*1), AX - -loop: - VMOVDQU (SI), Y1 - VMOVDQU 32(SI), Y2 - VMOVDQU (DI), Y3 - VMOVDQU 32(DI), Y4 - VPXOR Y1, Y3, Y3 - VPXOR Y2, Y4, Y4 - VPSRLQ $0x04, Y3, Y6 - VPAND Y0, Y3, Y5 - VPAND Y0, Y6, Y6 - VPSHUFB Y5, Y24, Y7 - VPSHUFB Y5, Y25, Y5 - VPSHUFB Y6, Y26, Y8 - VPSHUFB Y6, Y27, Y6 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y6, Y5 - VPAND Y4, Y0, Y6 - VPSRLQ $0x04, Y4, Y8 - VPAND Y0, Y8, Y8 - VPSHUFB Y6, Y28, Y9 - VPSHUFB Y6, Y29, Y6 - VPXOR Y7, Y9, Y7 - VPXOR Y5, Y6, Y5 - VPSHUFB Y8, Y30, Y9 - VPSHUFB Y8, Y31, Y6 - VPTERNLOGD $0x96, Y7, Y9, Y1 - VPTERNLOGD $0x96, Y5, Y6, Y2 - VMOVDQU (R8), Y5 - VMOVDQU 32(R8), Y6 - VMOVDQU (AX), Y7 - VMOVDQU 32(AX), Y8 - VPXOR Y5, Y7, Y7 - VPXOR Y6, Y8, Y8 - VPXOR Y1, Y5, Y5 - VPXOR Y2, Y6, Y6 - VPXOR Y3, Y7, Y7 - VPXOR Y4, Y8, Y8 - VPSRLQ $0x04, Y5, Y10 - VPAND Y0, Y5, Y9 - VPAND Y0, Y10, Y10 - VPSHUFB Y9, Y16, Y11 - VPSHUFB Y9, Y17, Y9 - VPSHUFB Y10, Y18, Y12 - VPSHUFB Y10, Y19, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y6, Y0, Y10 - VPSRLQ $0x04, Y6, Y12 - VPAND Y0, Y12, Y12 - VPSHUFB Y10, Y20, Y13 - VPSHUFB Y10, Y21, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VPSHUFB Y12, Y22, Y13 - VPSHUFB Y12, Y23, Y10 - VPTERNLOGD $0x96, Y11, Y13, Y1 - VPTERNLOGD $0x96, Y9, Y10, Y2 - VPSRLQ $0x04, Y7, Y10 - VPAND Y0, Y7, Y9 - VPAND Y0, Y10, Y10 - VPSHUFB Y9, Y16, Y11 - VPSHUFB Y9, Y17, Y9 - VPSHUFB Y10, Y18, Y12 - VPSHUFB Y10, Y19, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y8, Y0, Y10 - VPSRLQ $0x04, Y8, Y12 - VPAND Y0, Y12, Y12 - VPSHUFB Y10, Y20, Y13 - VPSHUFB Y10, Y21, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VPSHUFB Y12, Y22, Y13 - VPSHUFB Y12, Y23, Y10 - VPTERNLOGD $0x96, Y11, Y13, Y3 - VPTERNLOGD $0x96, Y9, Y10, Y4 - VMOVDQU Y1, (SI) - VMOVDQU Y2, 32(SI) - ADDQ $0x40, SI - VMOVDQU Y3, (DI) - VMOVDQU Y4, 32(DI) - ADDQ $0x40, DI - VMOVDQU Y5, (R8) - VMOVDQU Y6, 32(R8) - ADDQ $0x40, R8 - VMOVDQU Y7, (AX) - VMOVDQU Y8, 32(AX) - ADDQ $0x40, AX - SUBQ $0x40, DX - JNZ loop - VZEROUPPER - RET - -// func fftDIT4_avx512_2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·fftDIT4_avx512_2(SB), NOSPLIT, $0-56 - // dist must be multiplied by 24 (size of slice header) - MOVQ table01+32(FP), AX - MOVQ table23+40(FP), AX - MOVQ table02+48(FP), CX - VBROADCASTI128 (CX), Y1 - VBROADCASTI128 64(CX), Y0 - VMOVAPS Z1, Z16 - VMOVAPS Z0, Z17 - VBROADCASTI128 16(CX), Y1 - VBROADCASTI128 80(CX), Y0 - VMOVAPS Z1, Z18 - VMOVAPS Z0, Z19 - VBROADCASTI128 32(CX), Y1 - VBROADCASTI128 96(CX), Y0 - VMOVAPS Z1, Z20 - VMOVAPS Z0, Z21 - VBROADCASTI128 48(CX), Y1 - VBROADCASTI128 112(CX), Y0 - VMOVAPS Z1, Z22 - VMOVAPS Z0, Z23 - VBROADCASTI128 (AX), Y1 - VBROADCASTI128 64(AX), Y0 - VMOVAPS Z1, Z24 - VMOVAPS Z0, Z25 - VBROADCASTI128 16(AX), Y1 - VBROADCASTI128 80(AX), Y0 - VMOVAPS Z1, Z26 - VMOVAPS Z0, Z27 - VBROADCASTI128 32(AX), Y1 - VBROADCASTI128 96(AX), Y0 - VMOVAPS Z1, Z28 - VMOVAPS Z0, Z29 - VBROADCASTI128 48(AX), Y1 - VBROADCASTI128 112(AX), Y0 - VMOVAPS Z1, Z30 - VMOVAPS Z0, Z31 - MOVQ $0x0000000f, AX - MOVQ AX, X0 - VPBROADCASTB X0, Y0 - MOVQ dist+24(FP), AX - MOVQ work_base+0(FP), CX - MOVQ 8(CX), DX - XORQ BX, BX - MOVQ (CX)(BX*1), SI - ADDQ AX, BX - MOVQ (CX)(BX*1), DI - ADDQ AX, BX - MOVQ (CX)(BX*1), R8 - ADDQ AX, BX - MOVQ (CX)(BX*1), AX - -loop: - VMOVDQU (SI), Y1 - VMOVDQU 32(SI), Y2 - VMOVDQU (R8), Y5 - VMOVDQU 32(R8), Y6 - VMOVDQU (DI), Y3 - VMOVDQU 32(DI), Y4 - VMOVDQU (AX), Y7 - VMOVDQU 32(AX), Y8 - VPSRLQ $0x04, Y5, Y10 - VPAND Y0, Y5, Y9 - VPAND Y0, Y10, Y10 - VPSHUFB Y9, Y16, Y11 - VPSHUFB Y9, Y17, Y9 - VPSHUFB Y10, Y18, Y12 - VPSHUFB Y10, Y19, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y6, Y0, Y10 - VPSRLQ $0x04, Y6, Y12 - VPAND Y0, Y12, Y12 - VPSHUFB Y10, Y20, Y13 - VPSHUFB Y10, Y21, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VPSHUFB Y12, Y22, Y13 - VPSHUFB Y12, Y23, Y10 - VPTERNLOGD $0x96, Y11, Y13, Y1 - VPTERNLOGD $0x96, Y9, Y10, Y2 - VPSRLQ $0x04, Y7, Y10 - VPAND Y0, Y7, Y9 - VPAND Y0, Y10, Y10 - VPSHUFB Y9, Y16, Y11 - VPSHUFB Y9, Y17, Y9 - VPSHUFB Y10, Y18, Y12 - VPSHUFB Y10, Y19, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y8, Y0, Y10 - VPSRLQ $0x04, Y8, Y12 - VPAND Y0, Y12, Y12 - VPSHUFB Y10, Y20, Y13 - VPSHUFB Y10, Y21, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VPSHUFB Y12, Y22, Y13 - VPSHUFB Y12, Y23, Y10 - VPTERNLOGD $0x96, Y11, Y13, Y3 - VPTERNLOGD $0x96, Y9, Y10, Y4 - VPXOR Y1, Y5, Y5 - VPXOR Y2, Y6, Y6 - VPXOR Y3, Y7, Y7 - VPXOR Y4, Y8, Y8 - VPXOR Y1, Y3, Y3 - VPXOR Y2, Y4, Y4 - VMOVDQU Y1, (SI) - VMOVDQU Y2, 32(SI) - ADDQ $0x40, SI - VMOVDQU Y3, (DI) - VMOVDQU Y4, 32(DI) - ADDQ $0x40, DI - VPSRLQ $0x04, Y7, Y2 - VPAND Y0, Y7, Y1 - VPAND Y0, Y2, Y2 - VPSHUFB Y1, Y24, Y3 - VPSHUFB Y1, Y25, Y1 - VPSHUFB Y2, Y26, Y4 - VPSHUFB Y2, Y27, Y2 - VPXOR Y3, Y4, Y3 - VPXOR Y1, Y2, Y1 - VPAND Y8, Y0, Y2 - VPSRLQ $0x04, Y8, Y4 - VPAND Y0, Y4, Y4 - VPSHUFB Y2, Y28, Y9 - VPSHUFB Y2, Y29, Y2 - VPXOR Y3, Y9, Y3 - VPXOR Y1, Y2, Y1 - VPSHUFB Y4, Y30, Y9 - VPSHUFB Y4, Y31, Y2 - VPTERNLOGD $0x96, Y3, Y9, Y5 - VPTERNLOGD $0x96, Y1, Y2, Y6 - VPXOR Y5, Y7, Y7 - VPXOR Y6, Y8, Y8 - VMOVDQU Y5, (R8) - VMOVDQU Y6, 32(R8) - ADDQ $0x40, R8 - VMOVDQU Y7, (AX) - VMOVDQU Y8, 32(AX) - ADDQ $0x40, AX - SUBQ $0x40, DX - JNZ loop - VZEROUPPER - RET - -// func ifftDIT4_avx512_3(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·ifftDIT4_avx512_3(SB), NOSPLIT, $0-56 - // dist must be multiplied by 24 (size of slice header) - MOVQ table01+32(FP), AX - MOVQ table23+40(FP), AX - MOVQ table02+48(FP), AX - VBROADCASTI128 (AX), Y1 - VBROADCASTI128 64(AX), Y0 - VMOVAPS Z1, Z16 - VMOVAPS Z0, Z17 - VBROADCASTI128 16(AX), Y1 - VBROADCASTI128 80(AX), Y0 - VMOVAPS Z1, Z18 - VMOVAPS Z0, Z19 - VBROADCASTI128 32(AX), Y1 - VBROADCASTI128 96(AX), Y0 - VMOVAPS Z1, Z20 - VMOVAPS Z0, Z21 - VBROADCASTI128 48(AX), Y1 - VBROADCASTI128 112(AX), Y0 - VMOVAPS Z1, Z22 - VMOVAPS Z0, Z23 - MOVQ $0x0000000f, AX - MOVQ AX, X0 - VPBROADCASTB X0, Y0 - MOVQ dist+24(FP), AX - MOVQ work_base+0(FP), CX - MOVQ 8(CX), DX - XORQ BX, BX - MOVQ (CX)(BX*1), SI - ADDQ AX, BX - MOVQ (CX)(BX*1), DI - ADDQ AX, BX - MOVQ (CX)(BX*1), R8 - ADDQ AX, BX - MOVQ (CX)(BX*1), AX - -loop: - VMOVDQU (SI), Y1 - VMOVDQU 32(SI), Y2 - VMOVDQU (DI), Y3 - VMOVDQU 32(DI), Y4 - VPXOR Y1, Y3, Y3 - VPXOR Y2, Y4, Y4 - VMOVDQU (R8), Y5 - VMOVDQU 32(R8), Y6 - VMOVDQU (AX), Y7 - VMOVDQU 32(AX), Y8 - VPXOR Y5, Y7, Y7 - VPXOR Y6, Y8, Y8 - VPXOR Y1, Y5, Y5 - VPXOR Y2, Y6, Y6 - VPXOR Y3, Y7, Y7 - VPXOR Y4, Y8, Y8 - VPSRLQ $0x04, Y5, Y10 - VPAND Y0, Y5, Y9 - VPAND Y0, Y10, Y10 - VPSHUFB Y9, Y16, Y11 - VPSHUFB Y9, Y17, Y9 - VPSHUFB Y10, Y18, Y12 - VPSHUFB Y10, Y19, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y6, Y0, Y10 - VPSRLQ $0x04, Y6, Y12 - VPAND Y0, Y12, Y12 - VPSHUFB Y10, Y20, Y13 - VPSHUFB Y10, Y21, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VPSHUFB Y12, Y22, Y13 - VPSHUFB Y12, Y23, Y10 - VPTERNLOGD $0x96, Y11, Y13, Y1 - VPTERNLOGD $0x96, Y9, Y10, Y2 - VPSRLQ $0x04, Y7, Y10 - VPAND Y0, Y7, Y9 - VPAND Y0, Y10, Y10 - VPSHUFB Y9, Y16, Y11 - VPSHUFB Y9, Y17, Y9 - VPSHUFB Y10, Y18, Y12 - VPSHUFB Y10, Y19, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y8, Y0, Y10 - VPSRLQ $0x04, Y8, Y12 - VPAND Y0, Y12, Y12 - VPSHUFB Y10, Y20, Y13 - VPSHUFB Y10, Y21, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VPSHUFB Y12, Y22, Y13 - VPSHUFB Y12, Y23, Y10 - VPTERNLOGD $0x96, Y11, Y13, Y3 - VPTERNLOGD $0x96, Y9, Y10, Y4 - VMOVDQU Y1, (SI) - VMOVDQU Y2, 32(SI) - ADDQ $0x40, SI - VMOVDQU Y3, (DI) - VMOVDQU Y4, 32(DI) - ADDQ $0x40, DI - VMOVDQU Y5, (R8) - VMOVDQU Y6, 32(R8) - ADDQ $0x40, R8 - VMOVDQU Y7, (AX) - VMOVDQU Y8, 32(AX) - ADDQ $0x40, AX - SUBQ $0x40, DX - JNZ loop - VZEROUPPER - RET - -// func fftDIT4_avx512_3(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·fftDIT4_avx512_3(SB), NOSPLIT, $0-56 - // dist must be multiplied by 24 (size of slice header) - MOVQ table01+32(FP), AX - MOVQ table23+40(FP), AX - MOVQ table02+48(FP), CX - VBROADCASTI128 (AX), Y1 - VBROADCASTI128 64(AX), Y0 - VMOVAPS Z1, Z16 - VMOVAPS Z0, Z17 - VBROADCASTI128 16(AX), Y1 - VBROADCASTI128 80(AX), Y0 - VMOVAPS Z1, Z18 - VMOVAPS Z0, Z19 - VBROADCASTI128 32(AX), Y1 - VBROADCASTI128 96(AX), Y0 - VMOVAPS Z1, Z20 - VMOVAPS Z0, Z21 - VBROADCASTI128 48(AX), Y1 - VBROADCASTI128 112(AX), Y0 - VMOVAPS Z1, Z22 - VMOVAPS Z0, Z23 - MOVQ $0x0000000f, AX - MOVQ AX, X0 - VPBROADCASTB X0, Y0 - MOVQ dist+24(FP), AX - MOVQ work_base+0(FP), CX - MOVQ 8(CX), DX - XORQ BX, BX - MOVQ (CX)(BX*1), SI - ADDQ AX, BX - MOVQ (CX)(BX*1), DI - ADDQ AX, BX - MOVQ (CX)(BX*1), R8 - ADDQ AX, BX - MOVQ (CX)(BX*1), AX - -loop: - VMOVDQU (SI), Y1 - VMOVDQU 32(SI), Y2 - VMOVDQU (R8), Y5 - VMOVDQU 32(R8), Y6 - VMOVDQU (DI), Y3 - VMOVDQU 32(DI), Y4 - VMOVDQU (AX), Y7 - VMOVDQU 32(AX), Y8 - VPXOR Y1, Y5, Y5 - VPXOR Y2, Y6, Y6 - VPXOR Y3, Y7, Y7 - VPXOR Y4, Y8, Y8 - VPXOR Y1, Y3, Y3 - VPXOR Y2, Y4, Y4 - VMOVDQU Y1, (SI) - VMOVDQU Y2, 32(SI) - ADDQ $0x40, SI - VMOVDQU Y3, (DI) - VMOVDQU Y4, 32(DI) - ADDQ $0x40, DI - VPSRLQ $0x04, Y7, Y2 - VPAND Y0, Y7, Y1 - VPAND Y0, Y2, Y2 - VPSHUFB Y1, Y16, Y3 - VPSHUFB Y1, Y17, Y1 - VPSHUFB Y2, Y18, Y4 - VPSHUFB Y2, Y19, Y2 - VPXOR Y3, Y4, Y3 - VPXOR Y1, Y2, Y1 - VPAND Y8, Y0, Y2 - VPSRLQ $0x04, Y8, Y4 - VPAND Y0, Y4, Y4 - VPSHUFB Y2, Y20, Y9 - VPSHUFB Y2, Y21, Y2 - VPXOR Y3, Y9, Y3 - VPXOR Y1, Y2, Y1 - VPSHUFB Y4, Y22, Y9 - VPSHUFB Y4, Y23, Y2 - VPTERNLOGD $0x96, Y3, Y9, Y5 - VPTERNLOGD $0x96, Y1, Y2, Y6 - VPXOR Y5, Y7, Y7 - VPXOR Y6, Y8, Y8 - VMOVDQU Y5, (R8) - VMOVDQU Y6, 32(R8) - ADDQ $0x40, R8 - VMOVDQU Y7, (AX) - VMOVDQU Y8, 32(AX) - ADDQ $0x40, AX - SUBQ $0x40, DX - JNZ loop - VZEROUPPER - RET - -// func ifftDIT4_avx512_4(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·ifftDIT4_avx512_4(SB), NOSPLIT, $0-56 - // dist must be multiplied by 24 (size of slice header) - MOVQ table01+32(FP), AX - MOVQ table23+40(FP), CX - MOVQ table02+48(FP), DX - VBROADCASTI128 (AX), Y1 - VBROADCASTI128 64(AX), Y0 - VMOVAPS Z1, Z16 - VMOVAPS Z0, Z17 - VBROADCASTI128 16(AX), Y1 - VBROADCASTI128 80(AX), Y0 - VMOVAPS Z1, Z18 - VMOVAPS Z0, Z19 - VBROADCASTI128 32(AX), Y1 - VBROADCASTI128 96(AX), Y0 - VMOVAPS Z1, Z20 - VMOVAPS Z0, Z21 - VBROADCASTI128 48(AX), Y1 - VBROADCASTI128 112(AX), Y0 - VMOVAPS Z1, Z22 - VMOVAPS Z0, Z23 - VBROADCASTI128 (CX), Y1 - VBROADCASTI128 64(CX), Y0 - VMOVAPS Z1, Z24 - VMOVAPS Z0, Z25 - VBROADCASTI128 16(CX), Y1 - VBROADCASTI128 80(CX), Y0 - VMOVAPS Z1, Z26 - VMOVAPS Z0, Z27 - VBROADCASTI128 32(CX), Y1 - VBROADCASTI128 96(CX), Y0 - VMOVAPS Z1, Z28 - VMOVAPS Z0, Z29 - VBROADCASTI128 48(CX), Y1 - VBROADCASTI128 112(CX), Y0 - VMOVAPS Z1, Z30 - VMOVAPS Z0, Z31 - MOVQ $0x0000000f, AX - MOVQ AX, X0 - VPBROADCASTB X0, Y0 - MOVQ dist+24(FP), AX - MOVQ work_base+0(FP), CX - MOVQ 8(CX), DX - XORQ BX, BX - MOVQ (CX)(BX*1), SI - ADDQ AX, BX - MOVQ (CX)(BX*1), DI - ADDQ AX, BX - MOVQ (CX)(BX*1), R8 - ADDQ AX, BX - MOVQ (CX)(BX*1), AX - -loop: - VMOVDQU (SI), Y1 - VMOVDQU 32(SI), Y2 - VMOVDQU (DI), Y3 - VMOVDQU 32(DI), Y4 - VPXOR Y1, Y3, Y3 - VPXOR Y2, Y4, Y4 - VPSRLQ $0x04, Y3, Y6 - VPAND Y0, Y3, Y5 - VPAND Y0, Y6, Y6 - VPSHUFB Y5, Y16, Y7 - VPSHUFB Y5, Y17, Y5 - VPSHUFB Y6, Y18, Y8 - VPSHUFB Y6, Y19, Y6 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y6, Y5 - VPAND Y4, Y0, Y6 - VPSRLQ $0x04, Y4, Y8 - VPAND Y0, Y8, Y8 - VPSHUFB Y6, Y20, Y9 - VPSHUFB Y6, Y21, Y6 - VPXOR Y7, Y9, Y7 - VPXOR Y5, Y6, Y5 - VPSHUFB Y8, Y22, Y9 - VPSHUFB Y8, Y23, Y6 - VPTERNLOGD $0x96, Y7, Y9, Y1 - VPTERNLOGD $0x96, Y5, Y6, Y2 - VMOVDQU (R8), Y5 - VMOVDQU 32(R8), Y6 - VMOVDQU (AX), Y7 - VMOVDQU 32(AX), Y8 - VPXOR Y5, Y7, Y7 - VPXOR Y6, Y8, Y8 - VPSRLQ $0x04, Y7, Y10 - VPAND Y0, Y7, Y9 - VPAND Y0, Y10, Y10 - VPSHUFB Y9, Y24, Y11 - VPSHUFB Y9, Y25, Y9 - VPSHUFB Y10, Y26, Y12 - VPSHUFB Y10, Y27, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y8, Y0, Y10 - VPSRLQ $0x04, Y8, Y12 - VPAND Y0, Y12, Y12 - VPSHUFB Y10, Y28, Y13 - VPSHUFB Y10, Y29, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VPSHUFB Y12, Y30, Y13 - VPSHUFB Y12, Y31, Y10 - VPTERNLOGD $0x96, Y11, Y13, Y5 - VPTERNLOGD $0x96, Y9, Y10, Y6 - VPXOR Y1, Y5, Y5 - VPXOR Y2, Y6, Y6 - VPXOR Y3, Y7, Y7 - VPXOR Y4, Y8, Y8 - VMOVDQU Y1, (SI) - VMOVDQU Y2, 32(SI) - ADDQ $0x40, SI - VMOVDQU Y3, (DI) - VMOVDQU Y4, 32(DI) - ADDQ $0x40, DI - VMOVDQU Y5, (R8) - VMOVDQU Y6, 32(R8) - ADDQ $0x40, R8 - VMOVDQU Y7, (AX) - VMOVDQU Y8, 32(AX) - ADDQ $0x40, AX - SUBQ $0x40, DX - JNZ loop - VZEROUPPER - RET - -// func fftDIT4_avx512_4(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·fftDIT4_avx512_4(SB), NOSPLIT, $0-56 - // dist must be multiplied by 24 (size of slice header) - MOVQ table01+32(FP), AX - MOVQ table23+40(FP), CX - MOVQ table02+48(FP), CX - VBROADCASTI128 (CX), Y1 - VBROADCASTI128 64(CX), Y0 - VMOVAPS Z1, Z16 - VMOVAPS Z0, Z17 - VBROADCASTI128 16(CX), Y1 - VBROADCASTI128 80(CX), Y0 - VMOVAPS Z1, Z18 - VMOVAPS Z0, Z19 - VBROADCASTI128 32(CX), Y1 - VBROADCASTI128 96(CX), Y0 - VMOVAPS Z1, Z20 - VMOVAPS Z0, Z21 - VBROADCASTI128 48(CX), Y1 - VBROADCASTI128 112(CX), Y0 - VMOVAPS Z1, Z22 - VMOVAPS Z0, Z23 - VBROADCASTI128 (AX), Y1 - VBROADCASTI128 64(AX), Y0 - VMOVAPS Z1, Z24 - VMOVAPS Z0, Z25 - VBROADCASTI128 16(AX), Y1 - VBROADCASTI128 80(AX), Y0 - VMOVAPS Z1, Z26 - VMOVAPS Z0, Z27 - VBROADCASTI128 32(AX), Y1 - VBROADCASTI128 96(AX), Y0 - VMOVAPS Z1, Z28 - VMOVAPS Z0, Z29 - VBROADCASTI128 48(AX), Y1 - VBROADCASTI128 112(AX), Y0 - VMOVAPS Z1, Z30 - VMOVAPS Z0, Z31 - MOVQ $0x0000000f, AX - MOVQ AX, X0 - VPBROADCASTB X0, Y0 - MOVQ dist+24(FP), AX - MOVQ work_base+0(FP), CX - MOVQ 8(CX), DX - XORQ BX, BX - MOVQ (CX)(BX*1), SI - ADDQ AX, BX - MOVQ (CX)(BX*1), DI - ADDQ AX, BX - MOVQ (CX)(BX*1), R8 - ADDQ AX, BX - MOVQ (CX)(BX*1), AX - -loop: - VMOVDQU (SI), Y1 - VMOVDQU 32(SI), Y2 - VMOVDQU (R8), Y5 - VMOVDQU 32(R8), Y6 - VMOVDQU (DI), Y3 - VMOVDQU 32(DI), Y4 - VMOVDQU (AX), Y7 - VMOVDQU 32(AX), Y8 - VPSRLQ $0x04, Y5, Y10 - VPAND Y0, Y5, Y9 - VPAND Y0, Y10, Y10 - VPSHUFB Y9, Y16, Y11 - VPSHUFB Y9, Y17, Y9 - VPSHUFB Y10, Y18, Y12 - VPSHUFB Y10, Y19, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y6, Y0, Y10 - VPSRLQ $0x04, Y6, Y12 - VPAND Y0, Y12, Y12 - VPSHUFB Y10, Y20, Y13 - VPSHUFB Y10, Y21, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VPSHUFB Y12, Y22, Y13 - VPSHUFB Y12, Y23, Y10 - VPTERNLOGD $0x96, Y11, Y13, Y1 - VPTERNLOGD $0x96, Y9, Y10, Y2 - VPSRLQ $0x04, Y7, Y10 - VPAND Y0, Y7, Y9 - VPAND Y0, Y10, Y10 - VPSHUFB Y9, Y16, Y11 - VPSHUFB Y9, Y17, Y9 - VPSHUFB Y10, Y18, Y12 - VPSHUFB Y10, Y19, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y8, Y0, Y10 - VPSRLQ $0x04, Y8, Y12 - VPAND Y0, Y12, Y12 - VPSHUFB Y10, Y20, Y13 - VPSHUFB Y10, Y21, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VPSHUFB Y12, Y22, Y13 - VPSHUFB Y12, Y23, Y10 - VPTERNLOGD $0x96, Y11, Y13, Y3 - VPTERNLOGD $0x96, Y9, Y10, Y4 - VPXOR Y1, Y5, Y5 - VPXOR Y2, Y6, Y6 - VPXOR Y3, Y7, Y7 - VPXOR Y4, Y8, Y8 - VPSRLQ $0x04, Y3, Y10 - VPAND Y0, Y3, Y9 - VPAND Y0, Y10, Y10 - VPSHUFB Y9, Y24, Y11 - VPSHUFB Y9, Y25, Y9 - VPSHUFB Y10, Y26, Y12 - VPSHUFB Y10, Y27, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y4, Y0, Y10 - VPSRLQ $0x04, Y4, Y12 - VPAND Y0, Y12, Y12 - VPSHUFB Y10, Y28, Y13 - VPSHUFB Y10, Y29, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VPSHUFB Y12, Y30, Y13 - VPSHUFB Y12, Y31, Y10 - VPTERNLOGD $0x96, Y11, Y13, Y1 - VPTERNLOGD $0x96, Y9, Y10, Y2 - VPXOR Y1, Y3, Y3 - VPXOR Y2, Y4, Y4 - VMOVDQU Y1, (SI) - VMOVDQU Y2, 32(SI) - ADDQ $0x40, SI - VMOVDQU Y3, (DI) - VMOVDQU Y4, 32(DI) - ADDQ $0x40, DI - VPXOR Y5, Y7, Y7 - VPXOR Y6, Y8, Y8 - VMOVDQU Y5, (R8) - VMOVDQU Y6, 32(R8) - ADDQ $0x40, R8 - VMOVDQU Y7, (AX) - VMOVDQU Y8, 32(AX) - ADDQ $0x40, AX - SUBQ $0x40, DX - JNZ loop - VZEROUPPER - RET - -// func ifftDIT4_avx512_5(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·ifftDIT4_avx512_5(SB), NOSPLIT, $0-56 - // dist must be multiplied by 24 (size of slice header) - MOVQ table01+32(FP), AX - MOVQ table23+40(FP), AX - MOVQ table02+48(FP), CX - VBROADCASTI128 (AX), Y1 - VBROADCASTI128 64(AX), Y0 - VMOVAPS Z1, Z16 - VMOVAPS Z0, Z17 - VBROADCASTI128 16(AX), Y1 - VBROADCASTI128 80(AX), Y0 - VMOVAPS Z1, Z18 - VMOVAPS Z0, Z19 - VBROADCASTI128 32(AX), Y1 - VBROADCASTI128 96(AX), Y0 - VMOVAPS Z1, Z20 - VMOVAPS Z0, Z21 - VBROADCASTI128 48(AX), Y1 - VBROADCASTI128 112(AX), Y0 - VMOVAPS Z1, Z22 - VMOVAPS Z0, Z23 - MOVQ $0x0000000f, AX - MOVQ AX, X0 - VPBROADCASTB X0, Y0 - MOVQ dist+24(FP), AX - MOVQ work_base+0(FP), CX - MOVQ 8(CX), DX - XORQ BX, BX - MOVQ (CX)(BX*1), SI - ADDQ AX, BX - MOVQ (CX)(BX*1), DI - ADDQ AX, BX - MOVQ (CX)(BX*1), R8 - ADDQ AX, BX - MOVQ (CX)(BX*1), AX - -loop: - VMOVDQU (SI), Y1 - VMOVDQU 32(SI), Y2 - VMOVDQU (DI), Y3 - VMOVDQU 32(DI), Y4 - VPXOR Y1, Y3, Y3 - VPXOR Y2, Y4, Y4 - VMOVDQU (R8), Y5 - VMOVDQU 32(R8), Y6 - VMOVDQU (AX), Y7 - VMOVDQU 32(AX), Y8 - VPXOR Y5, Y7, Y7 - VPXOR Y6, Y8, Y8 - VPSRLQ $0x04, Y7, Y10 - VPAND Y0, Y7, Y9 - VPAND Y0, Y10, Y10 - VPSHUFB Y9, Y16, Y11 - VPSHUFB Y9, Y17, Y9 - VPSHUFB Y10, Y18, Y12 - VPSHUFB Y10, Y19, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y8, Y0, Y10 - VPSRLQ $0x04, Y8, Y12 - VPAND Y0, Y12, Y12 - VPSHUFB Y10, Y20, Y13 - VPSHUFB Y10, Y21, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VPSHUFB Y12, Y22, Y13 - VPSHUFB Y12, Y23, Y10 - VPTERNLOGD $0x96, Y11, Y13, Y5 - VPTERNLOGD $0x96, Y9, Y10, Y6 - VPXOR Y1, Y5, Y5 - VPXOR Y2, Y6, Y6 - VPXOR Y3, Y7, Y7 - VPXOR Y4, Y8, Y8 - VMOVDQU Y1, (SI) - VMOVDQU Y2, 32(SI) - ADDQ $0x40, SI - VMOVDQU Y3, (DI) - VMOVDQU Y4, 32(DI) - ADDQ $0x40, DI - VMOVDQU Y5, (R8) - VMOVDQU Y6, 32(R8) - ADDQ $0x40, R8 - VMOVDQU Y7, (AX) - VMOVDQU Y8, 32(AX) - ADDQ $0x40, AX - SUBQ $0x40, DX - JNZ loop - VZEROUPPER - RET - -// func fftDIT4_avx512_5(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·fftDIT4_avx512_5(SB), NOSPLIT, $0-56 - // dist must be multiplied by 24 (size of slice header) - MOVQ table01+32(FP), AX - MOVQ table23+40(FP), CX - MOVQ table02+48(FP), CX - VBROADCASTI128 (AX), Y1 - VBROADCASTI128 64(AX), Y0 - VMOVAPS Z1, Z16 - VMOVAPS Z0, Z17 - VBROADCASTI128 16(AX), Y1 - VBROADCASTI128 80(AX), Y0 - VMOVAPS Z1, Z18 - VMOVAPS Z0, Z19 - VBROADCASTI128 32(AX), Y1 - VBROADCASTI128 96(AX), Y0 - VMOVAPS Z1, Z20 - VMOVAPS Z0, Z21 - VBROADCASTI128 48(AX), Y1 - VBROADCASTI128 112(AX), Y0 - VMOVAPS Z1, Z22 - VMOVAPS Z0, Z23 - MOVQ $0x0000000f, AX - MOVQ AX, X0 - VPBROADCASTB X0, Y0 - MOVQ dist+24(FP), AX - MOVQ work_base+0(FP), CX - MOVQ 8(CX), DX - XORQ BX, BX - MOVQ (CX)(BX*1), SI - ADDQ AX, BX - MOVQ (CX)(BX*1), DI - ADDQ AX, BX - MOVQ (CX)(BX*1), R8 - ADDQ AX, BX - MOVQ (CX)(BX*1), AX - -loop: - VMOVDQU (SI), Y1 - VMOVDQU 32(SI), Y2 - VMOVDQU (R8), Y5 - VMOVDQU 32(R8), Y6 - VMOVDQU (DI), Y3 - VMOVDQU 32(DI), Y4 - VMOVDQU (AX), Y7 - VMOVDQU 32(AX), Y8 - VPXOR Y1, Y5, Y5 - VPXOR Y2, Y6, Y6 - VPXOR Y3, Y7, Y7 - VPXOR Y4, Y8, Y8 - VPSRLQ $0x04, Y3, Y10 - VPAND Y0, Y3, Y9 - VPAND Y0, Y10, Y10 - VPSHUFB Y9, Y16, Y11 - VPSHUFB Y9, Y17, Y9 - VPSHUFB Y10, Y18, Y12 - VPSHUFB Y10, Y19, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y4, Y0, Y10 - VPSRLQ $0x04, Y4, Y12 - VPAND Y0, Y12, Y12 - VPSHUFB Y10, Y20, Y13 - VPSHUFB Y10, Y21, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VPSHUFB Y12, Y22, Y13 - VPSHUFB Y12, Y23, Y10 - VPTERNLOGD $0x96, Y11, Y13, Y1 - VPTERNLOGD $0x96, Y9, Y10, Y2 - VPXOR Y1, Y3, Y3 - VPXOR Y2, Y4, Y4 - VMOVDQU Y1, (SI) - VMOVDQU Y2, 32(SI) - ADDQ $0x40, SI - VMOVDQU Y3, (DI) - VMOVDQU Y4, 32(DI) - ADDQ $0x40, DI - VPXOR Y5, Y7, Y7 - VPXOR Y6, Y8, Y8 - VMOVDQU Y5, (R8) - VMOVDQU Y6, 32(R8) - ADDQ $0x40, R8 - VMOVDQU Y7, (AX) - VMOVDQU Y8, 32(AX) - ADDQ $0x40, AX - SUBQ $0x40, DX - JNZ loop - VZEROUPPER - RET - -// func ifftDIT4_avx512_6(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·ifftDIT4_avx512_6(SB), NOSPLIT, $0-56 - // dist must be multiplied by 24 (size of slice header) - MOVQ table01+32(FP), AX - MOVQ table23+40(FP), CX - MOVQ table02+48(FP), CX - VBROADCASTI128 (AX), Y1 - VBROADCASTI128 64(AX), Y0 - VMOVAPS Z1, Z16 - VMOVAPS Z0, Z17 - VBROADCASTI128 16(AX), Y1 - VBROADCASTI128 80(AX), Y0 - VMOVAPS Z1, Z18 - VMOVAPS Z0, Z19 - VBROADCASTI128 32(AX), Y1 - VBROADCASTI128 96(AX), Y0 - VMOVAPS Z1, Z20 - VMOVAPS Z0, Z21 - VBROADCASTI128 48(AX), Y1 - VBROADCASTI128 112(AX), Y0 - VMOVAPS Z1, Z22 - VMOVAPS Z0, Z23 - MOVQ $0x0000000f, AX - MOVQ AX, X0 - VPBROADCASTB X0, Y0 - MOVQ dist+24(FP), AX - MOVQ work_base+0(FP), CX - MOVQ 8(CX), DX - XORQ BX, BX - MOVQ (CX)(BX*1), SI - ADDQ AX, BX - MOVQ (CX)(BX*1), DI - ADDQ AX, BX - MOVQ (CX)(BX*1), R8 - ADDQ AX, BX - MOVQ (CX)(BX*1), AX - -loop: - VMOVDQU (SI), Y1 - VMOVDQU 32(SI), Y2 - VMOVDQU (DI), Y3 - VMOVDQU 32(DI), Y4 - VPXOR Y1, Y3, Y3 - VPXOR Y2, Y4, Y4 - VPSRLQ $0x04, Y3, Y6 - VPAND Y0, Y3, Y5 - VPAND Y0, Y6, Y6 - VPSHUFB Y5, Y16, Y7 - VPSHUFB Y5, Y17, Y5 - VPSHUFB Y6, Y18, Y8 - VPSHUFB Y6, Y19, Y6 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y6, Y5 - VPAND Y4, Y0, Y6 - VPSRLQ $0x04, Y4, Y8 - VPAND Y0, Y8, Y8 - VPSHUFB Y6, Y20, Y9 - VPSHUFB Y6, Y21, Y6 - VPXOR Y7, Y9, Y7 - VPXOR Y5, Y6, Y5 - VPSHUFB Y8, Y22, Y9 - VPSHUFB Y8, Y23, Y6 - VPTERNLOGD $0x96, Y7, Y9, Y1 - VPTERNLOGD $0x96, Y5, Y6, Y2 - VMOVDQU (R8), Y5 - VMOVDQU 32(R8), Y6 - VMOVDQU (AX), Y7 - VMOVDQU 32(AX), Y8 - VPXOR Y5, Y7, Y7 - VPXOR Y6, Y8, Y8 - VPXOR Y1, Y5, Y5 - VPXOR Y2, Y6, Y6 - VPXOR Y3, Y7, Y7 - VPXOR Y4, Y8, Y8 - VMOVDQU Y1, (SI) - VMOVDQU Y2, 32(SI) - ADDQ $0x40, SI - VMOVDQU Y3, (DI) - VMOVDQU Y4, 32(DI) - ADDQ $0x40, DI - VMOVDQU Y5, (R8) - VMOVDQU Y6, 32(R8) - ADDQ $0x40, R8 - VMOVDQU Y7, (AX) - VMOVDQU Y8, 32(AX) - ADDQ $0x40, AX - SUBQ $0x40, DX - JNZ loop - VZEROUPPER - RET - -// func fftDIT4_avx512_6(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·fftDIT4_avx512_6(SB), NOSPLIT, $0-56 - // dist must be multiplied by 24 (size of slice header) - MOVQ table01+32(FP), AX - MOVQ table23+40(FP), AX - MOVQ table02+48(FP), AX - VBROADCASTI128 (AX), Y1 - VBROADCASTI128 64(AX), Y0 - VMOVAPS Z1, Z16 - VMOVAPS Z0, Z17 - VBROADCASTI128 16(AX), Y1 - VBROADCASTI128 80(AX), Y0 - VMOVAPS Z1, Z18 - VMOVAPS Z0, Z19 - VBROADCASTI128 32(AX), Y1 - VBROADCASTI128 96(AX), Y0 - VMOVAPS Z1, Z20 - VMOVAPS Z0, Z21 - VBROADCASTI128 48(AX), Y1 - VBROADCASTI128 112(AX), Y0 - VMOVAPS Z1, Z22 - VMOVAPS Z0, Z23 - MOVQ $0x0000000f, AX - MOVQ AX, X0 - VPBROADCASTB X0, Y0 - MOVQ dist+24(FP), AX - MOVQ work_base+0(FP), CX - MOVQ 8(CX), DX - XORQ BX, BX - MOVQ (CX)(BX*1), SI - ADDQ AX, BX - MOVQ (CX)(BX*1), DI - ADDQ AX, BX - MOVQ (CX)(BX*1), R8 - ADDQ AX, BX - MOVQ (CX)(BX*1), AX - -loop: - VMOVDQU (SI), Y1 - VMOVDQU 32(SI), Y2 - VMOVDQU (R8), Y5 - VMOVDQU 32(R8), Y6 - VMOVDQU (DI), Y3 - VMOVDQU 32(DI), Y4 - VMOVDQU (AX), Y7 - VMOVDQU 32(AX), Y8 - VPSRLQ $0x04, Y5, Y10 - VPAND Y0, Y5, Y9 - VPAND Y0, Y10, Y10 - VPSHUFB Y9, Y16, Y11 - VPSHUFB Y9, Y17, Y9 - VPSHUFB Y10, Y18, Y12 - VPSHUFB Y10, Y19, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y6, Y0, Y10 - VPSRLQ $0x04, Y6, Y12 - VPAND Y0, Y12, Y12 - VPSHUFB Y10, Y20, Y13 - VPSHUFB Y10, Y21, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VPSHUFB Y12, Y22, Y13 - VPSHUFB Y12, Y23, Y10 - VPTERNLOGD $0x96, Y11, Y13, Y1 - VPTERNLOGD $0x96, Y9, Y10, Y2 - VPSRLQ $0x04, Y7, Y10 - VPAND Y0, Y7, Y9 - VPAND Y0, Y10, Y10 - VPSHUFB Y9, Y16, Y11 - VPSHUFB Y9, Y17, Y9 - VPSHUFB Y10, Y18, Y12 - VPSHUFB Y10, Y19, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y8, Y0, Y10 - VPSRLQ $0x04, Y8, Y12 - VPAND Y0, Y12, Y12 - VPSHUFB Y10, Y20, Y13 - VPSHUFB Y10, Y21, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VPSHUFB Y12, Y22, Y13 - VPSHUFB Y12, Y23, Y10 - VPTERNLOGD $0x96, Y11, Y13, Y3 - VPTERNLOGD $0x96, Y9, Y10, Y4 - VPXOR Y1, Y5, Y5 - VPXOR Y2, Y6, Y6 - VPXOR Y3, Y7, Y7 - VPXOR Y4, Y8, Y8 - VPXOR Y1, Y3, Y3 - VPXOR Y2, Y4, Y4 - VMOVDQU Y1, (SI) - VMOVDQU Y2, 32(SI) - ADDQ $0x40, SI - VMOVDQU Y3, (DI) - VMOVDQU Y4, 32(DI) - ADDQ $0x40, DI - VPXOR Y5, Y7, Y7 - VPXOR Y6, Y8, Y8 - VMOVDQU Y5, (R8) - VMOVDQU Y6, 32(R8) - ADDQ $0x40, R8 - VMOVDQU Y7, (AX) - VMOVDQU Y8, 32(AX) - ADDQ $0x40, AX - SUBQ $0x40, DX - JNZ loop - VZEROUPPER - RET - -// func ifftDIT4_avx512_7(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) -// Requires: AVX, AVX2, SSE2 -TEXT ·ifftDIT4_avx512_7(SB), NOSPLIT, $0-56 - // dist must be multiplied by 24 (size of slice header) - MOVQ table01+32(FP), AX - MOVQ table23+40(FP), AX - MOVQ table02+48(FP), AX - MOVQ $0x0000000f, AX - MOVQ AX, X0 - VPBROADCASTB X0, Y0 - MOVQ dist+24(FP), AX - MOVQ work_base+0(FP), CX - MOVQ 8(CX), DX - XORQ BX, BX - MOVQ (CX)(BX*1), SI - ADDQ AX, BX - MOVQ (CX)(BX*1), DI - ADDQ AX, BX - MOVQ (CX)(BX*1), R8 - ADDQ AX, BX - MOVQ (CX)(BX*1), AX - -loop: - VMOVDQU (SI), Y0 - VMOVDQU 32(SI), Y1 - VMOVDQU (DI), Y2 - VMOVDQU 32(DI), Y3 - VPXOR Y0, Y2, Y2 - VPXOR Y1, Y3, Y3 - VMOVDQU (R8), Y4 - VMOVDQU 32(R8), Y5 - VMOVDQU (AX), Y6 - VMOVDQU 32(AX), Y7 - VPXOR Y4, Y6, Y6 - VPXOR Y5, Y7, Y7 - VPXOR Y0, Y4, Y4 - VPXOR Y1, Y5, Y5 - VPXOR Y2, Y6, Y6 - VPXOR Y3, Y7, Y7 - VMOVDQU Y0, (SI) - VMOVDQU Y1, 32(SI) - ADDQ $0x40, SI - VMOVDQU Y2, (DI) - VMOVDQU Y3, 32(DI) - ADDQ $0x40, DI - VMOVDQU Y4, (R8) - VMOVDQU Y5, 32(R8) - ADDQ $0x40, R8 - VMOVDQU Y6, (AX) - VMOVDQU Y7, 32(AX) - ADDQ $0x40, AX - SUBQ $0x40, DX - JNZ loop - VZEROUPPER - RET - -// func fftDIT4_avx512_7(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) -// Requires: AVX, AVX2, SSE2 -TEXT ·fftDIT4_avx512_7(SB), NOSPLIT, $0-56 - // dist must be multiplied by 24 (size of slice header) - MOVQ table01+32(FP), AX - MOVQ table23+40(FP), AX - MOVQ table02+48(FP), AX - MOVQ $0x0000000f, AX - MOVQ AX, X0 - VPBROADCASTB X0, Y0 - MOVQ dist+24(FP), AX - MOVQ work_base+0(FP), CX - MOVQ 8(CX), DX - XORQ BX, BX - MOVQ (CX)(BX*1), SI - ADDQ AX, BX - MOVQ (CX)(BX*1), DI - ADDQ AX, BX - MOVQ (CX)(BX*1), R8 - ADDQ AX, BX - MOVQ (CX)(BX*1), AX - -loop: - VMOVDQU (SI), Y0 - VMOVDQU 32(SI), Y1 - VMOVDQU (R8), Y4 - VMOVDQU 32(R8), Y5 - VMOVDQU (DI), Y2 - VMOVDQU 32(DI), Y3 - VMOVDQU (AX), Y6 - VMOVDQU 32(AX), Y7 - VPXOR Y0, Y4, Y4 - VPXOR Y1, Y5, Y5 - VPXOR Y2, Y6, Y6 - VPXOR Y3, Y7, Y7 - VPXOR Y0, Y2, Y2 - VPXOR Y1, Y3, Y3 - VMOVDQU Y0, (SI) - VMOVDQU Y1, 32(SI) - ADDQ $0x40, SI - VMOVDQU Y2, (DI) - VMOVDQU Y3, 32(DI) - ADDQ $0x40, DI - VPXOR Y4, Y6, Y6 - VPXOR Y5, Y7, Y7 - VMOVDQU Y4, (R8) - VMOVDQU Y5, 32(R8) - ADDQ $0x40, R8 - VMOVDQU Y6, (AX) - VMOVDQU Y7, 32(AX) - ADDQ $0x40, AX - SUBQ $0x40, DX - JNZ loop - VZEROUPPER - RET - -// func ifftDIT4_avx2_0(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·ifftDIT4_avx2_0(SB), NOSPLIT, $0-56 - // dist must be multiplied by 24 (size of slice header) - MOVQ table01+32(FP), AX - MOVQ table23+40(FP), CX - MOVQ table02+48(FP), DX - MOVQ $0x0000000f, BX - MOVQ BX, X0 - VPBROADCASTB X0, Y0 - MOVQ dist+24(FP), BX - MOVQ work_base+0(FP), SI - MOVQ 8(SI), DI - XORQ R8, R8 - MOVQ (SI)(R8*1), R9 - ADDQ BX, R8 - MOVQ (SI)(R8*1), R10 - ADDQ BX, R8 - MOVQ (SI)(R8*1), R11 - ADDQ BX, R8 - MOVQ (SI)(R8*1), BX - -loop: - VMOVDQU (R9), Y1 - VMOVDQU 32(R9), Y2 - VMOVDQU (R10), Y3 - VMOVDQU 32(R10), Y4 - VPXOR Y1, Y3, Y3 - VPXOR Y2, Y4, Y4 - VPSRLQ $0x04, Y3, Y6 - VPAND Y0, Y3, Y5 - VPAND Y0, Y6, Y6 - VBROADCASTI128 (AX), Y7 - VBROADCASTI128 64(AX), Y8 - VPSHUFB Y5, Y7, Y7 - VPSHUFB Y5, Y8, Y5 - VBROADCASTI128 16(AX), Y8 - VBROADCASTI128 80(AX), Y9 - VPSHUFB Y6, Y8, Y8 - VPSHUFB Y6, Y9, Y6 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y6, Y5 - VPAND Y4, Y0, Y6 - VPSRLQ $0x04, Y4, Y8 - VPAND Y0, Y8, Y8 - VBROADCASTI128 32(AX), Y9 - VBROADCASTI128 96(AX), Y10 - VPSHUFB Y6, Y9, Y9 - VPSHUFB Y6, Y10, Y6 - VPXOR Y7, Y9, Y7 - VPXOR Y5, Y6, Y5 - VBROADCASTI128 48(AX), Y9 - VBROADCASTI128 112(AX), Y6 - VPSHUFB Y8, Y9, Y9 - VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y7, Y9, Y1) - XOR3WAY( $0x00, Y5, Y6, Y2) - VMOVDQU (R11), Y5 - VMOVDQU 32(R11), Y6 - VMOVDQU (BX), Y7 - VMOVDQU 32(BX), Y8 - VPXOR Y5, Y7, Y7 - VPXOR Y6, Y8, Y8 - VPSRLQ $0x04, Y7, Y10 - VPAND Y0, Y7, Y9 - VPAND Y0, Y10, Y10 - VBROADCASTI128 (CX), Y11 - VBROADCASTI128 64(CX), Y12 - VPSHUFB Y9, Y11, Y11 - VPSHUFB Y9, Y12, Y9 - VBROADCASTI128 16(CX), Y12 - VBROADCASTI128 80(CX), Y13 - VPSHUFB Y10, Y12, Y12 - VPSHUFB Y10, Y13, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y8, Y0, Y10 - VPSRLQ $0x04, Y8, Y12 - VPAND Y0, Y12, Y12 - VBROADCASTI128 32(CX), Y13 - VBROADCASTI128 96(CX), Y14 - VPSHUFB Y10, Y13, Y13 - VPSHUFB Y10, Y14, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VBROADCASTI128 48(CX), Y13 - VBROADCASTI128 112(CX), Y10 - VPSHUFB Y12, Y13, Y13 - VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y11, Y13, Y5) - XOR3WAY( $0x00, Y9, Y10, Y6) - VPXOR Y1, Y5, Y5 - VPXOR Y2, Y6, Y6 - VPXOR Y3, Y7, Y7 - VPXOR Y4, Y8, Y8 - VPSRLQ $0x04, Y5, Y10 - VPAND Y0, Y5, Y9 - VPAND Y0, Y10, Y10 - VBROADCASTI128 (DX), Y11 - VBROADCASTI128 64(DX), Y12 - VPSHUFB Y9, Y11, Y11 - VPSHUFB Y9, Y12, Y9 - VBROADCASTI128 16(DX), Y12 - VBROADCASTI128 80(DX), Y13 - VPSHUFB Y10, Y12, Y12 - VPSHUFB Y10, Y13, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y6, Y0, Y10 - VPSRLQ $0x04, Y6, Y12 - VPAND Y0, Y12, Y12 - VBROADCASTI128 32(DX), Y13 - VBROADCASTI128 96(DX), Y14 - VPSHUFB Y10, Y13, Y13 - VPSHUFB Y10, Y14, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VBROADCASTI128 48(DX), Y13 - VBROADCASTI128 112(DX), Y10 - VPSHUFB Y12, Y13, Y13 - VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y11, Y13, Y1) - XOR3WAY( $0x00, Y9, Y10, Y2) - VPSRLQ $0x04, Y7, Y10 - VPAND Y0, Y7, Y9 - VPAND Y0, Y10, Y10 - VBROADCASTI128 (DX), Y11 - VBROADCASTI128 64(DX), Y12 - VPSHUFB Y9, Y11, Y11 - VPSHUFB Y9, Y12, Y9 - VBROADCASTI128 16(DX), Y12 - VBROADCASTI128 80(DX), Y13 - VPSHUFB Y10, Y12, Y12 - VPSHUFB Y10, Y13, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y8, Y0, Y10 - VPSRLQ $0x04, Y8, Y12 - VPAND Y0, Y12, Y12 - VBROADCASTI128 32(DX), Y13 - VBROADCASTI128 96(DX), Y14 - VPSHUFB Y10, Y13, Y13 - VPSHUFB Y10, Y14, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VBROADCASTI128 48(DX), Y13 - VBROADCASTI128 112(DX), Y10 - VPSHUFB Y12, Y13, Y13 - VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y11, Y13, Y3) - XOR3WAY( $0x00, Y9, Y10, Y4) - VMOVDQU Y1, (R9) - VMOVDQU Y2, 32(R9) - ADDQ $0x40, R9 - VMOVDQU Y3, (R10) - VMOVDQU Y4, 32(R10) - ADDQ $0x40, R10 - VMOVDQU Y5, (R11) - VMOVDQU Y6, 32(R11) - ADDQ $0x40, R11 - VMOVDQU Y7, (BX) - VMOVDQU Y8, 32(BX) - ADDQ $0x40, BX - SUBQ $0x40, DI - JNZ loop - VZEROUPPER - RET - -// func fftDIT4_avx2_0(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·fftDIT4_avx2_0(SB), NOSPLIT, $0-56 - // dist must be multiplied by 24 (size of slice header) - MOVQ table01+32(FP), AX - MOVQ table23+40(FP), CX - MOVQ table02+48(FP), DX - MOVQ $0x0000000f, BX - MOVQ BX, X0 - VPBROADCASTB X0, Y0 - MOVQ dist+24(FP), BX - MOVQ work_base+0(FP), SI - MOVQ 8(SI), DI - XORQ R8, R8 - MOVQ (SI)(R8*1), R9 - ADDQ BX, R8 - MOVQ (SI)(R8*1), R10 - ADDQ BX, R8 - MOVQ (SI)(R8*1), R11 - ADDQ BX, R8 - MOVQ (SI)(R8*1), BX - -loop: - VMOVDQU (R9), Y1 - VMOVDQU 32(R9), Y2 - VMOVDQU (R11), Y5 - VMOVDQU 32(R11), Y6 - VMOVDQU (R10), Y3 - VMOVDQU 32(R10), Y4 - VMOVDQU (BX), Y7 - VMOVDQU 32(BX), Y8 - VPSRLQ $0x04, Y5, Y10 - VPAND Y0, Y5, Y9 - VPAND Y0, Y10, Y10 - VBROADCASTI128 (DX), Y11 - VBROADCASTI128 64(DX), Y12 - VPSHUFB Y9, Y11, Y11 - VPSHUFB Y9, Y12, Y9 - VBROADCASTI128 16(DX), Y12 - VBROADCASTI128 80(DX), Y13 - VPSHUFB Y10, Y12, Y12 - VPSHUFB Y10, Y13, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y6, Y0, Y10 - VPSRLQ $0x04, Y6, Y12 - VPAND Y0, Y12, Y12 - VBROADCASTI128 32(DX), Y13 - VBROADCASTI128 96(DX), Y14 - VPSHUFB Y10, Y13, Y13 - VPSHUFB Y10, Y14, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VBROADCASTI128 48(DX), Y13 - VBROADCASTI128 112(DX), Y10 - VPSHUFB Y12, Y13, Y13 - VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y11, Y13, Y1) - XOR3WAY( $0x00, Y9, Y10, Y2) - VPSRLQ $0x04, Y7, Y10 - VPAND Y0, Y7, Y9 - VPAND Y0, Y10, Y10 - VBROADCASTI128 (DX), Y11 - VBROADCASTI128 64(DX), Y12 - VPSHUFB Y9, Y11, Y11 - VPSHUFB Y9, Y12, Y9 - VBROADCASTI128 16(DX), Y12 - VBROADCASTI128 80(DX), Y13 - VPSHUFB Y10, Y12, Y12 - VPSHUFB Y10, Y13, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y8, Y0, Y10 - VPSRLQ $0x04, Y8, Y12 - VPAND Y0, Y12, Y12 - VBROADCASTI128 32(DX), Y13 - VBROADCASTI128 96(DX), Y14 - VPSHUFB Y10, Y13, Y13 - VPSHUFB Y10, Y14, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VBROADCASTI128 48(DX), Y13 - VBROADCASTI128 112(DX), Y10 - VPSHUFB Y12, Y13, Y13 - VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y11, Y13, Y3) - XOR3WAY( $0x00, Y9, Y10, Y4) - VPXOR Y1, Y5, Y5 - VPXOR Y2, Y6, Y6 - VPXOR Y3, Y7, Y7 - VPXOR Y4, Y8, Y8 - VPSRLQ $0x04, Y3, Y10 - VPAND Y0, Y3, Y9 - VPAND Y0, Y10, Y10 - VBROADCASTI128 (AX), Y11 - VBROADCASTI128 64(AX), Y12 - VPSHUFB Y9, Y11, Y11 - VPSHUFB Y9, Y12, Y9 - VBROADCASTI128 16(AX), Y12 - VBROADCASTI128 80(AX), Y13 - VPSHUFB Y10, Y12, Y12 - VPSHUFB Y10, Y13, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y4, Y0, Y10 - VPSRLQ $0x04, Y4, Y12 - VPAND Y0, Y12, Y12 - VBROADCASTI128 32(AX), Y13 - VBROADCASTI128 96(AX), Y14 - VPSHUFB Y10, Y13, Y13 - VPSHUFB Y10, Y14, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VBROADCASTI128 48(AX), Y13 - VBROADCASTI128 112(AX), Y10 - VPSHUFB Y12, Y13, Y13 - VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y11, Y13, Y1) - XOR3WAY( $0x00, Y9, Y10, Y2) - VPXOR Y1, Y3, Y3 - VPXOR Y2, Y4, Y4 - VMOVDQU Y1, (R9) - VMOVDQU Y2, 32(R9) - ADDQ $0x40, R9 - VMOVDQU Y3, (R10) - VMOVDQU Y4, 32(R10) - ADDQ $0x40, R10 - VPSRLQ $0x04, Y7, Y2 - VPAND Y0, Y7, Y1 - VPAND Y0, Y2, Y2 - VBROADCASTI128 (CX), Y3 - VBROADCASTI128 64(CX), Y4 - VPSHUFB Y1, Y3, Y3 - VPSHUFB Y1, Y4, Y1 - VBROADCASTI128 16(CX), Y4 - VBROADCASTI128 80(CX), Y9 - VPSHUFB Y2, Y4, Y4 - VPSHUFB Y2, Y9, Y2 - VPXOR Y3, Y4, Y3 - VPXOR Y1, Y2, Y1 - VPAND Y8, Y0, Y2 - VPSRLQ $0x04, Y8, Y4 - VPAND Y0, Y4, Y4 - VBROADCASTI128 32(CX), Y9 - VBROADCASTI128 96(CX), Y10 - VPSHUFB Y2, Y9, Y9 - VPSHUFB Y2, Y10, Y2 - VPXOR Y3, Y9, Y3 - VPXOR Y1, Y2, Y1 - VBROADCASTI128 48(CX), Y9 - VBROADCASTI128 112(CX), Y2 - VPSHUFB Y4, Y9, Y9 - VPSHUFB Y4, Y2, Y2 - XOR3WAY( $0x00, Y3, Y9, Y5) - XOR3WAY( $0x00, Y1, Y2, Y6) - VPXOR Y5, Y7, Y7 - VPXOR Y6, Y8, Y8 - VMOVDQU Y5, (R11) - VMOVDQU Y6, 32(R11) - ADDQ $0x40, R11 - VMOVDQU Y7, (BX) - VMOVDQU Y8, 32(BX) - ADDQ $0x40, BX - SUBQ $0x40, DI - JNZ loop - VZEROUPPER - RET - -// func ifftDIT4_avx2_1(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·ifftDIT4_avx2_1(SB), NOSPLIT, $0-56 - // dist must be multiplied by 24 (size of slice header) - MOVQ table01+32(FP), AX - MOVQ table23+40(FP), AX - MOVQ table02+48(FP), CX - MOVQ $0x0000000f, DX - MOVQ DX, X0 - VPBROADCASTB X0, Y0 - MOVQ dist+24(FP), DX - MOVQ work_base+0(FP), BX - MOVQ 8(BX), SI - XORQ DI, DI - MOVQ (BX)(DI*1), R8 - ADDQ DX, DI - MOVQ (BX)(DI*1), R9 - ADDQ DX, DI - MOVQ (BX)(DI*1), R10 - ADDQ DX, DI - MOVQ (BX)(DI*1), DX - -loop: - VMOVDQU (R8), Y1 - VMOVDQU 32(R8), Y2 - VMOVDQU (R9), Y3 - VMOVDQU 32(R9), Y4 - VPXOR Y1, Y3, Y3 - VPXOR Y2, Y4, Y4 - VMOVDQU (R10), Y5 - VMOVDQU 32(R10), Y6 - VMOVDQU (DX), Y7 - VMOVDQU 32(DX), Y8 - VPXOR Y5, Y7, Y7 - VPXOR Y6, Y8, Y8 - VPSRLQ $0x04, Y7, Y10 - VPAND Y0, Y7, Y9 - VPAND Y0, Y10, Y10 - VBROADCASTI128 (AX), Y11 - VBROADCASTI128 64(AX), Y12 - VPSHUFB Y9, Y11, Y11 - VPSHUFB Y9, Y12, Y9 - VBROADCASTI128 16(AX), Y12 - VBROADCASTI128 80(AX), Y13 - VPSHUFB Y10, Y12, Y12 - VPSHUFB Y10, Y13, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y8, Y0, Y10 - VPSRLQ $0x04, Y8, Y12 - VPAND Y0, Y12, Y12 - VBROADCASTI128 32(AX), Y13 - VBROADCASTI128 96(AX), Y14 - VPSHUFB Y10, Y13, Y13 - VPSHUFB Y10, Y14, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VBROADCASTI128 48(AX), Y13 - VBROADCASTI128 112(AX), Y10 - VPSHUFB Y12, Y13, Y13 - VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y11, Y13, Y5) - XOR3WAY( $0x00, Y9, Y10, Y6) - VPXOR Y1, Y5, Y5 - VPXOR Y2, Y6, Y6 - VPXOR Y3, Y7, Y7 - VPXOR Y4, Y8, Y8 - VPSRLQ $0x04, Y5, Y10 - VPAND Y0, Y5, Y9 - VPAND Y0, Y10, Y10 - VBROADCASTI128 (CX), Y11 - VBROADCASTI128 64(CX), Y12 - VPSHUFB Y9, Y11, Y11 - VPSHUFB Y9, Y12, Y9 - VBROADCASTI128 16(CX), Y12 - VBROADCASTI128 80(CX), Y13 - VPSHUFB Y10, Y12, Y12 - VPSHUFB Y10, Y13, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y6, Y0, Y10 - VPSRLQ $0x04, Y6, Y12 - VPAND Y0, Y12, Y12 - VBROADCASTI128 32(CX), Y13 - VBROADCASTI128 96(CX), Y14 - VPSHUFB Y10, Y13, Y13 - VPSHUFB Y10, Y14, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VBROADCASTI128 48(CX), Y13 - VBROADCASTI128 112(CX), Y10 - VPSHUFB Y12, Y13, Y13 - VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y11, Y13, Y1) - XOR3WAY( $0x00, Y9, Y10, Y2) - VPSRLQ $0x04, Y7, Y10 - VPAND Y0, Y7, Y9 - VPAND Y0, Y10, Y10 - VBROADCASTI128 (CX), Y11 - VBROADCASTI128 64(CX), Y12 - VPSHUFB Y9, Y11, Y11 - VPSHUFB Y9, Y12, Y9 - VBROADCASTI128 16(CX), Y12 - VBROADCASTI128 80(CX), Y13 - VPSHUFB Y10, Y12, Y12 - VPSHUFB Y10, Y13, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y8, Y0, Y10 - VPSRLQ $0x04, Y8, Y12 - VPAND Y0, Y12, Y12 - VBROADCASTI128 32(CX), Y13 - VBROADCASTI128 96(CX), Y14 - VPSHUFB Y10, Y13, Y13 - VPSHUFB Y10, Y14, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VBROADCASTI128 48(CX), Y13 - VBROADCASTI128 112(CX), Y10 - VPSHUFB Y12, Y13, Y13 - VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y11, Y13, Y3) - XOR3WAY( $0x00, Y9, Y10, Y4) - VMOVDQU Y1, (R8) - VMOVDQU Y2, 32(R8) - ADDQ $0x40, R8 - VMOVDQU Y3, (R9) - VMOVDQU Y4, 32(R9) - ADDQ $0x40, R9 - VMOVDQU Y5, (R10) - VMOVDQU Y6, 32(R10) - ADDQ $0x40, R10 - VMOVDQU Y7, (DX) - VMOVDQU Y8, 32(DX) - ADDQ $0x40, DX - SUBQ $0x40, SI - JNZ loop - VZEROUPPER - RET - -// func fftDIT4_avx2_1(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·fftDIT4_avx2_1(SB), NOSPLIT, $0-56 - // dist must be multiplied by 24 (size of slice header) - MOVQ table01+32(FP), AX - MOVQ table23+40(FP), CX - MOVQ table02+48(FP), DX - MOVQ $0x0000000f, DX - MOVQ DX, X0 - VPBROADCASTB X0, Y0 - MOVQ dist+24(FP), DX - MOVQ work_base+0(FP), BX - MOVQ 8(BX), SI - XORQ DI, DI - MOVQ (BX)(DI*1), R8 - ADDQ DX, DI - MOVQ (BX)(DI*1), R9 - ADDQ DX, DI - MOVQ (BX)(DI*1), R10 - ADDQ DX, DI - MOVQ (BX)(DI*1), DX - -loop: - VMOVDQU (R8), Y1 - VMOVDQU 32(R8), Y2 - VMOVDQU (R10), Y5 - VMOVDQU 32(R10), Y6 - VMOVDQU (R9), Y3 - VMOVDQU 32(R9), Y4 - VMOVDQU (DX), Y7 - VMOVDQU 32(DX), Y8 - VPXOR Y1, Y5, Y5 - VPXOR Y2, Y6, Y6 - VPXOR Y3, Y7, Y7 - VPXOR Y4, Y8, Y8 - VPSRLQ $0x04, Y3, Y10 - VPAND Y0, Y3, Y9 - VPAND Y0, Y10, Y10 - VBROADCASTI128 (AX), Y11 - VBROADCASTI128 64(AX), Y12 - VPSHUFB Y9, Y11, Y11 - VPSHUFB Y9, Y12, Y9 - VBROADCASTI128 16(AX), Y12 - VBROADCASTI128 80(AX), Y13 - VPSHUFB Y10, Y12, Y12 - VPSHUFB Y10, Y13, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y4, Y0, Y10 - VPSRLQ $0x04, Y4, Y12 - VPAND Y0, Y12, Y12 - VBROADCASTI128 32(AX), Y13 - VBROADCASTI128 96(AX), Y14 - VPSHUFB Y10, Y13, Y13 - VPSHUFB Y10, Y14, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VBROADCASTI128 48(AX), Y13 - VBROADCASTI128 112(AX), Y10 - VPSHUFB Y12, Y13, Y13 - VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y11, Y13, Y1) - XOR3WAY( $0x00, Y9, Y10, Y2) - VPXOR Y1, Y3, Y3 - VPXOR Y2, Y4, Y4 - VMOVDQU Y1, (R8) - VMOVDQU Y2, 32(R8) - ADDQ $0x40, R8 - VMOVDQU Y3, (R9) - VMOVDQU Y4, 32(R9) - ADDQ $0x40, R9 - VPSRLQ $0x04, Y7, Y2 - VPAND Y0, Y7, Y1 - VPAND Y0, Y2, Y2 - VBROADCASTI128 (CX), Y3 - VBROADCASTI128 64(CX), Y4 - VPSHUFB Y1, Y3, Y3 - VPSHUFB Y1, Y4, Y1 - VBROADCASTI128 16(CX), Y4 - VBROADCASTI128 80(CX), Y9 - VPSHUFB Y2, Y4, Y4 - VPSHUFB Y2, Y9, Y2 - VPXOR Y3, Y4, Y3 - VPXOR Y1, Y2, Y1 - VPAND Y8, Y0, Y2 - VPSRLQ $0x04, Y8, Y4 - VPAND Y0, Y4, Y4 - VBROADCASTI128 32(CX), Y9 - VBROADCASTI128 96(CX), Y10 - VPSHUFB Y2, Y9, Y9 - VPSHUFB Y2, Y10, Y2 - VPXOR Y3, Y9, Y3 - VPXOR Y1, Y2, Y1 - VBROADCASTI128 48(CX), Y9 - VBROADCASTI128 112(CX), Y2 - VPSHUFB Y4, Y9, Y9 - VPSHUFB Y4, Y2, Y2 - XOR3WAY( $0x00, Y3, Y9, Y5) - XOR3WAY( $0x00, Y1, Y2, Y6) - VPXOR Y5, Y7, Y7 - VPXOR Y6, Y8, Y8 - VMOVDQU Y5, (R10) - VMOVDQU Y6, 32(R10) - ADDQ $0x40, R10 - VMOVDQU Y7, (DX) - VMOVDQU Y8, 32(DX) - ADDQ $0x40, DX - SUBQ $0x40, SI - JNZ loop - VZEROUPPER - RET - -// func ifftDIT4_avx2_2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·ifftDIT4_avx2_2(SB), NOSPLIT, $0-56 - // dist must be multiplied by 24 (size of slice header) - MOVQ table01+32(FP), AX - MOVQ table23+40(FP), CX - MOVQ table02+48(FP), CX - MOVQ $0x0000000f, DX - MOVQ DX, X0 - VPBROADCASTB X0, Y0 - MOVQ dist+24(FP), DX - MOVQ work_base+0(FP), BX - MOVQ 8(BX), SI - XORQ DI, DI - MOVQ (BX)(DI*1), R8 - ADDQ DX, DI - MOVQ (BX)(DI*1), R9 - ADDQ DX, DI - MOVQ (BX)(DI*1), R10 - ADDQ DX, DI - MOVQ (BX)(DI*1), DX - -loop: - VMOVDQU (R8), Y1 - VMOVDQU 32(R8), Y2 - VMOVDQU (R9), Y3 - VMOVDQU 32(R9), Y4 - VPXOR Y1, Y3, Y3 - VPXOR Y2, Y4, Y4 - VPSRLQ $0x04, Y3, Y6 - VPAND Y0, Y3, Y5 - VPAND Y0, Y6, Y6 - VBROADCASTI128 (AX), Y7 - VBROADCASTI128 64(AX), Y8 - VPSHUFB Y5, Y7, Y7 - VPSHUFB Y5, Y8, Y5 - VBROADCASTI128 16(AX), Y8 - VBROADCASTI128 80(AX), Y9 - VPSHUFB Y6, Y8, Y8 - VPSHUFB Y6, Y9, Y6 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y6, Y5 - VPAND Y4, Y0, Y6 - VPSRLQ $0x04, Y4, Y8 - VPAND Y0, Y8, Y8 - VBROADCASTI128 32(AX), Y9 - VBROADCASTI128 96(AX), Y10 - VPSHUFB Y6, Y9, Y9 - VPSHUFB Y6, Y10, Y6 - VPXOR Y7, Y9, Y7 - VPXOR Y5, Y6, Y5 - VBROADCASTI128 48(AX), Y9 - VBROADCASTI128 112(AX), Y6 - VPSHUFB Y8, Y9, Y9 - VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y7, Y9, Y1) - XOR3WAY( $0x00, Y5, Y6, Y2) - VMOVDQU (R10), Y5 - VMOVDQU 32(R10), Y6 - VMOVDQU (DX), Y7 - VMOVDQU 32(DX), Y8 - VPXOR Y5, Y7, Y7 - VPXOR Y6, Y8, Y8 - VPXOR Y1, Y5, Y5 - VPXOR Y2, Y6, Y6 - VPXOR Y3, Y7, Y7 - VPXOR Y4, Y8, Y8 - VPSRLQ $0x04, Y5, Y10 - VPAND Y0, Y5, Y9 - VPAND Y0, Y10, Y10 - VBROADCASTI128 (CX), Y11 - VBROADCASTI128 64(CX), Y12 - VPSHUFB Y9, Y11, Y11 - VPSHUFB Y9, Y12, Y9 - VBROADCASTI128 16(CX), Y12 - VBROADCASTI128 80(CX), Y13 - VPSHUFB Y10, Y12, Y12 - VPSHUFB Y10, Y13, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y6, Y0, Y10 - VPSRLQ $0x04, Y6, Y12 - VPAND Y0, Y12, Y12 - VBROADCASTI128 32(CX), Y13 - VBROADCASTI128 96(CX), Y14 - VPSHUFB Y10, Y13, Y13 - VPSHUFB Y10, Y14, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VBROADCASTI128 48(CX), Y13 - VBROADCASTI128 112(CX), Y10 - VPSHUFB Y12, Y13, Y13 - VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y11, Y13, Y1) - XOR3WAY( $0x00, Y9, Y10, Y2) - VPSRLQ $0x04, Y7, Y10 - VPAND Y0, Y7, Y9 - VPAND Y0, Y10, Y10 - VBROADCASTI128 (CX), Y11 - VBROADCASTI128 64(CX), Y12 - VPSHUFB Y9, Y11, Y11 - VPSHUFB Y9, Y12, Y9 - VBROADCASTI128 16(CX), Y12 - VBROADCASTI128 80(CX), Y13 - VPSHUFB Y10, Y12, Y12 - VPSHUFB Y10, Y13, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y8, Y0, Y10 - VPSRLQ $0x04, Y8, Y12 - VPAND Y0, Y12, Y12 - VBROADCASTI128 32(CX), Y13 - VBROADCASTI128 96(CX), Y14 - VPSHUFB Y10, Y13, Y13 - VPSHUFB Y10, Y14, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VBROADCASTI128 48(CX), Y13 - VBROADCASTI128 112(CX), Y10 - VPSHUFB Y12, Y13, Y13 - VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y11, Y13, Y3) - XOR3WAY( $0x00, Y9, Y10, Y4) - VMOVDQU Y1, (R8) - VMOVDQU Y2, 32(R8) - ADDQ $0x40, R8 - VMOVDQU Y3, (R9) - VMOVDQU Y4, 32(R9) - ADDQ $0x40, R9 - VMOVDQU Y5, (R10) - VMOVDQU Y6, 32(R10) - ADDQ $0x40, R10 - VMOVDQU Y7, (DX) - VMOVDQU Y8, 32(DX) - ADDQ $0x40, DX - SUBQ $0x40, SI - JNZ loop - VZEROUPPER - RET - -// func fftDIT4_avx2_2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·fftDIT4_avx2_2(SB), NOSPLIT, $0-56 - // dist must be multiplied by 24 (size of slice header) - MOVQ table01+32(FP), AX - MOVQ table23+40(FP), AX - MOVQ table02+48(FP), CX - MOVQ $0x0000000f, DX - MOVQ DX, X0 - VPBROADCASTB X0, Y0 - MOVQ dist+24(FP), DX - MOVQ work_base+0(FP), BX - MOVQ 8(BX), SI - XORQ DI, DI - MOVQ (BX)(DI*1), R8 - ADDQ DX, DI - MOVQ (BX)(DI*1), R9 - ADDQ DX, DI - MOVQ (BX)(DI*1), R10 - ADDQ DX, DI - MOVQ (BX)(DI*1), DX - -loop: - VMOVDQU (R8), Y1 - VMOVDQU 32(R8), Y2 - VMOVDQU (R10), Y5 - VMOVDQU 32(R10), Y6 - VMOVDQU (R9), Y3 - VMOVDQU 32(R9), Y4 - VMOVDQU (DX), Y7 - VMOVDQU 32(DX), Y8 - VPSRLQ $0x04, Y5, Y10 - VPAND Y0, Y5, Y9 - VPAND Y0, Y10, Y10 - VBROADCASTI128 (CX), Y11 - VBROADCASTI128 64(CX), Y12 - VPSHUFB Y9, Y11, Y11 - VPSHUFB Y9, Y12, Y9 - VBROADCASTI128 16(CX), Y12 - VBROADCASTI128 80(CX), Y13 - VPSHUFB Y10, Y12, Y12 - VPSHUFB Y10, Y13, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y6, Y0, Y10 - VPSRLQ $0x04, Y6, Y12 - VPAND Y0, Y12, Y12 - VBROADCASTI128 32(CX), Y13 - VBROADCASTI128 96(CX), Y14 - VPSHUFB Y10, Y13, Y13 - VPSHUFB Y10, Y14, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VBROADCASTI128 48(CX), Y13 - VBROADCASTI128 112(CX), Y10 - VPSHUFB Y12, Y13, Y13 - VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y11, Y13, Y1) - XOR3WAY( $0x00, Y9, Y10, Y2) - VPSRLQ $0x04, Y7, Y10 - VPAND Y0, Y7, Y9 - VPAND Y0, Y10, Y10 - VBROADCASTI128 (CX), Y11 - VBROADCASTI128 64(CX), Y12 - VPSHUFB Y9, Y11, Y11 - VPSHUFB Y9, Y12, Y9 - VBROADCASTI128 16(CX), Y12 - VBROADCASTI128 80(CX), Y13 - VPSHUFB Y10, Y12, Y12 - VPSHUFB Y10, Y13, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y8, Y0, Y10 - VPSRLQ $0x04, Y8, Y12 - VPAND Y0, Y12, Y12 - VBROADCASTI128 32(CX), Y13 - VBROADCASTI128 96(CX), Y14 - VPSHUFB Y10, Y13, Y13 - VPSHUFB Y10, Y14, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VBROADCASTI128 48(CX), Y13 - VBROADCASTI128 112(CX), Y10 - VPSHUFB Y12, Y13, Y13 - VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y11, Y13, Y3) - XOR3WAY( $0x00, Y9, Y10, Y4) - VPXOR Y1, Y5, Y5 - VPXOR Y2, Y6, Y6 - VPXOR Y3, Y7, Y7 - VPXOR Y4, Y8, Y8 - VPXOR Y1, Y3, Y3 - VPXOR Y2, Y4, Y4 - VMOVDQU Y1, (R8) - VMOVDQU Y2, 32(R8) - ADDQ $0x40, R8 - VMOVDQU Y3, (R9) - VMOVDQU Y4, 32(R9) - ADDQ $0x40, R9 - VPSRLQ $0x04, Y7, Y2 - VPAND Y0, Y7, Y1 - VPAND Y0, Y2, Y2 - VBROADCASTI128 (AX), Y3 - VBROADCASTI128 64(AX), Y4 - VPSHUFB Y1, Y3, Y3 - VPSHUFB Y1, Y4, Y1 - VBROADCASTI128 16(AX), Y4 - VBROADCASTI128 80(AX), Y9 - VPSHUFB Y2, Y4, Y4 - VPSHUFB Y2, Y9, Y2 - VPXOR Y3, Y4, Y3 - VPXOR Y1, Y2, Y1 - VPAND Y8, Y0, Y2 - VPSRLQ $0x04, Y8, Y4 - VPAND Y0, Y4, Y4 - VBROADCASTI128 32(AX), Y9 - VBROADCASTI128 96(AX), Y10 - VPSHUFB Y2, Y9, Y9 - VPSHUFB Y2, Y10, Y2 - VPXOR Y3, Y9, Y3 - VPXOR Y1, Y2, Y1 - VBROADCASTI128 48(AX), Y9 - VBROADCASTI128 112(AX), Y2 - VPSHUFB Y4, Y9, Y9 - VPSHUFB Y4, Y2, Y2 - XOR3WAY( $0x00, Y3, Y9, Y5) - XOR3WAY( $0x00, Y1, Y2, Y6) - VPXOR Y5, Y7, Y7 - VPXOR Y6, Y8, Y8 - VMOVDQU Y5, (R10) - VMOVDQU Y6, 32(R10) - ADDQ $0x40, R10 - VMOVDQU Y7, (DX) - VMOVDQU Y8, 32(DX) - ADDQ $0x40, DX - SUBQ $0x40, SI - JNZ loop - VZEROUPPER - RET - -// func ifftDIT4_avx2_3(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·ifftDIT4_avx2_3(SB), NOSPLIT, $0-56 - // dist must be multiplied by 24 (size of slice header) - MOVQ table01+32(FP), AX - MOVQ table23+40(FP), AX - MOVQ table02+48(FP), AX - MOVQ $0x0000000f, CX - MOVQ CX, X0 - VPBROADCASTB X0, Y0 - MOVQ dist+24(FP), CX - MOVQ work_base+0(FP), DX - MOVQ 8(DX), BX - XORQ SI, SI - MOVQ (DX)(SI*1), DI - ADDQ CX, SI - MOVQ (DX)(SI*1), R8 - ADDQ CX, SI - MOVQ (DX)(SI*1), R9 - ADDQ CX, SI - MOVQ (DX)(SI*1), CX - -loop: - VMOVDQU (DI), Y1 - VMOVDQU 32(DI), Y2 - VMOVDQU (R8), Y3 - VMOVDQU 32(R8), Y4 - VPXOR Y1, Y3, Y3 - VPXOR Y2, Y4, Y4 - VMOVDQU (R9), Y5 - VMOVDQU 32(R9), Y6 - VMOVDQU (CX), Y7 - VMOVDQU 32(CX), Y8 - VPXOR Y5, Y7, Y7 - VPXOR Y6, Y8, Y8 - VPXOR Y1, Y5, Y5 - VPXOR Y2, Y6, Y6 - VPXOR Y3, Y7, Y7 - VPXOR Y4, Y8, Y8 - VPSRLQ $0x04, Y5, Y10 - VPAND Y0, Y5, Y9 - VPAND Y0, Y10, Y10 - VBROADCASTI128 (AX), Y11 - VBROADCASTI128 64(AX), Y12 - VPSHUFB Y9, Y11, Y11 - VPSHUFB Y9, Y12, Y9 - VBROADCASTI128 16(AX), Y12 - VBROADCASTI128 80(AX), Y13 - VPSHUFB Y10, Y12, Y12 - VPSHUFB Y10, Y13, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y6, Y0, Y10 - VPSRLQ $0x04, Y6, Y12 - VPAND Y0, Y12, Y12 - VBROADCASTI128 32(AX), Y13 - VBROADCASTI128 96(AX), Y14 - VPSHUFB Y10, Y13, Y13 - VPSHUFB Y10, Y14, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VBROADCASTI128 48(AX), Y13 - VBROADCASTI128 112(AX), Y10 - VPSHUFB Y12, Y13, Y13 - VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y11, Y13, Y1) - XOR3WAY( $0x00, Y9, Y10, Y2) - VPSRLQ $0x04, Y7, Y10 - VPAND Y0, Y7, Y9 - VPAND Y0, Y10, Y10 - VBROADCASTI128 (AX), Y11 - VBROADCASTI128 64(AX), Y12 - VPSHUFB Y9, Y11, Y11 - VPSHUFB Y9, Y12, Y9 - VBROADCASTI128 16(AX), Y12 - VBROADCASTI128 80(AX), Y13 - VPSHUFB Y10, Y12, Y12 - VPSHUFB Y10, Y13, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y8, Y0, Y10 - VPSRLQ $0x04, Y8, Y12 - VPAND Y0, Y12, Y12 - VBROADCASTI128 32(AX), Y13 - VBROADCASTI128 96(AX), Y14 - VPSHUFB Y10, Y13, Y13 - VPSHUFB Y10, Y14, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VBROADCASTI128 48(AX), Y13 - VBROADCASTI128 112(AX), Y10 - VPSHUFB Y12, Y13, Y13 - VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y11, Y13, Y3) - XOR3WAY( $0x00, Y9, Y10, Y4) - VMOVDQU Y1, (DI) - VMOVDQU Y2, 32(DI) - ADDQ $0x40, DI - VMOVDQU Y3, (R8) - VMOVDQU Y4, 32(R8) - ADDQ $0x40, R8 - VMOVDQU Y5, (R9) - VMOVDQU Y6, 32(R9) - ADDQ $0x40, R9 - VMOVDQU Y7, (CX) - VMOVDQU Y8, 32(CX) - ADDQ $0x40, CX - SUBQ $0x40, BX - JNZ loop - VZEROUPPER - RET - -// func fftDIT4_avx2_3(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·fftDIT4_avx2_3(SB), NOSPLIT, $0-56 - // dist must be multiplied by 24 (size of slice header) - MOVQ table01+32(FP), AX - MOVQ table23+40(FP), AX - MOVQ table02+48(FP), CX - MOVQ $0x0000000f, CX - MOVQ CX, X0 - VPBROADCASTB X0, Y0 - MOVQ dist+24(FP), CX - MOVQ work_base+0(FP), DX - MOVQ 8(DX), BX - XORQ SI, SI - MOVQ (DX)(SI*1), DI - ADDQ CX, SI - MOVQ (DX)(SI*1), R8 - ADDQ CX, SI - MOVQ (DX)(SI*1), R9 - ADDQ CX, SI - MOVQ (DX)(SI*1), CX - -loop: - VMOVDQU (DI), Y1 - VMOVDQU 32(DI), Y2 - VMOVDQU (R9), Y5 - VMOVDQU 32(R9), Y6 - VMOVDQU (R8), Y3 - VMOVDQU 32(R8), Y4 - VMOVDQU (CX), Y7 - VMOVDQU 32(CX), Y8 - VPXOR Y1, Y5, Y5 - VPXOR Y2, Y6, Y6 - VPXOR Y3, Y7, Y7 - VPXOR Y4, Y8, Y8 - VPXOR Y1, Y3, Y3 - VPXOR Y2, Y4, Y4 - VMOVDQU Y1, (DI) - VMOVDQU Y2, 32(DI) - ADDQ $0x40, DI - VMOVDQU Y3, (R8) - VMOVDQU Y4, 32(R8) - ADDQ $0x40, R8 - VPSRLQ $0x04, Y7, Y2 - VPAND Y0, Y7, Y1 - VPAND Y0, Y2, Y2 - VBROADCASTI128 (AX), Y3 - VBROADCASTI128 64(AX), Y4 - VPSHUFB Y1, Y3, Y3 - VPSHUFB Y1, Y4, Y1 - VBROADCASTI128 16(AX), Y4 - VBROADCASTI128 80(AX), Y9 - VPSHUFB Y2, Y4, Y4 - VPSHUFB Y2, Y9, Y2 - VPXOR Y3, Y4, Y3 - VPXOR Y1, Y2, Y1 - VPAND Y8, Y0, Y2 - VPSRLQ $0x04, Y8, Y4 - VPAND Y0, Y4, Y4 - VBROADCASTI128 32(AX), Y9 - VBROADCASTI128 96(AX), Y10 - VPSHUFB Y2, Y9, Y9 - VPSHUFB Y2, Y10, Y2 - VPXOR Y3, Y9, Y3 - VPXOR Y1, Y2, Y1 - VBROADCASTI128 48(AX), Y9 - VBROADCASTI128 112(AX), Y2 - VPSHUFB Y4, Y9, Y9 - VPSHUFB Y4, Y2, Y2 - XOR3WAY( $0x00, Y3, Y9, Y5) - XOR3WAY( $0x00, Y1, Y2, Y6) - VPXOR Y5, Y7, Y7 - VPXOR Y6, Y8, Y8 - VMOVDQU Y5, (R9) - VMOVDQU Y6, 32(R9) - ADDQ $0x40, R9 - VMOVDQU Y7, (CX) - VMOVDQU Y8, 32(CX) - ADDQ $0x40, CX - SUBQ $0x40, BX - JNZ loop - VZEROUPPER - RET - -// func ifftDIT4_avx2_4(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·ifftDIT4_avx2_4(SB), NOSPLIT, $0-56 - // dist must be multiplied by 24 (size of slice header) - MOVQ table01+32(FP), AX - MOVQ table23+40(FP), CX - MOVQ table02+48(FP), DX - MOVQ $0x0000000f, DX - MOVQ DX, X0 - VPBROADCASTB X0, Y0 - MOVQ dist+24(FP), DX - MOVQ work_base+0(FP), BX - MOVQ 8(BX), SI - XORQ DI, DI - MOVQ (BX)(DI*1), R8 - ADDQ DX, DI - MOVQ (BX)(DI*1), R9 - ADDQ DX, DI - MOVQ (BX)(DI*1), R10 - ADDQ DX, DI - MOVQ (BX)(DI*1), DX - -loop: - VMOVDQU (R8), Y1 - VMOVDQU 32(R8), Y2 - VMOVDQU (R9), Y3 - VMOVDQU 32(R9), Y4 - VPXOR Y1, Y3, Y3 - VPXOR Y2, Y4, Y4 - VPSRLQ $0x04, Y3, Y6 - VPAND Y0, Y3, Y5 - VPAND Y0, Y6, Y6 - VBROADCASTI128 (AX), Y7 - VBROADCASTI128 64(AX), Y8 - VPSHUFB Y5, Y7, Y7 - VPSHUFB Y5, Y8, Y5 - VBROADCASTI128 16(AX), Y8 - VBROADCASTI128 80(AX), Y9 - VPSHUFB Y6, Y8, Y8 - VPSHUFB Y6, Y9, Y6 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y6, Y5 - VPAND Y4, Y0, Y6 - VPSRLQ $0x04, Y4, Y8 - VPAND Y0, Y8, Y8 - VBROADCASTI128 32(AX), Y9 - VBROADCASTI128 96(AX), Y10 - VPSHUFB Y6, Y9, Y9 - VPSHUFB Y6, Y10, Y6 - VPXOR Y7, Y9, Y7 - VPXOR Y5, Y6, Y5 - VBROADCASTI128 48(AX), Y9 - VBROADCASTI128 112(AX), Y6 - VPSHUFB Y8, Y9, Y9 - VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y7, Y9, Y1) - XOR3WAY( $0x00, Y5, Y6, Y2) - VMOVDQU (R10), Y5 - VMOVDQU 32(R10), Y6 - VMOVDQU (DX), Y7 - VMOVDQU 32(DX), Y8 - VPXOR Y5, Y7, Y7 - VPXOR Y6, Y8, Y8 - VPSRLQ $0x04, Y7, Y10 - VPAND Y0, Y7, Y9 - VPAND Y0, Y10, Y10 - VBROADCASTI128 (CX), Y11 - VBROADCASTI128 64(CX), Y12 - VPSHUFB Y9, Y11, Y11 - VPSHUFB Y9, Y12, Y9 - VBROADCASTI128 16(CX), Y12 - VBROADCASTI128 80(CX), Y13 - VPSHUFB Y10, Y12, Y12 - VPSHUFB Y10, Y13, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y8, Y0, Y10 - VPSRLQ $0x04, Y8, Y12 - VPAND Y0, Y12, Y12 - VBROADCASTI128 32(CX), Y13 - VBROADCASTI128 96(CX), Y14 - VPSHUFB Y10, Y13, Y13 - VPSHUFB Y10, Y14, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VBROADCASTI128 48(CX), Y13 - VBROADCASTI128 112(CX), Y10 - VPSHUFB Y12, Y13, Y13 - VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y11, Y13, Y5) - XOR3WAY( $0x00, Y9, Y10, Y6) - VPXOR Y1, Y5, Y5 - VPXOR Y2, Y6, Y6 - VPXOR Y3, Y7, Y7 - VPXOR Y4, Y8, Y8 - VMOVDQU Y1, (R8) - VMOVDQU Y2, 32(R8) - ADDQ $0x40, R8 - VMOVDQU Y3, (R9) - VMOVDQU Y4, 32(R9) - ADDQ $0x40, R9 - VMOVDQU Y5, (R10) - VMOVDQU Y6, 32(R10) - ADDQ $0x40, R10 - VMOVDQU Y7, (DX) - VMOVDQU Y8, 32(DX) - ADDQ $0x40, DX - SUBQ $0x40, SI - JNZ loop - VZEROUPPER - RET - -// func fftDIT4_avx2_4(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·fftDIT4_avx2_4(SB), NOSPLIT, $0-56 - // dist must be multiplied by 24 (size of slice header) - MOVQ table01+32(FP), AX - MOVQ table23+40(FP), CX - MOVQ table02+48(FP), CX - MOVQ $0x0000000f, DX - MOVQ DX, X0 - VPBROADCASTB X0, Y0 - MOVQ dist+24(FP), DX - MOVQ work_base+0(FP), BX - MOVQ 8(BX), SI - XORQ DI, DI - MOVQ (BX)(DI*1), R8 - ADDQ DX, DI - MOVQ (BX)(DI*1), R9 - ADDQ DX, DI - MOVQ (BX)(DI*1), R10 - ADDQ DX, DI - MOVQ (BX)(DI*1), DX - -loop: - VMOVDQU (R8), Y1 - VMOVDQU 32(R8), Y2 - VMOVDQU (R10), Y5 - VMOVDQU 32(R10), Y6 - VMOVDQU (R9), Y3 - VMOVDQU 32(R9), Y4 - VMOVDQU (DX), Y7 - VMOVDQU 32(DX), Y8 - VPSRLQ $0x04, Y5, Y10 - VPAND Y0, Y5, Y9 - VPAND Y0, Y10, Y10 - VBROADCASTI128 (CX), Y11 - VBROADCASTI128 64(CX), Y12 - VPSHUFB Y9, Y11, Y11 - VPSHUFB Y9, Y12, Y9 - VBROADCASTI128 16(CX), Y12 - VBROADCASTI128 80(CX), Y13 - VPSHUFB Y10, Y12, Y12 - VPSHUFB Y10, Y13, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y6, Y0, Y10 - VPSRLQ $0x04, Y6, Y12 - VPAND Y0, Y12, Y12 - VBROADCASTI128 32(CX), Y13 - VBROADCASTI128 96(CX), Y14 - VPSHUFB Y10, Y13, Y13 - VPSHUFB Y10, Y14, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VBROADCASTI128 48(CX), Y13 - VBROADCASTI128 112(CX), Y10 - VPSHUFB Y12, Y13, Y13 - VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y11, Y13, Y1) - XOR3WAY( $0x00, Y9, Y10, Y2) - VPSRLQ $0x04, Y7, Y10 - VPAND Y0, Y7, Y9 - VPAND Y0, Y10, Y10 - VBROADCASTI128 (CX), Y11 - VBROADCASTI128 64(CX), Y12 - VPSHUFB Y9, Y11, Y11 - VPSHUFB Y9, Y12, Y9 - VBROADCASTI128 16(CX), Y12 - VBROADCASTI128 80(CX), Y13 - VPSHUFB Y10, Y12, Y12 - VPSHUFB Y10, Y13, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y8, Y0, Y10 - VPSRLQ $0x04, Y8, Y12 - VPAND Y0, Y12, Y12 - VBROADCASTI128 32(CX), Y13 - VBROADCASTI128 96(CX), Y14 - VPSHUFB Y10, Y13, Y13 - VPSHUFB Y10, Y14, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VBROADCASTI128 48(CX), Y13 - VBROADCASTI128 112(CX), Y10 - VPSHUFB Y12, Y13, Y13 - VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y11, Y13, Y3) - XOR3WAY( $0x00, Y9, Y10, Y4) - VPXOR Y1, Y5, Y5 - VPXOR Y2, Y6, Y6 - VPXOR Y3, Y7, Y7 - VPXOR Y4, Y8, Y8 - VPSRLQ $0x04, Y3, Y10 - VPAND Y0, Y3, Y9 - VPAND Y0, Y10, Y10 - VBROADCASTI128 (AX), Y11 - VBROADCASTI128 64(AX), Y12 - VPSHUFB Y9, Y11, Y11 - VPSHUFB Y9, Y12, Y9 - VBROADCASTI128 16(AX), Y12 - VBROADCASTI128 80(AX), Y13 - VPSHUFB Y10, Y12, Y12 - VPSHUFB Y10, Y13, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y4, Y0, Y10 - VPSRLQ $0x04, Y4, Y12 - VPAND Y0, Y12, Y12 - VBROADCASTI128 32(AX), Y13 - VBROADCASTI128 96(AX), Y14 - VPSHUFB Y10, Y13, Y13 - VPSHUFB Y10, Y14, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VBROADCASTI128 48(AX), Y13 - VBROADCASTI128 112(AX), Y10 - VPSHUFB Y12, Y13, Y13 - VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y11, Y13, Y1) - XOR3WAY( $0x00, Y9, Y10, Y2) - VPXOR Y1, Y3, Y3 - VPXOR Y2, Y4, Y4 - VMOVDQU Y1, (R8) - VMOVDQU Y2, 32(R8) - ADDQ $0x40, R8 - VMOVDQU Y3, (R9) - VMOVDQU Y4, 32(R9) - ADDQ $0x40, R9 - VPXOR Y5, Y7, Y7 - VPXOR Y6, Y8, Y8 - VMOVDQU Y5, (R10) - VMOVDQU Y6, 32(R10) - ADDQ $0x40, R10 - VMOVDQU Y7, (DX) - VMOVDQU Y8, 32(DX) - ADDQ $0x40, DX - SUBQ $0x40, SI - JNZ loop - VZEROUPPER - RET - -// func ifftDIT4_avx2_5(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·ifftDIT4_avx2_5(SB), NOSPLIT, $0-56 - // dist must be multiplied by 24 (size of slice header) - MOVQ table01+32(FP), AX - MOVQ table23+40(FP), AX - MOVQ table02+48(FP), CX - MOVQ $0x0000000f, CX - MOVQ CX, X0 - VPBROADCASTB X0, Y0 - MOVQ dist+24(FP), CX - MOVQ work_base+0(FP), DX - MOVQ 8(DX), BX - XORQ SI, SI - MOVQ (DX)(SI*1), DI - ADDQ CX, SI - MOVQ (DX)(SI*1), R8 - ADDQ CX, SI - MOVQ (DX)(SI*1), R9 - ADDQ CX, SI - MOVQ (DX)(SI*1), CX - -loop: - VMOVDQU (DI), Y1 - VMOVDQU 32(DI), Y2 - VMOVDQU (R8), Y3 - VMOVDQU 32(R8), Y4 - VPXOR Y1, Y3, Y3 - VPXOR Y2, Y4, Y4 - VMOVDQU (R9), Y5 - VMOVDQU 32(R9), Y6 - VMOVDQU (CX), Y7 - VMOVDQU 32(CX), Y8 - VPXOR Y5, Y7, Y7 - VPXOR Y6, Y8, Y8 - VPSRLQ $0x04, Y7, Y10 - VPAND Y0, Y7, Y9 - VPAND Y0, Y10, Y10 - VBROADCASTI128 (AX), Y11 - VBROADCASTI128 64(AX), Y12 - VPSHUFB Y9, Y11, Y11 - VPSHUFB Y9, Y12, Y9 - VBROADCASTI128 16(AX), Y12 - VBROADCASTI128 80(AX), Y13 - VPSHUFB Y10, Y12, Y12 - VPSHUFB Y10, Y13, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y8, Y0, Y10 - VPSRLQ $0x04, Y8, Y12 - VPAND Y0, Y12, Y12 - VBROADCASTI128 32(AX), Y13 - VBROADCASTI128 96(AX), Y14 - VPSHUFB Y10, Y13, Y13 - VPSHUFB Y10, Y14, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VBROADCASTI128 48(AX), Y13 - VBROADCASTI128 112(AX), Y10 - VPSHUFB Y12, Y13, Y13 - VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y11, Y13, Y5) - XOR3WAY( $0x00, Y9, Y10, Y6) - VPXOR Y1, Y5, Y5 - VPXOR Y2, Y6, Y6 - VPXOR Y3, Y7, Y7 - VPXOR Y4, Y8, Y8 - VMOVDQU Y1, (DI) - VMOVDQU Y2, 32(DI) - ADDQ $0x40, DI - VMOVDQU Y3, (R8) - VMOVDQU Y4, 32(R8) - ADDQ $0x40, R8 - VMOVDQU Y5, (R9) - VMOVDQU Y6, 32(R9) - ADDQ $0x40, R9 - VMOVDQU Y7, (CX) - VMOVDQU Y8, 32(CX) - ADDQ $0x40, CX - SUBQ $0x40, BX - JNZ loop - VZEROUPPER - RET - -// func fftDIT4_avx2_5(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·fftDIT4_avx2_5(SB), NOSPLIT, $0-56 - // dist must be multiplied by 24 (size of slice header) - MOVQ table01+32(FP), AX - MOVQ table23+40(FP), CX - MOVQ table02+48(FP), CX - MOVQ $0x0000000f, CX - MOVQ CX, X0 - VPBROADCASTB X0, Y0 - MOVQ dist+24(FP), CX - MOVQ work_base+0(FP), DX - MOVQ 8(DX), BX - XORQ SI, SI - MOVQ (DX)(SI*1), DI - ADDQ CX, SI - MOVQ (DX)(SI*1), R8 - ADDQ CX, SI - MOVQ (DX)(SI*1), R9 - ADDQ CX, SI - MOVQ (DX)(SI*1), CX - -loop: - VMOVDQU (DI), Y1 - VMOVDQU 32(DI), Y2 - VMOVDQU (R9), Y5 - VMOVDQU 32(R9), Y6 - VMOVDQU (R8), Y3 - VMOVDQU 32(R8), Y4 - VMOVDQU (CX), Y7 - VMOVDQU 32(CX), Y8 - VPXOR Y1, Y5, Y5 - VPXOR Y2, Y6, Y6 - VPXOR Y3, Y7, Y7 - VPXOR Y4, Y8, Y8 - VPSRLQ $0x04, Y3, Y10 - VPAND Y0, Y3, Y9 - VPAND Y0, Y10, Y10 - VBROADCASTI128 (AX), Y11 - VBROADCASTI128 64(AX), Y12 - VPSHUFB Y9, Y11, Y11 - VPSHUFB Y9, Y12, Y9 - VBROADCASTI128 16(AX), Y12 - VBROADCASTI128 80(AX), Y13 - VPSHUFB Y10, Y12, Y12 - VPSHUFB Y10, Y13, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y4, Y0, Y10 - VPSRLQ $0x04, Y4, Y12 - VPAND Y0, Y12, Y12 - VBROADCASTI128 32(AX), Y13 - VBROADCASTI128 96(AX), Y14 - VPSHUFB Y10, Y13, Y13 - VPSHUFB Y10, Y14, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VBROADCASTI128 48(AX), Y13 - VBROADCASTI128 112(AX), Y10 - VPSHUFB Y12, Y13, Y13 - VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y11, Y13, Y1) - XOR3WAY( $0x00, Y9, Y10, Y2) - VPXOR Y1, Y3, Y3 - VPXOR Y2, Y4, Y4 - VMOVDQU Y1, (DI) - VMOVDQU Y2, 32(DI) - ADDQ $0x40, DI - VMOVDQU Y3, (R8) - VMOVDQU Y4, 32(R8) - ADDQ $0x40, R8 - VPXOR Y5, Y7, Y7 - VPXOR Y6, Y8, Y8 - VMOVDQU Y5, (R9) - VMOVDQU Y6, 32(R9) - ADDQ $0x40, R9 - VMOVDQU Y7, (CX) - VMOVDQU Y8, 32(CX) - ADDQ $0x40, CX - SUBQ $0x40, BX - JNZ loop - VZEROUPPER - RET - -// func ifftDIT4_avx2_6(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·ifftDIT4_avx2_6(SB), NOSPLIT, $0-56 - // dist must be multiplied by 24 (size of slice header) - MOVQ table01+32(FP), AX - MOVQ table23+40(FP), CX - MOVQ table02+48(FP), CX - MOVQ $0x0000000f, CX - MOVQ CX, X0 - VPBROADCASTB X0, Y0 - MOVQ dist+24(FP), CX - MOVQ work_base+0(FP), DX - MOVQ 8(DX), BX - XORQ SI, SI - MOVQ (DX)(SI*1), DI - ADDQ CX, SI - MOVQ (DX)(SI*1), R8 - ADDQ CX, SI - MOVQ (DX)(SI*1), R9 - ADDQ CX, SI - MOVQ (DX)(SI*1), CX - -loop: - VMOVDQU (DI), Y1 - VMOVDQU 32(DI), Y2 - VMOVDQU (R8), Y3 - VMOVDQU 32(R8), Y4 - VPXOR Y1, Y3, Y3 - VPXOR Y2, Y4, Y4 - VPSRLQ $0x04, Y3, Y6 - VPAND Y0, Y3, Y5 - VPAND Y0, Y6, Y6 - VBROADCASTI128 (AX), Y7 - VBROADCASTI128 64(AX), Y8 - VPSHUFB Y5, Y7, Y7 - VPSHUFB Y5, Y8, Y5 - VBROADCASTI128 16(AX), Y8 - VBROADCASTI128 80(AX), Y9 - VPSHUFB Y6, Y8, Y8 - VPSHUFB Y6, Y9, Y6 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y6, Y5 - VPAND Y4, Y0, Y6 - VPSRLQ $0x04, Y4, Y8 - VPAND Y0, Y8, Y8 - VBROADCASTI128 32(AX), Y9 - VBROADCASTI128 96(AX), Y10 - VPSHUFB Y6, Y9, Y9 - VPSHUFB Y6, Y10, Y6 - VPXOR Y7, Y9, Y7 - VPXOR Y5, Y6, Y5 - VBROADCASTI128 48(AX), Y9 - VBROADCASTI128 112(AX), Y6 - VPSHUFB Y8, Y9, Y9 - VPSHUFB Y8, Y6, Y6 - XOR3WAY( $0x00, Y7, Y9, Y1) - XOR3WAY( $0x00, Y5, Y6, Y2) - VMOVDQU (R9), Y5 - VMOVDQU 32(R9), Y6 - VMOVDQU (CX), Y7 - VMOVDQU 32(CX), Y8 - VPXOR Y5, Y7, Y7 - VPXOR Y6, Y8, Y8 - VPXOR Y1, Y5, Y5 - VPXOR Y2, Y6, Y6 - VPXOR Y3, Y7, Y7 - VPXOR Y4, Y8, Y8 - VMOVDQU Y1, (DI) - VMOVDQU Y2, 32(DI) - ADDQ $0x40, DI - VMOVDQU Y3, (R8) - VMOVDQU Y4, 32(R8) - ADDQ $0x40, R8 - VMOVDQU Y5, (R9) - VMOVDQU Y6, 32(R9) - ADDQ $0x40, R9 - VMOVDQU Y7, (CX) - VMOVDQU Y8, 32(CX) - ADDQ $0x40, CX - SUBQ $0x40, BX - JNZ loop - VZEROUPPER - RET - -// func fftDIT4_avx2_6(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) -// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 -TEXT ·fftDIT4_avx2_6(SB), NOSPLIT, $0-56 - // dist must be multiplied by 24 (size of slice header) - MOVQ table01+32(FP), AX - MOVQ table23+40(FP), AX - MOVQ table02+48(FP), AX - MOVQ $0x0000000f, CX - MOVQ CX, X0 - VPBROADCASTB X0, Y0 - MOVQ dist+24(FP), CX - MOVQ work_base+0(FP), DX - MOVQ 8(DX), BX - XORQ SI, SI - MOVQ (DX)(SI*1), DI - ADDQ CX, SI - MOVQ (DX)(SI*1), R8 - ADDQ CX, SI - MOVQ (DX)(SI*1), R9 - ADDQ CX, SI - MOVQ (DX)(SI*1), CX - -loop: - VMOVDQU (DI), Y1 - VMOVDQU 32(DI), Y2 - VMOVDQU (R9), Y5 - VMOVDQU 32(R9), Y6 - VMOVDQU (R8), Y3 - VMOVDQU 32(R8), Y4 - VMOVDQU (CX), Y7 - VMOVDQU 32(CX), Y8 - VPSRLQ $0x04, Y5, Y10 - VPAND Y0, Y5, Y9 - VPAND Y0, Y10, Y10 - VBROADCASTI128 (AX), Y11 - VBROADCASTI128 64(AX), Y12 - VPSHUFB Y9, Y11, Y11 - VPSHUFB Y9, Y12, Y9 - VBROADCASTI128 16(AX), Y12 - VBROADCASTI128 80(AX), Y13 - VPSHUFB Y10, Y12, Y12 - VPSHUFB Y10, Y13, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y6, Y0, Y10 - VPSRLQ $0x04, Y6, Y12 - VPAND Y0, Y12, Y12 - VBROADCASTI128 32(AX), Y13 - VBROADCASTI128 96(AX), Y14 - VPSHUFB Y10, Y13, Y13 - VPSHUFB Y10, Y14, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VBROADCASTI128 48(AX), Y13 - VBROADCASTI128 112(AX), Y10 - VPSHUFB Y12, Y13, Y13 - VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y11, Y13, Y1) - XOR3WAY( $0x00, Y9, Y10, Y2) - VPSRLQ $0x04, Y7, Y10 - VPAND Y0, Y7, Y9 - VPAND Y0, Y10, Y10 - VBROADCASTI128 (AX), Y11 - VBROADCASTI128 64(AX), Y12 - VPSHUFB Y9, Y11, Y11 - VPSHUFB Y9, Y12, Y9 - VBROADCASTI128 16(AX), Y12 - VBROADCASTI128 80(AX), Y13 - VPSHUFB Y10, Y12, Y12 - VPSHUFB Y10, Y13, Y10 - VPXOR Y11, Y12, Y11 - VPXOR Y9, Y10, Y9 - VPAND Y8, Y0, Y10 - VPSRLQ $0x04, Y8, Y12 - VPAND Y0, Y12, Y12 - VBROADCASTI128 32(AX), Y13 - VBROADCASTI128 96(AX), Y14 - VPSHUFB Y10, Y13, Y13 - VPSHUFB Y10, Y14, Y10 - VPXOR Y11, Y13, Y11 - VPXOR Y9, Y10, Y9 - VBROADCASTI128 48(AX), Y13 - VBROADCASTI128 112(AX), Y10 - VPSHUFB Y12, Y13, Y13 - VPSHUFB Y12, Y10, Y10 - XOR3WAY( $0x00, Y11, Y13, Y3) - XOR3WAY( $0x00, Y9, Y10, Y4) - VPXOR Y1, Y5, Y5 - VPXOR Y2, Y6, Y6 - VPXOR Y3, Y7, Y7 - VPXOR Y4, Y8, Y8 - VPXOR Y1, Y3, Y3 - VPXOR Y2, Y4, Y4 - VMOVDQU Y1, (DI) - VMOVDQU Y2, 32(DI) - ADDQ $0x40, DI - VMOVDQU Y3, (R8) - VMOVDQU Y4, 32(R8) - ADDQ $0x40, R8 - VPXOR Y5, Y7, Y7 - VPXOR Y6, Y8, Y8 - VMOVDQU Y5, (R9) - VMOVDQU Y6, 32(R9) - ADDQ $0x40, R9 - VMOVDQU Y7, (CX) - VMOVDQU Y8, 32(CX) - ADDQ $0x40, CX - SUBQ $0x40, BX - JNZ loop - VZEROUPPER - RET - -// func ifftDIT4_avx2_7(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) -// Requires: AVX, AVX2, SSE2 -TEXT ·ifftDIT4_avx2_7(SB), NOSPLIT, $0-56 - // dist must be multiplied by 24 (size of slice header) - MOVQ table01+32(FP), AX - MOVQ table23+40(FP), AX - MOVQ table02+48(FP), AX - MOVQ $0x0000000f, AX - MOVQ AX, X0 - VPBROADCASTB X0, Y0 - MOVQ dist+24(FP), AX - MOVQ work_base+0(FP), CX - MOVQ 8(CX), DX - XORQ BX, BX - MOVQ (CX)(BX*1), SI - ADDQ AX, BX - MOVQ (CX)(BX*1), DI - ADDQ AX, BX - MOVQ (CX)(BX*1), R8 - ADDQ AX, BX - MOVQ (CX)(BX*1), AX - -loop: - VMOVDQU (SI), Y0 - VMOVDQU 32(SI), Y1 - VMOVDQU (DI), Y2 - VMOVDQU 32(DI), Y3 - VPXOR Y0, Y2, Y2 - VPXOR Y1, Y3, Y3 - VMOVDQU (R8), Y4 - VMOVDQU 32(R8), Y5 - VMOVDQU (AX), Y6 - VMOVDQU 32(AX), Y7 - VPXOR Y4, Y6, Y6 - VPXOR Y5, Y7, Y7 - VPXOR Y0, Y4, Y4 - VPXOR Y1, Y5, Y5 - VPXOR Y2, Y6, Y6 - VPXOR Y3, Y7, Y7 - VMOVDQU Y0, (SI) - VMOVDQU Y1, 32(SI) - ADDQ $0x40, SI - VMOVDQU Y2, (DI) - VMOVDQU Y3, 32(DI) - ADDQ $0x40, DI - VMOVDQU Y4, (R8) - VMOVDQU Y5, 32(R8) - ADDQ $0x40, R8 - VMOVDQU Y6, (AX) - VMOVDQU Y7, 32(AX) - ADDQ $0x40, AX - SUBQ $0x40, DX - JNZ loop - VZEROUPPER - RET - -// func fftDIT4_avx2_7(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8) -// Requires: AVX, AVX2, SSE2 -TEXT ·fftDIT4_avx2_7(SB), NOSPLIT, $0-56 - // dist must be multiplied by 24 (size of slice header) - MOVQ table01+32(FP), AX - MOVQ table23+40(FP), AX - MOVQ table02+48(FP), AX - MOVQ $0x0000000f, AX - MOVQ AX, X0 - VPBROADCASTB X0, Y0 - MOVQ dist+24(FP), AX - MOVQ work_base+0(FP), CX - MOVQ 8(CX), DX - XORQ BX, BX - MOVQ (CX)(BX*1), SI - ADDQ AX, BX - MOVQ (CX)(BX*1), DI - ADDQ AX, BX - MOVQ (CX)(BX*1), R8 - ADDQ AX, BX - MOVQ (CX)(BX*1), AX - -loop: - VMOVDQU (SI), Y0 - VMOVDQU 32(SI), Y1 - VMOVDQU (R8), Y4 - VMOVDQU 32(R8), Y5 - VMOVDQU (DI), Y2 - VMOVDQU 32(DI), Y3 - VMOVDQU (AX), Y6 - VMOVDQU 32(AX), Y7 - VPXOR Y0, Y4, Y4 - VPXOR Y1, Y5, Y5 - VPXOR Y2, Y6, Y6 - VPXOR Y3, Y7, Y7 - VPXOR Y0, Y2, Y2 - VPXOR Y1, Y3, Y3 - VMOVDQU Y0, (SI) - VMOVDQU Y1, 32(SI) - ADDQ $0x40, SI - VMOVDQU Y2, (DI) - VMOVDQU Y3, 32(DI) - ADDQ $0x40, DI - VPXOR Y4, Y6, Y6 - VPXOR Y5, Y7, Y7 - VMOVDQU Y4, (R8) - VMOVDQU Y5, 32(R8) - ADDQ $0x40, R8 - VMOVDQU Y6, (AX) - VMOVDQU Y7, 32(AX) - ADDQ $0x40, AX - SUBQ $0x40, DX - JNZ loop - VZEROUPPER - RET - -// func ifftDIT2_ssse3(x []byte, y []byte, table *[128]uint8) -// Requires: SSE, SSE2, SSSE3 -TEXT ·ifftDIT2_ssse3(SB), NOSPLIT, $0-56 - MOVQ table+48(FP), AX - MOVUPS (AX), X0 - MOVUPS 64(AX), X1 - MOVUPS 16(AX), X2 - MOVUPS 80(AX), X3 - MOVUPS 32(AX), X4 - MOVUPS 96(AX), X5 - XORPS X6, X6 - MOVQ $0x0000000f, CX - MOVQ CX, X7 - PSHUFB X6, X7 - MOVQ x_len+8(FP), CX - MOVQ x_base+0(FP), DX - MOVQ y_base+24(FP), BX - -loop: - MOVUPS (DX), X6 - MOVUPS 32(DX), X8 - MOVUPS (BX), X9 - MOVUPS 32(BX), X10 - PXOR X6, X9 - PXOR X8, X10 - MOVUPS X9, (BX) - MOVUPS X10, 32(BX) - MOVAPS X9, X11 - PSRLQ $0x04, X11 - MOVAPS X9, X9 - PAND X7, X9 - PAND X7, X11 - MOVUPS X0, X12 - MOVUPS X1, X13 - PSHUFB X9, X12 - PSHUFB X9, X13 - MOVUPS X2, X9 - MOVUPS X3, X14 - PSHUFB X11, X9 - PSHUFB X11, X14 - PXOR X9, X12 - PXOR X14, X13 - MOVAPS X10, X9 - MOVAPS X10, X10 - PAND X7, X9 - PSRLQ $0x04, X10 - PAND X7, X10 - MOVUPS X4, X11 - MOVUPS X5, X14 - PSHUFB X9, X11 - PSHUFB X9, X14 - PXOR X11, X12 - PXOR X14, X13 - MOVUPS 48(AX), X11 - MOVUPS 112(AX), X14 - PSHUFB X10, X11 - PSHUFB X10, X14 - PXOR X11, X12 - PXOR X14, X13 - PXOR X12, X6 - PXOR X13, X8 - MOVUPS X6, (DX) - MOVUPS X8, 32(DX) - MOVUPS 16(DX), X6 - MOVUPS 48(DX), X8 - MOVUPS 16(BX), X9 - MOVUPS 48(BX), X10 - PXOR X6, X9 - PXOR X8, X10 - MOVUPS X9, 16(BX) - MOVUPS X10, 48(BX) - MOVAPS X9, X11 - PSRLQ $0x04, X11 - MOVAPS X9, X9 - PAND X7, X9 - PAND X7, X11 - MOVUPS X0, X12 - MOVUPS X1, X13 - PSHUFB X9, X12 - PSHUFB X9, X13 - MOVUPS X2, X9 - MOVUPS X3, X14 - PSHUFB X11, X9 - PSHUFB X11, X14 - PXOR X9, X12 - PXOR X14, X13 - MOVAPS X10, X9 - MOVAPS X10, X10 - PAND X7, X9 - PSRLQ $0x04, X10 - PAND X7, X10 - MOVUPS X4, X11 - MOVUPS X5, X14 - PSHUFB X9, X11 - PSHUFB X9, X14 - PXOR X11, X12 - PXOR X14, X13 - MOVUPS 48(AX), X11 - MOVUPS 112(AX), X14 - PSHUFB X10, X11 - PSHUFB X10, X14 - PXOR X11, X12 - PXOR X14, X13 - PXOR X12, X6 - PXOR X13, X8 - MOVUPS X6, 16(DX) - MOVUPS X8, 48(DX) - ADDQ $0x40, DX - ADDQ $0x40, BX - SUBQ $0x40, CX - JNZ loop - RET - -// func fftDIT2_ssse3(x []byte, y []byte, table *[128]uint8) -// Requires: SSE, SSE2, SSSE3 -TEXT ·fftDIT2_ssse3(SB), NOSPLIT, $0-56 - MOVQ table+48(FP), AX - MOVUPS (AX), X0 - MOVUPS 64(AX), X1 - MOVUPS 16(AX), X2 - MOVUPS 80(AX), X3 - MOVUPS 32(AX), X4 - MOVUPS 96(AX), X5 - XORPS X6, X6 - MOVQ $0x0000000f, CX - MOVQ CX, X7 - PSHUFB X6, X7 - MOVQ x_len+8(FP), CX - MOVQ x_base+0(FP), DX - MOVQ y_base+24(FP), BX - -loop: - MOVUPS (BX), X9 - MOVUPS 32(BX), X10 - MOVAPS X9, X8 - PSRLQ $0x04, X8 - MOVAPS X9, X6 - PAND X7, X6 - PAND X7, X8 - MOVUPS X0, X11 - MOVUPS X1, X12 - PSHUFB X6, X11 - PSHUFB X6, X12 - MOVUPS X2, X6 - MOVUPS X3, X13 - PSHUFB X8, X6 - PSHUFB X8, X13 - PXOR X6, X11 - PXOR X13, X12 - MOVAPS X10, X6 - MOVAPS X10, X8 - PAND X7, X6 - PSRLQ $0x04, X8 - PAND X7, X8 - MOVUPS X4, X13 - MOVUPS X5, X14 - PSHUFB X6, X13 - PSHUFB X6, X14 - PXOR X13, X11 - PXOR X14, X12 - MOVUPS 48(AX), X13 - MOVUPS 112(AX), X14 - PSHUFB X8, X13 - PSHUFB X8, X14 - PXOR X13, X11 - PXOR X14, X12 - MOVUPS (DX), X6 - MOVUPS 32(DX), X8 - PXOR X11, X6 - PXOR X12, X8 - MOVUPS X6, (DX) - MOVUPS X8, 32(DX) - PXOR X6, X9 - PXOR X8, X10 - MOVUPS X9, (BX) - MOVUPS X10, 32(BX) - MOVUPS 16(BX), X9 - MOVUPS 48(BX), X10 - MOVAPS X9, X8 - PSRLQ $0x04, X8 - MOVAPS X9, X6 - PAND X7, X6 - PAND X7, X8 - MOVUPS X0, X11 - MOVUPS X1, X12 - PSHUFB X6, X11 - PSHUFB X6, X12 - MOVUPS X2, X6 - MOVUPS X3, X13 - PSHUFB X8, X6 - PSHUFB X8, X13 - PXOR X6, X11 - PXOR X13, X12 - MOVAPS X10, X6 - MOVAPS X10, X8 - PAND X7, X6 - PSRLQ $0x04, X8 - PAND X7, X8 - MOVUPS X4, X13 - MOVUPS X5, X14 - PSHUFB X6, X13 - PSHUFB X6, X14 - PXOR X13, X11 - PXOR X14, X12 - MOVUPS 48(AX), X13 - MOVUPS 112(AX), X14 - PSHUFB X8, X13 - PSHUFB X8, X14 - PXOR X13, X11 - PXOR X14, X12 - MOVUPS 16(DX), X6 - MOVUPS 48(DX), X8 - PXOR X11, X6 - PXOR X12, X8 - MOVUPS X6, 16(DX) - MOVUPS X8, 48(DX) - PXOR X6, X9 - PXOR X8, X10 - MOVUPS X9, 16(BX) - MOVUPS X10, 48(BX) - ADDQ $0x40, DX - ADDQ $0x40, BX - SUBQ $0x40, CX - JNZ loop - RET - -// func mulgf16_ssse3(x []byte, y []byte, table *[128]uint8) -// Requires: SSE, SSE2, SSSE3 -TEXT ·mulgf16_ssse3(SB), NOSPLIT, $0-56 - MOVQ table+48(FP), AX - MOVUPS (AX), X0 - MOVUPS 64(AX), X1 - MOVUPS 16(AX), X2 - MOVUPS 80(AX), X3 - MOVUPS 32(AX), X4 - MOVUPS 96(AX), X5 - MOVUPS 48(AX), X6 - MOVUPS 112(AX), X7 - MOVQ x_len+8(FP), AX - MOVQ x_base+0(FP), CX - MOVQ y_base+24(FP), DX - XORPS X8, X8 - MOVQ $0x0000000f, BX - MOVQ BX, X9 - PSHUFB X8, X9 - -loop: - MOVUPS (DX), X8 - MOVUPS 32(DX), X10 - MOVAPS X8, X11 - PSRLQ $0x04, X11 - MOVAPS X8, X8 - PAND X9, X8 - PAND X9, X11 - MOVUPS X0, X12 - MOVUPS X1, X13 - PSHUFB X8, X12 - PSHUFB X8, X13 - MOVUPS X2, X8 - MOVUPS X3, X14 - PSHUFB X11, X8 - PSHUFB X11, X14 - PXOR X8, X12 - PXOR X14, X13 - MOVAPS X10, X8 - MOVAPS X10, X10 - PAND X9, X8 - PSRLQ $0x04, X10 - PAND X9, X10 - MOVUPS X4, X11 - MOVUPS X5, X14 - PSHUFB X8, X11 - PSHUFB X8, X14 - PXOR X11, X12 - PXOR X14, X13 - MOVUPS X6, X11 - MOVUPS X7, X14 - PSHUFB X10, X11 - PSHUFB X10, X14 - PXOR X11, X12 - PXOR X14, X13 - MOVUPS X12, (CX) - MOVUPS X13, 32(CX) - MOVUPS 16(DX), X8 - MOVUPS 48(DX), X10 - MOVAPS X8, X11 - PSRLQ $0x04, X11 - MOVAPS X8, X8 - PAND X9, X8 - PAND X9, X11 - MOVUPS X0, X12 - MOVUPS X1, X13 - PSHUFB X8, X12 - PSHUFB X8, X13 - MOVUPS X2, X8 - MOVUPS X3, X14 - PSHUFB X11, X8 - PSHUFB X11, X14 - PXOR X8, X12 - PXOR X14, X13 - MOVAPS X10, X8 - MOVAPS X10, X10 - PAND X9, X8 - PSRLQ $0x04, X10 - PAND X9, X10 - MOVUPS X4, X11 - MOVUPS X5, X14 - PSHUFB X8, X11 - PSHUFB X8, X14 - PXOR X11, X12 - PXOR X14, X13 - MOVUPS X6, X11 - MOVUPS X7, X14 - PSHUFB X10, X11 - PSHUFB X10, X14 - PXOR X11, X12 - PXOR X14, X13 - MOVUPS X12, 16(CX) - MOVUPS X13, 48(CX) - ADDQ $0x40, CX - ADDQ $0x40, DX - SUBQ $0x40, AX - JNZ loop - RET diff --git a/vendor/github.com/klauspost/reedsolomon/galois_noasm.go b/vendor/github.com/klauspost/reedsolomon/galois_noasm.go index 47e24d7..7ef78f8 100644 --- a/vendor/github.com/klauspost/reedsolomon/galois_noasm.go +++ b/vendor/github.com/klauspost/reedsolomon/galois_noasm.go @@ -7,6 +7,8 @@ package reedsolomon +import "encoding/binary" + func galMulSlice(c byte, in, out []byte, o *options) { out = out[:len(in)] if c == 1 { @@ -32,38 +34,25 @@ func galMulSliceXor(c byte, in, out []byte, o *options) { } // simple slice xor -func sliceXor(in, out []byte, o *options) { - sliceXorGo(in, out, o) +func sliceXor(in, out []byte, _ *options) { + for len(out) >= 32 { + inS := in[:32] + v0 := binary.LittleEndian.Uint64(out[:]) ^ binary.LittleEndian.Uint64(inS[:]) + v1 := binary.LittleEndian.Uint64(out[8:]) ^ binary.LittleEndian.Uint64(inS[8:]) + v2 := binary.LittleEndian.Uint64(out[16:]) ^ binary.LittleEndian.Uint64(inS[16:]) + v3 := binary.LittleEndian.Uint64(out[24:]) ^ binary.LittleEndian.Uint64(inS[24:]) + binary.LittleEndian.PutUint64(out[:], v0) + binary.LittleEndian.PutUint64(out[8:], v1) + binary.LittleEndian.PutUint64(out[16:], v2) + binary.LittleEndian.PutUint64(out[24:], v3) + out = out[32:] + in = in[32:] + } + for n, input := range in { + out[n] ^= input + } } func init() { defaultOptions.useAVX512 = false } - -// 4-way butterfly -func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { - ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) -} - -// 4-way butterfly -func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { - fftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) -} - -// 2-way butterfly forward -func fftDIT2(x, y []byte, log_m ffe, o *options) { - // Reference version: - refMulAdd(x, y, log_m) - sliceXorGo(x, y, o) -} - -// 2-way butterfly inverse -func ifftDIT2(x, y []byte, log_m ffe, o *options) { - // Reference version: - sliceXorGo(x, y, o) - refMulAdd(x, y, log_m) -} - -func mulgf16(x, y []byte, log_m ffe, o *options) { - refMul(x, y, log_m) -} diff --git a/vendor/github.com/klauspost/reedsolomon/galois_ppc64le.go b/vendor/github.com/klauspost/reedsolomon/galois_ppc64le.go index 415828a..52e8c23 100644 --- a/vendor/github.com/klauspost/reedsolomon/galois_ppc64le.go +++ b/vendor/github.com/klauspost/reedsolomon/galois_ppc64le.go @@ -72,31 +72,3 @@ func sliceXor(in, out []byte, o *options) { out[n] ^= input } } - -// 4-way butterfly -func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { - ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) -} - -// 4-way butterfly -func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { - fftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) -} - -// 2-way butterfly forward -func fftDIT2(x, y []byte, log_m ffe, o *options) { - // Reference version: - refMulAdd(x, y, log_m) - sliceXor(x, y, o) -} - -// 2-way butterfly inverse -func ifftDIT2(x, y []byte, log_m ffe, o *options) { - // Reference version: - sliceXor(x, y, o) - refMulAdd(x, y, log_m) -} - -func mulgf16(x, y []byte, log_m ffe, o *options) { - refMul(x, y, log_m) -} diff --git a/vendor/github.com/klauspost/reedsolomon/options.go b/vendor/github.com/klauspost/reedsolomon/options.go index 83dd2cd..26269eb 100644 --- a/vendor/github.com/klauspost/reedsolomon/options.go +++ b/vendor/github.com/klauspost/reedsolomon/options.go @@ -16,13 +16,10 @@ type options struct { perRound int useAVX512, useAVX2, useSSSE3, useSSE2 bool - useJerasureMatrix bool usePAR1Matrix bool useCauchy bool fastOneParity bool inversionCache bool - customMatrix [][]byte - withLeopard *bool // stream options concReads bool @@ -133,57 +130,36 @@ func WithStreamBlockSize(n int) Option { } } -// WithSSSE3 allows to enable/disable SSSE3 instructions. -// If not set, SSSE3 will be turned on or off automatically based on CPU ID information. -func WithSSSE3(enabled bool) Option { +func withSSSE3(enabled bool) Option { return func(o *options) { o.useSSSE3 = enabled } } -// WithAVX2 allows to enable/disable AVX2 instructions. -// If not set, AVX2 will be turned on or off automatically based on CPU ID information. -func WithAVX2(enabled bool) Option { +func withAVX2(enabled bool) Option { return func(o *options) { o.useAVX2 = enabled } } -// WithSSE2 allows to enable/disable SSE2 instructions. -// If not set, SSE2 will be turned on or off automatically based on CPU ID information. -func WithSSE2(enabled bool) Option { +func withSSE2(enabled bool) Option { return func(o *options) { o.useSSE2 = enabled } } -// WithAVX512 allows to enable/disable AVX512 instructions. -// If not set, AVX512 will be turned on or off automatically based on CPU ID information. -func WithAVX512(enabled bool) Option { +func withAVX512(enabled bool) Option { return func(o *options) { o.useAVX512 = enabled } } -// WithJerasureMatrix causes the encoder to build the Reed-Solomon-Vandermonde -// matrix in the same way as done by the Jerasure library. -// The first row and column of the coding matrix only contains 1's in this method -// so the first parity chunk is always equal to XOR of all data chunks. -func WithJerasureMatrix() Option { - return func(o *options) { - o.useJerasureMatrix = true - o.usePAR1Matrix = false - o.useCauchy = false - } -} - // WithPAR1Matrix causes the encoder to build the matrix how PARv1 // does. Note that the method they use is buggy, and may lead to cases // where recovery is impossible, even if there are enough parity // shards. func WithPAR1Matrix() Option { return func(o *options) { - o.useJerasureMatrix = false o.usePAR1Matrix = true o.useCauchy = false } @@ -195,9 +171,8 @@ func WithPAR1Matrix() Option { // but will result in slightly faster start-up time. func WithCauchyMatrix() Option { return func(o *options) { - o.useJerasureMatrix = false - o.usePAR1Matrix = false o.useCauchy = true + o.usePAR1Matrix = false } } @@ -209,26 +184,3 @@ func WithFastOneParityMatrix() Option { o.fastOneParity = true } } - -// WithCustomMatrix causes the encoder to use the manually specified matrix. -// customMatrix represents only the parity chunks. -// customMatrix must have at least ParityShards rows and DataShards columns. -// It can be used for interoperability with libraries which generate -// the matrix differently or to implement more complex coding schemes like LRC -// (locally reconstructible codes). -func WithCustomMatrix(customMatrix [][]byte) Option { - return func(o *options) { - o.customMatrix = customMatrix - } -} - -// WithLeopardGF16 will always use leopard GF16 for encoding, -// even when there is less than 256 shards. -// This will likely improve reconstruction time for some setups. -// This is not compatible with Leopard output for <= 256 shards. -// Note that Leopard places certain restrictions on use see other documentation. -func WithLeopardGF16(enabled bool) Option { - return func(o *options) { - o.withLeopard = &enabled - } -} diff --git a/vendor/github.com/klauspost/reedsolomon/reedsolomon.go b/vendor/github.com/klauspost/reedsolomon/reedsolomon.go index a2e5886..8382e56 100644 --- a/vendor/github.com/klauspost/reedsolomon/reedsolomon.go +++ b/vendor/github.com/klauspost/reedsolomon/reedsolomon.go @@ -8,6 +8,7 @@ // Package reedsolomon enables Erasure Coding in Go // // For usage and examples, see https://github.com/klauspost/reedsolomon +// package reedsolomon import ( @@ -76,24 +77,6 @@ type Encoder interface { // calling the Verify function is likely to fail. ReconstructData(shards [][]byte) error - // ReconstructSome will recreate only requested data shards, if possible. - // - // Given a list of shards, some of which contain data, fills in the - // data shards indicated by true values in the "required" parameter. - // The length of "required" array must be equal to DataShards. - // - // The length of "shards" array must be equal to Shards. - // You indicate that a shard is missing by setting it to nil or zero-length. - // If a shard is zero-length but has sufficient capacity, that memory will - // be used, otherwise a new []byte will be allocated. - // - // If there are too few shards to reconstruct the missing - // ones, ErrTooFewShards will be returned. - // - // As the reconstructed shard set may contain missing parity shards, - // calling the Verify function is likely to fail. - ReconstructSome(shards [][]byte, required []bool) error - // Update parity is use for change a few data shards and update it's parity. // Input 'newDatashards' containing data shards changed. // Input 'shards' containing old data shards (if data shard not changed, it can be nil) and old parity shards. @@ -125,22 +108,6 @@ type Encoder interface { Join(dst io.Writer, shards [][]byte, outSize int) error } -// Extensions is an optional interface. -// All returned instances will support this interface. -type Extensions interface { - // ShardSizeMultiple will return the size the shard sizes must be a multiple of. - ShardSizeMultiple() int - - // DataShards will return the number of data shards. - DataShards() int - - // ParityShards will return the number of parity shards. - ParityShards() int - - // TotalShards will return the total number of shards. - TotalShards() int -} - const ( avx2CodeGenMinSize = 64 avx2CodeGenMinShards = 3 @@ -154,9 +121,9 @@ const ( // distribution of datashards and parity shards. // Construct if using New() type reedSolomon struct { - dataShards int // Number of data shards, should not be modified. - parityShards int // Number of parity shards, should not be modified. - totalShards int // Total number of shards. Calculated, and should not be modified. + DataShards int // Number of data shards, should not be modified. + ParityShards int // Number of parity shards, should not be modified. + Shards int // Total number of shards. Calculated, and should not be modified. m matrix tree *inversionTree parity [][]byte @@ -164,24 +131,6 @@ type reedSolomon struct { mPool sync.Pool } -var _ = Extensions(&reedSolomon{}) - -func (r *reedSolomon) ShardSizeMultiple() int { - return 1 -} - -func (r *reedSolomon) DataShards() int { - return r.dataShards -} - -func (r *reedSolomon) ParityShards() int { - return r.parityShards -} - -func (r *reedSolomon) TotalShards() int { - return r.parityShards -} - // ErrInvShardNum will be returned by New, if you attempt to create // an Encoder with less than one data shard or less than zero parity // shards. @@ -192,9 +141,6 @@ var ErrInvShardNum = errors.New("cannot create Encoder with less than one data s // GF(2^8). var ErrMaxShardNum = errors.New("cannot create Encoder with more than 256 data+parity shards") -// ErrNotSupported is returned when an operation is not supported. -var ErrNotSupported = errors.New("operation not supported") - // buildMatrix creates the matrix to use for encoding, given the // number of data shards and the number of total shards. // @@ -227,87 +173,6 @@ func buildMatrix(dataShards, totalShards int) (matrix, error) { return vm.Multiply(topInv) } -// buildMatrixJerasure creates the same encoding matrix as Jerasure library -// -// The top square of the matrix is guaranteed to be an identity -// matrix, which means that the data shards are unchanged after -// encoding. -func buildMatrixJerasure(dataShards, totalShards int) (matrix, error) { - // Start with a Vandermonde matrix. This matrix would work, - // in theory, but doesn't have the property that the data - // shards are unchanged after encoding. - vm, err := vandermonde(totalShards, dataShards) - if err != nil { - return nil, err - } - - // Jerasure does this: - // first row is always 100..00 - vm[0][0] = 1 - for i := 1; i < dataShards; i++ { - vm[0][i] = 0 - } - // last row is always 000..01 - for i := 0; i < dataShards-1; i++ { - vm[totalShards-1][i] = 0 - } - vm[totalShards-1][dataShards-1] = 1 - - for i := 0; i < dataShards; i++ { - // Find the row where i'th col is not 0 - r := i - for ; r < totalShards && vm[r][i] == 0; r++ { - } - if r != i { - // Swap it with i'th row if not already - t := vm[r] - vm[r] = vm[i] - vm[i] = t - } - // Multiply by the inverted matrix (same as vm.Multiply(vm[0:dataShards].Invert())) - if vm[i][i] != 1 { - // Make vm[i][i] = 1 by dividing the column by vm[i][i] - tmp := galDivide(1, vm[i][i]) - for j := 0; j < totalShards; j++ { - vm[j][i] = galMultiply(vm[j][i], tmp) - } - } - for j := 0; j < dataShards; j++ { - // Make vm[i][j] = 0 where j != i by adding vm[i][j]*vm[.][i] to each column - tmp := vm[i][j] - if j != i && tmp != 0 { - for r := 0; r < totalShards; r++ { - vm[r][j] = galAdd(vm[r][j], galMultiply(tmp, vm[r][i])) - } - } - } - } - - // Make vm[dataShards] row all ones - divide each column j by vm[dataShards][j] - for j := 0; j < dataShards; j++ { - tmp := vm[dataShards][j] - if tmp != 1 { - tmp = galDivide(1, tmp) - for i := dataShards; i < totalShards; i++ { - vm[i][j] = galMultiply(vm[i][j], tmp) - } - } - } - - // Make vm[dataShards...totalShards-1][0] column all ones - divide each row - for i := dataShards + 1; i < totalShards; i++ { - tmp := vm[i][0] - if tmp != 1 { - tmp = galDivide(1, tmp) - for j := 0; j < dataShards; j++ { - vm[i][j] = galMultiply(vm[i][j], tmp) - } - } - } - - return vm, nil -} - // buildMatrixPAR1 creates the matrix to use for encoding according to // the PARv1 spec, given the number of data shards and the number of // total shards. Note that the method they use is buggy, and may lead @@ -387,70 +252,41 @@ func buildXorMatrix(dataShards, totalShards int) (matrix, error) { // New creates a new encoder and initializes it to // the number of data shards and parity shards that // you want to use. You can reuse this encoder. -// Note that the maximum number of total shards is 65536, with some -// restrictions for a total larger than 256: -// -// - Shard sizes must be multiple of 64 -// - The methods Join/Split/Update/EncodeIdx are not supported -// +// Note that the maximum number of total shards is 256. // If no options are supplied, default options are used. func New(dataShards, parityShards int, opts ...Option) (Encoder, error) { - o := defaultOptions - for _, opt := range opts { - opt(&o) - } - - if (dataShards+parityShards > 256 && o.withLeopard == nil) || - (o.withLeopard != nil && *o.withLeopard == true && parityShards > 0) { - return newFF16(dataShards, parityShards, o) - } - if dataShards+parityShards > 256 { - return nil, ErrMaxShardNum - } - r := reedSolomon{ - dataShards: dataShards, - parityShards: parityShards, - totalShards: dataShards + parityShards, - o: o, + DataShards: dataShards, + ParityShards: parityShards, + Shards: dataShards + parityShards, + o: defaultOptions, } + for _, opt := range opts { + opt(&r.o) + } if dataShards <= 0 || parityShards < 0 { return nil, ErrInvShardNum } + if dataShards+parityShards > 256 { + return nil, ErrMaxShardNum + } + if parityShards == 0 { return &r, nil } var err error switch { - case r.o.customMatrix != nil: - if len(r.o.customMatrix) < parityShards { - return nil, errors.New("coding matrix must contain at least parityShards rows") - } - r.m = make([][]byte, r.totalShards) - for i := 0; i < dataShards; i++ { - r.m[i] = make([]byte, dataShards) - r.m[i][i] = 1 - } - for k, row := range r.o.customMatrix { - if len(row) < dataShards { - return nil, errors.New("coding matrix must contain at least dataShards columns") - } - r.m[dataShards+k] = make([]byte, dataShards) - copy(r.m[dataShards+k], row) - } case r.o.fastOneParity && parityShards == 1: - r.m, err = buildXorMatrix(dataShards, r.totalShards) + r.m, err = buildXorMatrix(dataShards, r.Shards) case r.o.useCauchy: - r.m, err = buildMatrixCauchy(dataShards, r.totalShards) + r.m, err = buildMatrixCauchy(dataShards, r.Shards) case r.o.usePAR1Matrix: - r.m, err = buildMatrixPAR1(dataShards, r.totalShards) - case r.o.useJerasureMatrix: - r.m, err = buildMatrixJerasure(dataShards, r.totalShards) + r.m, err = buildMatrixPAR1(dataShards, r.Shards) default: - r.m, err = buildMatrix(dataShards, r.totalShards) + r.m, err = buildMatrix(dataShards, r.Shards) } if err != nil { return nil, err @@ -548,7 +384,7 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) { } if avx2CodeGen && r.o.useAVX2 { - sz := r.dataShards * r.parityShards * 2 * 32 + sz := r.DataShards * r.ParityShards * 2 * 32 r.mPool.New = func() interface{} { return make([]byte, sz) } @@ -568,7 +404,7 @@ var ErrTooFewShards = errors.New("too few shards given") // The parity shards will always be overwritten and the data shards // will remain the same. func (r *reedSolomon) Encode(shards [][]byte) error { - if len(shards) != r.totalShards { + if len(shards) != r.Shards { return ErrTooFewShards } @@ -578,10 +414,10 @@ func (r *reedSolomon) Encode(shards [][]byte) error { } // Get the slice of output buffers. - output := shards[r.dataShards:] + output := shards[r.DataShards:] // Do the coding. - r.codeSomeShards(r.parity, shards[0:r.dataShards], output[:r.parityShards], len(shards[0])) + r.codeSomeShards(r.parity, shards[0:r.DataShards], output[:r.ParityShards], len(shards[0])) return nil } @@ -590,13 +426,13 @@ func (r *reedSolomon) Encode(shards [][]byte) error { // Data shards should only be delivered once. There is no check for this. // The parity shards will always be updated and the data shards will remain the unchanged. func (r *reedSolomon) EncodeIdx(dataShard []byte, idx int, parity [][]byte) error { - if len(parity) != r.parityShards { + if len(parity) != r.ParityShards { return ErrTooFewShards } if len(parity) == 0 { return nil } - if idx < 0 || idx >= r.dataShards { + if idx < 0 || idx >= r.DataShards { return ErrInvShardNum } err := checkShards(parity, false) @@ -615,7 +451,7 @@ func (r *reedSolomon) EncodeIdx(dataShard []byte, idx int, parity [][]byte) erro for start < len(dataShard) { in := dataShard[start:end] - for iRow := 0; iRow < r.parityShards; iRow++ { + for iRow := 0; iRow < r.ParityShards; iRow++ { galMulSliceXor(r.parity[iRow][idx], in, parity[iRow][start:end], &r.o) } start = end @@ -631,11 +467,11 @@ func (r *reedSolomon) EncodeIdx(dataShard []byte, idx int, parity [][]byte) erro var ErrInvalidInput = errors.New("invalid input") func (r *reedSolomon) Update(shards [][]byte, newDatashards [][]byte) error { - if len(shards) != r.totalShards { + if len(shards) != r.Shards { return ErrTooFewShards } - if len(newDatashards) != r.dataShards { + if len(newDatashards) != r.DataShards { return ErrTooFewShards } @@ -654,7 +490,7 @@ func (r *reedSolomon) Update(shards [][]byte, newDatashards [][]byte) error { return ErrInvalidInput } } - for _, p := range shards[r.dataShards:] { + for _, p := range shards[r.DataShards:] { if p == nil { return ErrInvalidInput } @@ -663,10 +499,10 @@ func (r *reedSolomon) Update(shards [][]byte, newDatashards [][]byte) error { shardSize := shardSize(shards) // Get the slice of output buffers. - output := shards[r.dataShards:] + output := shards[r.DataShards:] // Do the coding. - r.updateParityShards(r.parity, shards[0:r.dataShards], newDatashards[0:r.dataShards], output, r.parityShards, shardSize) + r.updateParityShards(r.parity, shards[0:r.DataShards], newDatashards[0:r.DataShards], output, r.ParityShards, shardSize) return nil } @@ -680,7 +516,7 @@ func (r *reedSolomon) updateParityShards(matrixRows, oldinputs, newinputs, outpu return } - for c := 0; c < r.dataShards; c++ { + for c := 0; c < r.DataShards; c++ { in := newinputs[c] if in == nil { continue @@ -707,7 +543,7 @@ func (r *reedSolomon) updateParityShardsP(matrixRows, oldinputs, newinputs, outp } wg.Add(1) go func(start, stop int) { - for c := 0; c < r.dataShards; c++ { + for c := 0; c < r.DataShards; c++ { in := newinputs[c] if in == nil { continue @@ -729,7 +565,7 @@ func (r *reedSolomon) updateParityShardsP(matrixRows, oldinputs, newinputs, outp // Verify returns true if the parity shards contain the right data. // The data is the same format as Encode. No data is modified. func (r *reedSolomon) Verify(shards [][]byte) (bool, error) { - if len(shards) != r.totalShards { + if len(shards) != r.Shards { return false, ErrTooFewShards } err := checkShards(shards, false) @@ -738,10 +574,10 @@ func (r *reedSolomon) Verify(shards [][]byte) (bool, error) { } // Slice of buffers being checked. - toCheck := shards[r.dataShards:] + toCheck := shards[r.DataShards:] // Do the checking. - return r.checkSomeShards(r.parity, shards[:r.dataShards], toCheck[:r.parityShards], len(shards[0])), nil + return r.checkSomeShards(r.parity, shards[:r.DataShards], toCheck[:r.ParityShards], len(shards[0])), nil } func (r *reedSolomon) canAVX2C(byteCount int, inputs, outputs int) bool { @@ -751,11 +587,11 @@ func (r *reedSolomon) canAVX2C(byteCount int, inputs, outputs int) bool { } // Multiplies a subset of rows from a coding matrix by a full set of -// input totalShards to produce some output totalShards. +// input shards to produce some output shards. // 'matrixRows' is The rows from the matrix to use. // 'inputs' An array of byte arrays, each of which is one input shard. // The number of inputs used is determined by the length of each matrix row. -// outputs Byte arrays where the computed totalShards are stored. +// outputs Byte arrays where the computed shards are stored. // The number of outputs computed, and the // number of matrix rows used, is determined by // outputCount, which is the number of outputs to compute. @@ -1132,7 +968,7 @@ func shardSize(shards [][]byte) int { // Given a list of shards, some of which contain data, fills in the // ones that don't have data. // -// The length of the array must be equal to shards. +// The length of the array must be equal to Shards. // You indicate that a shard is missing by setting it to nil or zero-length. // If a shard is zero-length but has sufficient capacity, that memory will // be used, otherwise a new []byte will be allocated. @@ -1143,7 +979,7 @@ func shardSize(shards [][]byte) int { // The reconstructed shard set is complete, but integrity is not verified. // Use the Verify function to check if data set is ok. func (r *reedSolomon) Reconstruct(shards [][]byte) error { - return r.reconstruct(shards, false, nil) + return r.reconstruct(shards, false) } // ReconstructData will recreate any missing data shards, if possible. @@ -1151,7 +987,7 @@ func (r *reedSolomon) Reconstruct(shards [][]byte) error { // Given a list of shards, some of which contain data, fills in the // data shards that don't have data. // -// The length of the array must be equal to shards. +// The length of the array must be equal to Shards. // You indicate that a shard is missing by setting it to nil or zero-length. // If a shard is zero-length but has sufficient capacity, that memory will // be used, otherwise a new []byte will be allocated. @@ -1162,39 +998,19 @@ func (r *reedSolomon) Reconstruct(shards [][]byte) error { // As the reconstructed shard set may contain missing parity shards, // calling the Verify function is likely to fail. func (r *reedSolomon) ReconstructData(shards [][]byte) error { - return r.reconstruct(shards, true, nil) + return r.reconstruct(shards, true) } -// ReconstructSome will recreate only requested data shards, if possible. +// reconstruct will recreate the missing data shards, and unless +// dataOnly is true, also the missing parity shards // -// Given a list of shards, some of which contain data, fills in the -// data shards indicated by true values in the "required" parameter. -// The length of "required" array must be equal to dataShards. -// -// The length of "shards" array must be equal to shards. -// You indicate that a shard is missing by setting it to nil or zero-length. -// If a shard is zero-length but has sufficient capacity, that memory will -// be used, otherwise a new []byte will be allocated. +// The length of the array must be equal to Shards. +// You indicate that a shard is missing by setting it to nil. // // If there are too few shards to reconstruct the missing // ones, ErrTooFewShards will be returned. -// -// As the reconstructed shard set may contain missing parity shards, -// calling the Verify function is likely to fail. -func (r *reedSolomon) ReconstructSome(shards [][]byte, required []bool) error { - return r.reconstruct(shards, true, required) -} - -// reconstruct will recreate the missing data totalShards, and unless -// dataOnly is true, also the missing parity totalShards -// -// The length of "shards" array must be equal to totalShards. -// You indicate that a shard is missing by setting it to nil. -// -// If there are too few totalShards to reconstruct the missing -// ones, ErrTooFewShards will be returned. -func (r *reedSolomon) reconstruct(shards [][]byte, dataOnly bool, required []bool) error { - if len(shards) != r.totalShards || required != nil && len(required) < r.dataShards { +func (r *reedSolomon) reconstruct(shards [][]byte, dataOnly bool) error { + if len(shards) != r.Shards { return ErrTooFewShards } // Check arguments. @@ -1209,26 +1025,22 @@ func (r *reedSolomon) reconstruct(shards [][]byte, dataOnly bool, required []boo // nothing to do. numberPresent := 0 dataPresent := 0 - missingRequired := 0 - for i := 0; i < r.totalShards; i++ { + for i := 0; i < r.Shards; i++ { if len(shards[i]) != 0 { numberPresent++ - if i < r.dataShards { + if i < r.DataShards { dataPresent++ } - } else if required != nil && required[i] { - missingRequired++ } } - if numberPresent == r.totalShards || dataOnly && dataPresent == r.dataShards || - required != nil && missingRequired == 0 { - // Cool. All of the shards have data. We don't + if numberPresent == r.Shards || dataOnly && dataPresent == r.DataShards { + // Cool. All of the shards data data. We don't // need to do anything. return nil } // More complete sanity check - if numberPresent < r.dataShards { + if numberPresent < r.DataShards { return ErrTooFewShards } @@ -1239,11 +1051,11 @@ func (r *reedSolomon) reconstruct(shards [][]byte, dataOnly bool, required []boo // // Also, create an array of indices of the valid rows we do have // and the invalid rows we don't have up until we have enough valid rows. - subShards := make([][]byte, r.dataShards) - validIndices := make([]int, r.dataShards) + subShards := make([][]byte, r.DataShards) + validIndices := make([]int, r.DataShards) invalidIndices := make([]int, 0) subMatrixRow := 0 - for matrixRow := 0; matrixRow < r.totalShards && subMatrixRow < r.dataShards; matrixRow++ { + for matrixRow := 0; matrixRow < r.Shards && subMatrixRow < r.DataShards; matrixRow++ { if len(shards[matrixRow]) != 0 { subShards[subMatrixRow] = shards[matrixRow] validIndices[subMatrixRow] = matrixRow @@ -1265,9 +1077,9 @@ func (r *reedSolomon) reconstruct(shards [][]byte, dataOnly bool, required []boo // shards that we have and build a square matrix. This // matrix could be used to generate the shards that we have // from the original data. - subMatrix, _ := newMatrix(r.dataShards, r.dataShards) + subMatrix, _ := newMatrix(r.DataShards, r.DataShards) for subMatrixRow, validIndex := range validIndices { - for c := 0; c < r.dataShards; c++ { + for c := 0; c < r.DataShards; c++ { subMatrix[subMatrixRow][c] = r.m[validIndex][c] } } @@ -1283,7 +1095,7 @@ func (r *reedSolomon) reconstruct(shards [][]byte, dataOnly bool, required []boo // Cache the inverted matrix in the tree for future use keyed on the // indices of the invalid rows. - err = r.tree.InsertInvertedMatrix(invalidIndices, dataDecodeMatrix, r.totalShards) + err = r.tree.InsertInvertedMatrix(invalidIndices, dataDecodeMatrix, r.Shards) if err != nil { return err } @@ -1294,12 +1106,12 @@ func (r *reedSolomon) reconstruct(shards [][]byte, dataOnly bool, required []boo // The input to the coding is all of the shards we actually // have, and the output is the missing data shards. The computation // is done using the special decode matrix we just built. - outputs := make([][]byte, r.parityShards) - matrixRows := make([][]byte, r.parityShards) + outputs := make([][]byte, r.ParityShards) + matrixRows := make([][]byte, r.ParityShards) outputCount := 0 - for iShard := 0; iShard < r.dataShards; iShard++ { - if len(shards[iShard]) == 0 && (required == nil || required[iShard]) { + for iShard := 0; iShard < r.DataShards; iShard++ { + if len(shards[iShard]) == 0 { if cap(shards[iShard]) >= shardSize { shards[iShard] = shards[iShard][0:shardSize] } else { @@ -1324,19 +1136,19 @@ func (r *reedSolomon) reconstruct(shards [][]byte, dataOnly bool, required []boo // any that we just calculated. The output is whichever of the // data shards were missing. outputCount = 0 - for iShard := r.dataShards; iShard < r.totalShards; iShard++ { - if len(shards[iShard]) == 0 && (required == nil || required[iShard]) { + for iShard := r.DataShards; iShard < r.Shards; iShard++ { + if len(shards[iShard]) == 0 { if cap(shards[iShard]) >= shardSize { shards[iShard] = shards[iShard][0:shardSize] } else { shards[iShard] = make([]byte, shardSize) } outputs[outputCount] = shards[iShard] - matrixRows[outputCount] = r.parity[iShard-r.dataShards] + matrixRows[outputCount] = r.parity[iShard-r.DataShards] outputCount++ } } - r.codeSomeShards(matrixRows, shards[:r.dataShards], outputs[:outputCount], shardSize) + r.codeSomeShards(matrixRows, shards[:r.DataShards], outputs[:outputCount], shardSize) return nil } @@ -1362,7 +1174,7 @@ func (r *reedSolomon) Split(data []byte) ([][]byte, error) { } dataLen := len(data) // Calculate number of bytes per data shard. - perShard := (len(data) + r.dataShards - 1) / r.dataShards + perShard := (len(data) + r.DataShards - 1) / r.DataShards if cap(data) > len(data) { data = data[:cap(data)] @@ -1370,20 +1182,20 @@ func (r *reedSolomon) Split(data []byte) ([][]byte, error) { // Only allocate memory if necessary var padding []byte - if len(data) < (r.totalShards * perShard) { + if len(data) < (r.Shards * perShard) { // calculate maximum number of full shards in `data` slice fullShards := len(data) / perShard - padding = make([]byte, r.totalShards*perShard-perShard*fullShards) + padding = make([]byte, r.Shards*perShard-perShard*fullShards) copy(padding, data[perShard*fullShards:]) data = data[0 : perShard*fullShards] } else { - for i := dataLen; i < dataLen+r.dataShards; i++ { + for i := dataLen; i < dataLen+r.DataShards; i++ { data[i] = 0 } } // Split into equal-length shards. - dst := make([][]byte, r.totalShards) + dst := make([][]byte, r.Shards) i := 0 for ; i < len(dst) && len(data) >= perShard; i++ { dst[i] = data[:perShard:perShard] @@ -1412,10 +1224,10 @@ var ErrReconstructRequired = errors.New("reconstruction required as one or more // If one or more required data shards are nil, ErrReconstructRequired will be returned. func (r *reedSolomon) Join(dst io.Writer, shards [][]byte, outSize int) error { // Do we have enough shards? - if len(shards) < r.dataShards { + if len(shards) < r.DataShards { return ErrTooFewShards } - shards = shards[:r.dataShards] + shards = shards[:r.DataShards] // Do we have enough data? size := 0 diff --git a/vendor/github.com/klauspost/reedsolomon/streaming.go b/vendor/github.com/klauspost/reedsolomon/streaming.go index e3aaf00..d048ba0 100644 --- a/vendor/github.com/klauspost/reedsolomon/streaming.go +++ b/vendor/github.com/klauspost/reedsolomon/streaming.go @@ -147,10 +147,6 @@ type rsStream struct { // you want to use. You can reuse this encoder. // Note that the maximum number of data shards is 256. func NewStream(dataShards, parityShards int, o ...Option) (StreamEncoder, error) { - if dataShards+parityShards > 256 { - return nil, ErrMaxShardNum - } - r := rsStream{o: defaultOptions} for _, opt := range o { opt(&r.o) @@ -223,18 +219,18 @@ func (r *rsStream) createSlice() [][]byte { // will be returned. If a parity writer returns an error, a // StreamWriteError will be returned. func (r *rsStream) Encode(data []io.Reader, parity []io.Writer) error { - if len(data) != r.r.dataShards { + if len(data) != r.r.DataShards { return ErrTooFewShards } - if len(parity) != r.r.parityShards { + if len(parity) != r.r.ParityShards { return ErrTooFewShards } all := r.createSlice() defer r.blockPool.Put(all) - in := all[:r.r.dataShards] - out := all[r.r.dataShards:] + in := all[:r.r.DataShards] + out := all[r.r.DataShards:] read := 0 for { @@ -429,7 +425,7 @@ func cWriteShards(out []io.Writer, in [][]byte) error { // If a shard stream returns an error, a StreamReadError type error // will be returned. func (r *rsStream) Verify(shards []io.Reader) (bool, error) { - if len(shards) != r.r.totalShards { + if len(shards) != r.r.Shards { return false, ErrTooFewShards } @@ -476,10 +472,10 @@ var ErrReconstructMismatch = errors.New("valid shards and fill shards are mutual // However its integrity is not automatically verified. // Use the Verify function to check in case the data set is complete. func (r *rsStream) Reconstruct(valid []io.Reader, fill []io.Writer) error { - if len(valid) != r.r.totalShards { + if len(valid) != r.r.Shards { return ErrTooFewShards } - if len(fill) != r.r.totalShards { + if len(fill) != r.r.Shards { return ErrTooFewShards } @@ -490,7 +486,7 @@ func (r *rsStream) Reconstruct(valid []io.Reader, fill []io.Writer) error { if valid[i] != nil && fill[i] != nil { return ErrReconstructMismatch } - if i >= r.r.dataShards && fill[i] != nil { + if i >= r.r.DataShards && fill[i] != nil { reconDataOnly = false } } @@ -534,12 +530,12 @@ func (r *rsStream) Reconstruct(valid []io.Reader, fill []io.Writer) error { // If the total data size is less than outSize, ErrShortData will be returned. func (r *rsStream) Join(dst io.Writer, shards []io.Reader, outSize int64) error { // Do we have enough shards? - if len(shards) < r.r.dataShards { + if len(shards) < r.r.DataShards { return ErrTooFewShards } // Trim off parity shards if any - shards = shards[:r.r.dataShards] + shards = shards[:r.r.DataShards] for i := range shards { if shards[i] == nil { return StreamReadError{Err: ErrShardNoData, Stream: i} @@ -575,7 +571,7 @@ func (r *rsStream) Split(data io.Reader, dst []io.Writer, size int64) error { if size == 0 { return ErrShortData } - if len(dst) != r.r.dataShards { + if len(dst) != r.r.DataShards { return ErrInvShardNum } @@ -586,10 +582,10 @@ func (r *rsStream) Split(data io.Reader, dst []io.Writer, size int64) error { } // Calculate number of bytes per shard. - perShard := (size + int64(r.r.dataShards) - 1) / int64(r.r.dataShards) + perShard := (size + int64(r.r.DataShards) - 1) / int64(r.r.DataShards) // Pad data to r.Shards*perShard. - padding := make([]byte, (int64(r.r.totalShards)*perShard)-size) + padding := make([]byte, (int64(r.r.Shards)*perShard)-size) data = io.MultiReader(data, bytes.NewBuffer(padding)) // Split into equal-length shards and copy. diff --git a/vendor/golang.org/x/crypto/AUTHORS b/vendor/golang.org/x/crypto/AUTHORS new file mode 100644 index 0000000..2b00ddb --- /dev/null +++ b/vendor/golang.org/x/crypto/AUTHORS @@ -0,0 +1,3 @@ +# This source code refers to The Go Authors for copyright purposes. +# The master list of authors is in the main Go distribution, +# visible at https://tip.golang.org/AUTHORS. diff --git a/vendor/golang.org/x/crypto/CONTRIBUTORS b/vendor/golang.org/x/crypto/CONTRIBUTORS new file mode 100644 index 0000000..1fbd3e9 --- /dev/null +++ b/vendor/golang.org/x/crypto/CONTRIBUTORS @@ -0,0 +1,3 @@ +# This source code was written by the Go contributors. +# The master list of contributors is in the main Go distribution, +# visible at https://tip.golang.org/CONTRIBUTORS. diff --git a/vendor/golang.org/x/crypto/argon2/argon2.go b/vendor/golang.org/x/crypto/argon2/argon2.go index 29f0a2d..b423fea 100644 --- a/vendor/golang.org/x/crypto/argon2/argon2.go +++ b/vendor/golang.org/x/crypto/argon2/argon2.go @@ -11,7 +11,8 @@ // If you aren't sure which function you need, use Argon2id (IDKey) and // the parameter recommendations for your scenario. // -// # Argon2i +// +// Argon2i // // Argon2i (implemented by Key) is the side-channel resistant version of Argon2. // It uses data-independent memory access, which is preferred for password @@ -20,7 +21,8 @@ // parameters (taken from [2]) for non-interactive operations are time=3 and to // use the maximum available memory. // -// # Argon2id +// +// Argon2id // // Argon2id (implemented by IDKey) is a hybrid version of Argon2 combining // Argon2i and Argon2d. It uses data-independent memory access for the first @@ -57,7 +59,7 @@ const ( // For example, you can get a derived key for e.g. AES-256 (which needs a // 32-byte key) by doing: // -// key := argon2.Key([]byte("some password"), salt, 3, 32*1024, 4, 32) +// key := argon2.Key([]byte("some password"), salt, 3, 32*1024, 4, 32) // // The draft RFC recommends[2] time=3, and memory=32*1024 is a sensible number. // If using that amount of memory (32 MB) is not possible in some contexts then @@ -81,7 +83,7 @@ func Key(password, salt []byte, time, memory uint32, threads uint8, keyLen uint3 // For example, you can get a derived key for e.g. AES-256 (which needs a // 32-byte key) by doing: // -// key := argon2.IDKey([]byte("some password"), salt, 1, 64*1024, 4, 32) +// key := argon2.IDKey([]byte("some password"), salt, 1, 64*1024, 4, 32) // // The draft RFC recommends[2] time=1, and memory=64*1024 is a sensible number. // If using that amount of memory (64 MB) is not possible in some contexts then diff --git a/vendor/golang.org/x/crypto/internal/subtle/aliasing.go b/vendor/golang.org/x/crypto/internal/subtle/aliasing.go new file mode 100644 index 0000000..4fad24f --- /dev/null +++ b/vendor/golang.org/x/crypto/internal/subtle/aliasing.go @@ -0,0 +1,33 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !purego +// +build !purego + +// Package subtle implements functions that are often useful in cryptographic +// code but require careful thought to use correctly. +package subtle // import "golang.org/x/crypto/internal/subtle" + +import "unsafe" + +// AnyOverlap reports whether x and y share memory at any (not necessarily +// corresponding) index. The memory beyond the slice length is ignored. +func AnyOverlap(x, y []byte) bool { + return len(x) > 0 && len(y) > 0 && + uintptr(unsafe.Pointer(&x[0])) <= uintptr(unsafe.Pointer(&y[len(y)-1])) && + uintptr(unsafe.Pointer(&y[0])) <= uintptr(unsafe.Pointer(&x[len(x)-1])) +} + +// InexactOverlap reports whether x and y share memory at any non-corresponding +// index. The memory beyond the slice length is ignored. Note that x and y can +// have different lengths and still not have any inexact overlap. +// +// InexactOverlap can be used to implement the requirements of the crypto/cipher +// AEAD, Block, BlockMode and Stream interfaces. +func InexactOverlap(x, y []byte) bool { + if len(x) == 0 || len(y) == 0 || &x[0] == &y[0] { + return false + } + return AnyOverlap(x, y) +} diff --git a/vendor/golang.org/x/crypto/internal/subtle/aliasing_purego.go b/vendor/golang.org/x/crypto/internal/subtle/aliasing_purego.go new file mode 100644 index 0000000..80ccbed --- /dev/null +++ b/vendor/golang.org/x/crypto/internal/subtle/aliasing_purego.go @@ -0,0 +1,36 @@ +// Copyright 2018 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build purego +// +build purego + +// Package subtle implements functions that are often useful in cryptographic +// code but require careful thought to use correctly. +package subtle // import "golang.org/x/crypto/internal/subtle" + +// This is the Google App Engine standard variant based on reflect +// because the unsafe package and cgo are disallowed. + +import "reflect" + +// AnyOverlap reports whether x and y share memory at any (not necessarily +// corresponding) index. The memory beyond the slice length is ignored. +func AnyOverlap(x, y []byte) bool { + return len(x) > 0 && len(y) > 0 && + reflect.ValueOf(&x[0]).Pointer() <= reflect.ValueOf(&y[len(y)-1]).Pointer() && + reflect.ValueOf(&y[0]).Pointer() <= reflect.ValueOf(&x[len(x)-1]).Pointer() +} + +// InexactOverlap reports whether x and y share memory at any non-corresponding +// index. The memory beyond the slice length is ignored. Note that x and y can +// have different lengths and still not have any inexact overlap. +// +// InexactOverlap can be used to implement the requirements of the crypto/cipher +// AEAD, Block, BlockMode and Stream interfaces. +func InexactOverlap(x, y []byte) bool { + if len(x) == 0 || len(y) == 0 || &x[0] == &y[0] { + return false + } + return AnyOverlap(x, y) +} diff --git a/vendor/golang.org/x/crypto/pbkdf2/pbkdf2.go b/vendor/golang.org/x/crypto/pbkdf2/pbkdf2.go index 904b57e..593f653 100644 --- a/vendor/golang.org/x/crypto/pbkdf2/pbkdf2.go +++ b/vendor/golang.org/x/crypto/pbkdf2/pbkdf2.go @@ -32,7 +32,7 @@ import ( // can get a derived key for e.g. AES-256 (which needs a 32-byte key) by // doing: // -// dk := pbkdf2.Key([]byte("some password"), salt, 4096, 32, sha1.New) +// dk := pbkdf2.Key([]byte("some password"), salt, 4096, 32, sha1.New) // // Remember to get a good random salt. At least 8 bytes is recommended by the // RFC. diff --git a/vendor/golang.org/x/crypto/salsa20/salsa20.go b/vendor/golang.org/x/crypto/salsa20/salsa20.go index 8f4f896..6f9bb10 100644 --- a/vendor/golang.org/x/crypto/salsa20/salsa20.go +++ b/vendor/golang.org/x/crypto/salsa20/salsa20.go @@ -24,7 +24,7 @@ package salsa20 // import "golang.org/x/crypto/salsa20" // TODO(agl): implement XORKeyStream12 and XORKeyStream8 - the reduced round variants of Salsa20. import ( - "golang.org/x/crypto/internal/alias" + "golang.org/x/crypto/internal/subtle" "golang.org/x/crypto/salsa20/salsa" ) @@ -35,7 +35,7 @@ func XORKeyStream(out, in []byte, nonce []byte, key *[32]byte) { if len(out) < len(in) { panic("salsa20: output smaller than input") } - if alias.InexactOverlap(out[:len(in)], in) { + if subtle.InexactOverlap(out[:len(in)], in) { panic("salsa20: invalid buffer overlap") } diff --git a/vendor/golang.org/x/crypto/scrypt/scrypt.go b/vendor/golang.org/x/crypto/scrypt/scrypt.go index c971a99..bbe4494 100644 --- a/vendor/golang.org/x/crypto/scrypt/scrypt.go +++ b/vendor/golang.org/x/crypto/scrypt/scrypt.go @@ -186,7 +186,7 @@ func smix(b []byte, r, N int, v, xy []uint32) { // For example, you can get a derived key for e.g. AES-256 (which needs a // 32-byte key) by doing: // -// dk, err := scrypt.Key([]byte("some password"), salt, 32768, 8, 1, 32) +// dk, err := scrypt.Key([]byte("some password"), salt, 32768, 8, 1, 32) // // The recommended parameters for interactive logins as of 2017 are N=32768, r=8 // and p=1. The parameters N, r, and p should be increased as memory latency and diff --git a/vendor/golang.org/x/crypto/sha3/doc.go b/vendor/golang.org/x/crypto/sha3/doc.go index decd8cf..c2fef30 100644 --- a/vendor/golang.org/x/crypto/sha3/doc.go +++ b/vendor/golang.org/x/crypto/sha3/doc.go @@ -8,7 +8,8 @@ // Both types of hash function use the "sponge" construction and the Keccak // permutation. For a detailed specification see http://keccak.noekeon.org/ // -// # Guidance +// +// Guidance // // If you aren't sure what function you need, use SHAKE256 with at least 64 // bytes of output. The SHAKE instances are faster than the SHA3 instances; @@ -18,7 +19,8 @@ // secret key to the input, hash with SHAKE256 and read at least 32 bytes of // output. // -// # Security strengths +// +// Security strengths // // The SHA3-x (x equals 224, 256, 384, or 512) functions have a security // strength against preimage attacks of x bits. Since they only produce "x" @@ -29,7 +31,8 @@ // is used. Requesting more than 64 or 32 bytes of output, respectively, does // not increase the collision-resistance of the SHAKE functions. // -// # The sponge construction +// +// The sponge construction // // A sponge builds a pseudo-random function from a public pseudo-random // permutation, by applying the permutation to a state of "rate + capacity" @@ -47,7 +50,8 @@ // Since the KeccakF-1600 permutation is 1600 bits (200 bytes) wide, this means // that the security strength of a sponge instance is equal to (1600 - bitrate) / 2. // -// # Recommendations +// +// Recommendations // // The SHAKE functions are recommended for most new uses. They can produce // output of arbitrary length. SHAKE256, with an output length of at least diff --git a/vendor/golang.org/x/crypto/sha3/sha3_s390x.go b/vendor/golang.org/x/crypto/sha3/sha3_s390x.go index 63a3edb..4fcfc92 100644 --- a/vendor/golang.org/x/crypto/sha3/sha3_s390x.go +++ b/vendor/golang.org/x/crypto/sha3/sha3_s390x.go @@ -34,13 +34,11 @@ const ( // kimd is a wrapper for the 'compute intermediate message digest' instruction. // src must be a multiple of the rate for the given function code. -// //go:noescape func kimd(function code, chain *[200]byte, src []byte) // klmd is a wrapper for the 'compute last message digest' instruction. // src padding is handled by the instruction. -// //go:noescape func klmd(function code, chain *[200]byte, dst, src []byte) diff --git a/vendor/golang.org/x/sys/AUTHORS b/vendor/golang.org/x/sys/AUTHORS new file mode 100644 index 0000000..15167cd --- /dev/null +++ b/vendor/golang.org/x/sys/AUTHORS @@ -0,0 +1,3 @@ +# This source code refers to The Go Authors for copyright purposes. +# The master list of authors is in the main Go distribution, +# visible at http://tip.golang.org/AUTHORS. diff --git a/vendor/golang.org/x/sys/CONTRIBUTORS b/vendor/golang.org/x/sys/CONTRIBUTORS new file mode 100644 index 0000000..1c4577e --- /dev/null +++ b/vendor/golang.org/x/sys/CONTRIBUTORS @@ -0,0 +1,3 @@ +# This source code was written by the Go contributors. +# The master list of contributors is in the main Go distribution, +# visible at http://tip.golang.org/CONTRIBUTORS. diff --git a/vendor/golang.org/x/sys/cpu/byteorder.go b/vendor/golang.org/x/sys/cpu/byteorder.go index 271055b..dcbb14e 100644 --- a/vendor/golang.org/x/sys/cpu/byteorder.go +++ b/vendor/golang.org/x/sys/cpu/byteorder.go @@ -46,7 +46,6 @@ func hostByteOrder() byteOrder { case "386", "amd64", "amd64p32", "alpha", "arm", "arm64", - "loong64", "mipsle", "mips64le", "mips64p32le", "nios2", "ppc64le", diff --git a/vendor/golang.org/x/sys/cpu/cpu.go b/vendor/golang.org/x/sys/cpu/cpu.go index 83f112c..b56886f 100644 --- a/vendor/golang.org/x/sys/cpu/cpu.go +++ b/vendor/golang.org/x/sys/cpu/cpu.go @@ -106,8 +106,8 @@ var ARM64 struct { // ARM contains the supported CPU features of the current ARM (32-bit) platform. // All feature flags are false if: -// 1. the current platform is not arm, or -// 2. the current operating system is not Linux. +// 1. the current platform is not arm, or +// 2. the current operating system is not Linux. var ARM struct { _ CacheLinePad HasSWP bool // SWP instruction support diff --git a/vendor/golang.org/x/sys/cpu/cpu_arm64.go b/vendor/golang.org/x/sys/cpu/cpu_arm64.go index f3eb993..87dd5e3 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_arm64.go +++ b/vendor/golang.org/x/sys/cpu/cpu_arm64.go @@ -6,10 +6,7 @@ package cpu import "runtime" -// cacheLineSize is used to prevent false sharing of cache lines. -// We choose 128 because Apple Silicon, a.k.a. M1, has 128-byte cache line size. -// It doesn't cost much and is much more future-proof. -const cacheLineSize = 128 +const cacheLineSize = 64 func initOptions() { options = []option{ @@ -44,10 +41,13 @@ func archInit() { switch runtime.GOOS { case "freebsd": readARM64Registers() - case "linux", "netbsd", "openbsd": + case "linux", "netbsd": doinit() default: - // Many platforms don't seem to allow reading these registers. + // Most platforms don't seem to allow reading these registers. + // + // OpenBSD: + // See https://golang.org/issue/31746 setMinimalFeatures() } } diff --git a/vendor/golang.org/x/sys/cpu/cpu_gccgo_x86.c b/vendor/golang.org/x/sys/cpu/cpu_gccgo_x86.c index a4605e6..e363c7d 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_gccgo_x86.c +++ b/vendor/golang.org/x/sys/cpu/cpu_gccgo_x86.c @@ -7,7 +7,6 @@ #include #include -#include // Need to wrap __get_cpuid_count because it's declared as static. int @@ -18,21 +17,27 @@ gccgoGetCpuidCount(uint32_t leaf, uint32_t subleaf, return __get_cpuid_count(leaf, subleaf, eax, ebx, ecx, edx); } -#pragma GCC diagnostic ignored "-Wunknown-pragmas" -#pragma GCC push_options -#pragma GCC target("xsave") -#pragma clang attribute push (__attribute__((target("xsave"))), apply_to=function) - // xgetbv reads the contents of an XCR (Extended Control Register) // specified in the ECX register into registers EDX:EAX. // Currently, the only supported value for XCR is 0. +// +// TODO: Replace with a better alternative: +// +// #include +// +// #pragma GCC target("xsave") +// +// void gccgoXgetbv(uint32_t *eax, uint32_t *edx) { +// unsigned long long x = _xgetbv(0); +// *eax = x & 0xffffffff; +// *edx = (x >> 32) & 0xffffffff; +// } +// +// Note that _xgetbv is defined starting with GCC 8. void gccgoXgetbv(uint32_t *eax, uint32_t *edx) { - uint64_t v = _xgetbv(0); - *eax = v & 0xffffffff; - *edx = v >> 32; + __asm(" xorl %%ecx, %%ecx\n" + " xgetbv" + : "=a"(*eax), "=d"(*edx)); } - -#pragma clang attribute pop -#pragma GCC pop_options diff --git a/vendor/golang.org/x/sys/cpu/cpu_other_arm64.go b/vendor/golang.org/x/sys/cpu/cpu_other_arm64.go index f3cde12..f8c484f 100644 --- a/vendor/golang.org/x/sys/cpu/cpu_other_arm64.go +++ b/vendor/golang.org/x/sys/cpu/cpu_other_arm64.go @@ -2,8 +2,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build !linux && !netbsd && !openbsd && arm64 -// +build !linux,!netbsd,!openbsd,arm64 +//go:build !linux && !netbsd && arm64 +// +build !linux,!netbsd,arm64 package cpu diff --git a/vendor/golang.org/x/sys/unix/endian_little.go b/vendor/golang.org/x/sys/unix/endian_little.go index b0f2bc4..4362f47 100644 --- a/vendor/golang.org/x/sys/unix/endian_little.go +++ b/vendor/golang.org/x/sys/unix/endian_little.go @@ -2,8 +2,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. // -//go:build 386 || amd64 || amd64p32 || alpha || arm || arm64 || loong64 || mipsle || mips64le || mips64p32le || nios2 || ppc64le || riscv || riscv64 || sh -// +build 386 amd64 amd64p32 alpha arm arm64 loong64 mipsle mips64le mips64p32le nios2 ppc64le riscv riscv64 sh +//go:build 386 || amd64 || amd64p32 || alpha || arm || arm64 || mipsle || mips64le || mips64p32le || nios2 || ppc64le || riscv || riscv64 || sh +// +build 386 amd64 amd64p32 alpha arm arm64 mipsle mips64le mips64p32le nios2 ppc64le riscv riscv64 sh package unix diff --git a/vendor/golang.org/x/sys/unix/errors_freebsd_386.go b/vendor/golang.org/x/sys/unix/errors_freebsd_386.go new file mode 100644 index 0000000..761db66 --- /dev/null +++ b/vendor/golang.org/x/sys/unix/errors_freebsd_386.go @@ -0,0 +1,233 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Constants that were deprecated or moved to enums in the FreeBSD headers. Keep +// them here for backwards compatibility. + +package unix + +const ( + DLT_HHDLC = 0x79 + IFF_SMART = 0x20 + IFT_1822 = 0x2 + IFT_A12MPPSWITCH = 0x82 + IFT_AAL2 = 0xbb + IFT_AAL5 = 0x31 + IFT_ADSL = 0x5e + IFT_AFLANE8023 = 0x3b + IFT_AFLANE8025 = 0x3c + IFT_ARAP = 0x58 + IFT_ARCNET = 0x23 + IFT_ARCNETPLUS = 0x24 + IFT_ASYNC = 0x54 + IFT_ATM = 0x25 + IFT_ATMDXI = 0x69 + IFT_ATMFUNI = 0x6a + IFT_ATMIMA = 0x6b + IFT_ATMLOGICAL = 0x50 + IFT_ATMRADIO = 0xbd + IFT_ATMSUBINTERFACE = 0x86 + IFT_ATMVCIENDPT = 0xc2 + IFT_ATMVIRTUAL = 0x95 + IFT_BGPPOLICYACCOUNTING = 0xa2 + IFT_BSC = 0x53 + IFT_CCTEMUL = 0x3d + IFT_CEPT = 0x13 + IFT_CES = 0x85 + IFT_CHANNEL = 0x46 + IFT_CNR = 0x55 + IFT_COFFEE = 0x84 + IFT_COMPOSITELINK = 0x9b + IFT_DCN = 0x8d + IFT_DIGITALPOWERLINE = 0x8a + IFT_DIGITALWRAPPEROVERHEADCHANNEL = 0xba + IFT_DLSW = 0x4a + IFT_DOCSCABLEDOWNSTREAM = 0x80 + IFT_DOCSCABLEMACLAYER = 0x7f + IFT_DOCSCABLEUPSTREAM = 0x81 + IFT_DS0 = 0x51 + IFT_DS0BUNDLE = 0x52 + IFT_DS1FDL = 0xaa + IFT_DS3 = 0x1e + IFT_DTM = 0x8c + IFT_DVBASILN = 0xac + IFT_DVBASIOUT = 0xad + IFT_DVBRCCDOWNSTREAM = 0x93 + IFT_DVBRCCMACLAYER = 0x92 + IFT_DVBRCCUPSTREAM = 0x94 + IFT_ENC = 0xf4 + IFT_EON = 0x19 + IFT_EPLRS = 0x57 + IFT_ESCON = 0x49 + IFT_ETHER = 0x6 + IFT_FAITH = 0xf2 + IFT_FAST = 0x7d + IFT_FASTETHER = 0x3e + IFT_FASTETHERFX = 0x45 + IFT_FDDI = 0xf + IFT_FIBRECHANNEL = 0x38 + IFT_FRAMERELAYINTERCONNECT = 0x3a + IFT_FRAMERELAYMPI = 0x5c + IFT_FRDLCIENDPT = 0xc1 + IFT_FRELAY = 0x20 + IFT_FRELAYDCE = 0x2c + IFT_FRF16MFRBUNDLE = 0xa3 + IFT_FRFORWARD = 0x9e + IFT_G703AT2MB = 0x43 + IFT_G703AT64K = 0x42 + IFT_GIF = 0xf0 + IFT_GIGABITETHERNET = 0x75 + IFT_GR303IDT = 0xb2 + IFT_GR303RDT = 0xb1 + IFT_H323GATEKEEPER = 0xa4 + IFT_H323PROXY = 0xa5 + IFT_HDH1822 = 0x3 + IFT_HDLC = 0x76 + IFT_HDSL2 = 0xa8 + IFT_HIPERLAN2 = 0xb7 + IFT_HIPPI = 0x2f + IFT_HIPPIINTERFACE = 0x39 + IFT_HOSTPAD = 0x5a + IFT_HSSI = 0x2e + IFT_HY = 0xe + IFT_IBM370PARCHAN = 0x48 + IFT_IDSL = 0x9a + IFT_IEEE80211 = 0x47 + IFT_IEEE80212 = 0x37 + IFT_IEEE8023ADLAG = 0xa1 + IFT_IFGSN = 0x91 + IFT_IMT = 0xbe + IFT_INTERLEAVE = 0x7c + IFT_IP = 0x7e + IFT_IPFORWARD = 0x8e + IFT_IPOVERATM = 0x72 + IFT_IPOVERCDLC = 0x6d + IFT_IPOVERCLAW = 0x6e + IFT_IPSWITCH = 0x4e + IFT_IPXIP = 0xf9 + IFT_ISDN = 0x3f + IFT_ISDNBASIC = 0x14 + IFT_ISDNPRIMARY = 0x15 + IFT_ISDNS = 0x4b + IFT_ISDNU = 0x4c + IFT_ISO88022LLC = 0x29 + IFT_ISO88023 = 0x7 + IFT_ISO88024 = 0x8 + IFT_ISO88025 = 0x9 + IFT_ISO88025CRFPINT = 0x62 + IFT_ISO88025DTR = 0x56 + IFT_ISO88025FIBER = 0x73 + IFT_ISO88026 = 0xa + IFT_ISUP = 0xb3 + IFT_L3IPXVLAN = 0x89 + IFT_LAPB = 0x10 + IFT_LAPD = 0x4d + IFT_LAPF = 0x77 + IFT_LOCALTALK = 0x2a + IFT_LOOP = 0x18 + IFT_MEDIAMAILOVERIP = 0x8b + IFT_MFSIGLINK = 0xa7 + IFT_MIOX25 = 0x26 + IFT_MODEM = 0x30 + IFT_MPC = 0x71 + IFT_MPLS = 0xa6 + IFT_MPLSTUNNEL = 0x96 + IFT_MSDSL = 0x8f + IFT_MVL = 0xbf + IFT_MYRINET = 0x63 + IFT_NFAS = 0xaf + IFT_NSIP = 0x1b + IFT_OPTICALCHANNEL = 0xc3 + IFT_OPTICALTRANSPORT = 0xc4 + IFT_OTHER = 0x1 + IFT_P10 = 0xc + IFT_P80 = 0xd + IFT_PARA = 0x22 + IFT_PFLOG = 0xf6 + IFT_PFSYNC = 0xf7 + IFT_PLC = 0xae + IFT_POS = 0xab + IFT_PPPMULTILINKBUNDLE = 0x6c + IFT_PROPBWAP2MP = 0xb8 + IFT_PROPCNLS = 0x59 + IFT_PROPDOCSWIRELESSDOWNSTREAM = 0xb5 + IFT_PROPDOCSWIRELESSMACLAYER = 0xb4 + IFT_PROPDOCSWIRELESSUPSTREAM = 0xb6 + IFT_PROPMUX = 0x36 + IFT_PROPWIRELESSP2P = 0x9d + IFT_PTPSERIAL = 0x16 + IFT_PVC = 0xf1 + IFT_QLLC = 0x44 + IFT_RADIOMAC = 0xbc + IFT_RADSL = 0x5f + IFT_REACHDSL = 0xc0 + IFT_RFC1483 = 0x9f + IFT_RS232 = 0x21 + IFT_RSRB = 0x4f + IFT_SDLC = 0x11 + IFT_SDSL = 0x60 + IFT_SHDSL = 0xa9 + IFT_SIP = 0x1f + IFT_SLIP = 0x1c + IFT_SMDSDXI = 0x2b + IFT_SMDSICIP = 0x34 + IFT_SONET = 0x27 + IFT_SONETOVERHEADCHANNEL = 0xb9 + IFT_SONETPATH = 0x32 + IFT_SONETVT = 0x33 + IFT_SRP = 0x97 + IFT_SS7SIGLINK = 0x9c + IFT_STACKTOSTACK = 0x6f + IFT_STARLAN = 0xb + IFT_STF = 0xd7 + IFT_T1 = 0x12 + IFT_TDLC = 0x74 + IFT_TERMPAD = 0x5b + IFT_TR008 = 0xb0 + IFT_TRANSPHDLC = 0x7b + IFT_TUNNEL = 0x83 + IFT_ULTRA = 0x1d + IFT_USB = 0xa0 + IFT_V11 = 0x40 + IFT_V35 = 0x2d + IFT_V36 = 0x41 + IFT_V37 = 0x78 + IFT_VDSL = 0x61 + IFT_VIRTUALIPADDRESS = 0x70 + IFT_VOICEEM = 0x64 + IFT_VOICEENCAP = 0x67 + IFT_VOICEFXO = 0x65 + IFT_VOICEFXS = 0x66 + IFT_VOICEOVERATM = 0x98 + IFT_VOICEOVERFRAMERELAY = 0x99 + IFT_VOICEOVERIP = 0x68 + IFT_X213 = 0x5d + IFT_X25 = 0x5 + IFT_X25DDN = 0x4 + IFT_X25HUNTGROUP = 0x7a + IFT_X25MLP = 0x79 + IFT_X25PLE = 0x28 + IFT_XETHER = 0x1a + IPPROTO_MAXID = 0x34 + IPV6_FAITH = 0x1d + IPV6_MIN_MEMBERSHIPS = 0x1f + IP_FAITH = 0x16 + IP_MAX_SOURCE_FILTER = 0x400 + IP_MIN_MEMBERSHIPS = 0x1f + MAP_NORESERVE = 0x40 + MAP_RENAME = 0x20 + NET_RT_MAXID = 0x6 + RTF_PRCLONING = 0x10000 + RTM_OLDADD = 0x9 + RTM_OLDDEL = 0xa + RT_CACHING_CONTEXT = 0x1 + RT_NORTREF = 0x2 + SIOCADDRT = 0x8030720a + SIOCALIFADDR = 0x8118691b + SIOCDELRT = 0x8030720b + SIOCDLIFADDR = 0x8118691d + SIOCGLIFADDR = 0xc118691c + SIOCGLIFPHYADDR = 0xc118694b + SIOCSLIFPHYADDR = 0x8118694a +) diff --git a/vendor/golang.org/x/sys/unix/errors_freebsd_amd64.go b/vendor/golang.org/x/sys/unix/errors_freebsd_amd64.go new file mode 100644 index 0000000..070f44b --- /dev/null +++ b/vendor/golang.org/x/sys/unix/errors_freebsd_amd64.go @@ -0,0 +1,233 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Constants that were deprecated or moved to enums in the FreeBSD headers. Keep +// them here for backwards compatibility. + +package unix + +const ( + DLT_HHDLC = 0x79 + IFF_SMART = 0x20 + IFT_1822 = 0x2 + IFT_A12MPPSWITCH = 0x82 + IFT_AAL2 = 0xbb + IFT_AAL5 = 0x31 + IFT_ADSL = 0x5e + IFT_AFLANE8023 = 0x3b + IFT_AFLANE8025 = 0x3c + IFT_ARAP = 0x58 + IFT_ARCNET = 0x23 + IFT_ARCNETPLUS = 0x24 + IFT_ASYNC = 0x54 + IFT_ATM = 0x25 + IFT_ATMDXI = 0x69 + IFT_ATMFUNI = 0x6a + IFT_ATMIMA = 0x6b + IFT_ATMLOGICAL = 0x50 + IFT_ATMRADIO = 0xbd + IFT_ATMSUBINTERFACE = 0x86 + IFT_ATMVCIENDPT = 0xc2 + IFT_ATMVIRTUAL = 0x95 + IFT_BGPPOLICYACCOUNTING = 0xa2 + IFT_BSC = 0x53 + IFT_CCTEMUL = 0x3d + IFT_CEPT = 0x13 + IFT_CES = 0x85 + IFT_CHANNEL = 0x46 + IFT_CNR = 0x55 + IFT_COFFEE = 0x84 + IFT_COMPOSITELINK = 0x9b + IFT_DCN = 0x8d + IFT_DIGITALPOWERLINE = 0x8a + IFT_DIGITALWRAPPEROVERHEADCHANNEL = 0xba + IFT_DLSW = 0x4a + IFT_DOCSCABLEDOWNSTREAM = 0x80 + IFT_DOCSCABLEMACLAYER = 0x7f + IFT_DOCSCABLEUPSTREAM = 0x81 + IFT_DS0 = 0x51 + IFT_DS0BUNDLE = 0x52 + IFT_DS1FDL = 0xaa + IFT_DS3 = 0x1e + IFT_DTM = 0x8c + IFT_DVBASILN = 0xac + IFT_DVBASIOUT = 0xad + IFT_DVBRCCDOWNSTREAM = 0x93 + IFT_DVBRCCMACLAYER = 0x92 + IFT_DVBRCCUPSTREAM = 0x94 + IFT_ENC = 0xf4 + IFT_EON = 0x19 + IFT_EPLRS = 0x57 + IFT_ESCON = 0x49 + IFT_ETHER = 0x6 + IFT_FAITH = 0xf2 + IFT_FAST = 0x7d + IFT_FASTETHER = 0x3e + IFT_FASTETHERFX = 0x45 + IFT_FDDI = 0xf + IFT_FIBRECHANNEL = 0x38 + IFT_FRAMERELAYINTERCONNECT = 0x3a + IFT_FRAMERELAYMPI = 0x5c + IFT_FRDLCIENDPT = 0xc1 + IFT_FRELAY = 0x20 + IFT_FRELAYDCE = 0x2c + IFT_FRF16MFRBUNDLE = 0xa3 + IFT_FRFORWARD = 0x9e + IFT_G703AT2MB = 0x43 + IFT_G703AT64K = 0x42 + IFT_GIF = 0xf0 + IFT_GIGABITETHERNET = 0x75 + IFT_GR303IDT = 0xb2 + IFT_GR303RDT = 0xb1 + IFT_H323GATEKEEPER = 0xa4 + IFT_H323PROXY = 0xa5 + IFT_HDH1822 = 0x3 + IFT_HDLC = 0x76 + IFT_HDSL2 = 0xa8 + IFT_HIPERLAN2 = 0xb7 + IFT_HIPPI = 0x2f + IFT_HIPPIINTERFACE = 0x39 + IFT_HOSTPAD = 0x5a + IFT_HSSI = 0x2e + IFT_HY = 0xe + IFT_IBM370PARCHAN = 0x48 + IFT_IDSL = 0x9a + IFT_IEEE80211 = 0x47 + IFT_IEEE80212 = 0x37 + IFT_IEEE8023ADLAG = 0xa1 + IFT_IFGSN = 0x91 + IFT_IMT = 0xbe + IFT_INTERLEAVE = 0x7c + IFT_IP = 0x7e + IFT_IPFORWARD = 0x8e + IFT_IPOVERATM = 0x72 + IFT_IPOVERCDLC = 0x6d + IFT_IPOVERCLAW = 0x6e + IFT_IPSWITCH = 0x4e + IFT_IPXIP = 0xf9 + IFT_ISDN = 0x3f + IFT_ISDNBASIC = 0x14 + IFT_ISDNPRIMARY = 0x15 + IFT_ISDNS = 0x4b + IFT_ISDNU = 0x4c + IFT_ISO88022LLC = 0x29 + IFT_ISO88023 = 0x7 + IFT_ISO88024 = 0x8 + IFT_ISO88025 = 0x9 + IFT_ISO88025CRFPINT = 0x62 + IFT_ISO88025DTR = 0x56 + IFT_ISO88025FIBER = 0x73 + IFT_ISO88026 = 0xa + IFT_ISUP = 0xb3 + IFT_L3IPXVLAN = 0x89 + IFT_LAPB = 0x10 + IFT_LAPD = 0x4d + IFT_LAPF = 0x77 + IFT_LOCALTALK = 0x2a + IFT_LOOP = 0x18 + IFT_MEDIAMAILOVERIP = 0x8b + IFT_MFSIGLINK = 0xa7 + IFT_MIOX25 = 0x26 + IFT_MODEM = 0x30 + IFT_MPC = 0x71 + IFT_MPLS = 0xa6 + IFT_MPLSTUNNEL = 0x96 + IFT_MSDSL = 0x8f + IFT_MVL = 0xbf + IFT_MYRINET = 0x63 + IFT_NFAS = 0xaf + IFT_NSIP = 0x1b + IFT_OPTICALCHANNEL = 0xc3 + IFT_OPTICALTRANSPORT = 0xc4 + IFT_OTHER = 0x1 + IFT_P10 = 0xc + IFT_P80 = 0xd + IFT_PARA = 0x22 + IFT_PFLOG = 0xf6 + IFT_PFSYNC = 0xf7 + IFT_PLC = 0xae + IFT_POS = 0xab + IFT_PPPMULTILINKBUNDLE = 0x6c + IFT_PROPBWAP2MP = 0xb8 + IFT_PROPCNLS = 0x59 + IFT_PROPDOCSWIRELESSDOWNSTREAM = 0xb5 + IFT_PROPDOCSWIRELESSMACLAYER = 0xb4 + IFT_PROPDOCSWIRELESSUPSTREAM = 0xb6 + IFT_PROPMUX = 0x36 + IFT_PROPWIRELESSP2P = 0x9d + IFT_PTPSERIAL = 0x16 + IFT_PVC = 0xf1 + IFT_QLLC = 0x44 + IFT_RADIOMAC = 0xbc + IFT_RADSL = 0x5f + IFT_REACHDSL = 0xc0 + IFT_RFC1483 = 0x9f + IFT_RS232 = 0x21 + IFT_RSRB = 0x4f + IFT_SDLC = 0x11 + IFT_SDSL = 0x60 + IFT_SHDSL = 0xa9 + IFT_SIP = 0x1f + IFT_SLIP = 0x1c + IFT_SMDSDXI = 0x2b + IFT_SMDSICIP = 0x34 + IFT_SONET = 0x27 + IFT_SONETOVERHEADCHANNEL = 0xb9 + IFT_SONETPATH = 0x32 + IFT_SONETVT = 0x33 + IFT_SRP = 0x97 + IFT_SS7SIGLINK = 0x9c + IFT_STACKTOSTACK = 0x6f + IFT_STARLAN = 0xb + IFT_STF = 0xd7 + IFT_T1 = 0x12 + IFT_TDLC = 0x74 + IFT_TERMPAD = 0x5b + IFT_TR008 = 0xb0 + IFT_TRANSPHDLC = 0x7b + IFT_TUNNEL = 0x83 + IFT_ULTRA = 0x1d + IFT_USB = 0xa0 + IFT_V11 = 0x40 + IFT_V35 = 0x2d + IFT_V36 = 0x41 + IFT_V37 = 0x78 + IFT_VDSL = 0x61 + IFT_VIRTUALIPADDRESS = 0x70 + IFT_VOICEEM = 0x64 + IFT_VOICEENCAP = 0x67 + IFT_VOICEFXO = 0x65 + IFT_VOICEFXS = 0x66 + IFT_VOICEOVERATM = 0x98 + IFT_VOICEOVERFRAMERELAY = 0x99 + IFT_VOICEOVERIP = 0x68 + IFT_X213 = 0x5d + IFT_X25 = 0x5 + IFT_X25DDN = 0x4 + IFT_X25HUNTGROUP = 0x7a + IFT_X25MLP = 0x79 + IFT_X25PLE = 0x28 + IFT_XETHER = 0x1a + IPPROTO_MAXID = 0x34 + IPV6_FAITH = 0x1d + IPV6_MIN_MEMBERSHIPS = 0x1f + IP_FAITH = 0x16 + IP_MAX_SOURCE_FILTER = 0x400 + IP_MIN_MEMBERSHIPS = 0x1f + MAP_NORESERVE = 0x40 + MAP_RENAME = 0x20 + NET_RT_MAXID = 0x6 + RTF_PRCLONING = 0x10000 + RTM_OLDADD = 0x9 + RTM_OLDDEL = 0xa + RT_CACHING_CONTEXT = 0x1 + RT_NORTREF = 0x2 + SIOCADDRT = 0x8040720a + SIOCALIFADDR = 0x8118691b + SIOCDELRT = 0x8040720b + SIOCDLIFADDR = 0x8118691d + SIOCGLIFADDR = 0xc118691c + SIOCGLIFPHYADDR = 0xc118694b + SIOCSLIFPHYADDR = 0x8118694a +) diff --git a/vendor/golang.org/x/sys/unix/errors_freebsd_arm.go b/vendor/golang.org/x/sys/unix/errors_freebsd_arm.go new file mode 100644 index 0000000..856dca3 --- /dev/null +++ b/vendor/golang.org/x/sys/unix/errors_freebsd_arm.go @@ -0,0 +1,226 @@ +// Copyright 2017 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package unix + +const ( + IFT_1822 = 0x2 + IFT_A12MPPSWITCH = 0x82 + IFT_AAL2 = 0xbb + IFT_AAL5 = 0x31 + IFT_ADSL = 0x5e + IFT_AFLANE8023 = 0x3b + IFT_AFLANE8025 = 0x3c + IFT_ARAP = 0x58 + IFT_ARCNET = 0x23 + IFT_ARCNETPLUS = 0x24 + IFT_ASYNC = 0x54 + IFT_ATM = 0x25 + IFT_ATMDXI = 0x69 + IFT_ATMFUNI = 0x6a + IFT_ATMIMA = 0x6b + IFT_ATMLOGICAL = 0x50 + IFT_ATMRADIO = 0xbd + IFT_ATMSUBINTERFACE = 0x86 + IFT_ATMVCIENDPT = 0xc2 + IFT_ATMVIRTUAL = 0x95 + IFT_BGPPOLICYACCOUNTING = 0xa2 + IFT_BSC = 0x53 + IFT_CCTEMUL = 0x3d + IFT_CEPT = 0x13 + IFT_CES = 0x85 + IFT_CHANNEL = 0x46 + IFT_CNR = 0x55 + IFT_COFFEE = 0x84 + IFT_COMPOSITELINK = 0x9b + IFT_DCN = 0x8d + IFT_DIGITALPOWERLINE = 0x8a + IFT_DIGITALWRAPPEROVERHEADCHANNEL = 0xba + IFT_DLSW = 0x4a + IFT_DOCSCABLEDOWNSTREAM = 0x80 + IFT_DOCSCABLEMACLAYER = 0x7f + IFT_DOCSCABLEUPSTREAM = 0x81 + IFT_DS0 = 0x51 + IFT_DS0BUNDLE = 0x52 + IFT_DS1FDL = 0xaa + IFT_DS3 = 0x1e + IFT_DTM = 0x8c + IFT_DVBASILN = 0xac + IFT_DVBASIOUT = 0xad + IFT_DVBRCCDOWNSTREAM = 0x93 + IFT_DVBRCCMACLAYER = 0x92 + IFT_DVBRCCUPSTREAM = 0x94 + IFT_ENC = 0xf4 + IFT_EON = 0x19 + IFT_EPLRS = 0x57 + IFT_ESCON = 0x49 + IFT_ETHER = 0x6 + IFT_FAST = 0x7d + IFT_FASTETHER = 0x3e + IFT_FASTETHERFX = 0x45 + IFT_FDDI = 0xf + IFT_FIBRECHANNEL = 0x38 + IFT_FRAMERELAYINTERCONNECT = 0x3a + IFT_FRAMERELAYMPI = 0x5c + IFT_FRDLCIENDPT = 0xc1 + IFT_FRELAY = 0x20 + IFT_FRELAYDCE = 0x2c + IFT_FRF16MFRBUNDLE = 0xa3 + IFT_FRFORWARD = 0x9e + IFT_G703AT2MB = 0x43 + IFT_G703AT64K = 0x42 + IFT_GIF = 0xf0 + IFT_GIGABITETHERNET = 0x75 + IFT_GR303IDT = 0xb2 + IFT_GR303RDT = 0xb1 + IFT_H323GATEKEEPER = 0xa4 + IFT_H323PROXY = 0xa5 + IFT_HDH1822 = 0x3 + IFT_HDLC = 0x76 + IFT_HDSL2 = 0xa8 + IFT_HIPERLAN2 = 0xb7 + IFT_HIPPI = 0x2f + IFT_HIPPIINTERFACE = 0x39 + IFT_HOSTPAD = 0x5a + IFT_HSSI = 0x2e + IFT_HY = 0xe + IFT_IBM370PARCHAN = 0x48 + IFT_IDSL = 0x9a + IFT_IEEE80211 = 0x47 + IFT_IEEE80212 = 0x37 + IFT_IEEE8023ADLAG = 0xa1 + IFT_IFGSN = 0x91 + IFT_IMT = 0xbe + IFT_INTERLEAVE = 0x7c + IFT_IP = 0x7e + IFT_IPFORWARD = 0x8e + IFT_IPOVERATM = 0x72 + IFT_IPOVERCDLC = 0x6d + IFT_IPOVERCLAW = 0x6e + IFT_IPSWITCH = 0x4e + IFT_ISDN = 0x3f + IFT_ISDNBASIC = 0x14 + IFT_ISDNPRIMARY = 0x15 + IFT_ISDNS = 0x4b + IFT_ISDNU = 0x4c + IFT_ISO88022LLC = 0x29 + IFT_ISO88023 = 0x7 + IFT_ISO88024 = 0x8 + IFT_ISO88025 = 0x9 + IFT_ISO88025CRFPINT = 0x62 + IFT_ISO88025DTR = 0x56 + IFT_ISO88025FIBER = 0x73 + IFT_ISO88026 = 0xa + IFT_ISUP = 0xb3 + IFT_L3IPXVLAN = 0x89 + IFT_LAPB = 0x10 + IFT_LAPD = 0x4d + IFT_LAPF = 0x77 + IFT_LOCALTALK = 0x2a + IFT_LOOP = 0x18 + IFT_MEDIAMAILOVERIP = 0x8b + IFT_MFSIGLINK = 0xa7 + IFT_MIOX25 = 0x26 + IFT_MODEM = 0x30 + IFT_MPC = 0x71 + IFT_MPLS = 0xa6 + IFT_MPLSTUNNEL = 0x96 + IFT_MSDSL = 0x8f + IFT_MVL = 0xbf + IFT_MYRINET = 0x63 + IFT_NFAS = 0xaf + IFT_NSIP = 0x1b + IFT_OPTICALCHANNEL = 0xc3 + IFT_OPTICALTRANSPORT = 0xc4 + IFT_OTHER = 0x1 + IFT_P10 = 0xc + IFT_P80 = 0xd + IFT_PARA = 0x22 + IFT_PFLOG = 0xf6 + IFT_PFSYNC = 0xf7 + IFT_PLC = 0xae + IFT_POS = 0xab + IFT_PPPMULTILINKBUNDLE = 0x6c + IFT_PROPBWAP2MP = 0xb8 + IFT_PROPCNLS = 0x59 + IFT_PROPDOCSWIRELESSDOWNSTREAM = 0xb5 + IFT_PROPDOCSWIRELESSMACLAYER = 0xb4 + IFT_PROPDOCSWIRELESSUPSTREAM = 0xb6 + IFT_PROPMUX = 0x36 + IFT_PROPWIRELESSP2P = 0x9d + IFT_PTPSERIAL = 0x16 + IFT_PVC = 0xf1 + IFT_QLLC = 0x44 + IFT_RADIOMAC = 0xbc + IFT_RADSL = 0x5f + IFT_REACHDSL = 0xc0 + IFT_RFC1483 = 0x9f + IFT_RS232 = 0x21 + IFT_RSRB = 0x4f + IFT_SDLC = 0x11 + IFT_SDSL = 0x60 + IFT_SHDSL = 0xa9 + IFT_SIP = 0x1f + IFT_SLIP = 0x1c + IFT_SMDSDXI = 0x2b + IFT_SMDSICIP = 0x34 + IFT_SONET = 0x27 + IFT_SONETOVERHEADCHANNEL = 0xb9 + IFT_SONETPATH = 0x32 + IFT_SONETVT = 0x33 + IFT_SRP = 0x97 + IFT_SS7SIGLINK = 0x9c + IFT_STACKTOSTACK = 0x6f + IFT_STARLAN = 0xb + IFT_STF = 0xd7 + IFT_T1 = 0x12 + IFT_TDLC = 0x74 + IFT_TERMPAD = 0x5b + IFT_TR008 = 0xb0 + IFT_TRANSPHDLC = 0x7b + IFT_TUNNEL = 0x83 + IFT_ULTRA = 0x1d + IFT_USB = 0xa0 + IFT_V11 = 0x40 + IFT_V35 = 0x2d + IFT_V36 = 0x41 + IFT_V37 = 0x78 + IFT_VDSL = 0x61 + IFT_VIRTUALIPADDRESS = 0x70 + IFT_VOICEEM = 0x64 + IFT_VOICEENCAP = 0x67 + IFT_VOICEFXO = 0x65 + IFT_VOICEFXS = 0x66 + IFT_VOICEOVERATM = 0x98 + IFT_VOICEOVERFRAMERELAY = 0x99 + IFT_VOICEOVERIP = 0x68 + IFT_X213 = 0x5d + IFT_X25 = 0x5 + IFT_X25DDN = 0x4 + IFT_X25HUNTGROUP = 0x7a + IFT_X25MLP = 0x79 + IFT_X25PLE = 0x28 + IFT_XETHER = 0x1a + + // missing constants on FreeBSD-11.1-RELEASE, copied from old values in ztypes_freebsd_arm.go + IFF_SMART = 0x20 + IFT_FAITH = 0xf2 + IFT_IPXIP = 0xf9 + IPPROTO_MAXID = 0x34 + IPV6_FAITH = 0x1d + IP_FAITH = 0x16 + MAP_NORESERVE = 0x40 + MAP_RENAME = 0x20 + NET_RT_MAXID = 0x6 + RTF_PRCLONING = 0x10000 + RTM_OLDADD = 0x9 + RTM_OLDDEL = 0xa + SIOCADDRT = 0x8030720a + SIOCALIFADDR = 0x8118691b + SIOCDELRT = 0x8030720b + SIOCDLIFADDR = 0x8118691d + SIOCGLIFADDR = 0xc118691c + SIOCGLIFPHYADDR = 0xc118694b + SIOCSLIFPHYADDR = 0x8118694a +) diff --git a/vendor/golang.org/x/sys/unix/errors_freebsd_arm64.go b/vendor/golang.org/x/sys/unix/errors_freebsd_arm64.go new file mode 100644 index 0000000..946dcf3 --- /dev/null +++ b/vendor/golang.org/x/sys/unix/errors_freebsd_arm64.go @@ -0,0 +1,17 @@ +// Copyright 2020 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Constants that were deprecated or moved to enums in the FreeBSD headers. Keep +// them here for backwards compatibility. + +package unix + +const ( + DLT_HHDLC = 0x79 + IPV6_MIN_MEMBERSHIPS = 0x1f + IP_MAX_SOURCE_FILTER = 0x400 + IP_MIN_MEMBERSHIPS = 0x1f + RT_CACHING_CONTEXT = 0x1 + RT_NORTREF = 0x2 +) diff --git a/vendor/golang.org/x/sys/unix/ifreq_linux.go b/vendor/golang.org/x/sys/unix/ifreq_linux.go index 15721a5..934af31 100644 --- a/vendor/golang.org/x/sys/unix/ifreq_linux.go +++ b/vendor/golang.org/x/sys/unix/ifreq_linux.go @@ -8,6 +8,7 @@ package unix import ( + "bytes" "unsafe" ) @@ -44,7 +45,13 @@ func NewIfreq(name string) (*Ifreq, error) { // Name returns the interface name associated with the Ifreq. func (ifr *Ifreq) Name() string { - return ByteSliceToString(ifr.raw.Ifrn[:]) + // BytePtrToString requires a NULL terminator or the program may crash. If + // one is not present, just return the empty string. + if !bytes.Contains(ifr.raw.Ifrn[:], []byte{0x00}) { + return "" + } + + return BytePtrToString(&ifr.raw.Ifrn[0]) } // According to netdevice(7), only AF_INET addresses are returned for numerous diff --git a/vendor/golang.org/x/sys/unix/ioctl_linux.go b/vendor/golang.org/x/sys/unix/ioctl_linux.go index 0d12c08..884430b 100644 --- a/vendor/golang.org/x/sys/unix/ioctl_linux.go +++ b/vendor/golang.org/x/sys/unix/ioctl_linux.go @@ -4,7 +4,9 @@ package unix -import "unsafe" +import ( + "unsafe" +) // IoctlRetInt performs an ioctl operation specified by req on a device // associated with opened file descriptor fd, and returns a non-negative @@ -215,19 +217,3 @@ func IoctlKCMAttach(fd int, info KCMAttach) error { func IoctlKCMUnattach(fd int, info KCMUnattach) error { return ioctlPtr(fd, SIOCKCMUNATTACH, unsafe.Pointer(&info)) } - -// IoctlLoopGetStatus64 gets the status of the loop device associated with the -// file descriptor fd using the LOOP_GET_STATUS64 operation. -func IoctlLoopGetStatus64(fd int) (*LoopInfo64, error) { - var value LoopInfo64 - if err := ioctlPtr(fd, LOOP_GET_STATUS64, unsafe.Pointer(&value)); err != nil { - return nil, err - } - return &value, nil -} - -// IoctlLoopSetStatus64 sets the status of the loop device associated with the -// file descriptor fd using the LOOP_SET_STATUS64 operation. -func IoctlLoopSetStatus64(fd int, value *LoopInfo64) error { - return ioctlPtr(fd, LOOP_SET_STATUS64, unsafe.Pointer(value)) -} diff --git a/vendor/golang.org/x/sys/unix/mkall.sh b/vendor/golang.org/x/sys/unix/mkall.sh index 3b2335d..ee73623 100644 --- a/vendor/golang.org/x/sys/unix/mkall.sh +++ b/vendor/golang.org/x/sys/unix/mkall.sh @@ -73,12 +73,12 @@ aix_ppc64) darwin_amd64) mkerrors="$mkerrors -m64" mktypes="GOARCH=$GOARCH go tool cgo -godefs" - mkasm="go run mkasm.go" + mkasm="go run mkasm_darwin.go" ;; darwin_arm64) mkerrors="$mkerrors -m64" mktypes="GOARCH=$GOARCH go tool cgo -godefs" - mkasm="go run mkasm.go" + mkasm="go run mkasm_darwin.go" ;; dragonfly_amd64) mkerrors="$mkerrors -m64" @@ -89,30 +89,25 @@ dragonfly_amd64) freebsd_386) mkerrors="$mkerrors -m32" mksyscall="go run mksyscall.go -l32" - mksysnum="go run mksysnum.go 'https://cgit.freebsd.org/src/plain/sys/kern/syscalls.master?h=stable/12'" + mksysnum="go run mksysnum.go 'https://svn.freebsd.org/base/stable/11/sys/kern/syscalls.master'" mktypes="GOARCH=$GOARCH go tool cgo -godefs" ;; freebsd_amd64) mkerrors="$mkerrors -m64" - mksysnum="go run mksysnum.go 'https://cgit.freebsd.org/src/plain/sys/kern/syscalls.master?h=stable/12'" + mksysnum="go run mksysnum.go 'https://svn.freebsd.org/base/stable/11/sys/kern/syscalls.master'" mktypes="GOARCH=$GOARCH go tool cgo -godefs" ;; freebsd_arm) mkerrors="$mkerrors" mksyscall="go run mksyscall.go -l32 -arm" - mksysnum="go run mksysnum.go 'https://cgit.freebsd.org/src/plain/sys/kern/syscalls.master?h=stable/12'" + mksysnum="go run mksysnum.go 'https://svn.freebsd.org/base/stable/11/sys/kern/syscalls.master'" # Let the type of C char be signed for making the bare syscall # API consistent across platforms. mktypes="GOARCH=$GOARCH go tool cgo -godefs -- -fsigned-char" ;; freebsd_arm64) mkerrors="$mkerrors -m64" - mksysnum="go run mksysnum.go 'https://cgit.freebsd.org/src/plain/sys/kern/syscalls.master?h=stable/12'" - mktypes="GOARCH=$GOARCH go tool cgo -godefs -- -fsigned-char" - ;; -freebsd_riscv64) - mkerrors="$mkerrors -m64" - mksysnum="go run mksysnum.go 'https://cgit.freebsd.org/src/plain/sys/kern/syscalls.master?h=stable/12'" + mksysnum="go run mksysnum.go 'https://svn.freebsd.org/base/stable/11/sys/kern/syscalls.master'" mktypes="GOARCH=$GOARCH go tool cgo -godefs -- -fsigned-char" ;; netbsd_386) @@ -142,33 +137,33 @@ netbsd_arm64) mktypes="GOARCH=$GOARCH go tool cgo -godefs" ;; openbsd_386) - mkasm="go run mkasm.go" mkerrors="$mkerrors -m32" - mksyscall="go run mksyscall.go -l32 -openbsd -libc" + mksyscall="go run mksyscall.go -l32 -openbsd" mksysctl="go run mksysctl_openbsd.go" + mksysnum="go run mksysnum.go 'https://cvsweb.openbsd.org/cgi-bin/cvsweb/~checkout~/src/sys/kern/syscalls.master'" mktypes="GOARCH=$GOARCH go tool cgo -godefs" ;; openbsd_amd64) - mkasm="go run mkasm.go" mkerrors="$mkerrors -m64" - mksyscall="go run mksyscall.go -openbsd -libc" + mksyscall="go run mksyscall.go -openbsd" mksysctl="go run mksysctl_openbsd.go" + mksysnum="go run mksysnum.go 'https://cvsweb.openbsd.org/cgi-bin/cvsweb/~checkout~/src/sys/kern/syscalls.master'" mktypes="GOARCH=$GOARCH go tool cgo -godefs" ;; openbsd_arm) - mkasm="go run mkasm.go" mkerrors="$mkerrors" - mksyscall="go run mksyscall.go -l32 -openbsd -arm -libc" + mksyscall="go run mksyscall.go -l32 -openbsd -arm" mksysctl="go run mksysctl_openbsd.go" + mksysnum="go run mksysnum.go 'https://cvsweb.openbsd.org/cgi-bin/cvsweb/~checkout~/src/sys/kern/syscalls.master'" # Let the type of C char be signed for making the bare syscall # API consistent across platforms. mktypes="GOARCH=$GOARCH go tool cgo -godefs -- -fsigned-char" ;; openbsd_arm64) - mkasm="go run mkasm.go" mkerrors="$mkerrors -m64" - mksyscall="go run mksyscall.go -openbsd -libc" + mksyscall="go run mksyscall.go -openbsd" mksysctl="go run mksysctl_openbsd.go" + mksysnum="go run mksysnum.go 'https://cvsweb.openbsd.org/cgi-bin/cvsweb/~checkout~/src/sys/kern/syscalls.master'" # Let the type of C char be signed for making the bare syscall # API consistent across platforms. mktypes="GOARCH=$GOARCH go tool cgo -godefs -- -fsigned-char" @@ -232,5 +227,5 @@ esac if [ -n "$mksysctl" ]; then echo "$mksysctl |gofmt >$zsysctl"; fi if [ -n "$mksysnum" ]; then echo "$mksysnum |gofmt >zsysnum_$GOOSARCH.go"; fi if [ -n "$mktypes" ]; then echo "$mktypes types_$GOOS.go | go run mkpost.go > ztypes_$GOOSARCH.go"; fi - if [ -n "$mkasm" ]; then echo "$mkasm $GOOS $GOARCH"; fi + if [ -n "$mkasm" ]; then echo "$mkasm $GOARCH"; fi ) | $run diff --git a/vendor/golang.org/x/sys/unix/mkerrors.sh b/vendor/golang.org/x/sys/unix/mkerrors.sh index 2ab44aa..d888fb7 100644 --- a/vendor/golang.org/x/sys/unix/mkerrors.sh +++ b/vendor/golang.org/x/sys/unix/mkerrors.sh @@ -128,7 +128,6 @@ includes_FreeBSD=' #include #include #include -#include #include #include #include @@ -203,7 +202,6 @@ struct ltchars { #include #include #include -#include #include #include #include @@ -297,10 +295,6 @@ struct ltchars { #define SOL_NETLINK 270 #endif -#ifndef SOL_SMC -#define SOL_SMC 286 -#endif - #ifdef SOL_BLUETOOTH // SPARC includes this in /usr/include/sparc64-linux-gnu/bits/socket.h // but it is already in bluetooth_linux.go @@ -535,7 +529,7 @@ ccflags="$@" $2 ~ /^(MS|MNT|MOUNT|UMOUNT)_/ || $2 ~ /^NS_GET_/ || $2 ~ /^TUN(SET|GET|ATTACH|DETACH)/ || - $2 ~ /^(O|F|[ES]?FD|NAME|S|PTRACE|PT|PIOD|TFD)_/ || + $2 ~ /^(O|F|[ES]?FD|NAME|S|PTRACE|PT|TFD)_/ || $2 ~ /^KEXEC_/ || $2 ~ /^LINUX_REBOOT_CMD_/ || $2 ~ /^LINUX_REBOOT_MAGIC[12]$/ || @@ -559,7 +553,6 @@ ccflags="$@" $2 ~ /^CLONE_[A-Z_]+/ || $2 !~ /^(BPF_TIMEVAL|BPF_FIB_LOOKUP_[A-Z]+)$/ && $2 ~ /^(BPF|DLT)_/ || - $2 ~ /^AUDIT_/ || $2 ~ /^(CLOCK|TIMER)_/ || $2 ~ /^CAN_/ || $2 ~ /^CAP_/ || @@ -582,6 +575,7 @@ ccflags="$@" $2 ~ /^SEEK_/ || $2 ~ /^SPLICE_/ || $2 ~ /^SYNC_FILE_RANGE_/ || + $2 !~ /^AUDIT_RECORD_MAGIC/ && $2 !~ /IOC_MAGIC/ && $2 ~ /^[A-Z][A-Z0-9_]+_MAGIC2?$/ || $2 ~ /^(VM|VMADDR)_/ || diff --git a/vendor/golang.org/x/sys/unix/str.go b/vendor/golang.org/x/sys/unix/str.go new file mode 100644 index 0000000..8ba89ed --- /dev/null +++ b/vendor/golang.org/x/sys/unix/str.go @@ -0,0 +1,27 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build aix || darwin || dragonfly || freebsd || linux || netbsd || openbsd || solaris +// +build aix darwin dragonfly freebsd linux netbsd openbsd solaris + +package unix + +func itoa(val int) string { // do it here rather than with fmt to avoid dependency + if val < 0 { + return "-" + uitoa(uint(-val)) + } + return uitoa(uint(val)) +} + +func uitoa(val uint) string { + var buf [32]byte // big enough for int64 + i := len(buf) - 1 + for val >= 10 { + buf[i] = byte(val%10 + '0') + i-- + val /= 10 + } + buf[i] = byte(val + '0') + return string(buf[i:]) +} diff --git a/vendor/golang.org/x/sys/unix/syscall.go b/vendor/golang.org/x/sys/unix/syscall.go index 9916e5e..649fa87 100644 --- a/vendor/golang.org/x/sys/unix/syscall.go +++ b/vendor/golang.org/x/sys/unix/syscall.go @@ -29,6 +29,8 @@ import ( "bytes" "strings" "unsafe" + + "golang.org/x/sys/internal/unsafeheader" ) // ByteSliceFromString returns a NUL-terminated slice of bytes @@ -80,7 +82,12 @@ func BytePtrToString(p *byte) string { ptr = unsafe.Pointer(uintptr(ptr) + 1) } - s := unsafe.Slice((*byte)(unsafe.Pointer(p)), n) + var s []byte + h := (*unsafeheader.Slice)(unsafe.Pointer(&s)) + h.Data = unsafe.Pointer(p) + h.Len = n + h.Cap = n + return string(s) } diff --git a/vendor/golang.org/x/sys/unix/syscall_aix.go b/vendor/golang.org/x/sys/unix/syscall_aix.go index 2db1b51..f2a114f 100644 --- a/vendor/golang.org/x/sys/unix/syscall_aix.go +++ b/vendor/golang.org/x/sys/unix/syscall_aix.go @@ -37,7 +37,6 @@ func Creat(path string, mode uint32) (fd int, err error) { } //sys utimes(path string, times *[2]Timeval) (err error) - func Utimes(path string, tv []Timeval) error { if len(tv) != 2 { return EINVAL @@ -46,7 +45,6 @@ func Utimes(path string, tv []Timeval) error { } //sys utimensat(dirfd int, path string, times *[2]Timespec, flag int) (err error) - func UtimesNano(path string, ts []Timespec) error { if len(ts) != 2 { return EINVAL @@ -217,63 +215,14 @@ func Accept(fd int) (nfd int, sa Sockaddr, err error) { return } -func recvmsgRaw(fd int, iov []Iovec, oob []byte, flags int, rsa *RawSockaddrAny) (n, oobn int, recvflags int, err error) { - var msg Msghdr - msg.Name = (*byte)(unsafe.Pointer(rsa)) - msg.Namelen = uint32(SizeofSockaddrAny) - var dummy byte - if len(oob) > 0 { - // receive at least one normal byte - if emptyIovecs(iov) { - var iova [1]Iovec - iova[0].Base = &dummy - iova[0].SetLen(1) - iov = iova[:] - } - msg.Control = (*byte)(unsafe.Pointer(&oob[0])) - msg.SetControllen(len(oob)) - } - if len(iov) > 0 { - msg.Iov = &iov[0] - msg.SetIovlen(len(iov)) - } - if n, err = recvmsg(fd, &msg, flags); n == -1 { - return - } - oobn = int(msg.Controllen) - recvflags = int(msg.Flags) - return +func recvmsgRaw(fd int, p, oob []byte, flags int, rsa *RawSockaddrAny) (n, oobn int, recvflags int, err error) { + // Recvmsg not implemented on AIX + return -1, -1, -1, ENOSYS } -func sendmsgN(fd int, iov []Iovec, oob []byte, ptr unsafe.Pointer, salen _Socklen, flags int) (n int, err error) { - var msg Msghdr - msg.Name = (*byte)(unsafe.Pointer(ptr)) - msg.Namelen = uint32(salen) - var dummy byte - var empty bool - if len(oob) > 0 { - // send at least one normal byte - empty = emptyIovecs(iov) - if empty { - var iova [1]Iovec - iova[0].Base = &dummy - iova[0].SetLen(1) - iov = iova[:] - } - msg.Control = (*byte)(unsafe.Pointer(&oob[0])) - msg.SetControllen(len(oob)) - } - if len(iov) > 0 { - msg.Iov = &iov[0] - msg.SetIovlen(len(iov)) - } - if n, err = sendmsg(fd, &msg, flags); err != nil { - return 0, err - } - if len(oob) > 0 && empty { - n = 0 - } - return n, nil +func sendmsgN(fd int, p, oob []byte, ptr unsafe.Pointer, salen _Socklen, flags int) (n int, err error) { + // SendmsgN not implemented on AIX + return -1, ENOSYS } func anyToSockaddr(fd int, rsa *RawSockaddrAny) (Sockaddr, error) { @@ -351,13 +300,11 @@ func direntNamlen(buf []byte) (uint64, bool) { } //sys getdirent(fd int, buf []byte) (n int, err error) - func Getdents(fd int, buf []byte) (n int, err error) { return getdirent(fd, buf) } //sys wait4(pid Pid_t, status *_C_int, options int, rusage *Rusage) (wpid Pid_t, err error) - func Wait4(pid int, wstatus *WaitStatus, options int, rusage *Rusage) (wpid int, err error) { var status _C_int var r Pid_t @@ -425,7 +372,6 @@ func (w WaitStatus) TrapCause() int { return -1 } //sys fcntl(fd int, cmd int, arg int) (val int, err error) //sys fsyncRange(fd int, how int, start int64, length int64) (err error) = fsync_range - func Fsync(fd int) error { return fsyncRange(fd, O_SYNC, 0, 0) } @@ -590,7 +536,6 @@ func Poll(fds []PollFd, timeout int) (n int, err error) { //sys Getsystemcfg(label int) (n uint64) //sys umount(target string) (err error) - func Unmount(target string, flags int) (err error) { if flags != 0 { // AIX doesn't have any flags for umount. diff --git a/vendor/golang.org/x/sys/unix/syscall_bsd.go b/vendor/golang.org/x/sys/unix/syscall_bsd.go index eda4267..9c87c5f 100644 --- a/vendor/golang.org/x/sys/unix/syscall_bsd.go +++ b/vendor/golang.org/x/sys/unix/syscall_bsd.go @@ -325,26 +325,27 @@ func GetsockoptString(fd, level, opt int) (string, error) { //sys sendto(s int, buf []byte, flags int, to unsafe.Pointer, addrlen _Socklen) (err error) //sys recvmsg(s int, msg *Msghdr, flags int) (n int, err error) -func recvmsgRaw(fd int, iov []Iovec, oob []byte, flags int, rsa *RawSockaddrAny) (n, oobn int, recvflags int, err error) { +func recvmsgRaw(fd int, p, oob []byte, flags int, rsa *RawSockaddrAny) (n, oobn int, recvflags int, err error) { var msg Msghdr msg.Name = (*byte)(unsafe.Pointer(rsa)) msg.Namelen = uint32(SizeofSockaddrAny) + var iov Iovec + if len(p) > 0 { + iov.Base = (*byte)(unsafe.Pointer(&p[0])) + iov.SetLen(len(p)) + } var dummy byte if len(oob) > 0 { // receive at least one normal byte - if emptyIovecs(iov) { - var iova [1]Iovec - iova[0].Base = &dummy - iova[0].SetLen(1) - iov = iova[:] + if len(p) == 0 { + iov.Base = &dummy + iov.SetLen(1) } msg.Control = (*byte)(unsafe.Pointer(&oob[0])) msg.SetControllen(len(oob)) } - if len(iov) > 0 { - msg.Iov = &iov[0] - msg.SetIovlen(len(iov)) - } + msg.Iov = &iov + msg.Iovlen = 1 if n, err = recvmsg(fd, &msg, flags); err != nil { return } @@ -355,32 +356,31 @@ func recvmsgRaw(fd int, iov []Iovec, oob []byte, flags int, rsa *RawSockaddrAny) //sys sendmsg(s int, msg *Msghdr, flags int) (n int, err error) -func sendmsgN(fd int, iov []Iovec, oob []byte, ptr unsafe.Pointer, salen _Socklen, flags int) (n int, err error) { +func sendmsgN(fd int, p, oob []byte, ptr unsafe.Pointer, salen _Socklen, flags int) (n int, err error) { var msg Msghdr msg.Name = (*byte)(unsafe.Pointer(ptr)) msg.Namelen = uint32(salen) + var iov Iovec + if len(p) > 0 { + iov.Base = (*byte)(unsafe.Pointer(&p[0])) + iov.SetLen(len(p)) + } var dummy byte - var empty bool if len(oob) > 0 { // send at least one normal byte - empty = emptyIovecs(iov) - if empty { - var iova [1]Iovec - iova[0].Base = &dummy - iova[0].SetLen(1) - iov = iova[:] + if len(p) == 0 { + iov.Base = &dummy + iov.SetLen(1) } msg.Control = (*byte)(unsafe.Pointer(&oob[0])) msg.SetControllen(len(oob)) } - if len(iov) > 0 { - msg.Iov = &iov[0] - msg.SetIovlen(len(iov)) - } + msg.Iov = &iov + msg.Iovlen = 1 if n, err = sendmsg(fd, &msg, flags); err != nil { return 0, err } - if len(oob) > 0 && empty { + if len(oob) > 0 && len(p) == 0 { n = 0 } return n, nil diff --git a/vendor/golang.org/x/sys/unix/syscall_darwin.1_13.go b/vendor/golang.org/x/sys/unix/syscall_darwin.1_13.go index 1259f6d..1596426 100644 --- a/vendor/golang.org/x/sys/unix/syscall_darwin.1_13.go +++ b/vendor/golang.org/x/sys/unix/syscall_darwin.1_13.go @@ -7,7 +7,11 @@ package unix -import "unsafe" +import ( + "unsafe" + + "golang.org/x/sys/internal/unsafeheader" +) //sys closedir(dir uintptr) (err error) //sys readdir_r(dir uintptr, entry *Dirent, result **Dirent) (res Errno) @@ -82,7 +86,11 @@ func Getdirentries(fd int, buf []byte, basep *uintptr) (n int, err error) { } // Copy entry into return buffer. - s := unsafe.Slice((*byte)(unsafe.Pointer(&entry)), reclen) + var s []byte + hdr := (*unsafeheader.Slice)(unsafe.Pointer(&s)) + hdr.Data = unsafe.Pointer(&entry) + hdr.Cap = reclen + hdr.Len = reclen copy(buf, s) buf = buf[reclen:] diff --git a/vendor/golang.org/x/sys/unix/syscall_darwin.go b/vendor/golang.org/x/sys/unix/syscall_darwin.go index 4f87f16..09a25c6 100644 --- a/vendor/golang.org/x/sys/unix/syscall_darwin.go +++ b/vendor/golang.org/x/sys/unix/syscall_darwin.go @@ -393,13 +393,6 @@ func GetsockoptXucred(fd, level, opt int) (*Xucred, error) { return x, err } -func GetsockoptTCPConnectionInfo(fd, level, opt int) (*TCPConnectionInfo, error) { - var value TCPConnectionInfo - vallen := _Socklen(SizeofTCPConnectionInfo) - err := getsockopt(fd, level, opt, unsafe.Pointer(&value), &vallen) - return &value, err -} - func SysctlKinfoProc(name string, args ...int) (*KinfoProc, error) { mib, err := sysctlmib(name, args...) if err != nil { @@ -511,7 +504,6 @@ func SysctlKinfoProcSlice(name string, args ...int) ([]KinfoProc, error) { //sys Mkdirat(dirfd int, path string, mode uint32) (err error) //sys Mkfifo(path string, mode uint32) (err error) //sys Mknod(path string, mode uint32, dev int) (err error) -//sys Mount(fsType string, dir string, flags int, data unsafe.Pointer) (err error) //sys Open(path string, mode int, perm uint32) (fd int, err error) //sys Openat(dirfd int, path string, mode int, perm uint32) (fd int, err error) //sys Pathconf(path string, name int) (val int, err error) @@ -580,6 +572,7 @@ func SysctlKinfoProcSlice(name string, args ...int) ([]KinfoProc, error) { // Nfssvc // Getfh // Quotactl +// Mount // Csops // Waitid // Add_profil diff --git a/vendor/golang.org/x/sys/unix/syscall_dragonfly.go b/vendor/golang.org/x/sys/unix/syscall_dragonfly.go index 61c0d0d..c61e274 100644 --- a/vendor/golang.org/x/sys/unix/syscall_dragonfly.go +++ b/vendor/golang.org/x/sys/unix/syscall_dragonfly.go @@ -125,13 +125,11 @@ func Pipe2(p []int, flags int) (err error) { } //sys extpread(fd int, p []byte, flags int, offset int64) (n int, err error) - func pread(fd int, p []byte, offset int64) (n int, err error) { return extpread(fd, p, 0, offset) } //sys extpwrite(fd int, p []byte, flags int, offset int64) (n int, err error) - func pwrite(fd int, p []byte, offset int64) (n int, err error) { return extpwrite(fd, p, 0, offset) } diff --git a/vendor/golang.org/x/sys/unix/syscall_freebsd.go b/vendor/golang.org/x/sys/unix/syscall_freebsd.go index de7c23e..6f6c510 100644 --- a/vendor/golang.org/x/sys/unix/syscall_freebsd.go +++ b/vendor/golang.org/x/sys/unix/syscall_freebsd.go @@ -17,12 +17,25 @@ import ( "unsafe" ) +const ( + SYS_FSTAT_FREEBSD12 = 551 // { int fstat(int fd, _Out_ struct stat *sb); } + SYS_FSTATAT_FREEBSD12 = 552 // { int fstatat(int fd, _In_z_ char *path, \ + SYS_GETDIRENTRIES_FREEBSD12 = 554 // { ssize_t getdirentries(int fd, \ + SYS_STATFS_FREEBSD12 = 555 // { int statfs(_In_z_ char *path, \ + SYS_FSTATFS_FREEBSD12 = 556 // { int fstatfs(int fd, \ + SYS_GETFSSTAT_FREEBSD12 = 557 // { int getfsstat( \ + SYS_MKNODAT_FREEBSD12 = 559 // { int mknodat(int fd, _In_z_ char *path, \ +) + // See https://www.freebsd.org/doc/en_US.ISO8859-1/books/porters-handbook/versions.html. var ( osreldateOnce sync.Once osreldate uint32 ) +// INO64_FIRST from /usr/src/lib/libc/sys/compat-ino64.h +const _ino64First = 1200031 + func supportsABI(ver uint32) bool { osreldateOnce.Do(func() { osreldate, _ = SysctlUint32("kern.osreldate") }) return osreldate >= ver @@ -146,18 +159,38 @@ func Accept4(fd, flags int) (nfd int, sa Sockaddr, err error) { func Getfsstat(buf []Statfs_t, flags int) (n int, err error) { var ( - _p0 unsafe.Pointer - bufsize uintptr + _p0 unsafe.Pointer + bufsize uintptr + oldBuf []statfs_freebsd11_t + needsConvert bool ) + if len(buf) > 0 { - _p0 = unsafe.Pointer(&buf[0]) - bufsize = unsafe.Sizeof(Statfs_t{}) * uintptr(len(buf)) + if supportsABI(_ino64First) { + _p0 = unsafe.Pointer(&buf[0]) + bufsize = unsafe.Sizeof(Statfs_t{}) * uintptr(len(buf)) + } else { + n := len(buf) + oldBuf = make([]statfs_freebsd11_t, n) + _p0 = unsafe.Pointer(&oldBuf[0]) + bufsize = unsafe.Sizeof(statfs_freebsd11_t{}) * uintptr(n) + needsConvert = true + } } - r0, _, e1 := Syscall(SYS_GETFSSTAT, uintptr(_p0), bufsize, uintptr(flags)) + var sysno uintptr = SYS_GETFSSTAT + if supportsABI(_ino64First) { + sysno = SYS_GETFSSTAT_FREEBSD12 + } + r0, _, e1 := Syscall(sysno, uintptr(_p0), bufsize, uintptr(flags)) n = int(r0) if e1 != 0 { err = e1 } + if e1 == 0 && needsConvert { + for i := range oldBuf { + buf[i].convertFrom(&oldBuf[i]) + } + } return } @@ -212,11 +245,87 @@ func Uname(uname *Utsname) error { } func Stat(path string, st *Stat_t) (err error) { - return Fstatat(AT_FDCWD, path, st, 0) + var oldStat stat_freebsd11_t + if supportsABI(_ino64First) { + return fstatat_freebsd12(AT_FDCWD, path, st, 0) + } + err = stat(path, &oldStat) + if err != nil { + return err + } + + st.convertFrom(&oldStat) + return nil } func Lstat(path string, st *Stat_t) (err error) { - return Fstatat(AT_FDCWD, path, st, AT_SYMLINK_NOFOLLOW) + var oldStat stat_freebsd11_t + if supportsABI(_ino64First) { + return fstatat_freebsd12(AT_FDCWD, path, st, AT_SYMLINK_NOFOLLOW) + } + err = lstat(path, &oldStat) + if err != nil { + return err + } + + st.convertFrom(&oldStat) + return nil +} + +func Fstat(fd int, st *Stat_t) (err error) { + var oldStat stat_freebsd11_t + if supportsABI(_ino64First) { + return fstat_freebsd12(fd, st) + } + err = fstat(fd, &oldStat) + if err != nil { + return err + } + + st.convertFrom(&oldStat) + return nil +} + +func Fstatat(fd int, path string, st *Stat_t, flags int) (err error) { + var oldStat stat_freebsd11_t + if supportsABI(_ino64First) { + return fstatat_freebsd12(fd, path, st, flags) + } + err = fstatat(fd, path, &oldStat, flags) + if err != nil { + return err + } + + st.convertFrom(&oldStat) + return nil +} + +func Statfs(path string, st *Statfs_t) (err error) { + var oldStatfs statfs_freebsd11_t + if supportsABI(_ino64First) { + return statfs_freebsd12(path, st) + } + err = statfs(path, &oldStatfs) + if err != nil { + return err + } + + st.convertFrom(&oldStatfs) + return nil +} + +func Fstatfs(fd int, st *Statfs_t) (err error) { + var oldStatfs statfs_freebsd11_t + if supportsABI(_ino64First) { + return fstatfs_freebsd12(fd, st) + } + err = fstatfs(fd, &oldStatfs) + if err != nil { + return err + } + + st.convertFrom(&oldStatfs) + return nil } func Getdents(fd int, buf []byte) (n int, err error) { @@ -224,25 +333,162 @@ func Getdents(fd int, buf []byte) (n int, err error) { } func Getdirentries(fd int, buf []byte, basep *uintptr) (n int, err error) { - if basep == nil || unsafe.Sizeof(*basep) == 8 { - return getdirentries(fd, buf, (*uint64)(unsafe.Pointer(basep))) + if supportsABI(_ino64First) { + if basep == nil || unsafe.Sizeof(*basep) == 8 { + return getdirentries_freebsd12(fd, buf, (*uint64)(unsafe.Pointer(basep))) + } + // The freebsd12 syscall needs a 64-bit base. On 32-bit machines + // we can't just use the basep passed in. See #32498. + var base uint64 = uint64(*basep) + n, err = getdirentries_freebsd12(fd, buf, &base) + *basep = uintptr(base) + if base>>32 != 0 { + // We can't stuff the base back into a uintptr, so any + // future calls would be suspect. Generate an error. + // EIO is allowed by getdirentries. + err = EIO + } + return } - // The syscall needs a 64-bit base. On 32-bit machines - // we can't just use the basep passed in. See #32498. - var base uint64 = uint64(*basep) - n, err = getdirentries(fd, buf, &base) - *basep = uintptr(base) - if base>>32 != 0 { - // We can't stuff the base back into a uintptr, so any - // future calls would be suspect. Generate an error. - // EIO is allowed by getdirentries. - err = EIO + + // The old syscall entries are smaller than the new. Use 1/4 of the original + // buffer size rounded up to DIRBLKSIZ (see /usr/src/lib/libc/sys/getdirentries.c). + oldBufLen := roundup(len(buf)/4, _dirblksiz) + oldBuf := make([]byte, oldBufLen) + n, err = getdirentries(fd, oldBuf, basep) + if err == nil && n > 0 { + n = convertFromDirents11(buf, oldBuf[:n]) } return } func Mknod(path string, mode uint32, dev uint64) (err error) { - return Mknodat(AT_FDCWD, path, mode, dev) + var oldDev int + if supportsABI(_ino64First) { + return mknodat_freebsd12(AT_FDCWD, path, mode, dev) + } + oldDev = int(dev) + return mknod(path, mode, oldDev) +} + +func Mknodat(fd int, path string, mode uint32, dev uint64) (err error) { + var oldDev int + if supportsABI(_ino64First) { + return mknodat_freebsd12(fd, path, mode, dev) + } + oldDev = int(dev) + return mknodat(fd, path, mode, oldDev) +} + +// round x to the nearest multiple of y, larger or equal to x. +// +// from /usr/include/sys/param.h Macros for counting and rounding. +// #define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) +func roundup(x, y int) int { + return ((x + y - 1) / y) * y +} + +func (s *Stat_t) convertFrom(old *stat_freebsd11_t) { + *s = Stat_t{ + Dev: uint64(old.Dev), + Ino: uint64(old.Ino), + Nlink: uint64(old.Nlink), + Mode: old.Mode, + Uid: old.Uid, + Gid: old.Gid, + Rdev: uint64(old.Rdev), + Atim: old.Atim, + Mtim: old.Mtim, + Ctim: old.Ctim, + Btim: old.Btim, + Size: old.Size, + Blocks: old.Blocks, + Blksize: old.Blksize, + Flags: old.Flags, + Gen: uint64(old.Gen), + } +} + +func (s *Statfs_t) convertFrom(old *statfs_freebsd11_t) { + *s = Statfs_t{ + Version: _statfsVersion, + Type: old.Type, + Flags: old.Flags, + Bsize: old.Bsize, + Iosize: old.Iosize, + Blocks: old.Blocks, + Bfree: old.Bfree, + Bavail: old.Bavail, + Files: old.Files, + Ffree: old.Ffree, + Syncwrites: old.Syncwrites, + Asyncwrites: old.Asyncwrites, + Syncreads: old.Syncreads, + Asyncreads: old.Asyncreads, + // Spare + Namemax: old.Namemax, + Owner: old.Owner, + Fsid: old.Fsid, + // Charspare + // Fstypename + // Mntfromname + // Mntonname + } + + sl := old.Fstypename[:] + n := clen(*(*[]byte)(unsafe.Pointer(&sl))) + copy(s.Fstypename[:], old.Fstypename[:n]) + + sl = old.Mntfromname[:] + n = clen(*(*[]byte)(unsafe.Pointer(&sl))) + copy(s.Mntfromname[:], old.Mntfromname[:n]) + + sl = old.Mntonname[:] + n = clen(*(*[]byte)(unsafe.Pointer(&sl))) + copy(s.Mntonname[:], old.Mntonname[:n]) +} + +func convertFromDirents11(buf []byte, old []byte) int { + const ( + fixedSize = int(unsafe.Offsetof(Dirent{}.Name)) + oldFixedSize = int(unsafe.Offsetof(dirent_freebsd11{}.Name)) + ) + + dstPos := 0 + srcPos := 0 + for dstPos+fixedSize < len(buf) && srcPos+oldFixedSize < len(old) { + var dstDirent Dirent + var srcDirent dirent_freebsd11 + + // If multiple direntries are written, sometimes when we reach the final one, + // we may have cap of old less than size of dirent_freebsd11. + copy((*[unsafe.Sizeof(srcDirent)]byte)(unsafe.Pointer(&srcDirent))[:], old[srcPos:]) + + reclen := roundup(fixedSize+int(srcDirent.Namlen)+1, 8) + if dstPos+reclen > len(buf) { + break + } + + dstDirent.Fileno = uint64(srcDirent.Fileno) + dstDirent.Off = 0 + dstDirent.Reclen = uint16(reclen) + dstDirent.Type = srcDirent.Type + dstDirent.Pad0 = 0 + dstDirent.Namlen = uint16(srcDirent.Namlen) + dstDirent.Pad1 = 0 + + copy(dstDirent.Name[:], srcDirent.Name[:srcDirent.Namlen]) + copy(buf[dstPos:], (*[unsafe.Sizeof(dstDirent)]byte)(unsafe.Pointer(&dstDirent))[:]) + padding := buf[dstPos+fixedSize+int(dstDirent.Namlen) : dstPos+reclen] + for i := range padding { + padding[i] = 0 + } + + dstPos += int(dstDirent.Reclen) + srcPos += int(srcDirent.Reclen) + } + + return dstPos } func Sendfile(outfd int, infd int, offset *int64, count int) (written int, err error) { @@ -255,31 +501,31 @@ func Sendfile(outfd int, infd int, offset *int64, count int) (written int, err e //sys ptrace(request int, pid int, addr uintptr, data int) (err error) func PtraceAttach(pid int) (err error) { - return ptrace(PT_ATTACH, pid, 0, 0) + return ptrace(PTRACE_ATTACH, pid, 0, 0) } func PtraceCont(pid int, signal int) (err error) { - return ptrace(PT_CONTINUE, pid, 1, signal) + return ptrace(PTRACE_CONT, pid, 1, signal) } func PtraceDetach(pid int) (err error) { - return ptrace(PT_DETACH, pid, 1, 0) + return ptrace(PTRACE_DETACH, pid, 1, 0) } func PtraceGetFpRegs(pid int, fpregsout *FpReg) (err error) { - return ptrace(PT_GETFPREGS, pid, uintptr(unsafe.Pointer(fpregsout)), 0) + return ptrace(PTRACE_GETFPREGS, pid, uintptr(unsafe.Pointer(fpregsout)), 0) } func PtraceGetRegs(pid int, regsout *Reg) (err error) { - return ptrace(PT_GETREGS, pid, uintptr(unsafe.Pointer(regsout)), 0) + return ptrace(PTRACE_GETREGS, pid, uintptr(unsafe.Pointer(regsout)), 0) } func PtraceLwpEvents(pid int, enable int) (err error) { - return ptrace(PT_LWP_EVENTS, pid, 0, enable) + return ptrace(PTRACE_LWPEVENTS, pid, 0, enable) } func PtraceLwpInfo(pid int, info uintptr) (err error) { - return ptrace(PT_LWPINFO, pid, info, int(unsafe.Sizeof(PtraceLwpInfoStruct{}))) + return ptrace(PTRACE_LWPINFO, pid, info, int(unsafe.Sizeof(PtraceLwpInfoStruct{}))) } func PtracePeekData(pid int, addr uintptr, out []byte) (count int, err error) { @@ -299,11 +545,11 @@ func PtracePokeText(pid int, addr uintptr, data []byte) (count int, err error) { } func PtraceSetRegs(pid int, regs *Reg) (err error) { - return ptrace(PT_SETREGS, pid, uintptr(unsafe.Pointer(regs)), 0) + return ptrace(PTRACE_SETREGS, pid, uintptr(unsafe.Pointer(regs)), 0) } func PtraceSingleStep(pid int) (err error) { - return ptrace(PT_STEP, pid, 1, 0) + return ptrace(PTRACE_SINGLESTEP, pid, 1, 0) } /* @@ -345,12 +591,16 @@ func PtraceSingleStep(pid int) (err error) { //sys Fchownat(dirfd int, path string, uid int, gid int, flags int) (err error) //sys Flock(fd int, how int) (err error) //sys Fpathconf(fd int, name int) (val int, err error) -//sys Fstat(fd int, stat *Stat_t) (err error) -//sys Fstatat(fd int, path string, stat *Stat_t, flags int) (err error) -//sys Fstatfs(fd int, stat *Statfs_t) (err error) +//sys fstat(fd int, stat *stat_freebsd11_t) (err error) +//sys fstat_freebsd12(fd int, stat *Stat_t) (err error) +//sys fstatat(fd int, path string, stat *stat_freebsd11_t, flags int) (err error) +//sys fstatat_freebsd12(fd int, path string, stat *Stat_t, flags int) (err error) +//sys fstatfs(fd int, stat *statfs_freebsd11_t) (err error) +//sys fstatfs_freebsd12(fd int, stat *Statfs_t) (err error) //sys Fsync(fd int) (err error) //sys Ftruncate(fd int, length int64) (err error) -//sys getdirentries(fd int, buf []byte, basep *uint64) (n int, err error) +//sys getdirentries(fd int, buf []byte, basep *uintptr) (n int, err error) +//sys getdirentries_freebsd12(fd int, buf []byte, basep *uint64) (n int, err error) //sys Getdtablesize() (size int) //sysnb Getegid() (egid int) //sysnb Geteuid() (uid int) @@ -372,10 +622,13 @@ func PtraceSingleStep(pid int) (err error) { //sys Link(path string, link string) (err error) //sys Linkat(pathfd int, path string, linkfd int, link string, flags int) (err error) //sys Listen(s int, backlog int) (err error) +//sys lstat(path string, stat *stat_freebsd11_t) (err error) //sys Mkdir(path string, mode uint32) (err error) //sys Mkdirat(dirfd int, path string, mode uint32) (err error) //sys Mkfifo(path string, mode uint32) (err error) -//sys Mknodat(fd int, path string, mode uint32, dev uint64) (err error) +//sys mknod(path string, mode uint32, dev int) (err error) +//sys mknodat(fd int, path string, mode uint32, dev int) (err error) +//sys mknodat_freebsd12(fd int, path string, mode uint32, dev uint64) (err error) //sys Nanosleep(time *Timespec, leftover *Timespec) (err error) //sys Open(path string, mode int, perm uint32) (fd int, err error) //sys Openat(fdat int, path string, mode int, perm uint32) (fd int, err error) @@ -405,7 +658,9 @@ func PtraceSingleStep(pid int) (err error) { //sysnb Setsid() (pid int, err error) //sysnb Settimeofday(tp *Timeval) (err error) //sysnb Setuid(uid int) (err error) -//sys Statfs(path string, stat *Statfs_t) (err error) +//sys stat(path string, stat *stat_freebsd11_t) (err error) +//sys statfs(path string, stat *statfs_freebsd11_t) (err error) +//sys statfs_freebsd12(path string, stat *Statfs_t) (err error) //sys Symlink(path string, link string) (err error) //sys Symlinkat(oldpath string, newdirfd int, newpath string) (err error) //sys Sync() (err error) diff --git a/vendor/golang.org/x/sys/unix/syscall_freebsd_386.go b/vendor/golang.org/x/sys/unix/syscall_freebsd_386.go index b11ede8..342fc32 100644 --- a/vendor/golang.org/x/sys/unix/syscall_freebsd_386.go +++ b/vendor/golang.org/x/sys/unix/syscall_freebsd_386.go @@ -57,11 +57,11 @@ func sendfile(outfd int, infd int, offset *int64, count int) (written int, err e func Syscall9(num, a1, a2, a3, a4, a5, a6, a7, a8, a9 uintptr) (r1, r2 uintptr, err syscall.Errno) func PtraceGetFsBase(pid int, fsbase *int64) (err error) { - return ptrace(PT_GETFSBASE, pid, uintptr(unsafe.Pointer(fsbase)), 0) + return ptrace(PTRACE_GETFSBASE, pid, uintptr(unsafe.Pointer(fsbase)), 0) } func PtraceIO(req int, pid int, addr uintptr, out []byte, countin int) (count int, err error) { - ioDesc := PtraceIoDesc{Op: int32(req), Offs: uintptr(unsafe.Pointer(addr)), Addr: uintptr(unsafe.Pointer(&out[0])), Len: uint32(countin)} - err = ptrace(PT_IO, pid, uintptr(unsafe.Pointer(&ioDesc)), 0) + ioDesc := PtraceIoDesc{Op: int32(req), Offs: (*byte)(unsafe.Pointer(addr)), Addr: (*byte)(unsafe.Pointer(&out[0])), Len: uint32(countin)} + err = ptrace(PTRACE_IO, pid, uintptr(unsafe.Pointer(&ioDesc)), 0) return int(ioDesc.Len), err } diff --git a/vendor/golang.org/x/sys/unix/syscall_freebsd_amd64.go b/vendor/golang.org/x/sys/unix/syscall_freebsd_amd64.go index 9ed8eec..a32d5aa 100644 --- a/vendor/golang.org/x/sys/unix/syscall_freebsd_amd64.go +++ b/vendor/golang.org/x/sys/unix/syscall_freebsd_amd64.go @@ -57,11 +57,11 @@ func sendfile(outfd int, infd int, offset *int64, count int) (written int, err e func Syscall9(num, a1, a2, a3, a4, a5, a6, a7, a8, a9 uintptr) (r1, r2 uintptr, err syscall.Errno) func PtraceGetFsBase(pid int, fsbase *int64) (err error) { - return ptrace(PT_GETFSBASE, pid, uintptr(unsafe.Pointer(fsbase)), 0) + return ptrace(PTRACE_GETFSBASE, pid, uintptr(unsafe.Pointer(fsbase)), 0) } func PtraceIO(req int, pid int, addr uintptr, out []byte, countin int) (count int, err error) { - ioDesc := PtraceIoDesc{Op: int32(req), Offs: uintptr(unsafe.Pointer(addr)), Addr: uintptr(unsafe.Pointer(&out[0])), Len: uint64(countin)} - err = ptrace(PT_IO, pid, uintptr(unsafe.Pointer(&ioDesc)), 0) + ioDesc := PtraceIoDesc{Op: int32(req), Offs: (*byte)(unsafe.Pointer(addr)), Addr: (*byte)(unsafe.Pointer(&out[0])), Len: uint64(countin)} + err = ptrace(PTRACE_IO, pid, uintptr(unsafe.Pointer(&ioDesc)), 0) return int(ioDesc.Len), err } diff --git a/vendor/golang.org/x/sys/unix/syscall_freebsd_arm.go b/vendor/golang.org/x/sys/unix/syscall_freebsd_arm.go index f8ac982..1e36d39 100644 --- a/vendor/golang.org/x/sys/unix/syscall_freebsd_arm.go +++ b/vendor/golang.org/x/sys/unix/syscall_freebsd_arm.go @@ -57,7 +57,7 @@ func sendfile(outfd int, infd int, offset *int64, count int) (written int, err e func Syscall9(num, a1, a2, a3, a4, a5, a6, a7, a8, a9 uintptr) (r1, r2 uintptr, err syscall.Errno) func PtraceIO(req int, pid int, addr uintptr, out []byte, countin int) (count int, err error) { - ioDesc := PtraceIoDesc{Op: int32(req), Offs: uintptr(unsafe.Pointer(addr)), Addr: uintptr(unsafe.Pointer(&out[0])), Len: uint32(countin)} - err = ptrace(PT_IO, pid, uintptr(unsafe.Pointer(&ioDesc)), 0) + ioDesc := PtraceIoDesc{Op: int32(req), Offs: (*byte)(unsafe.Pointer(addr)), Addr: (*byte)(unsafe.Pointer(&out[0])), Len: uint32(countin)} + err = ptrace(PTRACE_IO, pid, uintptr(unsafe.Pointer(&ioDesc)), 0) return int(ioDesc.Len), err } diff --git a/vendor/golang.org/x/sys/unix/syscall_freebsd_arm64.go b/vendor/golang.org/x/sys/unix/syscall_freebsd_arm64.go index 8e93203..a09a153 100644 --- a/vendor/golang.org/x/sys/unix/syscall_freebsd_arm64.go +++ b/vendor/golang.org/x/sys/unix/syscall_freebsd_arm64.go @@ -57,7 +57,7 @@ func sendfile(outfd int, infd int, offset *int64, count int) (written int, err e func Syscall9(num, a1, a2, a3, a4, a5, a6, a7, a8, a9 uintptr) (r1, r2 uintptr, err syscall.Errno) func PtraceIO(req int, pid int, addr uintptr, out []byte, countin int) (count int, err error) { - ioDesc := PtraceIoDesc{Op: int32(req), Offs: uintptr(unsafe.Pointer(addr)), Addr: uintptr(unsafe.Pointer(&out[0])), Len: uint64(countin)} - err = ptrace(PT_IO, pid, uintptr(unsafe.Pointer(&ioDesc)), 0) + ioDesc := PtraceIoDesc{Op: int32(req), Offs: (*byte)(unsafe.Pointer(addr)), Addr: (*byte)(unsafe.Pointer(&out[0])), Len: uint64(countin)} + err = ptrace(PTRACE_IO, pid, uintptr(unsafe.Pointer(&ioDesc)), 0) return int(ioDesc.Len), err } diff --git a/vendor/golang.org/x/sys/unix/syscall_illumos.go b/vendor/golang.org/x/sys/unix/syscall_illumos.go index e48244a..8d5f294 100644 --- a/vendor/golang.org/x/sys/unix/syscall_illumos.go +++ b/vendor/golang.org/x/sys/unix/syscall_illumos.go @@ -20,9 +20,10 @@ func bytes2iovec(bs [][]byte) []Iovec { for i, b := range bs { iovecs[i].SetLen(len(b)) if len(b) > 0 { - iovecs[i].Base = &b[0] + // somehow Iovec.Base on illumos is (*int8), not (*byte) + iovecs[i].Base = (*int8)(unsafe.Pointer(&b[0])) } else { - iovecs[i].Base = (*byte)(unsafe.Pointer(&_zero)) + iovecs[i].Base = (*int8)(unsafe.Pointer(&_zero)) } } return iovecs diff --git a/vendor/golang.org/x/sys/unix/syscall_linux.go b/vendor/golang.org/x/sys/unix/syscall_linux.go index 4714691..eeae6db 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux.go @@ -13,7 +13,6 @@ package unix import ( "encoding/binary" - "strconv" "syscall" "time" "unsafe" @@ -234,7 +233,7 @@ func Futimesat(dirfd int, path string, tv []Timeval) error { func Futimes(fd int, tv []Timeval) (err error) { // Believe it or not, this is the best we can do on Linux // (and is what glibc does). - return Utimes("/proc/self/fd/"+strconv.Itoa(fd), tv) + return Utimes("/proc/self/fd/"+itoa(fd), tv) } const ImplementsGetwd = true @@ -513,24 +512,24 @@ func (sa *SockaddrL2) sockaddr() (unsafe.Pointer, _Socklen, error) { // // Server example: // -// fd, _ := Socket(AF_BLUETOOTH, SOCK_STREAM, BTPROTO_RFCOMM) -// _ = unix.Bind(fd, &unix.SockaddrRFCOMM{ -// Channel: 1, -// Addr: [6]uint8{0, 0, 0, 0, 0, 0}, // BDADDR_ANY or 00:00:00:00:00:00 -// }) -// _ = Listen(fd, 1) -// nfd, sa, _ := Accept(fd) -// fmt.Printf("conn addr=%v fd=%d", sa.(*unix.SockaddrRFCOMM).Addr, nfd) -// Read(nfd, buf) +// fd, _ := Socket(AF_BLUETOOTH, SOCK_STREAM, BTPROTO_RFCOMM) +// _ = unix.Bind(fd, &unix.SockaddrRFCOMM{ +// Channel: 1, +// Addr: [6]uint8{0, 0, 0, 0, 0, 0}, // BDADDR_ANY or 00:00:00:00:00:00 +// }) +// _ = Listen(fd, 1) +// nfd, sa, _ := Accept(fd) +// fmt.Printf("conn addr=%v fd=%d", sa.(*unix.SockaddrRFCOMM).Addr, nfd) +// Read(nfd, buf) // // Client example: // -// fd, _ := Socket(AF_BLUETOOTH, SOCK_STREAM, BTPROTO_RFCOMM) -// _ = Connect(fd, &SockaddrRFCOMM{ -// Channel: 1, -// Addr: [6]byte{0x11, 0x22, 0x33, 0xaa, 0xbb, 0xcc}, // CC:BB:AA:33:22:11 -// }) -// Write(fd, []byte(`hello`)) +// fd, _ := Socket(AF_BLUETOOTH, SOCK_STREAM, BTPROTO_RFCOMM) +// _ = Connect(fd, &SockaddrRFCOMM{ +// Channel: 1, +// Addr: [6]byte{0x11, 0x22, 0x33, 0xaa, 0xbb, 0xcc}, // CC:BB:AA:33:22:11 +// }) +// Write(fd, []byte(`hello`)) type SockaddrRFCOMM struct { // Addr represents a bluetooth address, byte ordering is little-endian. Addr [6]uint8 @@ -557,12 +556,12 @@ func (sa *SockaddrRFCOMM) sockaddr() (unsafe.Pointer, _Socklen, error) { // The SockaddrCAN struct must be bound to the socket file descriptor // using Bind before the CAN socket can be used. // -// // Read one raw CAN frame -// fd, _ := Socket(AF_CAN, SOCK_RAW, CAN_RAW) -// addr := &SockaddrCAN{Ifindex: index} -// Bind(fd, addr) -// frame := make([]byte, 16) -// Read(fd, frame) +// // Read one raw CAN frame +// fd, _ := Socket(AF_CAN, SOCK_RAW, CAN_RAW) +// addr := &SockaddrCAN{Ifindex: index} +// Bind(fd, addr) +// frame := make([]byte, 16) +// Read(fd, frame) // // The full SocketCAN documentation can be found in the linux kernel // archives at: https://www.kernel.org/doc/Documentation/networking/can.txt @@ -633,13 +632,13 @@ func (sa *SockaddrCANJ1939) sockaddr() (unsafe.Pointer, _Socklen, error) { // Here is an example of using an AF_ALG socket with SHA1 hashing. // The initial socket setup process is as follows: // -// // Open a socket to perform SHA1 hashing. -// fd, _ := unix.Socket(unix.AF_ALG, unix.SOCK_SEQPACKET, 0) -// addr := &unix.SockaddrALG{Type: "hash", Name: "sha1"} -// unix.Bind(fd, addr) -// // Note: unix.Accept does not work at this time; must invoke accept() -// // manually using unix.Syscall. -// hashfd, _, _ := unix.Syscall(unix.SYS_ACCEPT, uintptr(fd), 0, 0) +// // Open a socket to perform SHA1 hashing. +// fd, _ := unix.Socket(unix.AF_ALG, unix.SOCK_SEQPACKET, 0) +// addr := &unix.SockaddrALG{Type: "hash", Name: "sha1"} +// unix.Bind(fd, addr) +// // Note: unix.Accept does not work at this time; must invoke accept() +// // manually using unix.Syscall. +// hashfd, _, _ := unix.Syscall(unix.SYS_ACCEPT, uintptr(fd), 0, 0) // // Once a file descriptor has been returned from Accept, it may be used to // perform SHA1 hashing. The descriptor is not safe for concurrent use, but @@ -648,39 +647,39 @@ func (sa *SockaddrCANJ1939) sockaddr() (unsafe.Pointer, _Socklen, error) { // When hashing a small byte slice or string, a single Write and Read may // be used: // -// // Assume hashfd is already configured using the setup process. -// hash := os.NewFile(hashfd, "sha1") -// // Hash an input string and read the results. Each Write discards -// // previous hash state. Read always reads the current state. -// b := make([]byte, 20) -// for i := 0; i < 2; i++ { -// io.WriteString(hash, "Hello, world.") -// hash.Read(b) -// fmt.Println(hex.EncodeToString(b)) -// } -// // Output: -// // 2ae01472317d1935a84797ec1983ae243fc6aa28 -// // 2ae01472317d1935a84797ec1983ae243fc6aa28 +// // Assume hashfd is already configured using the setup process. +// hash := os.NewFile(hashfd, "sha1") +// // Hash an input string and read the results. Each Write discards +// // previous hash state. Read always reads the current state. +// b := make([]byte, 20) +// for i := 0; i < 2; i++ { +// io.WriteString(hash, "Hello, world.") +// hash.Read(b) +// fmt.Println(hex.EncodeToString(b)) +// } +// // Output: +// // 2ae01472317d1935a84797ec1983ae243fc6aa28 +// // 2ae01472317d1935a84797ec1983ae243fc6aa28 // // For hashing larger byte slices, or byte streams such as those read from // a file or socket, use Sendto with MSG_MORE to instruct the kernel to update // the hash digest instead of creating a new one for a given chunk and finalizing it. // -// // Assume hashfd and addr are already configured using the setup process. -// hash := os.NewFile(hashfd, "sha1") -// // Hash the contents of a file. -// f, _ := os.Open("/tmp/linux-4.10-rc7.tar.xz") -// b := make([]byte, 4096) -// for { -// n, err := f.Read(b) -// if err == io.EOF { -// break -// } -// unix.Sendto(hashfd, b[:n], unix.MSG_MORE, addr) -// } -// hash.Read(b) -// fmt.Println(hex.EncodeToString(b)) -// // Output: 85cdcad0c06eef66f805ecce353bec9accbeecc5 +// // Assume hashfd and addr are already configured using the setup process. +// hash := os.NewFile(hashfd, "sha1") +// // Hash the contents of a file. +// f, _ := os.Open("/tmp/linux-4.10-rc7.tar.xz") +// b := make([]byte, 4096) +// for { +// n, err := f.Read(b) +// if err == io.EOF { +// break +// } +// unix.Sendto(hashfd, b[:n], unix.MSG_MORE, addr) +// } +// hash.Read(b) +// fmt.Println(hex.EncodeToString(b)) +// // Output: 85cdcad0c06eef66f805ecce353bec9accbeecc5 // // For more information, see: http://www.chronox.de/crypto-API/crypto/userspace-if.html. type SockaddrALG struct { @@ -1500,13 +1499,18 @@ func KeyctlRestrictKeyring(ringid int, keyType string, restriction string) error //sys keyctlRestrictKeyringByType(cmd int, arg2 int, keyType string, restriction string) (err error) = SYS_KEYCTL //sys keyctlRestrictKeyring(cmd int, arg2 int) (err error) = SYS_KEYCTL -func recvmsgRaw(fd int, iov []Iovec, oob []byte, flags int, rsa *RawSockaddrAny) (n, oobn int, recvflags int, err error) { +func recvmsgRaw(fd int, p, oob []byte, flags int, rsa *RawSockaddrAny) (n, oobn int, recvflags int, err error) { var msg Msghdr msg.Name = (*byte)(unsafe.Pointer(rsa)) msg.Namelen = uint32(SizeofSockaddrAny) + var iov Iovec + if len(p) > 0 { + iov.Base = &p[0] + iov.SetLen(len(p)) + } var dummy byte if len(oob) > 0 { - if emptyIovecs(iov) { + if len(p) == 0 { var sockType int sockType, err = GetsockoptInt(fd, SOL_SOCKET, SO_TYPE) if err != nil { @@ -1514,19 +1518,15 @@ func recvmsgRaw(fd int, iov []Iovec, oob []byte, flags int, rsa *RawSockaddrAny) } // receive at least one normal byte if sockType != SOCK_DGRAM { - var iova [1]Iovec - iova[0].Base = &dummy - iova[0].SetLen(1) - iov = iova[:] + iov.Base = &dummy + iov.SetLen(1) } } msg.Control = &oob[0] msg.SetControllen(len(oob)) } - if len(iov) > 0 { - msg.Iov = &iov[0] - msg.SetIovlen(len(iov)) - } + msg.Iov = &iov + msg.Iovlen = 1 if n, err = recvmsg(fd, &msg, flags); err != nil { return } @@ -1535,15 +1535,18 @@ func recvmsgRaw(fd int, iov []Iovec, oob []byte, flags int, rsa *RawSockaddrAny) return } -func sendmsgN(fd int, iov []Iovec, oob []byte, ptr unsafe.Pointer, salen _Socklen, flags int) (n int, err error) { +func sendmsgN(fd int, p, oob []byte, ptr unsafe.Pointer, salen _Socklen, flags int) (n int, err error) { var msg Msghdr msg.Name = (*byte)(ptr) msg.Namelen = uint32(salen) + var iov Iovec + if len(p) > 0 { + iov.Base = &p[0] + iov.SetLen(len(p)) + } var dummy byte - var empty bool if len(oob) > 0 { - empty = emptyIovecs(iov) - if empty { + if len(p) == 0 { var sockType int sockType, err = GetsockoptInt(fd, SOL_SOCKET, SO_TYPE) if err != nil { @@ -1551,22 +1554,19 @@ func sendmsgN(fd int, iov []Iovec, oob []byte, ptr unsafe.Pointer, salen _Sockle } // send at least one normal byte if sockType != SOCK_DGRAM { - var iova [1]Iovec - iova[0].Base = &dummy - iova[0].SetLen(1) + iov.Base = &dummy + iov.SetLen(1) } } msg.Control = &oob[0] msg.SetControllen(len(oob)) } - if len(iov) > 0 { - msg.Iov = &iov[0] - msg.SetIovlen(len(iov)) - } + msg.Iov = &iov + msg.Iovlen = 1 if n, err = sendmsg(fd, &msg, flags); err != nil { return 0, err } - if len(oob) > 0 && empty { + if len(oob) > 0 && len(p) == 0 { n = 0 } return n, nil @@ -1829,9 +1829,6 @@ func Dup2(oldfd, newfd int) error { //sys Fremovexattr(fd int, attr string) (err error) //sys Fsetxattr(fd int, attr string, dest []byte, flags int) (err error) //sys Fsync(fd int) (err error) -//sys Fsmount(fd int, flags int, mountAttrs int) (fsfd int, err error) -//sys Fsopen(fsName string, flags int) (fd int, err error) -//sys Fspick(dirfd int, pathName string, flags int) (fd int, err error) //sys Getdents(fd int, buf []byte) (n int, err error) = SYS_GETDENTS64 //sysnb Getpgid(pid int) (pgid int, err error) @@ -1892,28 +1889,17 @@ func PrctlRetInt(option int, arg2 uintptr, arg3 uintptr, arg4 uintptr, arg5 uint return int(ret), nil } +// issue 1435. +// On linux Setuid and Setgid only affects the current thread, not the process. +// This does not match what most callers expect so we must return an error +// here rather than letting the caller think that the call succeeded. + func Setuid(uid int) (err error) { - return syscall.Setuid(uid) + return EOPNOTSUPP } -func Setgid(gid int) (err error) { - return syscall.Setgid(gid) -} - -func Setreuid(ruid, euid int) (err error) { - return syscall.Setreuid(ruid, euid) -} - -func Setregid(rgid, egid int) (err error) { - return syscall.Setregid(rgid, egid) -} - -func Setresuid(ruid, euid, suid int) (err error) { - return syscall.Setresuid(ruid, euid, suid) -} - -func Setresgid(rgid, egid, sgid int) (err error) { - return syscall.Setresgid(rgid, egid, sgid) +func Setgid(uid int) (err error) { + return EOPNOTSUPP } // SetfsgidRetGid sets fsgid for current thread and returns previous fsgid set. @@ -2200,7 +2186,7 @@ func Faccessat(dirfd int, path string, mode uint32, flags int) (err error) { gid = Getgid() } - if uint32(gid) == st.Gid || isGroupMember(int(st.Gid)) { + if uint32(gid) == st.Gid || isGroupMember(gid) { fmode = (st.Mode >> 3) & 7 } else { fmode = st.Mode & 7 diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_386.go b/vendor/golang.org/x/sys/unix/syscall_linux_386.go index ff5b589..518e476 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux_386.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux_386.go @@ -41,6 +41,10 @@ func setTimeval(sec, usec int64) Timeval { //sys sendfile(outfd int, infd int, offset *int64, count int) (written int, err error) = SYS_SENDFILE64 //sys setfsgid(gid int) (prev int, err error) = SYS_SETFSGID32 //sys setfsuid(uid int) (prev int, err error) = SYS_SETFSUID32 +//sysnb Setregid(rgid int, egid int) (err error) = SYS_SETREGID32 +//sysnb Setresgid(rgid int, egid int, sgid int) (err error) = SYS_SETRESGID32 +//sysnb Setresuid(ruid int, euid int, suid int) (err error) = SYS_SETRESUID32 +//sysnb Setreuid(ruid int, euid int) (err error) = SYS_SETREUID32 //sys Splice(rfd int, roff *int64, wfd int, woff *int64, len int, flags int) (n int, err error) //sys Stat(path string, stat *Stat_t) (err error) = SYS_STAT64 //sys SyncFileRange(fd int, off int64, n int64, flags int) (err error) diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_amd64.go b/vendor/golang.org/x/sys/unix/syscall_linux_amd64.go index 9b27035..f5e9d6b 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux_amd64.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux_amd64.go @@ -46,7 +46,11 @@ func Select(nfd int, r *FdSet, w *FdSet, e *FdSet, timeout *Timeval) (n int, err //sys sendfile(outfd int, infd int, offset *int64, count int) (written int, err error) //sys setfsgid(gid int) (prev int, err error) //sys setfsuid(uid int) (prev int, err error) +//sysnb Setregid(rgid int, egid int) (err error) +//sysnb Setresgid(rgid int, egid int, sgid int) (err error) +//sysnb Setresuid(ruid int, euid int, suid int) (err error) //sysnb Setrlimit(resource int, rlim *Rlimit) (err error) +//sysnb Setreuid(ruid int, euid int) (err error) //sys Shutdown(fd int, how int) (err error) //sys Splice(rfd int, roff *int64, wfd int, woff *int64, len int, flags int) (n int64, err error) diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_arm.go b/vendor/golang.org/x/sys/unix/syscall_linux_arm.go index 856ad1d..c1a7778 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux_arm.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux_arm.go @@ -62,6 +62,10 @@ func Seek(fd int, offset int64, whence int) (newoffset int64, err error) { //sys Select(nfd int, r *FdSet, w *FdSet, e *FdSet, timeout *Timeval) (n int, err error) = SYS__NEWSELECT //sys setfsgid(gid int) (prev int, err error) = SYS_SETFSGID32 //sys setfsuid(uid int) (prev int, err error) = SYS_SETFSUID32 +//sysnb Setregid(rgid int, egid int) (err error) = SYS_SETREGID32 +//sysnb Setresgid(rgid int, egid int, sgid int) (err error) = SYS_SETRESGID32 +//sysnb Setresuid(ruid int, euid int, suid int) (err error) = SYS_SETRESUID32 +//sysnb Setreuid(ruid int, euid int) (err error) = SYS_SETREUID32 //sys Shutdown(fd int, how int) (err error) //sys Splice(rfd int, roff *int64, wfd int, woff *int64, len int, flags int) (n int, err error) //sys Stat(path string, stat *Stat_t) (err error) = SYS_STAT64 diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_arm64.go b/vendor/golang.org/x/sys/unix/syscall_linux_arm64.go index 6422704..d83e2c6 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux_arm64.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux_arm64.go @@ -39,7 +39,11 @@ func Select(nfd int, r *FdSet, w *FdSet, e *FdSet, timeout *Timeval) (n int, err //sys sendfile(outfd int, infd int, offset *int64, count int) (written int, err error) //sys setfsgid(gid int) (prev int, err error) //sys setfsuid(uid int) (prev int, err error) +//sysnb Setregid(rgid int, egid int) (err error) +//sysnb Setresgid(rgid int, egid int, sgid int) (err error) +//sysnb Setresuid(ruid int, euid int, suid int) (err error) //sysnb setrlimit(resource int, rlim *Rlimit) (err error) +//sysnb Setreuid(ruid int, euid int) (err error) //sys Shutdown(fd int, how int) (err error) //sys Splice(rfd int, roff *int64, wfd int, woff *int64, len int, flags int) (n int64, err error) diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_mips64x.go b/vendor/golang.org/x/sys/unix/syscall_linux_mips64x.go index bfef09a..98a2660 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux_mips64x.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux_mips64x.go @@ -37,7 +37,11 @@ func Select(nfd int, r *FdSet, w *FdSet, e *FdSet, timeout *Timeval) (n int, err //sys sendfile(outfd int, infd int, offset *int64, count int) (written int, err error) //sys setfsgid(gid int) (prev int, err error) //sys setfsuid(uid int) (prev int, err error) +//sysnb Setregid(rgid int, egid int) (err error) +//sysnb Setresgid(rgid int, egid int, sgid int) (err error) +//sysnb Setresuid(ruid int, euid int, suid int) (err error) //sysnb Setrlimit(resource int, rlim *Rlimit) (err error) +//sysnb Setreuid(ruid int, euid int) (err error) //sys Shutdown(fd int, how int) (err error) //sys Splice(rfd int, roff *int64, wfd int, woff *int64, len int, flags int) (n int64, err error) //sys Statfs(path string, buf *Statfs_t) (err error) diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_mipsx.go b/vendor/golang.org/x/sys/unix/syscall_linux_mipsx.go index ab30250..b8a18c0 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux_mipsx.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux_mipsx.go @@ -32,6 +32,10 @@ func Syscall9(trap, a1, a2, a3, a4, a5, a6, a7, a8, a9 uintptr) (r1, r2 uintptr, //sys sendfile(outfd int, infd int, offset *int64, count int) (written int, err error) = SYS_SENDFILE64 //sys setfsgid(gid int) (prev int, err error) //sys setfsuid(uid int) (prev int, err error) +//sysnb Setregid(rgid int, egid int) (err error) +//sysnb Setresgid(rgid int, egid int, sgid int) (err error) +//sysnb Setresuid(ruid int, euid int, suid int) (err error) +//sysnb Setreuid(ruid int, euid int) (err error) //sys Shutdown(fd int, how int) (err error) //sys Splice(rfd int, roff *int64, wfd int, woff *int64, len int, flags int) (n int, err error) //sys SyncFileRange(fd int, off int64, n int64, flags int) (err error) diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_ppc.go b/vendor/golang.org/x/sys/unix/syscall_linux_ppc.go index eac1cf1..4ed9e67 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux_ppc.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux_ppc.go @@ -34,6 +34,10 @@ import ( //sys sendfile(outfd int, infd int, offset *int64, count int) (written int, err error) = SYS_SENDFILE64 //sys setfsgid(gid int) (prev int, err error) //sys setfsuid(uid int) (prev int, err error) +//sysnb Setregid(rgid int, egid int) (err error) +//sysnb Setresgid(rgid int, egid int, sgid int) (err error) +//sysnb Setresuid(ruid int, euid int, suid int) (err error) +//sysnb Setreuid(ruid int, euid int) (err error) //sys Shutdown(fd int, how int) (err error) //sys Splice(rfd int, roff *int64, wfd int, woff *int64, len int, flags int) (n int, err error) //sys Stat(path string, stat *Stat_t) (err error) = SYS_STAT64 diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_ppc64x.go b/vendor/golang.org/x/sys/unix/syscall_linux_ppc64x.go index 4df5661..db63d38 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux_ppc64x.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux_ppc64x.go @@ -34,7 +34,11 @@ package unix //sys sendfile(outfd int, infd int, offset *int64, count int) (written int, err error) //sys setfsgid(gid int) (prev int, err error) //sys setfsuid(uid int) (prev int, err error) +//sysnb Setregid(rgid int, egid int) (err error) +//sysnb Setresgid(rgid int, egid int, sgid int) (err error) +//sysnb Setresuid(ruid int, euid int, suid int) (err error) //sysnb Setrlimit(resource int, rlim *Rlimit) (err error) +//sysnb Setreuid(ruid int, euid int) (err error) //sys Shutdown(fd int, how int) (err error) //sys Splice(rfd int, roff *int64, wfd int, woff *int64, len int, flags int) (n int64, err error) //sys Stat(path string, stat *Stat_t) (err error) diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_riscv64.go b/vendor/golang.org/x/sys/unix/syscall_linux_riscv64.go index 5f4243d..8ff7adb 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux_riscv64.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux_riscv64.go @@ -22,7 +22,6 @@ import "unsafe" //sysnb Getrlimit(resource int, rlim *Rlimit) (err error) //sysnb Getuid() (uid int) //sys Listen(s int, n int) (err error) -//sys MemfdSecret(flags int) (fd int, err error) //sys pread(fd int, p []byte, offset int64) (n int, err error) = SYS_PREAD64 //sys pwrite(fd int, p []byte, offset int64) (n int, err error) = SYS_PWRITE64 //sys Seek(fd int, offset int64, whence int) (off int64, err error) = SYS_LSEEK @@ -38,7 +37,11 @@ func Select(nfd int, r *FdSet, w *FdSet, e *FdSet, timeout *Timeval) (n int, err //sys sendfile(outfd int, infd int, offset *int64, count int) (written int, err error) //sys setfsgid(gid int) (prev int, err error) //sys setfsuid(uid int) (prev int, err error) +//sysnb Setregid(rgid int, egid int) (err error) +//sysnb Setresgid(rgid int, egid int, sgid int) (err error) +//sysnb Setresuid(ruid int, euid int, suid int) (err error) //sysnb Setrlimit(resource int, rlim *Rlimit) (err error) +//sysnb Setreuid(ruid int, euid int) (err error) //sys Shutdown(fd int, how int) (err error) //sys Splice(rfd int, roff *int64, wfd int, woff *int64, len int, flags int) (n int64, err error) diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_s390x.go b/vendor/golang.org/x/sys/unix/syscall_linux_s390x.go index d0a7d40..6fcf277 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux_s390x.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux_s390x.go @@ -34,7 +34,11 @@ import ( //sys sendfile(outfd int, infd int, offset *int64, count int) (written int, err error) //sys setfsgid(gid int) (prev int, err error) //sys setfsuid(uid int) (prev int, err error) +//sysnb Setregid(rgid int, egid int) (err error) +//sysnb Setresgid(rgid int, egid int, sgid int) (err error) +//sysnb Setresuid(ruid int, euid int, suid int) (err error) //sysnb Setrlimit(resource int, rlim *Rlimit) (err error) +//sysnb Setreuid(ruid int, euid int) (err error) //sys Splice(rfd int, roff *int64, wfd int, woff *int64, len int, flags int) (n int64, err error) //sys Stat(path string, stat *Stat_t) (err error) //sys Statfs(path string, buf *Statfs_t) (err error) diff --git a/vendor/golang.org/x/sys/unix/syscall_linux_sparc64.go b/vendor/golang.org/x/sys/unix/syscall_linux_sparc64.go index f5c793b..02a45d9 100644 --- a/vendor/golang.org/x/sys/unix/syscall_linux_sparc64.go +++ b/vendor/golang.org/x/sys/unix/syscall_linux_sparc64.go @@ -31,7 +31,11 @@ package unix //sys sendfile(outfd int, infd int, offset *int64, count int) (written int, err error) //sys setfsgid(gid int) (prev int, err error) //sys setfsuid(uid int) (prev int, err error) +//sysnb Setregid(rgid int, egid int) (err error) +//sysnb Setresgid(rgid int, egid int, sgid int) (err error) +//sysnb Setresuid(ruid int, euid int, suid int) (err error) //sysnb Setrlimit(resource int, rlim *Rlimit) (err error) +//sysnb Setreuid(ruid int, euid int) (err error) //sys Shutdown(fd int, how int) (err error) //sys Splice(rfd int, roff *int64, wfd int, woff *int64, len int, flags int) (n int64, err error) //sys Stat(path string, stat *Stat_t) (err error) diff --git a/vendor/golang.org/x/sys/unix/syscall_openbsd.go b/vendor/golang.org/x/sys/unix/syscall_openbsd.go index 78daceb..15d637d 100644 --- a/vendor/golang.org/x/sys/unix/syscall_openbsd.go +++ b/vendor/golang.org/x/sys/unix/syscall_openbsd.go @@ -81,7 +81,6 @@ func Pipe(p []int) (err error) { } //sysnb pipe2(p *[2]_C_int, flags int) (err error) - func Pipe2(p []int, flags int) error { if len(p) != 2 { return EINVAL @@ -96,7 +95,6 @@ func Pipe2(p []int, flags int) error { } //sys Getdents(fd int, buf []byte) (n int, err error) - func Getdirentries(fd int, buf []byte, basep *uintptr) (n int, err error) { n, err = Getdents(fd, buf) if err != nil || basep == nil { diff --git a/vendor/golang.org/x/sys/unix/syscall_openbsd_mips64.go b/vendor/golang.org/x/sys/unix/syscall_openbsd_mips64.go index 1378489..30f2853 100644 --- a/vendor/golang.org/x/sys/unix/syscall_openbsd_mips64.go +++ b/vendor/golang.org/x/sys/unix/syscall_openbsd_mips64.go @@ -26,10 +26,6 @@ func (msghdr *Msghdr) SetControllen(length int) { msghdr.Controllen = uint32(length) } -func (msghdr *Msghdr) SetIovlen(length int) { - msghdr.Iovlen = uint32(length) -} - func (cmsg *Cmsghdr) SetLen(length int) { cmsg.Len = uint32(length) } diff --git a/vendor/golang.org/x/sys/unix/syscall_solaris.go b/vendor/golang.org/x/sys/unix/syscall_solaris.go index 8c6f409..5c2003c 100644 --- a/vendor/golang.org/x/sys/unix/syscall_solaris.go +++ b/vendor/golang.org/x/sys/unix/syscall_solaris.go @@ -451,25 +451,26 @@ func Accept(fd int) (nfd int, sa Sockaddr, err error) { //sys recvmsg(s int, msg *Msghdr, flags int) (n int, err error) = libsocket.__xnet_recvmsg -func recvmsgRaw(fd int, iov []Iovec, oob []byte, flags int, rsa *RawSockaddrAny) (n, oobn int, recvflags int, err error) { +func recvmsgRaw(fd int, p, oob []byte, flags int, rsa *RawSockaddrAny) (n, oobn int, recvflags int, err error) { var msg Msghdr msg.Name = (*byte)(unsafe.Pointer(rsa)) msg.Namelen = uint32(SizeofSockaddrAny) - var dummy byte + var iov Iovec + if len(p) > 0 { + iov.Base = (*int8)(unsafe.Pointer(&p[0])) + iov.SetLen(len(p)) + } + var dummy int8 if len(oob) > 0 { // receive at least one normal byte - if emptyIovecs(iov) { - var iova [1]Iovec - iova[0].Base = &dummy - iova[0].SetLen(1) - iov = iova[:] + if len(p) == 0 { + iov.Base = &dummy + iov.SetLen(1) } msg.Accrightslen = int32(len(oob)) } - if len(iov) > 0 { - msg.Iov = &iov[0] - msg.SetIovlen(len(iov)) - } + msg.Iov = &iov + msg.Iovlen = 1 if n, err = recvmsg(fd, &msg, flags); n == -1 { return } @@ -479,31 +480,30 @@ func recvmsgRaw(fd int, iov []Iovec, oob []byte, flags int, rsa *RawSockaddrAny) //sys sendmsg(s int, msg *Msghdr, flags int) (n int, err error) = libsocket.__xnet_sendmsg -func sendmsgN(fd int, iov []Iovec, oob []byte, ptr unsafe.Pointer, salen _Socklen, flags int) (n int, err error) { +func sendmsgN(fd int, p, oob []byte, ptr unsafe.Pointer, salen _Socklen, flags int) (n int, err error) { var msg Msghdr msg.Name = (*byte)(unsafe.Pointer(ptr)) msg.Namelen = uint32(salen) - var dummy byte - var empty bool + var iov Iovec + if len(p) > 0 { + iov.Base = (*int8)(unsafe.Pointer(&p[0])) + iov.SetLen(len(p)) + } + var dummy int8 if len(oob) > 0 { // send at least one normal byte - empty = emptyIovecs(iov) - if empty { - var iova [1]Iovec - iova[0].Base = &dummy - iova[0].SetLen(1) - iov = iova[:] + if len(p) == 0 { + iov.Base = &dummy + iov.SetLen(1) } msg.Accrightslen = int32(len(oob)) } - if len(iov) > 0 { - msg.Iov = &iov[0] - msg.SetIovlen(len(iov)) - } + msg.Iov = &iov + msg.Iovlen = 1 if n, err = sendmsg(fd, &msg, flags); err != nil { return 0, err } - if len(oob) > 0 && empty { + if len(oob) > 0 && len(p) == 0 { n = 0 } return n, nil @@ -618,7 +618,6 @@ func Sendfile(outfd int, infd int, offset *int64, count int) (written int, err e //sys Getpriority(which int, who int) (n int, err error) //sysnb Getrlimit(which int, lim *Rlimit) (err error) //sysnb Getrusage(who int, rusage *Rusage) (err error) -//sysnb Getsid(pid int) (sid int, err error) //sysnb Gettimeofday(tv *Timeval) (err error) //sysnb Getuid() (uid int) //sys Kill(pid int, signum syscall.Signal) (err error) @@ -750,8 +749,8 @@ type EventPort struct { // we should handle things gracefully. To do so, we need to keep an extra // reference to the cookie around until the event is processed // thus the otherwise seemingly extraneous "cookies" map - // The key of this map is a pointer to the corresponding fCookie - cookies map[*fileObjCookie]struct{} + // The key of this map is a pointer to the corresponding &fCookie.cookie + cookies map[*interface{}]*fileObjCookie } // PortEvent is an abstraction of the port_event C struct. @@ -778,7 +777,7 @@ func NewEventPort() (*EventPort, error) { port: port, fds: make(map[uintptr]*fileObjCookie), paths: make(map[string]*fileObjCookie), - cookies: make(map[*fileObjCookie]struct{}), + cookies: make(map[*interface{}]*fileObjCookie), } return e, nil } @@ -799,7 +798,6 @@ func (e *EventPort) Close() error { } e.fds = nil e.paths = nil - e.cookies = nil return nil } @@ -827,16 +825,17 @@ func (e *EventPort) AssociatePath(path string, stat os.FileInfo, events int, coo if _, found := e.paths[path]; found { return fmt.Errorf("%v is already associated with this Event Port", path) } - fCookie, err := createFileObjCookie(path, stat, cookie) + fobj, err := createFileObj(path, stat) if err != nil { return err } - _, err = port_associate(e.port, PORT_SOURCE_FILE, uintptr(unsafe.Pointer(fCookie.fobj)), events, (*byte)(unsafe.Pointer(fCookie))) + fCookie := &fileObjCookie{fobj, cookie} + _, err = port_associate(e.port, PORT_SOURCE_FILE, uintptr(unsafe.Pointer(fobj)), events, (*byte)(unsafe.Pointer(&fCookie.cookie))) if err != nil { return err } e.paths[path] = fCookie - e.cookies[fCookie] = struct{}{} + e.cookies[&fCookie.cookie] = fCookie return nil } @@ -858,7 +857,7 @@ func (e *EventPort) DissociatePath(path string) error { if err == nil { // dissociate was successful, safe to delete the cookie fCookie := e.paths[path] - delete(e.cookies, fCookie) + delete(e.cookies, &fCookie.cookie) } delete(e.paths, path) return err @@ -871,16 +870,13 @@ func (e *EventPort) AssociateFd(fd uintptr, events int, cookie interface{}) erro if _, found := e.fds[fd]; found { return fmt.Errorf("%v is already associated with this Event Port", fd) } - fCookie, err := createFileObjCookie("", nil, cookie) - if err != nil { - return err - } - _, err = port_associate(e.port, PORT_SOURCE_FD, fd, events, (*byte)(unsafe.Pointer(fCookie))) + fCookie := &fileObjCookie{nil, cookie} + _, err := port_associate(e.port, PORT_SOURCE_FD, fd, events, (*byte)(unsafe.Pointer(&fCookie.cookie))) if err != nil { return err } e.fds[fd] = fCookie - e.cookies[fCookie] = struct{}{} + e.cookies[&fCookie.cookie] = fCookie return nil } @@ -899,31 +895,27 @@ func (e *EventPort) DissociateFd(fd uintptr) error { if err == nil { // dissociate was successful, safe to delete the cookie fCookie := e.fds[fd] - delete(e.cookies, fCookie) + delete(e.cookies, &fCookie.cookie) } delete(e.fds, fd) return err } -func createFileObjCookie(name string, stat os.FileInfo, cookie interface{}) (*fileObjCookie, error) { - fCookie := new(fileObjCookie) - fCookie.cookie = cookie - if name != "" && stat != nil { - fCookie.fobj = new(fileObj) - bs, err := ByteSliceFromString(name) - if err != nil { - return nil, err - } - fCookie.fobj.Name = (*int8)(unsafe.Pointer(&bs[0])) - s := stat.Sys().(*syscall.Stat_t) - fCookie.fobj.Atim.Sec = s.Atim.Sec - fCookie.fobj.Atim.Nsec = s.Atim.Nsec - fCookie.fobj.Mtim.Sec = s.Mtim.Sec - fCookie.fobj.Mtim.Nsec = s.Mtim.Nsec - fCookie.fobj.Ctim.Sec = s.Ctim.Sec - fCookie.fobj.Ctim.Nsec = s.Ctim.Nsec +func createFileObj(name string, stat os.FileInfo) (*fileObj, error) { + fobj := new(fileObj) + bs, err := ByteSliceFromString(name) + if err != nil { + return nil, err } - return fCookie, nil + fobj.Name = (*int8)(unsafe.Pointer(&bs[0])) + s := stat.Sys().(*syscall.Stat_t) + fobj.Atim.Sec = s.Atim.Sec + fobj.Atim.Nsec = s.Atim.Nsec + fobj.Mtim.Sec = s.Mtim.Sec + fobj.Mtim.Nsec = s.Mtim.Nsec + fobj.Ctim.Sec = s.Ctim.Sec + fobj.Ctim.Nsec = s.Ctim.Nsec + return fobj, nil } // GetOne wraps port_get(3c) and returns a single PortEvent. @@ -936,50 +928,44 @@ func (e *EventPort) GetOne(t *Timespec) (*PortEvent, error) { p := new(PortEvent) e.mu.Lock() defer e.mu.Unlock() - err = e.peIntToExt(pe, p) - if err != nil { - return nil, err - } + e.peIntToExt(pe, p) return p, nil } // peIntToExt converts a cgo portEvent struct into the friendlier PortEvent // NOTE: Always call this function while holding the e.mu mutex -func (e *EventPort) peIntToExt(peInt *portEvent, peExt *PortEvent) error { - if e.cookies == nil { - return fmt.Errorf("this EventPort is already closed") - } +func (e *EventPort) peIntToExt(peInt *portEvent, peExt *PortEvent) { peExt.Events = peInt.Events peExt.Source = peInt.Source - fCookie := (*fileObjCookie)(unsafe.Pointer(peInt.User)) - _, found := e.cookies[fCookie] - - if !found { - panic("unexpected event port address; may be due to kernel bug; see https://go.dev/issue/54254") - } - peExt.Cookie = fCookie.cookie - delete(e.cookies, fCookie) - + cookie := (*interface{})(unsafe.Pointer(peInt.User)) + peExt.Cookie = *cookie switch peInt.Source { case PORT_SOURCE_FD: + delete(e.cookies, cookie) peExt.Fd = uintptr(peInt.Object) // Only remove the fds entry if it exists and this cookie matches if fobj, ok := e.fds[peExt.Fd]; ok { - if fobj == fCookie { + if &fobj.cookie == cookie { delete(e.fds, peExt.Fd) } } case PORT_SOURCE_FILE: - peExt.fobj = fCookie.fobj + if fCookie, ok := e.cookies[cookie]; ok && uintptr(unsafe.Pointer(fCookie.fobj)) == uintptr(peInt.Object) { + // Use our stashed reference rather than using unsafe on what we got back + // the unsafe version would be (*fileObj)(unsafe.Pointer(uintptr(peInt.Object))) + peExt.fobj = fCookie.fobj + } else { + panic("mismanaged memory") + } + delete(e.cookies, cookie) peExt.Path = BytePtrToString((*byte)(unsafe.Pointer(peExt.fobj.Name))) // Only remove the paths entry if it exists and this cookie matches if fobj, ok := e.paths[peExt.Path]; ok { - if fobj == fCookie { + if &fobj.cookie == cookie { delete(e.paths, peExt.Path) } } } - return nil } // Pending wraps port_getn(3c) and returns how many events are pending. @@ -1003,7 +989,7 @@ func (e *EventPort) Get(s []PortEvent, min int, timeout *Timespec) (int, error) got := uint32(min) max := uint32(len(s)) var err error - ps := make([]portEvent, max) + ps := make([]portEvent, max, max) _, err = port_getn(e.port, &ps[0], max, &got, timeout) // got will be trustworthy with ETIME, but not any other error. if err != nil && err != ETIME { @@ -1011,18 +997,8 @@ func (e *EventPort) Get(s []PortEvent, min int, timeout *Timespec) (int, error) } e.mu.Lock() defer e.mu.Unlock() - valid := 0 for i := 0; i < int(got); i++ { - err2 := e.peIntToExt(&ps[i], &s[i]) - if err2 != nil { - if valid == 0 && err == nil { - // If err2 is the only error and there are no valid events - // to return, return it to the caller. - err = err2 - } - break - } - valid = i + 1 + e.peIntToExt(&ps[i], &s[i]) } - return valid, err + return int(got), err } diff --git a/vendor/golang.org/x/sys/unix/syscall_unix.go b/vendor/golang.org/x/sys/unix/syscall_unix.go index 9f75356..70508af 100644 --- a/vendor/golang.org/x/sys/unix/syscall_unix.go +++ b/vendor/golang.org/x/sys/unix/syscall_unix.go @@ -13,6 +13,8 @@ import ( "sync" "syscall" "unsafe" + + "golang.org/x/sys/internal/unsafeheader" ) var ( @@ -115,7 +117,11 @@ func (m *mmapper) Mmap(fd int, offset int64, length int, prot int, flags int) (d } // Use unsafe to convert addr into a []byte. - b := unsafe.Slice((*byte)(unsafe.Pointer(addr)), length) + var b []byte + hdr := (*unsafeheader.Slice)(unsafe.Pointer(&b)) + hdr.Data = unsafe.Pointer(addr) + hdr.Cap = length + hdr.Len = length // Register mapping in m and return it. p := &b[cap(b)-1] @@ -332,13 +338,8 @@ func Recvfrom(fd int, p []byte, flags int) (n int, from Sockaddr, err error) { } func Recvmsg(fd int, p, oob []byte, flags int) (n, oobn int, recvflags int, from Sockaddr, err error) { - var iov [1]Iovec - if len(p) > 0 { - iov[0].Base = &p[0] - iov[0].SetLen(len(p)) - } var rsa RawSockaddrAny - n, oobn, recvflags, err = recvmsgRaw(fd, iov[:], oob, flags, &rsa) + n, oobn, recvflags, err = recvmsgRaw(fd, p, oob, flags, &rsa) // source address is only specified if the socket is unconnected if rsa.Addr.Family != AF_UNSPEC { from, err = anyToSockaddr(fd, &rsa) @@ -346,42 +347,12 @@ func Recvmsg(fd int, p, oob []byte, flags int) (n, oobn int, recvflags int, from return } -// RecvmsgBuffers receives a message from a socket using the recvmsg -// system call. The flags are passed to recvmsg. Any non-control data -// read is scattered into the buffers slices. The results are: -// - n is the number of non-control data read into bufs -// - oobn is the number of control data read into oob; this may be interpreted using [ParseSocketControlMessage] -// - recvflags is flags returned by recvmsg -// - from is the address of the sender -func RecvmsgBuffers(fd int, buffers [][]byte, oob []byte, flags int) (n, oobn int, recvflags int, from Sockaddr, err error) { - iov := make([]Iovec, len(buffers)) - for i := range buffers { - if len(buffers[i]) > 0 { - iov[i].Base = &buffers[i][0] - iov[i].SetLen(len(buffers[i])) - } else { - iov[i].Base = (*byte)(unsafe.Pointer(&_zero)) - } - } - var rsa RawSockaddrAny - n, oobn, recvflags, err = recvmsgRaw(fd, iov, oob, flags, &rsa) - if err == nil && rsa.Addr.Family != AF_UNSPEC { - from, err = anyToSockaddr(fd, &rsa) - } - return -} - func Sendmsg(fd int, p, oob []byte, to Sockaddr, flags int) (err error) { _, err = SendmsgN(fd, p, oob, to, flags) return } func SendmsgN(fd int, p, oob []byte, to Sockaddr, flags int) (n int, err error) { - var iov [1]Iovec - if len(p) > 0 { - iov[0].Base = &p[0] - iov[0].SetLen(len(p)) - } var ptr unsafe.Pointer var salen _Socklen if to != nil { @@ -390,32 +361,7 @@ func SendmsgN(fd int, p, oob []byte, to Sockaddr, flags int) (n int, err error) return 0, err } } - return sendmsgN(fd, iov[:], oob, ptr, salen, flags) -} - -// SendmsgBuffers sends a message on a socket to an address using the sendmsg -// system call. The flags are passed to sendmsg. Any non-control data written -// is gathered from buffers. The function returns the number of bytes written -// to the socket. -func SendmsgBuffers(fd int, buffers [][]byte, oob []byte, to Sockaddr, flags int) (n int, err error) { - iov := make([]Iovec, len(buffers)) - for i := range buffers { - if len(buffers[i]) > 0 { - iov[i].Base = &buffers[i][0] - iov[i].SetLen(len(buffers[i])) - } else { - iov[i].Base = (*byte)(unsafe.Pointer(&_zero)) - } - } - var ptr unsafe.Pointer - var salen _Socklen - if to != nil { - ptr, salen, err = to.sockaddr() - if err != nil { - return 0, err - } - } - return sendmsgN(fd, iov, oob, ptr, salen, flags) + return sendmsgN(fd, p, oob, ptr, salen, flags) } func Send(s int, buf []byte, flags int) (err error) { @@ -538,13 +484,3 @@ func Lutimes(path string, tv []Timeval) error { } return UtimesNanoAt(AT_FDCWD, path, ts, AT_SYMLINK_NOFOLLOW) } - -// emptyIovec reports whether there are no bytes in the slice of Iovec. -func emptyIovecs(iov []Iovec) bool { - for i := range iov { - if iov[i].Len > 0 { - return false - } - } - return true -} diff --git a/vendor/golang.org/x/sys/unix/sysvshm_unix.go b/vendor/golang.org/x/sys/unix/sysvshm_unix.go index 5bb41d1..0bb4c8d 100644 --- a/vendor/golang.org/x/sys/unix/sysvshm_unix.go +++ b/vendor/golang.org/x/sys/unix/sysvshm_unix.go @@ -7,7 +7,11 @@ package unix -import "unsafe" +import ( + "unsafe" + + "golang.org/x/sys/internal/unsafeheader" +) // SysvShmAttach attaches the Sysv shared memory segment associated with the // shared memory identifier id. @@ -30,7 +34,12 @@ func SysvShmAttach(id int, addr uintptr, flag int) ([]byte, error) { } // Use unsafe to convert addr into a []byte. - b := unsafe.Slice((*byte)(unsafe.Pointer(addr)), int(info.Segsz)) + // TODO: convert to unsafe.Slice once we can assume Go 1.17 + var b []byte + hdr := (*unsafeheader.Slice)(unsafe.Pointer(&b)) + hdr.Data = unsafe.Pointer(addr) + hdr.Cap = int(info.Segsz) + hdr.Len = int(info.Segsz) return b, nil } diff --git a/vendor/golang.org/x/sys/unix/zerrors_freebsd_386.go b/vendor/golang.org/x/sys/unix/zerrors_freebsd_386.go index f8c2c51..4409001 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_freebsd_386.go +++ b/vendor/golang.org/x/sys/unix/zerrors_freebsd_386.go @@ -151,7 +151,6 @@ const ( BIOCSETF = 0x80084267 BIOCSETFNR = 0x80084282 BIOCSETIF = 0x8020426c - BIOCSETVLANPCP = 0x80044285 BIOCSETWF = 0x8008427b BIOCSETZBUF = 0x800c4281 BIOCSHDRCMPLT = 0x80044275 @@ -448,7 +447,7 @@ const ( DLT_IEEE802_16_MAC_CPS_RADIO = 0xc1 DLT_INFINIBAND = 0xf7 DLT_IPFILTER = 0x74 - DLT_IPMB_KONTRON = 0xc7 + DLT_IPMB = 0xc7 DLT_IPMB_LINUX = 0xd1 DLT_IPMI_HPM_2 = 0x104 DLT_IPNET = 0xe2 @@ -488,11 +487,10 @@ const ( DLT_LINUX_LAPD = 0xb1 DLT_LINUX_PPP_WITHDIRECTION = 0xa6 DLT_LINUX_SLL = 0x71 - DLT_LINUX_SLL2 = 0x114 DLT_LOOP = 0x6c DLT_LORATAP = 0x10e DLT_LTALK = 0x72 - DLT_MATCHING_MAX = 0x114 + DLT_MATCHING_MAX = 0x113 DLT_MATCHING_MIN = 0x68 DLT_MFR = 0xb6 DLT_MOST = 0xd3 @@ -736,7 +734,6 @@ const ( IPPROTO_CMTP = 0x26 IPPROTO_CPHB = 0x49 IPPROTO_CPNX = 0x48 - IPPROTO_DCCP = 0x21 IPPROTO_DDP = 0x25 IPPROTO_DGP = 0x56 IPPROTO_DIVERT = 0x102 @@ -817,6 +814,7 @@ const ( IPPROTO_SCTP = 0x84 IPPROTO_SDRP = 0x2a IPPROTO_SEND = 0x103 + IPPROTO_SEP = 0x21 IPPROTO_SHIM6 = 0x8c IPPROTO_SKIP = 0x39 IPPROTO_SPACER = 0x7fff @@ -913,7 +911,6 @@ const ( IPV6_V6ONLY = 0x1b IPV6_VERSION = 0x60 IPV6_VERSION_MASK = 0xf0 - IPV6_VLAN_PCP = 0x4b IP_ADD_MEMBERSHIP = 0xc IP_ADD_SOURCE_MEMBERSHIP = 0x46 IP_BINDANY = 0x18 @@ -992,12 +989,8 @@ const ( IP_TOS = 0x3 IP_TTL = 0x4 IP_UNBLOCK_SOURCE = 0x49 - IP_VLAN_PCP = 0x4b ISIG = 0x80 ISTRIP = 0x20 - ITIMER_PROF = 0x2 - ITIMER_REAL = 0x0 - ITIMER_VIRTUAL = 0x1 IXANY = 0x800 IXOFF = 0x400 IXON = 0x200 @@ -1007,6 +1000,7 @@ const ( KERN_VERSION = 0x4 LOCAL_CONNWAIT = 0x4 LOCAL_CREDS = 0x2 + LOCAL_CREDS_PERSISTENT = 0x3 LOCAL_PEERCRED = 0x1 LOCAL_VENDOR = 0x80000000 LOCK_EX = 0x2 @@ -1185,8 +1179,6 @@ const ( O_NONBLOCK = 0x4 O_RDONLY = 0x0 O_RDWR = 0x2 - O_RESOLVE_BENEATH = 0x800000 - O_SEARCH = 0x40000 O_SHLOCK = 0x10 O_SYNC = 0x80 O_TRUNC = 0x400 @@ -1197,10 +1189,6 @@ const ( PARMRK = 0x8 PARODD = 0x2000 PENDIN = 0x20000000 - PIOD_READ_D = 0x1 - PIOD_READ_I = 0x3 - PIOD_WRITE_D = 0x2 - PIOD_WRITE_I = 0x4 PRIO_PGRP = 0x1 PRIO_PROCESS = 0x0 PRIO_USER = 0x2 @@ -1208,60 +1196,6 @@ const ( PROT_NONE = 0x0 PROT_READ = 0x1 PROT_WRITE = 0x2 - PTRACE_DEFAULT = 0x1 - PTRACE_EXEC = 0x1 - PTRACE_FORK = 0x8 - PTRACE_LWP = 0x10 - PTRACE_SCE = 0x2 - PTRACE_SCX = 0x4 - PTRACE_SYSCALL = 0x6 - PTRACE_VFORK = 0x20 - PT_ATTACH = 0xa - PT_CLEARSTEP = 0x10 - PT_CONTINUE = 0x7 - PT_DETACH = 0xb - PT_FIRSTMACH = 0x40 - PT_FOLLOW_FORK = 0x17 - PT_GETDBREGS = 0x25 - PT_GETFPREGS = 0x23 - PT_GETFSBASE = 0x47 - PT_GETGSBASE = 0x49 - PT_GETLWPLIST = 0xf - PT_GETNUMLWPS = 0xe - PT_GETREGS = 0x21 - PT_GETXMMREGS = 0x40 - PT_GETXSTATE = 0x45 - PT_GETXSTATE_INFO = 0x44 - PT_GET_EVENT_MASK = 0x19 - PT_GET_SC_ARGS = 0x1b - PT_GET_SC_RET = 0x1c - PT_IO = 0xc - PT_KILL = 0x8 - PT_LWPINFO = 0xd - PT_LWP_EVENTS = 0x18 - PT_READ_D = 0x2 - PT_READ_I = 0x1 - PT_RESUME = 0x13 - PT_SETDBREGS = 0x26 - PT_SETFPREGS = 0x24 - PT_SETFSBASE = 0x48 - PT_SETGSBASE = 0x4a - PT_SETREGS = 0x22 - PT_SETSTEP = 0x11 - PT_SETXMMREGS = 0x41 - PT_SETXSTATE = 0x46 - PT_SET_EVENT_MASK = 0x1a - PT_STEP = 0x9 - PT_SUSPEND = 0x12 - PT_SYSCALL = 0x16 - PT_TO_SCE = 0x14 - PT_TO_SCX = 0x15 - PT_TRACE_ME = 0x0 - PT_VM_ENTRY = 0x29 - PT_VM_TIMESTAMP = 0x28 - PT_WRITE_D = 0x5 - PT_WRITE_I = 0x4 - P_ZONEID = 0xc RLIMIT_AS = 0xa RLIMIT_CORE = 0x4 RLIMIT_CPU = 0x0 @@ -1386,12 +1320,10 @@ const ( SIOCGHWADDR = 0xc020693e SIOCGI2C = 0xc020693d SIOCGIFADDR = 0xc0206921 - SIOCGIFALIAS = 0xc044692d SIOCGIFBRDADDR = 0xc0206923 SIOCGIFCAP = 0xc020691f SIOCGIFCONF = 0xc0086924 SIOCGIFDESCR = 0xc020692a - SIOCGIFDOWNREASON = 0xc058699a SIOCGIFDSTADDR = 0xc0206922 SIOCGIFFIB = 0xc020695c SIOCGIFFLAGS = 0xc0206911 @@ -1482,7 +1414,6 @@ const ( SO_RCVBUF = 0x1002 SO_RCVLOWAT = 0x1004 SO_RCVTIMEO = 0x1006 - SO_RERROR = 0x20000 SO_REUSEADDR = 0x4 SO_REUSEPORT = 0x200 SO_REUSEPORT_LB = 0x10000 @@ -1541,40 +1472,22 @@ const ( TCOFLUSH = 0x2 TCOOFF = 0x1 TCOON = 0x2 - TCPOPT_EOL = 0x0 - TCPOPT_FAST_OPEN = 0x22 - TCPOPT_MAXSEG = 0x2 - TCPOPT_NOP = 0x1 - TCPOPT_PAD = 0x0 - TCPOPT_SACK = 0x5 - TCPOPT_SACK_PERMITTED = 0x4 - TCPOPT_SIGNATURE = 0x13 - TCPOPT_TIMESTAMP = 0x8 - TCPOPT_WINDOW = 0x3 TCP_BBR_ACK_COMP_ALG = 0x448 - TCP_BBR_ALGORITHM = 0x43b TCP_BBR_DRAIN_INC_EXTRA = 0x43c TCP_BBR_DRAIN_PG = 0x42e TCP_BBR_EXTRA_GAIN = 0x449 - TCP_BBR_EXTRA_STATE = 0x453 - TCP_BBR_FLOOR_MIN_TSO = 0x454 - TCP_BBR_HDWR_PACE = 0x451 - TCP_BBR_HOLD_TARGET = 0x436 TCP_BBR_IWINTSO = 0x42b TCP_BBR_LOWGAIN_FD = 0x436 TCP_BBR_LOWGAIN_HALF = 0x435 TCP_BBR_LOWGAIN_THRESH = 0x434 TCP_BBR_MAX_RTO = 0x439 TCP_BBR_MIN_RTO = 0x438 - TCP_BBR_MIN_TOPACEOUT = 0x455 TCP_BBR_ONE_RETRAN = 0x431 TCP_BBR_PACE_CROSS = 0x442 TCP_BBR_PACE_DEL_TAR = 0x43f - TCP_BBR_PACE_OH = 0x435 TCP_BBR_PACE_PER_SEC = 0x43e TCP_BBR_PACE_SEG_MAX = 0x440 TCP_BBR_PACE_SEG_MIN = 0x441 - TCP_BBR_POLICER_DETECT = 0x457 TCP_BBR_PROBE_RTT_GAIN = 0x44d TCP_BBR_PROBE_RTT_INT = 0x430 TCP_BBR_PROBE_RTT_LEN = 0x44e @@ -1583,18 +1496,12 @@ const ( TCP_BBR_REC_OVER_HPTS = 0x43a TCP_BBR_RETRAN_WTSO = 0x44b TCP_BBR_RWND_IS_APP = 0x42f - TCP_BBR_SEND_IWND_IN_TSO = 0x44f TCP_BBR_STARTUP_EXIT_EPOCH = 0x43d TCP_BBR_STARTUP_LOSS_EXIT = 0x432 TCP_BBR_STARTUP_PG = 0x42d - TCP_BBR_TMR_PACE_OH = 0x448 - TCP_BBR_TSLIMITS = 0x434 - TCP_BBR_TSTMP_RAISES = 0x456 TCP_BBR_UNLIMITED = 0x43b TCP_BBR_USEDEL_RATE = 0x437 TCP_BBR_USE_LOWGAIN = 0x433 - TCP_BBR_USE_RACK_CHEAT = 0x450 - TCP_BBR_UTTER_MAX_TSO = 0x452 TCP_CA_NAME_MAX = 0x10 TCP_CCALGOOPT = 0x41 TCP_CONGESTION = 0x40 @@ -1634,7 +1541,6 @@ const ( TCP_PCAP_OUT = 0x800 TCP_RACK_EARLY_RECOV = 0x423 TCP_RACK_EARLY_SEG = 0x424 - TCP_RACK_GP_INCREASE = 0x446 TCP_RACK_IDLE_REDUCE_HIGH = 0x444 TCP_RACK_MIN_PACE = 0x445 TCP_RACK_MIN_PACE_SEG = 0x446 @@ -1648,6 +1554,7 @@ const ( TCP_RACK_PRR_SENDALOT = 0x421 TCP_RACK_REORD_FADE = 0x426 TCP_RACK_REORD_THRESH = 0x425 + TCP_RACK_SESS_CWV = 0x42a TCP_RACK_TLP_INC_VAR = 0x429 TCP_RACK_TLP_REDUCE = 0x41c TCP_RACK_TLP_THRESH = 0x427 @@ -1787,13 +1694,12 @@ const ( EIDRM = syscall.Errno(0x52) EILSEQ = syscall.Errno(0x56) EINPROGRESS = syscall.Errno(0x24) - EINTEGRITY = syscall.Errno(0x61) EINTR = syscall.Errno(0x4) EINVAL = syscall.Errno(0x16) EIO = syscall.Errno(0x5) EISCONN = syscall.Errno(0x38) EISDIR = syscall.Errno(0x15) - ELAST = syscall.Errno(0x61) + ELAST = syscall.Errno(0x60) ELOOP = syscall.Errno(0x3e) EMFILE = syscall.Errno(0x18) EMLINK = syscall.Errno(0x1f) @@ -1936,7 +1842,7 @@ var errorList = [...]struct { {32, "EPIPE", "broken pipe"}, {33, "EDOM", "numerical argument out of domain"}, {34, "ERANGE", "result too large"}, - {35, "EWOULDBLOCK", "resource temporarily unavailable"}, + {35, "EAGAIN", "resource temporarily unavailable"}, {36, "EINPROGRESS", "operation now in progress"}, {37, "EALREADY", "operation already in progress"}, {38, "ENOTSOCK", "socket operation on non-socket"}, @@ -1998,7 +1904,6 @@ var errorList = [...]struct { {94, "ECAPMODE", "not permitted in capability mode"}, {95, "ENOTRECOVERABLE", "state not recoverable"}, {96, "EOWNERDEAD", "previous owner died"}, - {97, "EINTEGRITY", "integrity check failed"}, } // Signal table diff --git a/vendor/golang.org/x/sys/unix/zerrors_freebsd_amd64.go b/vendor/golang.org/x/sys/unix/zerrors_freebsd_amd64.go index 96310c3..64520d3 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_freebsd_amd64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_freebsd_amd64.go @@ -151,7 +151,6 @@ const ( BIOCSETF = 0x80104267 BIOCSETFNR = 0x80104282 BIOCSETIF = 0x8020426c - BIOCSETVLANPCP = 0x80044285 BIOCSETWF = 0x8010427b BIOCSETZBUF = 0x80184281 BIOCSHDRCMPLT = 0x80044275 @@ -448,7 +447,7 @@ const ( DLT_IEEE802_16_MAC_CPS_RADIO = 0xc1 DLT_INFINIBAND = 0xf7 DLT_IPFILTER = 0x74 - DLT_IPMB_KONTRON = 0xc7 + DLT_IPMB = 0xc7 DLT_IPMB_LINUX = 0xd1 DLT_IPMI_HPM_2 = 0x104 DLT_IPNET = 0xe2 @@ -488,11 +487,10 @@ const ( DLT_LINUX_LAPD = 0xb1 DLT_LINUX_PPP_WITHDIRECTION = 0xa6 DLT_LINUX_SLL = 0x71 - DLT_LINUX_SLL2 = 0x114 DLT_LOOP = 0x6c DLT_LORATAP = 0x10e DLT_LTALK = 0x72 - DLT_MATCHING_MAX = 0x114 + DLT_MATCHING_MAX = 0x113 DLT_MATCHING_MIN = 0x68 DLT_MFR = 0xb6 DLT_MOST = 0xd3 @@ -736,7 +734,6 @@ const ( IPPROTO_CMTP = 0x26 IPPROTO_CPHB = 0x49 IPPROTO_CPNX = 0x48 - IPPROTO_DCCP = 0x21 IPPROTO_DDP = 0x25 IPPROTO_DGP = 0x56 IPPROTO_DIVERT = 0x102 @@ -817,6 +814,7 @@ const ( IPPROTO_SCTP = 0x84 IPPROTO_SDRP = 0x2a IPPROTO_SEND = 0x103 + IPPROTO_SEP = 0x21 IPPROTO_SHIM6 = 0x8c IPPROTO_SKIP = 0x39 IPPROTO_SPACER = 0x7fff @@ -913,7 +911,6 @@ const ( IPV6_V6ONLY = 0x1b IPV6_VERSION = 0x60 IPV6_VERSION_MASK = 0xf0 - IPV6_VLAN_PCP = 0x4b IP_ADD_MEMBERSHIP = 0xc IP_ADD_SOURCE_MEMBERSHIP = 0x46 IP_BINDANY = 0x18 @@ -992,12 +989,8 @@ const ( IP_TOS = 0x3 IP_TTL = 0x4 IP_UNBLOCK_SOURCE = 0x49 - IP_VLAN_PCP = 0x4b ISIG = 0x80 ISTRIP = 0x20 - ITIMER_PROF = 0x2 - ITIMER_REAL = 0x0 - ITIMER_VIRTUAL = 0x1 IXANY = 0x800 IXOFF = 0x400 IXON = 0x200 @@ -1007,6 +1000,7 @@ const ( KERN_VERSION = 0x4 LOCAL_CONNWAIT = 0x4 LOCAL_CREDS = 0x2 + LOCAL_CREDS_PERSISTENT = 0x3 LOCAL_PEERCRED = 0x1 LOCAL_VENDOR = 0x80000000 LOCK_EX = 0x2 @@ -1186,8 +1180,6 @@ const ( O_NONBLOCK = 0x4 O_RDONLY = 0x0 O_RDWR = 0x2 - O_RESOLVE_BENEATH = 0x800000 - O_SEARCH = 0x40000 O_SHLOCK = 0x10 O_SYNC = 0x80 O_TRUNC = 0x400 @@ -1198,10 +1190,6 @@ const ( PARMRK = 0x8 PARODD = 0x2000 PENDIN = 0x20000000 - PIOD_READ_D = 0x1 - PIOD_READ_I = 0x3 - PIOD_WRITE_D = 0x2 - PIOD_WRITE_I = 0x4 PRIO_PGRP = 0x1 PRIO_PROCESS = 0x0 PRIO_USER = 0x2 @@ -1209,58 +1197,6 @@ const ( PROT_NONE = 0x0 PROT_READ = 0x1 PROT_WRITE = 0x2 - PTRACE_DEFAULT = 0x1 - PTRACE_EXEC = 0x1 - PTRACE_FORK = 0x8 - PTRACE_LWP = 0x10 - PTRACE_SCE = 0x2 - PTRACE_SCX = 0x4 - PTRACE_SYSCALL = 0x6 - PTRACE_VFORK = 0x20 - PT_ATTACH = 0xa - PT_CLEARSTEP = 0x10 - PT_CONTINUE = 0x7 - PT_DETACH = 0xb - PT_FIRSTMACH = 0x40 - PT_FOLLOW_FORK = 0x17 - PT_GETDBREGS = 0x25 - PT_GETFPREGS = 0x23 - PT_GETFSBASE = 0x47 - PT_GETGSBASE = 0x49 - PT_GETLWPLIST = 0xf - PT_GETNUMLWPS = 0xe - PT_GETREGS = 0x21 - PT_GETXSTATE = 0x45 - PT_GETXSTATE_INFO = 0x44 - PT_GET_EVENT_MASK = 0x19 - PT_GET_SC_ARGS = 0x1b - PT_GET_SC_RET = 0x1c - PT_IO = 0xc - PT_KILL = 0x8 - PT_LWPINFO = 0xd - PT_LWP_EVENTS = 0x18 - PT_READ_D = 0x2 - PT_READ_I = 0x1 - PT_RESUME = 0x13 - PT_SETDBREGS = 0x26 - PT_SETFPREGS = 0x24 - PT_SETFSBASE = 0x48 - PT_SETGSBASE = 0x4a - PT_SETREGS = 0x22 - PT_SETSTEP = 0x11 - PT_SETXSTATE = 0x46 - PT_SET_EVENT_MASK = 0x1a - PT_STEP = 0x9 - PT_SUSPEND = 0x12 - PT_SYSCALL = 0x16 - PT_TO_SCE = 0x14 - PT_TO_SCX = 0x15 - PT_TRACE_ME = 0x0 - PT_VM_ENTRY = 0x29 - PT_VM_TIMESTAMP = 0x28 - PT_WRITE_D = 0x5 - PT_WRITE_I = 0x4 - P_ZONEID = 0xc RLIMIT_AS = 0xa RLIMIT_CORE = 0x4 RLIMIT_CPU = 0x0 @@ -1385,12 +1321,10 @@ const ( SIOCGHWADDR = 0xc020693e SIOCGI2C = 0xc020693d SIOCGIFADDR = 0xc0206921 - SIOCGIFALIAS = 0xc044692d SIOCGIFBRDADDR = 0xc0206923 SIOCGIFCAP = 0xc020691f SIOCGIFCONF = 0xc0106924 SIOCGIFDESCR = 0xc020692a - SIOCGIFDOWNREASON = 0xc058699a SIOCGIFDSTADDR = 0xc0206922 SIOCGIFFIB = 0xc020695c SIOCGIFFLAGS = 0xc0206911 @@ -1481,7 +1415,6 @@ const ( SO_RCVBUF = 0x1002 SO_RCVLOWAT = 0x1004 SO_RCVTIMEO = 0x1006 - SO_RERROR = 0x20000 SO_REUSEADDR = 0x4 SO_REUSEPORT = 0x200 SO_REUSEPORT_LB = 0x10000 @@ -1540,40 +1473,22 @@ const ( TCOFLUSH = 0x2 TCOOFF = 0x1 TCOON = 0x2 - TCPOPT_EOL = 0x0 - TCPOPT_FAST_OPEN = 0x22 - TCPOPT_MAXSEG = 0x2 - TCPOPT_NOP = 0x1 - TCPOPT_PAD = 0x0 - TCPOPT_SACK = 0x5 - TCPOPT_SACK_PERMITTED = 0x4 - TCPOPT_SIGNATURE = 0x13 - TCPOPT_TIMESTAMP = 0x8 - TCPOPT_WINDOW = 0x3 TCP_BBR_ACK_COMP_ALG = 0x448 - TCP_BBR_ALGORITHM = 0x43b TCP_BBR_DRAIN_INC_EXTRA = 0x43c TCP_BBR_DRAIN_PG = 0x42e TCP_BBR_EXTRA_GAIN = 0x449 - TCP_BBR_EXTRA_STATE = 0x453 - TCP_BBR_FLOOR_MIN_TSO = 0x454 - TCP_BBR_HDWR_PACE = 0x451 - TCP_BBR_HOLD_TARGET = 0x436 TCP_BBR_IWINTSO = 0x42b TCP_BBR_LOWGAIN_FD = 0x436 TCP_BBR_LOWGAIN_HALF = 0x435 TCP_BBR_LOWGAIN_THRESH = 0x434 TCP_BBR_MAX_RTO = 0x439 TCP_BBR_MIN_RTO = 0x438 - TCP_BBR_MIN_TOPACEOUT = 0x455 TCP_BBR_ONE_RETRAN = 0x431 TCP_BBR_PACE_CROSS = 0x442 TCP_BBR_PACE_DEL_TAR = 0x43f - TCP_BBR_PACE_OH = 0x435 TCP_BBR_PACE_PER_SEC = 0x43e TCP_BBR_PACE_SEG_MAX = 0x440 TCP_BBR_PACE_SEG_MIN = 0x441 - TCP_BBR_POLICER_DETECT = 0x457 TCP_BBR_PROBE_RTT_GAIN = 0x44d TCP_BBR_PROBE_RTT_INT = 0x430 TCP_BBR_PROBE_RTT_LEN = 0x44e @@ -1582,18 +1497,12 @@ const ( TCP_BBR_REC_OVER_HPTS = 0x43a TCP_BBR_RETRAN_WTSO = 0x44b TCP_BBR_RWND_IS_APP = 0x42f - TCP_BBR_SEND_IWND_IN_TSO = 0x44f TCP_BBR_STARTUP_EXIT_EPOCH = 0x43d TCP_BBR_STARTUP_LOSS_EXIT = 0x432 TCP_BBR_STARTUP_PG = 0x42d - TCP_BBR_TMR_PACE_OH = 0x448 - TCP_BBR_TSLIMITS = 0x434 - TCP_BBR_TSTMP_RAISES = 0x456 TCP_BBR_UNLIMITED = 0x43b TCP_BBR_USEDEL_RATE = 0x437 TCP_BBR_USE_LOWGAIN = 0x433 - TCP_BBR_USE_RACK_CHEAT = 0x450 - TCP_BBR_UTTER_MAX_TSO = 0x452 TCP_CA_NAME_MAX = 0x10 TCP_CCALGOOPT = 0x41 TCP_CONGESTION = 0x40 @@ -1633,7 +1542,6 @@ const ( TCP_PCAP_OUT = 0x800 TCP_RACK_EARLY_RECOV = 0x423 TCP_RACK_EARLY_SEG = 0x424 - TCP_RACK_GP_INCREASE = 0x446 TCP_RACK_IDLE_REDUCE_HIGH = 0x444 TCP_RACK_MIN_PACE = 0x445 TCP_RACK_MIN_PACE_SEG = 0x446 @@ -1647,6 +1555,7 @@ const ( TCP_RACK_PRR_SENDALOT = 0x421 TCP_RACK_REORD_FADE = 0x426 TCP_RACK_REORD_THRESH = 0x425 + TCP_RACK_SESS_CWV = 0x42a TCP_RACK_TLP_INC_VAR = 0x429 TCP_RACK_TLP_REDUCE = 0x41c TCP_RACK_TLP_THRESH = 0x427 @@ -1784,13 +1693,12 @@ const ( EIDRM = syscall.Errno(0x52) EILSEQ = syscall.Errno(0x56) EINPROGRESS = syscall.Errno(0x24) - EINTEGRITY = syscall.Errno(0x61) EINTR = syscall.Errno(0x4) EINVAL = syscall.Errno(0x16) EIO = syscall.Errno(0x5) EISCONN = syscall.Errno(0x38) EISDIR = syscall.Errno(0x15) - ELAST = syscall.Errno(0x61) + ELAST = syscall.Errno(0x60) ELOOP = syscall.Errno(0x3e) EMFILE = syscall.Errno(0x18) EMLINK = syscall.Errno(0x1f) @@ -1933,7 +1841,7 @@ var errorList = [...]struct { {32, "EPIPE", "broken pipe"}, {33, "EDOM", "numerical argument out of domain"}, {34, "ERANGE", "result too large"}, - {35, "EWOULDBLOCK", "resource temporarily unavailable"}, + {35, "EAGAIN", "resource temporarily unavailable"}, {36, "EINPROGRESS", "operation now in progress"}, {37, "EALREADY", "operation already in progress"}, {38, "ENOTSOCK", "socket operation on non-socket"}, @@ -1995,7 +1903,6 @@ var errorList = [...]struct { {94, "ECAPMODE", "not permitted in capability mode"}, {95, "ENOTRECOVERABLE", "state not recoverable"}, {96, "EOWNERDEAD", "previous owner died"}, - {97, "EINTEGRITY", "integrity check failed"}, } // Signal table diff --git a/vendor/golang.org/x/sys/unix/zerrors_freebsd_arm.go b/vendor/golang.org/x/sys/unix/zerrors_freebsd_arm.go index 777b69d..99e9a0e 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_freebsd_arm.go +++ b/vendor/golang.org/x/sys/unix/zerrors_freebsd_arm.go @@ -151,7 +151,6 @@ const ( BIOCSETF = 0x80084267 BIOCSETFNR = 0x80084282 BIOCSETIF = 0x8020426c - BIOCSETVLANPCP = 0x80044285 BIOCSETWF = 0x8008427b BIOCSETZBUF = 0x800c4281 BIOCSHDRCMPLT = 0x80044275 @@ -363,7 +362,7 @@ const ( CTL_KERN = 0x1 CTL_MAXNAME = 0x18 CTL_NET = 0x4 - DIOCGATTR = 0xc148648e + DIOCGATTR = 0xc144648e DIOCGDELETE = 0x80106488 DIOCGFLUSH = 0x20006487 DIOCGFRONTSTUFF = 0x40086486 @@ -378,7 +377,7 @@ const ( DIOCGSTRIPESIZE = 0x4008648b DIOCSKERNELDUMP = 0x804c6490 DIOCSKERNELDUMP_FREEBSD11 = 0x80046485 - DIOCZONECMD = 0xc078648f + DIOCZONECMD = 0xc06c648f DLT_A429 = 0xb8 DLT_A653_ICM = 0xb9 DLT_AIRONET_HEADER = 0x78 @@ -408,9 +407,7 @@ const ( DLT_C_HDLC_WITH_DIR = 0xcd DLT_DBUS = 0xe7 DLT_DECT = 0xdd - DLT_DISPLAYPORT_AUX = 0x113 DLT_DOCSIS = 0x8f - DLT_DOCSIS31_XRA31 = 0x111 DLT_DVB_CI = 0xeb DLT_ECONET = 0x73 DLT_EN10MB = 0x1 @@ -420,7 +417,6 @@ const ( DLT_ERF = 0xc5 DLT_ERF_ETH = 0xaf DLT_ERF_POS = 0xb0 - DLT_ETHERNET_MPACKET = 0x112 DLT_FC_2 = 0xe0 DLT_FC_2_WITH_FRAME_DELIMS = 0xe1 DLT_FDDI = 0xa @@ -448,7 +444,7 @@ const ( DLT_IEEE802_16_MAC_CPS_RADIO = 0xc1 DLT_INFINIBAND = 0xf7 DLT_IPFILTER = 0x74 - DLT_IPMB_KONTRON = 0xc7 + DLT_IPMB = 0xc7 DLT_IPMB_LINUX = 0xd1 DLT_IPMI_HPM_2 = 0x104 DLT_IPNET = 0xe2 @@ -488,11 +484,9 @@ const ( DLT_LINUX_LAPD = 0xb1 DLT_LINUX_PPP_WITHDIRECTION = 0xa6 DLT_LINUX_SLL = 0x71 - DLT_LINUX_SLL2 = 0x114 DLT_LOOP = 0x6c - DLT_LORATAP = 0x10e DLT_LTALK = 0x72 - DLT_MATCHING_MAX = 0x114 + DLT_MATCHING_MAX = 0x109 DLT_MATCHING_MIN = 0x68 DLT_MFR = 0xb6 DLT_MOST = 0xd3 @@ -508,9 +502,7 @@ const ( DLT_NFC_LLCP = 0xf5 DLT_NFLOG = 0xef DLT_NG40 = 0xf4 - DLT_NORDIC_BLE = 0x110 DLT_NULL = 0x0 - DLT_OPENFLOW = 0x10b DLT_PCI_EXP = 0x7d DLT_PFLOG = 0x75 DLT_PFSYNC = 0x79 @@ -534,18 +526,15 @@ const ( DLT_RTAC_SERIAL = 0xfa DLT_SCCP = 0x8e DLT_SCTP = 0xf8 - DLT_SDLC = 0x10c DLT_SITA = 0xc4 DLT_SLIP = 0x8 DLT_SLIP_BSDOS = 0xd DLT_STANAG_5066_D_PDU = 0xed DLT_SUNATM = 0x7b DLT_SYMANTEC_FIREWALL = 0x63 - DLT_TI_LLN_SNIFFER = 0x10d DLT_TZSP = 0x80 DLT_USB = 0xba DLT_USBPCAP = 0xf9 - DLT_USB_DARWIN = 0x10a DLT_USB_FREEBSD = 0xba DLT_USB_LINUX = 0xbd DLT_USB_LINUX_MMAPPED = 0xdc @@ -565,7 +554,6 @@ const ( DLT_USER7 = 0x9a DLT_USER8 = 0x9b DLT_USER9 = 0x9c - DLT_VSOCK = 0x10f DLT_WATTSTOPPER_DLM = 0x107 DLT_WIHART = 0xdf DLT_WIRESHARK_UPPER_PDU = 0xfc @@ -590,7 +578,6 @@ const ( ECHONL = 0x10 ECHOPRT = 0x20 EVFILT_AIO = -0x3 - EVFILT_EMPTY = -0xd EVFILT_FS = -0x9 EVFILT_LIO = -0xa EVFILT_PROC = -0x5 @@ -598,12 +585,11 @@ const ( EVFILT_READ = -0x1 EVFILT_SENDFILE = -0xc EVFILT_SIGNAL = -0x6 - EVFILT_SYSCOUNT = 0xd + EVFILT_SYSCOUNT = 0xc EVFILT_TIMER = -0x7 EVFILT_USER = -0xb EVFILT_VNODE = -0x4 EVFILT_WRITE = -0x2 - EVNAMEMAP_NAME_SIZE = 0x40 EV_ADD = 0x1 EV_CLEAR = 0x20 EV_DELETE = 0x2 @@ -620,7 +606,6 @@ const ( EV_RECEIPT = 0x40 EV_SYSFLAGS = 0xf000 EXTA = 0x4b00 - EXTATTR_MAXNAMELEN = 0xff EXTATTR_NAMESPACE_EMPTY = 0x0 EXTATTR_NAMESPACE_SYSTEM = 0x2 EXTATTR_NAMESPACE_USER = 0x1 @@ -662,7 +647,6 @@ const ( IEXTEN = 0x400 IFAN_ARRIVAL = 0x0 IFAN_DEPARTURE = 0x1 - IFCAP_WOL_MAGIC = 0x2000 IFF_ALLMULTI = 0x200 IFF_ALTPHYS = 0x4000 IFF_BROADCAST = 0x2 @@ -679,7 +663,6 @@ const ( IFF_MONITOR = 0x40000 IFF_MULTICAST = 0x8000 IFF_NOARP = 0x80 - IFF_NOGROUP = 0x800000 IFF_OACTIVE = 0x400 IFF_POINTOPOINT = 0x10 IFF_PPROMISC = 0x20000 @@ -736,7 +719,6 @@ const ( IPPROTO_CMTP = 0x26 IPPROTO_CPHB = 0x49 IPPROTO_CPNX = 0x48 - IPPROTO_DCCP = 0x21 IPPROTO_DDP = 0x25 IPPROTO_DGP = 0x56 IPPROTO_DIVERT = 0x102 @@ -817,6 +799,7 @@ const ( IPPROTO_SCTP = 0x84 IPPROTO_SDRP = 0x2a IPPROTO_SEND = 0x103 + IPPROTO_SEP = 0x21 IPPROTO_SHIM6 = 0x8c IPPROTO_SKIP = 0x39 IPPROTO_SPACER = 0x7fff @@ -854,7 +837,6 @@ const ( IPV6_DSTOPTS = 0x32 IPV6_FLOWID = 0x43 IPV6_FLOWINFO_MASK = 0xffffff0f - IPV6_FLOWLABEL_LEN = 0x14 IPV6_FLOWLABEL_MASK = 0xffff0f00 IPV6_FLOWTYPE = 0x44 IPV6_FRAGTTL = 0x78 @@ -875,13 +857,13 @@ const ( IPV6_MAX_GROUP_SRC_FILTER = 0x200 IPV6_MAX_MEMBERSHIPS = 0xfff IPV6_MAX_SOCK_SRC_FILTER = 0x80 + IPV6_MIN_MEMBERSHIPS = 0x1f IPV6_MMTU = 0x500 IPV6_MSFILTER = 0x4a IPV6_MULTICAST_HOPS = 0xa IPV6_MULTICAST_IF = 0x9 IPV6_MULTICAST_LOOP = 0xb IPV6_NEXTHOP = 0x30 - IPV6_ORIGDSTADDR = 0x48 IPV6_PATHMTU = 0x2c IPV6_PKTINFO = 0x2e IPV6_PORTRANGE = 0xe @@ -893,7 +875,6 @@ const ( IPV6_RECVFLOWID = 0x46 IPV6_RECVHOPLIMIT = 0x25 IPV6_RECVHOPOPTS = 0x27 - IPV6_RECVORIGDSTADDR = 0x48 IPV6_RECVPATHMTU = 0x2b IPV6_RECVPKTINFO = 0x24 IPV6_RECVRSSBUCKETID = 0x47 @@ -913,7 +894,6 @@ const ( IPV6_V6ONLY = 0x1b IPV6_VERSION = 0x60 IPV6_VERSION_MASK = 0xf0 - IPV6_VLAN_PCP = 0x4b IP_ADD_MEMBERSHIP = 0xc IP_ADD_SOURCE_MEMBERSHIP = 0x46 IP_BINDANY = 0x18 @@ -955,8 +935,10 @@ const ( IP_MAX_MEMBERSHIPS = 0xfff IP_MAX_SOCK_MUTE_FILTER = 0x80 IP_MAX_SOCK_SRC_FILTER = 0x80 + IP_MAX_SOURCE_FILTER = 0x400 IP_MF = 0x2000 IP_MINTTL = 0x42 + IP_MIN_MEMBERSHIPS = 0x1f IP_MSFILTER = 0x4a IP_MSS = 0x240 IP_MULTICAST_IF = 0x9 @@ -966,7 +948,6 @@ const ( IP_OFFMASK = 0x1fff IP_ONESBCAST = 0x17 IP_OPTIONS = 0x1 - IP_ORIGDSTADDR = 0x1b IP_PORTRANGE = 0x13 IP_PORTRANGE_DEFAULT = 0x0 IP_PORTRANGE_HIGH = 0x1 @@ -975,7 +956,6 @@ const ( IP_RECVFLOWID = 0x5d IP_RECVIF = 0x14 IP_RECVOPTS = 0x5 - IP_RECVORIGDSTADDR = 0x1b IP_RECVRETOPTS = 0x6 IP_RECVRSSBUCKETID = 0x5e IP_RECVTOS = 0x44 @@ -992,12 +972,8 @@ const ( IP_TOS = 0x3 IP_TTL = 0x4 IP_UNBLOCK_SOURCE = 0x49 - IP_VLAN_PCP = 0x4b ISIG = 0x80 ISTRIP = 0x20 - ITIMER_PROF = 0x2 - ITIMER_REAL = 0x0 - ITIMER_VIRTUAL = 0x1 IXANY = 0x800 IXOFF = 0x400 IXON = 0x200 @@ -1007,6 +983,7 @@ const ( KERN_VERSION = 0x4 LOCAL_CONNWAIT = 0x4 LOCAL_CREDS = 0x2 + LOCAL_CREDS_PERSISTENT = 0x3 LOCAL_PEERCRED = 0x1 LOCAL_VENDOR = 0x80000000 LOCK_EX = 0x2 @@ -1094,12 +1071,10 @@ const ( MNT_SUSPEND = 0x4 MNT_SYNCHRONOUS = 0x2 MNT_UNION = 0x20 - MNT_UNTRUSTED = 0x800000000 MNT_UPDATE = 0x10000 - MNT_UPDATEMASK = 0xad8d0807e + MNT_UPDATEMASK = 0x2d8d0807e MNT_USER = 0x8000 - MNT_VERIFIED = 0x400000000 - MNT_VISFLAGMASK = 0xffef0ffff + MNT_VISFLAGMASK = 0x3fef0ffff MNT_WAIT = 0x1 MSG_CMSG_CLOEXEC = 0x40000 MSG_COMPAT = 0x8000 @@ -1128,7 +1103,6 @@ const ( NFDBITS = 0x20 NOFLSH = 0x80000000 NOKERNINFO = 0x2000000 - NOTE_ABSTIME = 0x10 NOTE_ATTRIB = 0x8 NOTE_CHILD = 0x4 NOTE_CLOSE = 0x100 @@ -1185,8 +1159,6 @@ const ( O_NONBLOCK = 0x4 O_RDONLY = 0x0 O_RDWR = 0x2 - O_RESOLVE_BENEATH = 0x800000 - O_SEARCH = 0x40000 O_SHLOCK = 0x10 O_SYNC = 0x80 O_TRUNC = 0x400 @@ -1197,10 +1169,6 @@ const ( PARMRK = 0x8 PARODD = 0x2000 PENDIN = 0x20000000 - PIOD_READ_D = 0x1 - PIOD_READ_I = 0x3 - PIOD_WRITE_D = 0x2 - PIOD_WRITE_I = 0x4 PRIO_PGRP = 0x1 PRIO_PROCESS = 0x0 PRIO_USER = 0x2 @@ -1208,53 +1176,6 @@ const ( PROT_NONE = 0x0 PROT_READ = 0x1 PROT_WRITE = 0x2 - PTRACE_DEFAULT = 0x1 - PTRACE_EXEC = 0x1 - PTRACE_FORK = 0x8 - PTRACE_LWP = 0x10 - PTRACE_SCE = 0x2 - PTRACE_SCX = 0x4 - PTRACE_SYSCALL = 0x6 - PTRACE_VFORK = 0x20 - PT_ATTACH = 0xa - PT_CLEARSTEP = 0x10 - PT_CONTINUE = 0x7 - PT_DETACH = 0xb - PT_FIRSTMACH = 0x40 - PT_FOLLOW_FORK = 0x17 - PT_GETDBREGS = 0x25 - PT_GETFPREGS = 0x23 - PT_GETLWPLIST = 0xf - PT_GETNUMLWPS = 0xe - PT_GETREGS = 0x21 - PT_GETVFPREGS = 0x40 - PT_GET_EVENT_MASK = 0x19 - PT_GET_SC_ARGS = 0x1b - PT_GET_SC_RET = 0x1c - PT_IO = 0xc - PT_KILL = 0x8 - PT_LWPINFO = 0xd - PT_LWP_EVENTS = 0x18 - PT_READ_D = 0x2 - PT_READ_I = 0x1 - PT_RESUME = 0x13 - PT_SETDBREGS = 0x26 - PT_SETFPREGS = 0x24 - PT_SETREGS = 0x22 - PT_SETSTEP = 0x11 - PT_SETVFPREGS = 0x41 - PT_SET_EVENT_MASK = 0x1a - PT_STEP = 0x9 - PT_SUSPEND = 0x12 - PT_SYSCALL = 0x16 - PT_TO_SCE = 0x14 - PT_TO_SCX = 0x15 - PT_TRACE_ME = 0x0 - PT_VM_ENTRY = 0x29 - PT_VM_TIMESTAMP = 0x28 - PT_WRITE_D = 0x5 - PT_WRITE_I = 0x4 - P_ZONEID = 0xc RLIMIT_AS = 0xa RLIMIT_CORE = 0x4 RLIMIT_CPU = 0x0 @@ -1336,6 +1257,7 @@ const ( RTV_WEIGHT = 0x100 RT_ALL_FIBS = -0x1 RT_BLACKHOLE = 0x40 + RT_CACHING_CONTEXT = 0x1 RT_DEFAULT_FIB = 0x0 RT_HAS_GW = 0x80 RT_HAS_HEADER = 0x10 @@ -1345,17 +1267,15 @@ const ( RT_LLE_CACHE = 0x100 RT_MAY_LOOP = 0x8 RT_MAY_LOOP_BIT = 0x3 + RT_NORTREF = 0x2 RT_REJECT = 0x20 RUSAGE_CHILDREN = -0x1 RUSAGE_SELF = 0x0 RUSAGE_THREAD = 0x1 SCM_BINTIME = 0x4 SCM_CREDS = 0x3 - SCM_MONOTONIC = 0x6 - SCM_REALTIME = 0x5 SCM_RIGHTS = 0x1 SCM_TIMESTAMP = 0x2 - SCM_TIME_INFO = 0x7 SEEK_CUR = 0x1 SEEK_DATA = 0x3 SEEK_END = 0x2 @@ -1379,12 +1299,10 @@ const ( SIOCGHWADDR = 0xc020693e SIOCGI2C = 0xc020693d SIOCGIFADDR = 0xc0206921 - SIOCGIFALIAS = 0xc044692d SIOCGIFBRDADDR = 0xc0206923 SIOCGIFCAP = 0xc020691f SIOCGIFCONF = 0xc0086924 SIOCGIFDESCR = 0xc020692a - SIOCGIFDOWNREASON = 0xc058699a SIOCGIFDSTADDR = 0xc0206922 SIOCGIFFIB = 0xc020695c SIOCGIFFLAGS = 0xc0206911 @@ -1400,11 +1318,8 @@ const ( SIOCGIFPDSTADDR = 0xc0206948 SIOCGIFPHYS = 0xc0206935 SIOCGIFPSRCADDR = 0xc0206947 - SIOCGIFRSSHASH = 0xc0186997 - SIOCGIFRSSKEY = 0xc0946996 SIOCGIFSTATUS = 0xc331693b SIOCGIFXMEDIA = 0xc028698b - SIOCGLANPCP = 0xc0206998 SIOCGLOWAT = 0x40047303 SIOCGPGRP = 0x40047309 SIOCGPRIVATE_0 = 0xc0206950 @@ -1435,7 +1350,6 @@ const ( SIOCSIFPHYS = 0x80206936 SIOCSIFRVNET = 0xc020695b SIOCSIFVNET = 0xc020695a - SIOCSLANPCP = 0x80206999 SIOCSLOWAT = 0x80047302 SIOCSPGRP = 0x80047308 SIOCSTUNFIB = 0x8020695f @@ -1455,7 +1369,6 @@ const ( SO_BINTIME = 0x2000 SO_BROADCAST = 0x20 SO_DEBUG = 0x1 - SO_DOMAIN = 0x1019 SO_DONTROUTE = 0x10 SO_ERROR = 0x1007 SO_KEEPALIVE = 0x8 @@ -1464,7 +1377,6 @@ const ( SO_LISTENINCQLEN = 0x1013 SO_LISTENQLEN = 0x1012 SO_LISTENQLIMIT = 0x1011 - SO_MAX_PACING_RATE = 0x1018 SO_NOSIGPIPE = 0x800 SO_NO_DDP = 0x8000 SO_NO_OFFLOAD = 0x4000 @@ -1475,22 +1387,13 @@ const ( SO_RCVBUF = 0x1002 SO_RCVLOWAT = 0x1004 SO_RCVTIMEO = 0x1006 - SO_RERROR = 0x20000 SO_REUSEADDR = 0x4 SO_REUSEPORT = 0x200 - SO_REUSEPORT_LB = 0x10000 SO_SETFIB = 0x1014 SO_SNDBUF = 0x1001 SO_SNDLOWAT = 0x1003 SO_SNDTIMEO = 0x1005 SO_TIMESTAMP = 0x400 - SO_TS_BINTIME = 0x1 - SO_TS_CLOCK = 0x1017 - SO_TS_CLOCK_MAX = 0x3 - SO_TS_DEFAULT = 0x0 - SO_TS_MONOTONIC = 0x3 - SO_TS_REALTIME = 0x2 - SO_TS_REALTIME_MICRO = 0x0 SO_TYPE = 0x1008 SO_USELOOPBACK = 0x40 SO_USER_COOKIE = 0x1015 @@ -1534,69 +1437,10 @@ const ( TCOFLUSH = 0x2 TCOOFF = 0x1 TCOON = 0x2 - TCPOPT_EOL = 0x0 - TCPOPT_FAST_OPEN = 0x22 - TCPOPT_MAXSEG = 0x2 - TCPOPT_NOP = 0x1 - TCPOPT_PAD = 0x0 - TCPOPT_SACK = 0x5 - TCPOPT_SACK_PERMITTED = 0x4 - TCPOPT_SIGNATURE = 0x13 - TCPOPT_TIMESTAMP = 0x8 - TCPOPT_WINDOW = 0x3 - TCP_BBR_ACK_COMP_ALG = 0x448 - TCP_BBR_ALGORITHM = 0x43b - TCP_BBR_DRAIN_INC_EXTRA = 0x43c - TCP_BBR_DRAIN_PG = 0x42e - TCP_BBR_EXTRA_GAIN = 0x449 - TCP_BBR_EXTRA_STATE = 0x453 - TCP_BBR_FLOOR_MIN_TSO = 0x454 - TCP_BBR_HDWR_PACE = 0x451 - TCP_BBR_HOLD_TARGET = 0x436 - TCP_BBR_IWINTSO = 0x42b - TCP_BBR_LOWGAIN_FD = 0x436 - TCP_BBR_LOWGAIN_HALF = 0x435 - TCP_BBR_LOWGAIN_THRESH = 0x434 - TCP_BBR_MAX_RTO = 0x439 - TCP_BBR_MIN_RTO = 0x438 - TCP_BBR_MIN_TOPACEOUT = 0x455 - TCP_BBR_ONE_RETRAN = 0x431 - TCP_BBR_PACE_CROSS = 0x442 - TCP_BBR_PACE_DEL_TAR = 0x43f - TCP_BBR_PACE_OH = 0x435 - TCP_BBR_PACE_PER_SEC = 0x43e - TCP_BBR_PACE_SEG_MAX = 0x440 - TCP_BBR_PACE_SEG_MIN = 0x441 - TCP_BBR_POLICER_DETECT = 0x457 - TCP_BBR_PROBE_RTT_GAIN = 0x44d - TCP_BBR_PROBE_RTT_INT = 0x430 - TCP_BBR_PROBE_RTT_LEN = 0x44e - TCP_BBR_RACK_RTT_USE = 0x44a - TCP_BBR_RECFORCE = 0x42c - TCP_BBR_REC_OVER_HPTS = 0x43a - TCP_BBR_RETRAN_WTSO = 0x44b - TCP_BBR_RWND_IS_APP = 0x42f - TCP_BBR_SEND_IWND_IN_TSO = 0x44f - TCP_BBR_STARTUP_EXIT_EPOCH = 0x43d - TCP_BBR_STARTUP_LOSS_EXIT = 0x432 - TCP_BBR_STARTUP_PG = 0x42d - TCP_BBR_TMR_PACE_OH = 0x448 - TCP_BBR_TSLIMITS = 0x434 - TCP_BBR_TSTMP_RAISES = 0x456 - TCP_BBR_UNLIMITED = 0x43b - TCP_BBR_USEDEL_RATE = 0x437 - TCP_BBR_USE_LOWGAIN = 0x433 - TCP_BBR_USE_RACK_CHEAT = 0x450 - TCP_BBR_UTTER_MAX_TSO = 0x452 TCP_CA_NAME_MAX = 0x10 TCP_CCALGOOPT = 0x41 TCP_CONGESTION = 0x40 - TCP_DATA_AFTER_CLOSE = 0x44c - TCP_DELACK = 0x48 TCP_FASTOPEN = 0x401 - TCP_FASTOPEN_MAX_COOKIE_LEN = 0x10 - TCP_FASTOPEN_MIN_COOKIE_LEN = 0x4 - TCP_FASTOPEN_PSK_LEN = 0x10 TCP_FUNCTION_BLK = 0x2000 TCP_FUNCTION_NAME_LEN_MAX = 0x20 TCP_INFO = 0x20 @@ -1604,12 +1448,6 @@ const ( TCP_KEEPIDLE = 0x100 TCP_KEEPINIT = 0x80 TCP_KEEPINTVL = 0x200 - TCP_LOG = 0x22 - TCP_LOGBUF = 0x23 - TCP_LOGDUMP = 0x25 - TCP_LOGDUMPID = 0x26 - TCP_LOGID = 0x24 - TCP_LOG_ID_LEN = 0x40 TCP_MAXBURST = 0x4 TCP_MAXHLEN = 0x3c TCP_MAXOLEN = 0x28 @@ -1625,30 +1463,8 @@ const ( TCP_NOPUSH = 0x4 TCP_PCAP_IN = 0x1000 TCP_PCAP_OUT = 0x800 - TCP_RACK_EARLY_RECOV = 0x423 - TCP_RACK_EARLY_SEG = 0x424 - TCP_RACK_GP_INCREASE = 0x446 - TCP_RACK_IDLE_REDUCE_HIGH = 0x444 - TCP_RACK_MIN_PACE = 0x445 - TCP_RACK_MIN_PACE_SEG = 0x446 - TCP_RACK_MIN_TO = 0x422 - TCP_RACK_PACE_ALWAYS = 0x41f - TCP_RACK_PACE_MAX_SEG = 0x41e - TCP_RACK_PACE_REDUCE = 0x41d - TCP_RACK_PKT_DELAY = 0x428 - TCP_RACK_PROP = 0x41b - TCP_RACK_PROP_RATE = 0x420 - TCP_RACK_PRR_SENDALOT = 0x421 - TCP_RACK_REORD_FADE = 0x426 - TCP_RACK_REORD_THRESH = 0x425 - TCP_RACK_TLP_INC_VAR = 0x429 - TCP_RACK_TLP_REDUCE = 0x41c - TCP_RACK_TLP_THRESH = 0x427 - TCP_RACK_TLP_USE = 0x447 TCP_VENDOR = 0x80000000 TCSAFLUSH = 0x2 - TIMER_ABSTIME = 0x1 - TIMER_RELTIME = 0x0 TIOCCBRK = 0x2000747a TIOCCDTR = 0x20007478 TIOCCONS = 0x80047462 @@ -1712,8 +1528,6 @@ const ( TIOCTIMESTAMP = 0x40107459 TIOCUCNTL = 0x80047466 TOSTOP = 0x400000 - UTIME_NOW = -0x1 - UTIME_OMIT = -0x2 VDISCARD = 0xf VDSUSP = 0xb VEOF = 0x0 @@ -1778,13 +1592,12 @@ const ( EIDRM = syscall.Errno(0x52) EILSEQ = syscall.Errno(0x56) EINPROGRESS = syscall.Errno(0x24) - EINTEGRITY = syscall.Errno(0x61) EINTR = syscall.Errno(0x4) EINVAL = syscall.Errno(0x16) EIO = syscall.Errno(0x5) EISCONN = syscall.Errno(0x38) EISDIR = syscall.Errno(0x15) - ELAST = syscall.Errno(0x61) + ELAST = syscall.Errno(0x60) ELOOP = syscall.Errno(0x3e) EMFILE = syscall.Errno(0x18) EMLINK = syscall.Errno(0x1f) @@ -1927,7 +1740,7 @@ var errorList = [...]struct { {32, "EPIPE", "broken pipe"}, {33, "EDOM", "numerical argument out of domain"}, {34, "ERANGE", "result too large"}, - {35, "EWOULDBLOCK", "resource temporarily unavailable"}, + {35, "EAGAIN", "resource temporarily unavailable"}, {36, "EINPROGRESS", "operation now in progress"}, {37, "EALREADY", "operation already in progress"}, {38, "ENOTSOCK", "socket operation on non-socket"}, @@ -1989,7 +1802,6 @@ var errorList = [...]struct { {94, "ECAPMODE", "not permitted in capability mode"}, {95, "ENOTRECOVERABLE", "state not recoverable"}, {96, "EOWNERDEAD", "previous owner died"}, - {97, "EINTEGRITY", "integrity check failed"}, } // Signal table diff --git a/vendor/golang.org/x/sys/unix/zerrors_freebsd_arm64.go b/vendor/golang.org/x/sys/unix/zerrors_freebsd_arm64.go index c557ac2..4c83771 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_freebsd_arm64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_freebsd_arm64.go @@ -151,7 +151,6 @@ const ( BIOCSETF = 0x80104267 BIOCSETFNR = 0x80104282 BIOCSETIF = 0x8020426c - BIOCSETVLANPCP = 0x80044285 BIOCSETWF = 0x8010427b BIOCSETZBUF = 0x80184281 BIOCSHDRCMPLT = 0x80044275 @@ -448,7 +447,7 @@ const ( DLT_IEEE802_16_MAC_CPS_RADIO = 0xc1 DLT_INFINIBAND = 0xf7 DLT_IPFILTER = 0x74 - DLT_IPMB_KONTRON = 0xc7 + DLT_IPMB = 0xc7 DLT_IPMB_LINUX = 0xd1 DLT_IPMI_HPM_2 = 0x104 DLT_IPNET = 0xe2 @@ -488,11 +487,10 @@ const ( DLT_LINUX_LAPD = 0xb1 DLT_LINUX_PPP_WITHDIRECTION = 0xa6 DLT_LINUX_SLL = 0x71 - DLT_LINUX_SLL2 = 0x114 DLT_LOOP = 0x6c DLT_LORATAP = 0x10e DLT_LTALK = 0x72 - DLT_MATCHING_MAX = 0x114 + DLT_MATCHING_MAX = 0x113 DLT_MATCHING_MIN = 0x68 DLT_MFR = 0xb6 DLT_MOST = 0xd3 @@ -736,7 +734,6 @@ const ( IPPROTO_CMTP = 0x26 IPPROTO_CPHB = 0x49 IPPROTO_CPNX = 0x48 - IPPROTO_DCCP = 0x21 IPPROTO_DDP = 0x25 IPPROTO_DGP = 0x56 IPPROTO_DIVERT = 0x102 @@ -817,6 +814,7 @@ const ( IPPROTO_SCTP = 0x84 IPPROTO_SDRP = 0x2a IPPROTO_SEND = 0x103 + IPPROTO_SEP = 0x21 IPPROTO_SHIM6 = 0x8c IPPROTO_SKIP = 0x39 IPPROTO_SPACER = 0x7fff @@ -913,7 +911,6 @@ const ( IPV6_V6ONLY = 0x1b IPV6_VERSION = 0x60 IPV6_VERSION_MASK = 0xf0 - IPV6_VLAN_PCP = 0x4b IP_ADD_MEMBERSHIP = 0xc IP_ADD_SOURCE_MEMBERSHIP = 0x46 IP_BINDANY = 0x18 @@ -992,12 +989,8 @@ const ( IP_TOS = 0x3 IP_TTL = 0x4 IP_UNBLOCK_SOURCE = 0x49 - IP_VLAN_PCP = 0x4b ISIG = 0x80 ISTRIP = 0x20 - ITIMER_PROF = 0x2 - ITIMER_REAL = 0x0 - ITIMER_VIRTUAL = 0x1 IXANY = 0x800 IXOFF = 0x400 IXON = 0x200 @@ -1007,6 +1000,7 @@ const ( KERN_VERSION = 0x4 LOCAL_CONNWAIT = 0x4 LOCAL_CREDS = 0x2 + LOCAL_CREDS_PERSISTENT = 0x3 LOCAL_PEERCRED = 0x1 LOCAL_VENDOR = 0x80000000 LOCK_EX = 0x2 @@ -1186,8 +1180,6 @@ const ( O_NONBLOCK = 0x4 O_RDONLY = 0x0 O_RDWR = 0x2 - O_RESOLVE_BENEATH = 0x800000 - O_SEARCH = 0x40000 O_SHLOCK = 0x10 O_SYNC = 0x80 O_TRUNC = 0x400 @@ -1198,10 +1190,6 @@ const ( PARMRK = 0x8 PARODD = 0x2000 PENDIN = 0x20000000 - PIOD_READ_D = 0x1 - PIOD_READ_I = 0x3 - PIOD_WRITE_D = 0x2 - PIOD_WRITE_I = 0x4 PRIO_PGRP = 0x1 PRIO_PROCESS = 0x0 PRIO_USER = 0x2 @@ -1209,51 +1197,6 @@ const ( PROT_NONE = 0x0 PROT_READ = 0x1 PROT_WRITE = 0x2 - PTRACE_DEFAULT = 0x1 - PTRACE_EXEC = 0x1 - PTRACE_FORK = 0x8 - PTRACE_LWP = 0x10 - PTRACE_SCE = 0x2 - PTRACE_SCX = 0x4 - PTRACE_SYSCALL = 0x6 - PTRACE_VFORK = 0x20 - PT_ATTACH = 0xa - PT_CLEARSTEP = 0x10 - PT_CONTINUE = 0x7 - PT_DETACH = 0xb - PT_FIRSTMACH = 0x40 - PT_FOLLOW_FORK = 0x17 - PT_GETDBREGS = 0x25 - PT_GETFPREGS = 0x23 - PT_GETLWPLIST = 0xf - PT_GETNUMLWPS = 0xe - PT_GETREGS = 0x21 - PT_GET_EVENT_MASK = 0x19 - PT_GET_SC_ARGS = 0x1b - PT_GET_SC_RET = 0x1c - PT_IO = 0xc - PT_KILL = 0x8 - PT_LWPINFO = 0xd - PT_LWP_EVENTS = 0x18 - PT_READ_D = 0x2 - PT_READ_I = 0x1 - PT_RESUME = 0x13 - PT_SETDBREGS = 0x26 - PT_SETFPREGS = 0x24 - PT_SETREGS = 0x22 - PT_SETSTEP = 0x11 - PT_SET_EVENT_MASK = 0x1a - PT_STEP = 0x9 - PT_SUSPEND = 0x12 - PT_SYSCALL = 0x16 - PT_TO_SCE = 0x14 - PT_TO_SCX = 0x15 - PT_TRACE_ME = 0x0 - PT_VM_ENTRY = 0x29 - PT_VM_TIMESTAMP = 0x28 - PT_WRITE_D = 0x5 - PT_WRITE_I = 0x4 - P_ZONEID = 0xc RLIMIT_AS = 0xa RLIMIT_CORE = 0x4 RLIMIT_CPU = 0x0 @@ -1378,12 +1321,10 @@ const ( SIOCGHWADDR = 0xc020693e SIOCGI2C = 0xc020693d SIOCGIFADDR = 0xc0206921 - SIOCGIFALIAS = 0xc044692d SIOCGIFBRDADDR = 0xc0206923 SIOCGIFCAP = 0xc020691f SIOCGIFCONF = 0xc0106924 SIOCGIFDESCR = 0xc020692a - SIOCGIFDOWNREASON = 0xc058699a SIOCGIFDSTADDR = 0xc0206922 SIOCGIFFIB = 0xc020695c SIOCGIFFLAGS = 0xc0206911 @@ -1474,7 +1415,6 @@ const ( SO_RCVBUF = 0x1002 SO_RCVLOWAT = 0x1004 SO_RCVTIMEO = 0x1006 - SO_RERROR = 0x20000 SO_REUSEADDR = 0x4 SO_REUSEPORT = 0x200 SO_REUSEPORT_LB = 0x10000 @@ -1533,40 +1473,22 @@ const ( TCOFLUSH = 0x2 TCOOFF = 0x1 TCOON = 0x2 - TCPOPT_EOL = 0x0 - TCPOPT_FAST_OPEN = 0x22 - TCPOPT_MAXSEG = 0x2 - TCPOPT_NOP = 0x1 - TCPOPT_PAD = 0x0 - TCPOPT_SACK = 0x5 - TCPOPT_SACK_PERMITTED = 0x4 - TCPOPT_SIGNATURE = 0x13 - TCPOPT_TIMESTAMP = 0x8 - TCPOPT_WINDOW = 0x3 TCP_BBR_ACK_COMP_ALG = 0x448 - TCP_BBR_ALGORITHM = 0x43b TCP_BBR_DRAIN_INC_EXTRA = 0x43c TCP_BBR_DRAIN_PG = 0x42e TCP_BBR_EXTRA_GAIN = 0x449 - TCP_BBR_EXTRA_STATE = 0x453 - TCP_BBR_FLOOR_MIN_TSO = 0x454 - TCP_BBR_HDWR_PACE = 0x451 - TCP_BBR_HOLD_TARGET = 0x436 TCP_BBR_IWINTSO = 0x42b TCP_BBR_LOWGAIN_FD = 0x436 TCP_BBR_LOWGAIN_HALF = 0x435 TCP_BBR_LOWGAIN_THRESH = 0x434 TCP_BBR_MAX_RTO = 0x439 TCP_BBR_MIN_RTO = 0x438 - TCP_BBR_MIN_TOPACEOUT = 0x455 TCP_BBR_ONE_RETRAN = 0x431 TCP_BBR_PACE_CROSS = 0x442 TCP_BBR_PACE_DEL_TAR = 0x43f - TCP_BBR_PACE_OH = 0x435 TCP_BBR_PACE_PER_SEC = 0x43e TCP_BBR_PACE_SEG_MAX = 0x440 TCP_BBR_PACE_SEG_MIN = 0x441 - TCP_BBR_POLICER_DETECT = 0x457 TCP_BBR_PROBE_RTT_GAIN = 0x44d TCP_BBR_PROBE_RTT_INT = 0x430 TCP_BBR_PROBE_RTT_LEN = 0x44e @@ -1575,18 +1497,12 @@ const ( TCP_BBR_REC_OVER_HPTS = 0x43a TCP_BBR_RETRAN_WTSO = 0x44b TCP_BBR_RWND_IS_APP = 0x42f - TCP_BBR_SEND_IWND_IN_TSO = 0x44f TCP_BBR_STARTUP_EXIT_EPOCH = 0x43d TCP_BBR_STARTUP_LOSS_EXIT = 0x432 TCP_BBR_STARTUP_PG = 0x42d - TCP_BBR_TMR_PACE_OH = 0x448 - TCP_BBR_TSLIMITS = 0x434 - TCP_BBR_TSTMP_RAISES = 0x456 TCP_BBR_UNLIMITED = 0x43b TCP_BBR_USEDEL_RATE = 0x437 TCP_BBR_USE_LOWGAIN = 0x433 - TCP_BBR_USE_RACK_CHEAT = 0x450 - TCP_BBR_UTTER_MAX_TSO = 0x452 TCP_CA_NAME_MAX = 0x10 TCP_CCALGOOPT = 0x41 TCP_CONGESTION = 0x40 @@ -1626,7 +1542,6 @@ const ( TCP_PCAP_OUT = 0x800 TCP_RACK_EARLY_RECOV = 0x423 TCP_RACK_EARLY_SEG = 0x424 - TCP_RACK_GP_INCREASE = 0x446 TCP_RACK_IDLE_REDUCE_HIGH = 0x444 TCP_RACK_MIN_PACE = 0x445 TCP_RACK_MIN_PACE_SEG = 0x446 @@ -1640,6 +1555,7 @@ const ( TCP_RACK_PRR_SENDALOT = 0x421 TCP_RACK_REORD_FADE = 0x426 TCP_RACK_REORD_THRESH = 0x425 + TCP_RACK_SESS_CWV = 0x42a TCP_RACK_TLP_INC_VAR = 0x429 TCP_RACK_TLP_REDUCE = 0x41c TCP_RACK_TLP_THRESH = 0x427 @@ -1778,13 +1694,12 @@ const ( EIDRM = syscall.Errno(0x52) EILSEQ = syscall.Errno(0x56) EINPROGRESS = syscall.Errno(0x24) - EINTEGRITY = syscall.Errno(0x61) EINTR = syscall.Errno(0x4) EINVAL = syscall.Errno(0x16) EIO = syscall.Errno(0x5) EISCONN = syscall.Errno(0x38) EISDIR = syscall.Errno(0x15) - ELAST = syscall.Errno(0x61) + ELAST = syscall.Errno(0x60) ELOOP = syscall.Errno(0x3e) EMFILE = syscall.Errno(0x18) EMLINK = syscall.Errno(0x1f) @@ -1927,7 +1842,7 @@ var errorList = [...]struct { {32, "EPIPE", "broken pipe"}, {33, "EDOM", "numerical argument out of domain"}, {34, "ERANGE", "result too large"}, - {35, "EWOULDBLOCK", "resource temporarily unavailable"}, + {35, "EAGAIN", "resource temporarily unavailable"}, {36, "EINPROGRESS", "operation now in progress"}, {37, "EALREADY", "operation already in progress"}, {38, "ENOTSOCK", "socket operation on non-socket"}, @@ -1989,7 +1904,6 @@ var errorList = [...]struct { {94, "ECAPMODE", "not permitted in capability mode"}, {95, "ENOTRECOVERABLE", "state not recoverable"}, {96, "EOWNERDEAD", "previous owner died"}, - {97, "EINTEGRITY", "integrity check failed"}, } // Signal table diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux.go b/vendor/golang.org/x/sys/unix/zerrors_linux.go index 785d693..3de79fa 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux.go @@ -140,306 +140,6 @@ const ( ARPHRD_VOID = 0xffff ARPHRD_VSOCKMON = 0x33a ARPHRD_X25 = 0x10f - AUDIT_ADD = 0x3eb - AUDIT_ADD_RULE = 0x3f3 - AUDIT_ALWAYS = 0x2 - AUDIT_ANOM_ABEND = 0x6a5 - AUDIT_ANOM_CREAT = 0x6a7 - AUDIT_ANOM_LINK = 0x6a6 - AUDIT_ANOM_PROMISCUOUS = 0x6a4 - AUDIT_ARCH = 0xb - AUDIT_ARCH_AARCH64 = 0xc00000b7 - AUDIT_ARCH_ALPHA = 0xc0009026 - AUDIT_ARCH_ARCOMPACT = 0x4000005d - AUDIT_ARCH_ARCOMPACTBE = 0x5d - AUDIT_ARCH_ARCV2 = 0x400000c3 - AUDIT_ARCH_ARCV2BE = 0xc3 - AUDIT_ARCH_ARM = 0x40000028 - AUDIT_ARCH_ARMEB = 0x28 - AUDIT_ARCH_C6X = 0x4000008c - AUDIT_ARCH_C6XBE = 0x8c - AUDIT_ARCH_CRIS = 0x4000004c - AUDIT_ARCH_CSKY = 0x400000fc - AUDIT_ARCH_FRV = 0x5441 - AUDIT_ARCH_H8300 = 0x2e - AUDIT_ARCH_HEXAGON = 0xa4 - AUDIT_ARCH_I386 = 0x40000003 - AUDIT_ARCH_IA64 = 0xc0000032 - AUDIT_ARCH_LOONGARCH32 = 0x40000102 - AUDIT_ARCH_LOONGARCH64 = 0xc0000102 - AUDIT_ARCH_M32R = 0x58 - AUDIT_ARCH_M68K = 0x4 - AUDIT_ARCH_MICROBLAZE = 0xbd - AUDIT_ARCH_MIPS = 0x8 - AUDIT_ARCH_MIPS64 = 0x80000008 - AUDIT_ARCH_MIPS64N32 = 0xa0000008 - AUDIT_ARCH_MIPSEL = 0x40000008 - AUDIT_ARCH_MIPSEL64 = 0xc0000008 - AUDIT_ARCH_MIPSEL64N32 = 0xe0000008 - AUDIT_ARCH_NDS32 = 0x400000a7 - AUDIT_ARCH_NDS32BE = 0xa7 - AUDIT_ARCH_NIOS2 = 0x40000071 - AUDIT_ARCH_OPENRISC = 0x5c - AUDIT_ARCH_PARISC = 0xf - AUDIT_ARCH_PARISC64 = 0x8000000f - AUDIT_ARCH_PPC = 0x14 - AUDIT_ARCH_PPC64 = 0x80000015 - AUDIT_ARCH_PPC64LE = 0xc0000015 - AUDIT_ARCH_RISCV32 = 0x400000f3 - AUDIT_ARCH_RISCV64 = 0xc00000f3 - AUDIT_ARCH_S390 = 0x16 - AUDIT_ARCH_S390X = 0x80000016 - AUDIT_ARCH_SH = 0x2a - AUDIT_ARCH_SH64 = 0x8000002a - AUDIT_ARCH_SHEL = 0x4000002a - AUDIT_ARCH_SHEL64 = 0xc000002a - AUDIT_ARCH_SPARC = 0x2 - AUDIT_ARCH_SPARC64 = 0x8000002b - AUDIT_ARCH_TILEGX = 0xc00000bf - AUDIT_ARCH_TILEGX32 = 0x400000bf - AUDIT_ARCH_TILEPRO = 0x400000bc - AUDIT_ARCH_UNICORE = 0x4000006e - AUDIT_ARCH_X86_64 = 0xc000003e - AUDIT_ARCH_XTENSA = 0x5e - AUDIT_ARG0 = 0xc8 - AUDIT_ARG1 = 0xc9 - AUDIT_ARG2 = 0xca - AUDIT_ARG3 = 0xcb - AUDIT_AVC = 0x578 - AUDIT_AVC_PATH = 0x57a - AUDIT_BITMASK_SIZE = 0x40 - AUDIT_BIT_MASK = 0x8000000 - AUDIT_BIT_TEST = 0x48000000 - AUDIT_BPF = 0x536 - AUDIT_BPRM_FCAPS = 0x529 - AUDIT_CAPSET = 0x52a - AUDIT_CLASS_CHATTR = 0x2 - AUDIT_CLASS_CHATTR_32 = 0x3 - AUDIT_CLASS_DIR_WRITE = 0x0 - AUDIT_CLASS_DIR_WRITE_32 = 0x1 - AUDIT_CLASS_READ = 0x4 - AUDIT_CLASS_READ_32 = 0x5 - AUDIT_CLASS_SIGNAL = 0x8 - AUDIT_CLASS_SIGNAL_32 = 0x9 - AUDIT_CLASS_WRITE = 0x6 - AUDIT_CLASS_WRITE_32 = 0x7 - AUDIT_COMPARE_AUID_TO_EUID = 0x10 - AUDIT_COMPARE_AUID_TO_FSUID = 0xe - AUDIT_COMPARE_AUID_TO_OBJ_UID = 0x5 - AUDIT_COMPARE_AUID_TO_SUID = 0xf - AUDIT_COMPARE_EGID_TO_FSGID = 0x17 - AUDIT_COMPARE_EGID_TO_OBJ_GID = 0x4 - AUDIT_COMPARE_EGID_TO_SGID = 0x18 - AUDIT_COMPARE_EUID_TO_FSUID = 0x12 - AUDIT_COMPARE_EUID_TO_OBJ_UID = 0x3 - AUDIT_COMPARE_EUID_TO_SUID = 0x11 - AUDIT_COMPARE_FSGID_TO_OBJ_GID = 0x9 - AUDIT_COMPARE_FSUID_TO_OBJ_UID = 0x8 - AUDIT_COMPARE_GID_TO_EGID = 0x14 - AUDIT_COMPARE_GID_TO_FSGID = 0x15 - AUDIT_COMPARE_GID_TO_OBJ_GID = 0x2 - AUDIT_COMPARE_GID_TO_SGID = 0x16 - AUDIT_COMPARE_SGID_TO_FSGID = 0x19 - AUDIT_COMPARE_SGID_TO_OBJ_GID = 0x7 - AUDIT_COMPARE_SUID_TO_FSUID = 0x13 - AUDIT_COMPARE_SUID_TO_OBJ_UID = 0x6 - AUDIT_COMPARE_UID_TO_AUID = 0xa - AUDIT_COMPARE_UID_TO_EUID = 0xb - AUDIT_COMPARE_UID_TO_FSUID = 0xc - AUDIT_COMPARE_UID_TO_OBJ_UID = 0x1 - AUDIT_COMPARE_UID_TO_SUID = 0xd - AUDIT_CONFIG_CHANGE = 0x519 - AUDIT_CWD = 0x51b - AUDIT_DAEMON_ABORT = 0x4b2 - AUDIT_DAEMON_CONFIG = 0x4b3 - AUDIT_DAEMON_END = 0x4b1 - AUDIT_DAEMON_START = 0x4b0 - AUDIT_DEL = 0x3ec - AUDIT_DEL_RULE = 0x3f4 - AUDIT_DEVMAJOR = 0x64 - AUDIT_DEVMINOR = 0x65 - AUDIT_DIR = 0x6b - AUDIT_DM_CTRL = 0x53a - AUDIT_DM_EVENT = 0x53b - AUDIT_EGID = 0x6 - AUDIT_EOE = 0x528 - AUDIT_EQUAL = 0x40000000 - AUDIT_EUID = 0x2 - AUDIT_EVENT_LISTENER = 0x537 - AUDIT_EXE = 0x70 - AUDIT_EXECVE = 0x51d - AUDIT_EXIT = 0x67 - AUDIT_FAIL_PANIC = 0x2 - AUDIT_FAIL_PRINTK = 0x1 - AUDIT_FAIL_SILENT = 0x0 - AUDIT_FANOTIFY = 0x533 - AUDIT_FD_PAIR = 0x525 - AUDIT_FEATURE_BITMAP_ALL = 0x7f - AUDIT_FEATURE_BITMAP_BACKLOG_LIMIT = 0x1 - AUDIT_FEATURE_BITMAP_BACKLOG_WAIT_TIME = 0x2 - AUDIT_FEATURE_BITMAP_EXCLUDE_EXTEND = 0x8 - AUDIT_FEATURE_BITMAP_EXECUTABLE_PATH = 0x4 - AUDIT_FEATURE_BITMAP_FILTER_FS = 0x40 - AUDIT_FEATURE_BITMAP_LOST_RESET = 0x20 - AUDIT_FEATURE_BITMAP_SESSIONID_FILTER = 0x10 - AUDIT_FEATURE_CHANGE = 0x530 - AUDIT_FEATURE_LOGINUID_IMMUTABLE = 0x1 - AUDIT_FEATURE_ONLY_UNSET_LOGINUID = 0x0 - AUDIT_FEATURE_VERSION = 0x1 - AUDIT_FIELD_COMPARE = 0x6f - AUDIT_FILETYPE = 0x6c - AUDIT_FILTERKEY = 0xd2 - AUDIT_FILTER_ENTRY = 0x2 - AUDIT_FILTER_EXCLUDE = 0x5 - AUDIT_FILTER_EXIT = 0x4 - AUDIT_FILTER_FS = 0x6 - AUDIT_FILTER_PREPEND = 0x10 - AUDIT_FILTER_TASK = 0x1 - AUDIT_FILTER_TYPE = 0x5 - AUDIT_FILTER_URING_EXIT = 0x7 - AUDIT_FILTER_USER = 0x0 - AUDIT_FILTER_WATCH = 0x3 - AUDIT_FIRST_KERN_ANOM_MSG = 0x6a4 - AUDIT_FIRST_USER_MSG = 0x44c - AUDIT_FIRST_USER_MSG2 = 0x834 - AUDIT_FSGID = 0x8 - AUDIT_FSTYPE = 0x1a - AUDIT_FSUID = 0x4 - AUDIT_GET = 0x3e8 - AUDIT_GET_FEATURE = 0x3fb - AUDIT_GID = 0x5 - AUDIT_GREATER_THAN = 0x20000000 - AUDIT_GREATER_THAN_OR_EQUAL = 0x60000000 - AUDIT_INODE = 0x66 - AUDIT_INTEGRITY_DATA = 0x708 - AUDIT_INTEGRITY_EVM_XATTR = 0x70e - AUDIT_INTEGRITY_HASH = 0x70b - AUDIT_INTEGRITY_METADATA = 0x709 - AUDIT_INTEGRITY_PCR = 0x70c - AUDIT_INTEGRITY_POLICY_RULE = 0x70f - AUDIT_INTEGRITY_RULE = 0x70d - AUDIT_INTEGRITY_STATUS = 0x70a - AUDIT_IPC = 0x517 - AUDIT_IPC_SET_PERM = 0x51f - AUDIT_KERNEL = 0x7d0 - AUDIT_KERNEL_OTHER = 0x524 - AUDIT_KERN_MODULE = 0x532 - AUDIT_LAST_FEATURE = 0x1 - AUDIT_LAST_KERN_ANOM_MSG = 0x707 - AUDIT_LAST_USER_MSG = 0x4af - AUDIT_LAST_USER_MSG2 = 0xbb7 - AUDIT_LESS_THAN = 0x10000000 - AUDIT_LESS_THAN_OR_EQUAL = 0x50000000 - AUDIT_LIST = 0x3ea - AUDIT_LIST_RULES = 0x3f5 - AUDIT_LOGIN = 0x3ee - AUDIT_LOGINUID = 0x9 - AUDIT_LOGINUID_SET = 0x18 - AUDIT_MAC_CALIPSO_ADD = 0x58a - AUDIT_MAC_CALIPSO_DEL = 0x58b - AUDIT_MAC_CIPSOV4_ADD = 0x57f - AUDIT_MAC_CIPSOV4_DEL = 0x580 - AUDIT_MAC_CONFIG_CHANGE = 0x57d - AUDIT_MAC_IPSEC_ADDSA = 0x583 - AUDIT_MAC_IPSEC_ADDSPD = 0x585 - AUDIT_MAC_IPSEC_DELSA = 0x584 - AUDIT_MAC_IPSEC_DELSPD = 0x586 - AUDIT_MAC_IPSEC_EVENT = 0x587 - AUDIT_MAC_MAP_ADD = 0x581 - AUDIT_MAC_MAP_DEL = 0x582 - AUDIT_MAC_POLICY_LOAD = 0x57b - AUDIT_MAC_STATUS = 0x57c - AUDIT_MAC_UNLBL_ALLOW = 0x57e - AUDIT_MAC_UNLBL_STCADD = 0x588 - AUDIT_MAC_UNLBL_STCDEL = 0x589 - AUDIT_MAKE_EQUIV = 0x3f7 - AUDIT_MAX_FIELDS = 0x40 - AUDIT_MAX_FIELD_COMPARE = 0x19 - AUDIT_MAX_KEY_LEN = 0x100 - AUDIT_MESSAGE_TEXT_MAX = 0x2170 - AUDIT_MMAP = 0x52b - AUDIT_MQ_GETSETATTR = 0x523 - AUDIT_MQ_NOTIFY = 0x522 - AUDIT_MQ_OPEN = 0x520 - AUDIT_MQ_SENDRECV = 0x521 - AUDIT_MSGTYPE = 0xc - AUDIT_NEGATE = 0x80000000 - AUDIT_NETFILTER_CFG = 0x52d - AUDIT_NETFILTER_PKT = 0x52c - AUDIT_NEVER = 0x0 - AUDIT_NLGRP_MAX = 0x1 - AUDIT_NOT_EQUAL = 0x30000000 - AUDIT_NR_FILTERS = 0x8 - AUDIT_OBJ_GID = 0x6e - AUDIT_OBJ_LEV_HIGH = 0x17 - AUDIT_OBJ_LEV_LOW = 0x16 - AUDIT_OBJ_PID = 0x526 - AUDIT_OBJ_ROLE = 0x14 - AUDIT_OBJ_TYPE = 0x15 - AUDIT_OBJ_UID = 0x6d - AUDIT_OBJ_USER = 0x13 - AUDIT_OPENAT2 = 0x539 - AUDIT_OPERATORS = 0x78000000 - AUDIT_PATH = 0x516 - AUDIT_PERM = 0x6a - AUDIT_PERM_ATTR = 0x8 - AUDIT_PERM_EXEC = 0x1 - AUDIT_PERM_READ = 0x4 - AUDIT_PERM_WRITE = 0x2 - AUDIT_PERS = 0xa - AUDIT_PID = 0x0 - AUDIT_POSSIBLE = 0x1 - AUDIT_PPID = 0x12 - AUDIT_PROCTITLE = 0x52f - AUDIT_REPLACE = 0x531 - AUDIT_SADDR_FAM = 0x71 - AUDIT_SECCOMP = 0x52e - AUDIT_SELINUX_ERR = 0x579 - AUDIT_SESSIONID = 0x19 - AUDIT_SET = 0x3e9 - AUDIT_SET_FEATURE = 0x3fa - AUDIT_SGID = 0x7 - AUDIT_SID_UNSET = 0xffffffff - AUDIT_SIGNAL_INFO = 0x3f2 - AUDIT_SOCKADDR = 0x51a - AUDIT_SOCKETCALL = 0x518 - AUDIT_STATUS_BACKLOG_LIMIT = 0x10 - AUDIT_STATUS_BACKLOG_WAIT_TIME = 0x20 - AUDIT_STATUS_BACKLOG_WAIT_TIME_ACTUAL = 0x80 - AUDIT_STATUS_ENABLED = 0x1 - AUDIT_STATUS_FAILURE = 0x2 - AUDIT_STATUS_LOST = 0x40 - AUDIT_STATUS_PID = 0x4 - AUDIT_STATUS_RATE_LIMIT = 0x8 - AUDIT_SUBJ_CLR = 0x11 - AUDIT_SUBJ_ROLE = 0xe - AUDIT_SUBJ_SEN = 0x10 - AUDIT_SUBJ_TYPE = 0xf - AUDIT_SUBJ_USER = 0xd - AUDIT_SUCCESS = 0x68 - AUDIT_SUID = 0x3 - AUDIT_SYSCALL = 0x514 - AUDIT_SYSCALL_CLASSES = 0x10 - AUDIT_TIME_ADJNTPVAL = 0x535 - AUDIT_TIME_INJOFFSET = 0x534 - AUDIT_TRIM = 0x3f6 - AUDIT_TTY = 0x527 - AUDIT_TTY_GET = 0x3f8 - AUDIT_TTY_SET = 0x3f9 - AUDIT_UID = 0x1 - AUDIT_UID_UNSET = 0xffffffff - AUDIT_UNUSED_BITS = 0x7fffc00 - AUDIT_URINGOP = 0x538 - AUDIT_USER = 0x3ed - AUDIT_USER_AVC = 0x453 - AUDIT_USER_TTY = 0x464 - AUDIT_VERSION_BACKLOG_LIMIT = 0x1 - AUDIT_VERSION_BACKLOG_WAIT_TIME = 0x2 - AUDIT_VERSION_LATEST = 0x7f - AUDIT_WATCH = 0x69 - AUDIT_WATCH_INS = 0x3ef - AUDIT_WATCH_LIST = 0x3f1 - AUDIT_WATCH_REM = 0x3f0 AUTOFS_SUPER_MAGIC = 0x187 B0 = 0x0 B110 = 0x3 @@ -484,7 +184,6 @@ const ( BPF_F_ALLOW_MULTI = 0x2 BPF_F_ALLOW_OVERRIDE = 0x1 BPF_F_ANY_ALIGNMENT = 0x2 - BPF_F_KPROBE_MULTI_RETURN = 0x1 BPF_F_QUERY_EFFECTIVE = 0x1 BPF_F_REPLACE = 0x4 BPF_F_SLEEPABLE = 0x10 @@ -492,8 +191,6 @@ const ( BPF_F_TEST_RND_HI32 = 0x4 BPF_F_TEST_RUN_ON_CPU = 0x1 BPF_F_TEST_STATE_FREQ = 0x8 - BPF_F_TEST_XDP_LIVE_FRAMES = 0x2 - BPF_F_XDP_HAS_FRAGS = 0x20 BPF_H = 0x8 BPF_IMM = 0x0 BPF_IND = 0x40 @@ -820,9 +517,9 @@ const ( DM_UUID_FLAG = 0x4000 DM_UUID_LEN = 0x81 DM_VERSION = 0xc138fd00 - DM_VERSION_EXTRA = "-ioctl (2022-02-22)" + DM_VERSION_EXTRA = "-ioctl (2021-03-22)" DM_VERSION_MAJOR = 0x4 - DM_VERSION_MINOR = 0x2e + DM_VERSION_MINOR = 0x2d DM_VERSION_PATCHLEVEL = 0x0 DT_BLK = 0x6 DT_CHR = 0x2 @@ -838,55 +535,6 @@ const ( EFD_SEMAPHORE = 0x1 EFIVARFS_MAGIC = 0xde5e81e4 EFS_SUPER_MAGIC = 0x414a53 - EM_386 = 0x3 - EM_486 = 0x6 - EM_68K = 0x4 - EM_860 = 0x7 - EM_88K = 0x5 - EM_AARCH64 = 0xb7 - EM_ALPHA = 0x9026 - EM_ALTERA_NIOS2 = 0x71 - EM_ARCOMPACT = 0x5d - EM_ARCV2 = 0xc3 - EM_ARM = 0x28 - EM_BLACKFIN = 0x6a - EM_BPF = 0xf7 - EM_CRIS = 0x4c - EM_CSKY = 0xfc - EM_CYGNUS_M32R = 0x9041 - EM_CYGNUS_MN10300 = 0xbeef - EM_FRV = 0x5441 - EM_H8_300 = 0x2e - EM_HEXAGON = 0xa4 - EM_IA_64 = 0x32 - EM_LOONGARCH = 0x102 - EM_M32 = 0x1 - EM_M32R = 0x58 - EM_MICROBLAZE = 0xbd - EM_MIPS = 0x8 - EM_MIPS_RS3_LE = 0xa - EM_MIPS_RS4_BE = 0xa - EM_MN10300 = 0x59 - EM_NDS32 = 0xa7 - EM_NONE = 0x0 - EM_OPENRISC = 0x5c - EM_PARISC = 0xf - EM_PPC = 0x14 - EM_PPC64 = 0x15 - EM_RISCV = 0xf3 - EM_S390 = 0x16 - EM_S390_OLD = 0xa390 - EM_SH = 0x2a - EM_SPARC = 0x2 - EM_SPARC32PLUS = 0x12 - EM_SPARCV9 = 0x2b - EM_SPU = 0x17 - EM_TILEGX = 0xbf - EM_TILEPRO = 0xbc - EM_TI_C6000 = 0x8c - EM_UNICORE = 0x6e - EM_X86_64 = 0x3e - EM_XTENSA = 0x5e ENCODING_DEFAULT = 0x0 ENCODING_FM_MARK = 0x3 ENCODING_FM_SPACE = 0x4 @@ -1064,7 +712,6 @@ const ( ETH_P_EDSA = 0xdada ETH_P_ERSPAN = 0x88be ETH_P_ERSPAN2 = 0x22eb - ETH_P_ETHERCAT = 0x88a4 ETH_P_FCOE = 0x8906 ETH_P_FIP = 0x8914 ETH_P_HDLC = 0x19 @@ -1102,7 +749,6 @@ const ( ETH_P_PPP_MP = 0x8 ETH_P_PPP_SES = 0x8864 ETH_P_PREAUTH = 0x88c7 - ETH_P_PROFINET = 0x8892 ETH_P_PRP = 0x88fb ETH_P_PUP = 0x200 ETH_P_PUPAT = 0x201 @@ -1191,7 +837,6 @@ const ( FAN_FS_ERROR = 0x8000 FAN_MARK_ADD = 0x1 FAN_MARK_DONT_FOLLOW = 0x4 - FAN_MARK_EVICTABLE = 0x200 FAN_MARK_FILESYSTEM = 0x100 FAN_MARK_FLUSH = 0x80 FAN_MARK_IGNORED_MASK = 0x20 @@ -1410,7 +1055,7 @@ const ( IFA_F_STABLE_PRIVACY = 0x800 IFA_F_TEMPORARY = 0x1 IFA_F_TENTATIVE = 0x40 - IFA_MAX = 0xb + IFA_MAX = 0xa IFF_ALLMULTI = 0x200 IFF_ATTACH_QUEUE = 0x200 IFF_AUTOMEDIA = 0x4000 @@ -1665,7 +1310,6 @@ const ( KEXEC_ARCH_ARM = 0x280000 KEXEC_ARCH_DEFAULT = 0x0 KEXEC_ARCH_IA_64 = 0x320000 - KEXEC_ARCH_LOONGARCH = 0x1020000 KEXEC_ARCH_MASK = 0xffff0000 KEXEC_ARCH_MIPS = 0x80000 KEXEC_ARCH_MIPS_LE = 0xa0000 @@ -1758,7 +1402,6 @@ const ( LANDLOCK_ACCESS_FS_MAKE_SYM = 0x1000 LANDLOCK_ACCESS_FS_READ_DIR = 0x8 LANDLOCK_ACCESS_FS_READ_FILE = 0x4 - LANDLOCK_ACCESS_FS_REFER = 0x2000 LANDLOCK_ACCESS_FS_REMOVE_DIR = 0x10 LANDLOCK_ACCESS_FS_REMOVE_FILE = 0x20 LANDLOCK_ACCESS_FS_WRITE_FILE = 0x2 @@ -2114,7 +1757,6 @@ const ( NLM_F_ACK_TLVS = 0x200 NLM_F_APPEND = 0x800 NLM_F_ATOMIC = 0x400 - NLM_F_BULK = 0x200 NLM_F_CAPPED = 0x100 NLM_F_CREATE = 0x400 NLM_F_DUMP = 0x300 @@ -2432,11 +2074,6 @@ const ( PR_SET_UNALIGN = 0x6 PR_SET_VMA = 0x53564d41 PR_SET_VMA_ANON_NAME = 0x0 - PR_SME_GET_VL = 0x40 - PR_SME_SET_VL = 0x3f - PR_SME_SET_VL_ONEXEC = 0x40000 - PR_SME_VL_INHERIT = 0x20000 - PR_SME_VL_LEN_MASK = 0xffff PR_SPEC_DISABLE = 0x4 PR_SPEC_DISABLE_NOEXEC = 0x10 PR_SPEC_ENABLE = 0x2 @@ -2589,9 +2226,8 @@ const ( RTC_FEATURE_ALARM = 0x0 RTC_FEATURE_ALARM_RES_2S = 0x3 RTC_FEATURE_ALARM_RES_MINUTE = 0x1 - RTC_FEATURE_ALARM_WAKEUP_ONLY = 0x7 RTC_FEATURE_BACKUP_SWITCH_MODE = 0x6 - RTC_FEATURE_CNT = 0x8 + RTC_FEATURE_CNT = 0x7 RTC_FEATURE_CORRECTION = 0x5 RTC_FEATURE_NEED_WEEK_DAY = 0x2 RTC_FEATURE_UPDATE_INTERRUPT = 0x4 @@ -2665,7 +2301,6 @@ const ( RTM_DELRULE = 0x21 RTM_DELTCLASS = 0x29 RTM_DELTFILTER = 0x2d - RTM_DELTUNNEL = 0x79 RTM_DELVLAN = 0x71 RTM_F_CLONED = 0x200 RTM_F_EQUALIZE = 0x400 @@ -2698,9 +2333,8 @@ const ( RTM_GETSTATS = 0x5e RTM_GETTCLASS = 0x2a RTM_GETTFILTER = 0x2e - RTM_GETTUNNEL = 0x7a RTM_GETVLAN = 0x72 - RTM_MAX = 0x7b + RTM_MAX = 0x77 RTM_NEWACTION = 0x30 RTM_NEWADDR = 0x14 RTM_NEWADDRLABEL = 0x48 @@ -2724,13 +2358,11 @@ const ( RTM_NEWSTATS = 0x5c RTM_NEWTCLASS = 0x28 RTM_NEWTFILTER = 0x2c - RTM_NEWTUNNEL = 0x78 - RTM_NR_FAMILIES = 0x1b - RTM_NR_MSGTYPES = 0x6c + RTM_NR_FAMILIES = 0x1a + RTM_NR_MSGTYPES = 0x68 RTM_SETDCB = 0x4f RTM_SETLINK = 0x13 RTM_SETNEIGHTBL = 0x43 - RTM_SETSTATS = 0x5f RTNH_ALIGNTO = 0x4 RTNH_COMPARE_MASK = 0x59 RTNH_F_DEAD = 0x1 @@ -2911,9 +2543,6 @@ const ( SOCK_RDM = 0x4 SOCK_SEQPACKET = 0x5 SOCK_SNDBUF_LOCK = 0x1 - SOCK_TXREHASH_DEFAULT = 0xff - SOCK_TXREHASH_DISABLED = 0x0 - SOCK_TXREHASH_ENABLED = 0x1 SOL_AAL = 0x109 SOL_ALG = 0x117 SOL_ATM = 0x108 @@ -2929,8 +2558,6 @@ const ( SOL_IUCV = 0x115 SOL_KCM = 0x119 SOL_LLC = 0x10c - SOL_MCTP = 0x11d - SOL_MPTCP = 0x11c SOL_NETBEUI = 0x10b SOL_NETLINK = 0x10e SOL_NFC = 0x118 @@ -2940,7 +2567,6 @@ const ( SOL_RAW = 0xff SOL_RDS = 0x114 SOL_RXRPC = 0x110 - SOL_SMC = 0x11e SOL_TCP = 0x6 SOL_TIPC = 0x10f SOL_TLS = 0x11a @@ -3047,7 +2673,7 @@ const ( TASKSTATS_GENL_NAME = "TASKSTATS" TASKSTATS_GENL_VERSION = 0x1 TASKSTATS_TYPE_MAX = 0x6 - TASKSTATS_VERSION = 0xd + TASKSTATS_VERSION = 0xb TCIFLUSH = 0x0 TCIOFF = 0x2 TCIOFLUSH = 0x2 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_386.go b/vendor/golang.org/x/sys/unix/zerrors_linux_386.go index 36c0dfc..234fd4a 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_386.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_386.go @@ -1,11 +1,11 @@ -// mkerrors.sh -Wall -Werror -static -I/tmp/386/include -m32 +// mkerrors.sh -Wall -Werror -static -I/tmp/include -m32 // Code generated by the command above; see README.md. DO NOT EDIT. //go:build 386 && linux // +build 386,linux // Code generated by cmd/cgo -godefs; DO NOT EDIT. -// cgo -godefs -- -Wall -Werror -static -I/tmp/386/include -m32 _const.go +// cgo -godefs -- -Wall -Werror -static -I/tmp/include -m32 /build/unix/_const.go package unix @@ -326,7 +326,6 @@ const ( SO_RCVBUF = 0x8 SO_RCVBUFFORCE = 0x21 SO_RCVLOWAT = 0x12 - SO_RCVMARK = 0x4b SO_RCVTIMEO = 0x14 SO_RCVTIMEO_NEW = 0x42 SO_RCVTIMEO_OLD = 0x14 @@ -351,7 +350,6 @@ const ( SO_TIMESTAMPNS_NEW = 0x40 SO_TIMESTAMPNS_OLD = 0x23 SO_TIMESTAMP_NEW = 0x3f - SO_TXREHASH = 0x4a SO_TXTIME = 0x3d SO_TYPE = 0x3 SO_WIFI_STATUS = 0x29 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go index 4ff9427..58619b7 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_amd64.go @@ -1,11 +1,11 @@ -// mkerrors.sh -Wall -Werror -static -I/tmp/amd64/include -m64 +// mkerrors.sh -Wall -Werror -static -I/tmp/include -m64 // Code generated by the command above; see README.md. DO NOT EDIT. //go:build amd64 && linux // +build amd64,linux // Code generated by cmd/cgo -godefs; DO NOT EDIT. -// cgo -godefs -- -Wall -Werror -static -I/tmp/amd64/include -m64 _const.go +// cgo -godefs -- -Wall -Werror -static -I/tmp/include -m64 /build/unix/_const.go package unix @@ -327,7 +327,6 @@ const ( SO_RCVBUF = 0x8 SO_RCVBUFFORCE = 0x21 SO_RCVLOWAT = 0x12 - SO_RCVMARK = 0x4b SO_RCVTIMEO = 0x14 SO_RCVTIMEO_NEW = 0x42 SO_RCVTIMEO_OLD = 0x14 @@ -352,7 +351,6 @@ const ( SO_TIMESTAMPNS_NEW = 0x40 SO_TIMESTAMPNS_OLD = 0x23 SO_TIMESTAMP_NEW = 0x3f - SO_TXREHASH = 0x4a SO_TXTIME = 0x3d SO_TYPE = 0x3 SO_WIFI_STATUS = 0x29 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_arm.go b/vendor/golang.org/x/sys/unix/zerrors_linux_arm.go index 3eaa0fb..3a64ff5 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_arm.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_arm.go @@ -1,11 +1,11 @@ -// mkerrors.sh -Wall -Werror -static -I/tmp/arm/include +// mkerrors.sh -Wall -Werror -static -I/tmp/include // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm && linux // +build arm,linux // Code generated by cmd/cgo -godefs; DO NOT EDIT. -// cgo -godefs -- -Wall -Werror -static -I/tmp/arm/include _const.go +// cgo -godefs -- -Wall -Werror -static -I/tmp/include /build/unix/_const.go package unix @@ -333,7 +333,6 @@ const ( SO_RCVBUF = 0x8 SO_RCVBUFFORCE = 0x21 SO_RCVLOWAT = 0x12 - SO_RCVMARK = 0x4b SO_RCVTIMEO = 0x14 SO_RCVTIMEO_NEW = 0x42 SO_RCVTIMEO_OLD = 0x14 @@ -358,7 +357,6 @@ const ( SO_TIMESTAMPNS_NEW = 0x40 SO_TIMESTAMPNS_OLD = 0x23 SO_TIMESTAMP_NEW = 0x3f - SO_TXREHASH = 0x4a SO_TXTIME = 0x3d SO_TYPE = 0x3 SO_WIFI_STATUS = 0x29 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go index d7995bd..abe0b92 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_arm64.go @@ -1,11 +1,11 @@ -// mkerrors.sh -Wall -Werror -static -I/tmp/arm64/include -fsigned-char +// mkerrors.sh -Wall -Werror -static -I/tmp/include -fsigned-char // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm64 && linux // +build arm64,linux // Code generated by cmd/cgo -godefs; DO NOT EDIT. -// cgo -godefs -- -Wall -Werror -static -I/tmp/arm64/include -fsigned-char _const.go +// cgo -godefs -- -Wall -Werror -static -I/tmp/include -fsigned-char /build/unix/_const.go package unix @@ -323,7 +323,6 @@ const ( SO_RCVBUF = 0x8 SO_RCVBUFFORCE = 0x21 SO_RCVLOWAT = 0x12 - SO_RCVMARK = 0x4b SO_RCVTIMEO = 0x14 SO_RCVTIMEO_NEW = 0x42 SO_RCVTIMEO_OLD = 0x14 @@ -348,7 +347,6 @@ const ( SO_TIMESTAMPNS_NEW = 0x40 SO_TIMESTAMPNS_OLD = 0x23 SO_TIMESTAMP_NEW = 0x3f - SO_TXREHASH = 0x4a SO_TXTIME = 0x3d SO_TYPE = 0x3 SO_WIFI_STATUS = 0x29 @@ -513,7 +511,6 @@ const ( WORDSIZE = 0x40 XCASE = 0x4 XTABS = 0x1800 - ZA_MAGIC = 0x54366345 _HIDIOCGRAWNAME = 0x80804804 _HIDIOCGRAWPHYS = 0x80404805 _HIDIOCGRAWUNIQ = 0x80404808 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_mips.go b/vendor/golang.org/x/sys/unix/zerrors_linux_mips.go index 179bffb..14d7a84 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_mips.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_mips.go @@ -1,11 +1,11 @@ -// mkerrors.sh -Wall -Werror -static -I/tmp/mips/include +// mkerrors.sh -Wall -Werror -static -I/tmp/include // Code generated by the command above; see README.md. DO NOT EDIT. //go:build mips && linux // +build mips,linux // Code generated by cmd/cgo -godefs; DO NOT EDIT. -// cgo -godefs -- -Wall -Werror -static -I/tmp/mips/include _const.go +// cgo -godefs -- -Wall -Werror -static -I/tmp/include /build/unix/_const.go package unix @@ -326,7 +326,6 @@ const ( SO_RCVBUF = 0x1002 SO_RCVBUFFORCE = 0x21 SO_RCVLOWAT = 0x1004 - SO_RCVMARK = 0x4b SO_RCVTIMEO = 0x1006 SO_RCVTIMEO_NEW = 0x42 SO_RCVTIMEO_OLD = 0x1006 @@ -352,7 +351,6 @@ const ( SO_TIMESTAMPNS_NEW = 0x40 SO_TIMESTAMPNS_OLD = 0x23 SO_TIMESTAMP_NEW = 0x3f - SO_TXREHASH = 0x4a SO_TXTIME = 0x3d SO_TYPE = 0x1008 SO_WIFI_STATUS = 0x29 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_mips64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_mips64.go index 1fba17b..99e7c4a 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_mips64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_mips64.go @@ -1,11 +1,11 @@ -// mkerrors.sh -Wall -Werror -static -I/tmp/mips64/include +// mkerrors.sh -Wall -Werror -static -I/tmp/include // Code generated by the command above; see README.md. DO NOT EDIT. //go:build mips64 && linux // +build mips64,linux // Code generated by cmd/cgo -godefs; DO NOT EDIT. -// cgo -godefs -- -Wall -Werror -static -I/tmp/mips64/include _const.go +// cgo -godefs -- -Wall -Werror -static -I/tmp/include /build/unix/_const.go package unix @@ -326,7 +326,6 @@ const ( SO_RCVBUF = 0x1002 SO_RCVBUFFORCE = 0x21 SO_RCVLOWAT = 0x1004 - SO_RCVMARK = 0x4b SO_RCVTIMEO = 0x1006 SO_RCVTIMEO_NEW = 0x42 SO_RCVTIMEO_OLD = 0x1006 @@ -352,7 +351,6 @@ const ( SO_TIMESTAMPNS_NEW = 0x40 SO_TIMESTAMPNS_OLD = 0x23 SO_TIMESTAMP_NEW = 0x3f - SO_TXREHASH = 0x4a SO_TXTIME = 0x3d SO_TYPE = 0x1008 SO_WIFI_STATUS = 0x29 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_mips64le.go b/vendor/golang.org/x/sys/unix/zerrors_linux_mips64le.go index b77dde3..496364c 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_mips64le.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_mips64le.go @@ -1,11 +1,11 @@ -// mkerrors.sh -Wall -Werror -static -I/tmp/mips64le/include +// mkerrors.sh -Wall -Werror -static -I/tmp/include // Code generated by the command above; see README.md. DO NOT EDIT. //go:build mips64le && linux // +build mips64le,linux // Code generated by cmd/cgo -godefs; DO NOT EDIT. -// cgo -godefs -- -Wall -Werror -static -I/tmp/mips64le/include _const.go +// cgo -godefs -- -Wall -Werror -static -I/tmp/include /build/unix/_const.go package unix @@ -326,7 +326,6 @@ const ( SO_RCVBUF = 0x1002 SO_RCVBUFFORCE = 0x21 SO_RCVLOWAT = 0x1004 - SO_RCVMARK = 0x4b SO_RCVTIMEO = 0x1006 SO_RCVTIMEO_NEW = 0x42 SO_RCVTIMEO_OLD = 0x1006 @@ -352,7 +351,6 @@ const ( SO_TIMESTAMPNS_NEW = 0x40 SO_TIMESTAMPNS_OLD = 0x23 SO_TIMESTAMP_NEW = 0x3f - SO_TXREHASH = 0x4a SO_TXTIME = 0x3d SO_TYPE = 0x1008 SO_WIFI_STATUS = 0x29 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_mipsle.go b/vendor/golang.org/x/sys/unix/zerrors_linux_mipsle.go index 78c6c75..3e40830 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_mipsle.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_mipsle.go @@ -1,11 +1,11 @@ -// mkerrors.sh -Wall -Werror -static -I/tmp/mipsle/include +// mkerrors.sh -Wall -Werror -static -I/tmp/include // Code generated by the command above; see README.md. DO NOT EDIT. //go:build mipsle && linux // +build mipsle,linux // Code generated by cmd/cgo -godefs; DO NOT EDIT. -// cgo -godefs -- -Wall -Werror -static -I/tmp/mipsle/include _const.go +// cgo -godefs -- -Wall -Werror -static -I/tmp/include /build/unix/_const.go package unix @@ -326,7 +326,6 @@ const ( SO_RCVBUF = 0x1002 SO_RCVBUFFORCE = 0x21 SO_RCVLOWAT = 0x1004 - SO_RCVMARK = 0x4b SO_RCVTIMEO = 0x1006 SO_RCVTIMEO_NEW = 0x42 SO_RCVTIMEO_OLD = 0x1006 @@ -352,7 +351,6 @@ const ( SO_TIMESTAMPNS_NEW = 0x40 SO_TIMESTAMPNS_OLD = 0x23 SO_TIMESTAMP_NEW = 0x3f - SO_TXREHASH = 0x4a SO_TXTIME = 0x3d SO_TYPE = 0x1008 SO_WIFI_STATUS = 0x29 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc.go b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc.go index 1c0d31f..1151a7d 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc.go @@ -1,11 +1,11 @@ -// mkerrors.sh -Wall -Werror -static -I/tmp/ppc/include +// mkerrors.sh -Wall -Werror -static -I/tmp/include // Code generated by the command above; see README.md. DO NOT EDIT. //go:build ppc && linux // +build ppc,linux // Code generated by cmd/cgo -godefs; DO NOT EDIT. -// cgo -godefs -- -Wall -Werror -static -I/tmp/ppc/include _const.go +// cgo -godefs -- -Wall -Werror -static -I/tmp/include /build/unix/_const.go package unix @@ -381,7 +381,6 @@ const ( SO_RCVBUF = 0x8 SO_RCVBUFFORCE = 0x21 SO_RCVLOWAT = 0x10 - SO_RCVMARK = 0x4b SO_RCVTIMEO = 0x12 SO_RCVTIMEO_NEW = 0x42 SO_RCVTIMEO_OLD = 0x12 @@ -406,7 +405,6 @@ const ( SO_TIMESTAMPNS_NEW = 0x40 SO_TIMESTAMPNS_OLD = 0x23 SO_TIMESTAMP_NEW = 0x3f - SO_TXREHASH = 0x4a SO_TXTIME = 0x3d SO_TYPE = 0x3 SO_WIFI_STATUS = 0x29 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64.go index 959dd9b..ed17f24 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64.go @@ -1,11 +1,11 @@ -// mkerrors.sh -Wall -Werror -static -I/tmp/ppc64/include +// mkerrors.sh -Wall -Werror -static -I/tmp/include // Code generated by the command above; see README.md. DO NOT EDIT. //go:build ppc64 && linux // +build ppc64,linux // Code generated by cmd/cgo -godefs; DO NOT EDIT. -// cgo -godefs -- -Wall -Werror -static -I/tmp/ppc64/include _const.go +// cgo -godefs -- -Wall -Werror -static -I/tmp/include /build/unix/_const.go package unix @@ -385,7 +385,6 @@ const ( SO_RCVBUF = 0x8 SO_RCVBUFFORCE = 0x21 SO_RCVLOWAT = 0x10 - SO_RCVMARK = 0x4b SO_RCVTIMEO = 0x12 SO_RCVTIMEO_NEW = 0x42 SO_RCVTIMEO_OLD = 0x12 @@ -410,7 +409,6 @@ const ( SO_TIMESTAMPNS_NEW = 0x40 SO_TIMESTAMPNS_OLD = 0x23 SO_TIMESTAMP_NEW = 0x3f - SO_TXREHASH = 0x4a SO_TXTIME = 0x3d SO_TYPE = 0x3 SO_WIFI_STATUS = 0x29 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64le.go b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64le.go index 5a873cd..d84a37c 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64le.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_ppc64le.go @@ -1,11 +1,11 @@ -// mkerrors.sh -Wall -Werror -static -I/tmp/ppc64le/include +// mkerrors.sh -Wall -Werror -static -I/tmp/include // Code generated by the command above; see README.md. DO NOT EDIT. //go:build ppc64le && linux // +build ppc64le,linux // Code generated by cmd/cgo -godefs; DO NOT EDIT. -// cgo -godefs -- -Wall -Werror -static -I/tmp/ppc64le/include _const.go +// cgo -godefs -- -Wall -Werror -static -I/tmp/include /build/unix/_const.go package unix @@ -385,7 +385,6 @@ const ( SO_RCVBUF = 0x8 SO_RCVBUFFORCE = 0x21 SO_RCVLOWAT = 0x10 - SO_RCVMARK = 0x4b SO_RCVTIMEO = 0x12 SO_RCVTIMEO_NEW = 0x42 SO_RCVTIMEO_OLD = 0x12 @@ -410,7 +409,6 @@ const ( SO_TIMESTAMPNS_NEW = 0x40 SO_TIMESTAMPNS_OLD = 0x23 SO_TIMESTAMP_NEW = 0x3f - SO_TXREHASH = 0x4a SO_TXTIME = 0x3d SO_TYPE = 0x3 SO_WIFI_STATUS = 0x29 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_riscv64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_riscv64.go index e336d14..5cafba8 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_riscv64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_riscv64.go @@ -1,11 +1,11 @@ -// mkerrors.sh -Wall -Werror -static -I/tmp/riscv64/include +// mkerrors.sh -Wall -Werror -static -I/tmp/include // Code generated by the command above; see README.md. DO NOT EDIT. //go:build riscv64 && linux // +build riscv64,linux // Code generated by cmd/cgo -godefs; DO NOT EDIT. -// cgo -godefs -- -Wall -Werror -static -I/tmp/riscv64/include _const.go +// cgo -godefs -- -Wall -Werror -static -I/tmp/include /build/unix/_const.go package unix @@ -314,7 +314,6 @@ const ( SO_RCVBUF = 0x8 SO_RCVBUFFORCE = 0x21 SO_RCVLOWAT = 0x12 - SO_RCVMARK = 0x4b SO_RCVTIMEO = 0x14 SO_RCVTIMEO_NEW = 0x42 SO_RCVTIMEO_OLD = 0x14 @@ -339,7 +338,6 @@ const ( SO_TIMESTAMPNS_NEW = 0x40 SO_TIMESTAMPNS_OLD = 0x23 SO_TIMESTAMP_NEW = 0x3f - SO_TXREHASH = 0x4a SO_TXTIME = 0x3d SO_TYPE = 0x3 SO_WIFI_STATUS = 0x29 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_s390x.go b/vendor/golang.org/x/sys/unix/zerrors_linux_s390x.go index 390c01d..6d122da 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_s390x.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_s390x.go @@ -1,11 +1,11 @@ -// mkerrors.sh -Wall -Werror -static -I/tmp/s390x/include -fsigned-char +// mkerrors.sh -Wall -Werror -static -I/tmp/include -fsigned-char // Code generated by the command above; see README.md. DO NOT EDIT. //go:build s390x && linux // +build s390x,linux // Code generated by cmd/cgo -godefs; DO NOT EDIT. -// cgo -godefs -- -Wall -Werror -static -I/tmp/s390x/include -fsigned-char _const.go +// cgo -godefs -- -Wall -Werror -static -I/tmp/include -fsigned-char /build/unix/_const.go package unix @@ -389,7 +389,6 @@ const ( SO_RCVBUF = 0x8 SO_RCVBUFFORCE = 0x21 SO_RCVLOWAT = 0x12 - SO_RCVMARK = 0x4b SO_RCVTIMEO = 0x14 SO_RCVTIMEO_NEW = 0x42 SO_RCVTIMEO_OLD = 0x14 @@ -414,7 +413,6 @@ const ( SO_TIMESTAMPNS_NEW = 0x40 SO_TIMESTAMPNS_OLD = 0x23 SO_TIMESTAMP_NEW = 0x3f - SO_TXREHASH = 0x4a SO_TXTIME = 0x3d SO_TYPE = 0x3 SO_WIFI_STATUS = 0x29 diff --git a/vendor/golang.org/x/sys/unix/zerrors_linux_sparc64.go b/vendor/golang.org/x/sys/unix/zerrors_linux_sparc64.go index 98a6e5f..6bd19e5 100644 --- a/vendor/golang.org/x/sys/unix/zerrors_linux_sparc64.go +++ b/vendor/golang.org/x/sys/unix/zerrors_linux_sparc64.go @@ -1,11 +1,11 @@ -// mkerrors.sh -Wall -Werror -static -I/tmp/sparc64/include +// mkerrors.sh -Wall -Werror -static -I/tmp/include // Code generated by the command above; see README.md. DO NOT EDIT. //go:build sparc64 && linux // +build sparc64,linux // Code generated by cmd/cgo -godefs; DO NOT EDIT. -// cgo -godefs -- -Wall -Werror -static -I/tmp/sparc64/include _const.go +// cgo -godefs -- -Wall -Werror -static -I/tmp/include /build/unix/_const.go package unix @@ -380,7 +380,6 @@ const ( SO_RCVBUF = 0x1002 SO_RCVBUFFORCE = 0x100b SO_RCVLOWAT = 0x800 - SO_RCVMARK = 0x54 SO_RCVTIMEO = 0x2000 SO_RCVTIMEO_NEW = 0x44 SO_RCVTIMEO_OLD = 0x2000 @@ -405,7 +404,6 @@ const ( SO_TIMESTAMPNS_NEW = 0x42 SO_TIMESTAMPNS_OLD = 0x21 SO_TIMESTAMP_NEW = 0x46 - SO_TXREHASH = 0x53 SO_TXTIME = 0x3f SO_TYPE = 0x1008 SO_WIFI_STATUS = 0x25 diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.1_13.s b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.1_13.s index f5bb40e..d6c3e25 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.1_13.s +++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.1_13.s @@ -1,4 +1,4 @@ -// go run mkasm.go darwin amd64 +// go run mkasm_darwin.go amd64 // Code generated by the command above; DO NOT EDIT. //go:build go1.13 diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go index 467deed..8793765 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.go @@ -1643,30 +1643,6 @@ var libc_mknod_trampoline_addr uintptr // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func Mount(fsType string, dir string, flags int, data unsafe.Pointer) (err error) { - var _p0 *byte - _p0, err = BytePtrFromString(fsType) - if err != nil { - return - } - var _p1 *byte - _p1, err = BytePtrFromString(dir) - if err != nil { - return - } - _, _, e1 := syscall_syscall6(libc_mount_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), uintptr(flags), uintptr(data), 0, 0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -var libc_mount_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mount mount "/usr/lib/libSystem.B.dylib" - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - func Open(path string, mode int, perm uint32) (fd int, err error) { var _p0 *byte _p0, err = BytePtrFromString(path) diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s index b41467a..8da90cf 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s +++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_amd64.s @@ -1,4 +1,4 @@ -// go run mkasm.go darwin amd64 +// go run mkasm_darwin.go amd64 // Code generated by the command above; DO NOT EDIT. //go:build go1.12 @@ -600,12 +600,6 @@ TEXT libc_mknod_trampoline<>(SB),NOSPLIT,$0-0 GLOBL ·libc_mknod_trampoline_addr(SB), RODATA, $8 DATA ·libc_mknod_trampoline_addr(SB)/8, $libc_mknod_trampoline<>(SB) -TEXT libc_mount_trampoline<>(SB),NOSPLIT,$0-0 - JMP libc_mount(SB) - -GLOBL ·libc_mount_trampoline_addr(SB), RODATA, $8 -DATA ·libc_mount_trampoline_addr(SB)/8, $libc_mount_trampoline<>(SB) - TEXT libc_open_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_open(SB) diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.1_13.s b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.1_13.s index 0c3f76b..3579897 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.1_13.s +++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.1_13.s @@ -1,4 +1,4 @@ -// go run mkasm.go darwin arm64 +// go run mkasm_darwin.go arm64 // Code generated by the command above; DO NOT EDIT. //go:build go1.13 diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go index 35938d3..f47eedd 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.go @@ -1643,30 +1643,6 @@ var libc_mknod_trampoline_addr uintptr // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func Mount(fsType string, dir string, flags int, data unsafe.Pointer) (err error) { - var _p0 *byte - _p0, err = BytePtrFromString(fsType) - if err != nil { - return - } - var _p1 *byte - _p1, err = BytePtrFromString(dir) - if err != nil { - return - } - _, _, e1 := syscall_syscall6(libc_mount_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), uintptr(flags), uintptr(data), 0, 0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -var libc_mount_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mount mount "/usr/lib/libSystem.B.dylib" - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - func Open(path string, mode int, perm uint32) (fd int, err error) { var _p0 *byte _p0, err = BytePtrFromString(path) diff --git a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s index e1f9204..4d26f7d 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s +++ b/vendor/golang.org/x/sys/unix/zsyscall_darwin_arm64.s @@ -1,4 +1,4 @@ -// go run mkasm.go darwin arm64 +// go run mkasm_darwin.go arm64 // Code generated by the command above; DO NOT EDIT. //go:build go1.12 @@ -600,12 +600,6 @@ TEXT libc_mknod_trampoline<>(SB),NOSPLIT,$0-0 GLOBL ·libc_mknod_trampoline_addr(SB), RODATA, $8 DATA ·libc_mknod_trampoline_addr(SB)/8, $libc_mknod_trampoline<>(SB) -TEXT libc_mount_trampoline<>(SB),NOSPLIT,$0-0 - JMP libc_mount(SB) - -GLOBL ·libc_mount_trampoline_addr(SB), RODATA, $8 -DATA ·libc_mount_trampoline_addr(SB)/8, $libc_mount_trampoline<>(SB) - TEXT libc_open_trampoline<>(SB),NOSPLIT,$0-0 JMP libc_open(SB) diff --git a/vendor/golang.org/x/sys/unix/zsyscall_freebsd_386.go b/vendor/golang.org/x/sys/unix/zsyscall_freebsd_386.go index 039c4aa..e9d9997 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_freebsd_386.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_freebsd_386.go @@ -912,7 +912,7 @@ func Fpathconf(fd int, name int) (val int, err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func Fstat(fd int, stat *Stat_t) (err error) { +func fstat(fd int, stat *stat_freebsd11_t) (err error) { _, _, e1 := Syscall(SYS_FSTAT, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) if e1 != 0 { err = errnoErr(e1) @@ -922,7 +922,17 @@ func Fstat(fd int, stat *Stat_t) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func Fstatat(fd int, path string, stat *Stat_t, flags int) (err error) { +func fstat_freebsd12(fd int, stat *Stat_t) (err error) { + _, _, e1 := Syscall(SYS_FSTAT_FREEBSD12, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func fstatat(fd int, path string, stat *stat_freebsd11_t, flags int) (err error) { var _p0 *byte _p0, err = BytePtrFromString(path) if err != nil { @@ -937,7 +947,22 @@ func Fstatat(fd int, path string, stat *Stat_t, flags int) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func Fstatfs(fd int, stat *Statfs_t) (err error) { +func fstatat_freebsd12(fd int, path string, stat *Stat_t, flags int) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(path) + if err != nil { + return + } + _, _, e1 := Syscall6(SYS_FSTATAT_FREEBSD12, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), uintptr(flags), 0, 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func fstatfs(fd int, stat *statfs_freebsd11_t) (err error) { _, _, e1 := Syscall(SYS_FSTATFS, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) if e1 != 0 { err = errnoErr(e1) @@ -947,6 +972,16 @@ func Fstatfs(fd int, stat *Statfs_t) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func fstatfs_freebsd12(fd int, stat *Statfs_t) (err error) { + _, _, e1 := Syscall(SYS_FSTATFS_FREEBSD12, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Fsync(fd int) (err error) { _, _, e1 := Syscall(SYS_FSYNC, uintptr(fd), 0, 0) if e1 != 0 { @@ -967,7 +1002,7 @@ func Ftruncate(fd int, length int64) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func getdirentries(fd int, buf []byte, basep *uint64) (n int, err error) { +func getdirentries(fd int, buf []byte, basep *uintptr) (n int, err error) { var _p0 unsafe.Pointer if len(buf) > 0 { _p0 = unsafe.Pointer(&buf[0]) @@ -984,6 +1019,23 @@ func getdirentries(fd int, buf []byte, basep *uint64) (n int, err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func getdirentries_freebsd12(fd int, buf []byte, basep *uint64) (n int, err error) { + var _p0 unsafe.Pointer + if len(buf) > 0 { + _p0 = unsafe.Pointer(&buf[0]) + } else { + _p0 = unsafe.Pointer(&_zero) + } + r0, _, e1 := Syscall6(SYS_GETDIRENTRIES_FREEBSD12, uintptr(fd), uintptr(_p0), uintptr(len(buf)), uintptr(unsafe.Pointer(basep)), 0, 0) + n = int(r0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Getdtablesize() (size int) { r0, _, _ := Syscall(SYS_GETDTABLESIZE, 0, 0, 0) size = int(r0) @@ -1205,6 +1257,21 @@ func Listen(s int, backlog int) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func lstat(path string, stat *stat_freebsd11_t) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(path) + if err != nil { + return + } + _, _, e1 := Syscall(SYS_LSTAT, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Mkdir(path string, mode uint32) (err error) { var _p0 *byte _p0, err = BytePtrFromString(path) @@ -1250,13 +1317,43 @@ func Mkfifo(path string, mode uint32) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func Mknodat(fd int, path string, mode uint32, dev uint64) (err error) { +func mknod(path string, mode uint32, dev int) (err error) { var _p0 *byte _p0, err = BytePtrFromString(path) if err != nil { return } - _, _, e1 := Syscall6(SYS_MKNODAT, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(dev), uintptr(dev>>32), 0) + _, _, e1 := Syscall(SYS_MKNOD, uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(dev)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func mknodat(fd int, path string, mode uint32, dev int) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(path) + if err != nil { + return + } + _, _, e1 := Syscall6(SYS_MKNODAT, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(dev), 0, 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func mknodat_freebsd12(fd int, path string, mode uint32, dev uint64) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(path) + if err != nil { + return + } + _, _, e1 := Syscall6(SYS_MKNODAT_FREEBSD12, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(dev), uintptr(dev>>32), 0) if e1 != 0 { err = errnoErr(e1) } @@ -1656,7 +1753,22 @@ func Setuid(uid int) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func Statfs(path string, stat *Statfs_t) (err error) { +func stat(path string, stat *stat_freebsd11_t) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(path) + if err != nil { + return + } + _, _, e1 := Syscall(SYS_STAT, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func statfs(path string, stat *statfs_freebsd11_t) (err error) { var _p0 *byte _p0, err = BytePtrFromString(path) if err != nil { @@ -1671,6 +1783,21 @@ func Statfs(path string, stat *Statfs_t) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func statfs_freebsd12(path string, stat *Statfs_t) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(path) + if err != nil { + return + } + _, _, e1 := Syscall(SYS_STATFS_FREEBSD12, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Symlink(path string, link string) (err error) { var _p0 *byte _p0, err = BytePtrFromString(path) diff --git a/vendor/golang.org/x/sys/unix/zsyscall_freebsd_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_freebsd_amd64.go index 0535d3c..edd373b 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_freebsd_amd64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_freebsd_amd64.go @@ -912,7 +912,7 @@ func Fpathconf(fd int, name int) (val int, err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func Fstat(fd int, stat *Stat_t) (err error) { +func fstat(fd int, stat *stat_freebsd11_t) (err error) { _, _, e1 := Syscall(SYS_FSTAT, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) if e1 != 0 { err = errnoErr(e1) @@ -922,7 +922,17 @@ func Fstat(fd int, stat *Stat_t) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func Fstatat(fd int, path string, stat *Stat_t, flags int) (err error) { +func fstat_freebsd12(fd int, stat *Stat_t) (err error) { + _, _, e1 := Syscall(SYS_FSTAT_FREEBSD12, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func fstatat(fd int, path string, stat *stat_freebsd11_t, flags int) (err error) { var _p0 *byte _p0, err = BytePtrFromString(path) if err != nil { @@ -937,7 +947,22 @@ func Fstatat(fd int, path string, stat *Stat_t, flags int) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func Fstatfs(fd int, stat *Statfs_t) (err error) { +func fstatat_freebsd12(fd int, path string, stat *Stat_t, flags int) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(path) + if err != nil { + return + } + _, _, e1 := Syscall6(SYS_FSTATAT_FREEBSD12, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), uintptr(flags), 0, 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func fstatfs(fd int, stat *statfs_freebsd11_t) (err error) { _, _, e1 := Syscall(SYS_FSTATFS, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) if e1 != 0 { err = errnoErr(e1) @@ -947,6 +972,16 @@ func Fstatfs(fd int, stat *Statfs_t) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func fstatfs_freebsd12(fd int, stat *Statfs_t) (err error) { + _, _, e1 := Syscall(SYS_FSTATFS_FREEBSD12, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Fsync(fd int) (err error) { _, _, e1 := Syscall(SYS_FSYNC, uintptr(fd), 0, 0) if e1 != 0 { @@ -967,7 +1002,7 @@ func Ftruncate(fd int, length int64) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func getdirentries(fd int, buf []byte, basep *uint64) (n int, err error) { +func getdirentries(fd int, buf []byte, basep *uintptr) (n int, err error) { var _p0 unsafe.Pointer if len(buf) > 0 { _p0 = unsafe.Pointer(&buf[0]) @@ -984,6 +1019,23 @@ func getdirentries(fd int, buf []byte, basep *uint64) (n int, err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func getdirentries_freebsd12(fd int, buf []byte, basep *uint64) (n int, err error) { + var _p0 unsafe.Pointer + if len(buf) > 0 { + _p0 = unsafe.Pointer(&buf[0]) + } else { + _p0 = unsafe.Pointer(&_zero) + } + r0, _, e1 := Syscall6(SYS_GETDIRENTRIES_FREEBSD12, uintptr(fd), uintptr(_p0), uintptr(len(buf)), uintptr(unsafe.Pointer(basep)), 0, 0) + n = int(r0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Getdtablesize() (size int) { r0, _, _ := Syscall(SYS_GETDTABLESIZE, 0, 0, 0) size = int(r0) @@ -1205,6 +1257,21 @@ func Listen(s int, backlog int) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func lstat(path string, stat *stat_freebsd11_t) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(path) + if err != nil { + return + } + _, _, e1 := Syscall(SYS_LSTAT, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Mkdir(path string, mode uint32) (err error) { var _p0 *byte _p0, err = BytePtrFromString(path) @@ -1250,7 +1317,22 @@ func Mkfifo(path string, mode uint32) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func Mknodat(fd int, path string, mode uint32, dev uint64) (err error) { +func mknod(path string, mode uint32, dev int) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(path) + if err != nil { + return + } + _, _, e1 := Syscall(SYS_MKNOD, uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(dev)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func mknodat(fd int, path string, mode uint32, dev int) (err error) { var _p0 *byte _p0, err = BytePtrFromString(path) if err != nil { @@ -1265,6 +1347,21 @@ func Mknodat(fd int, path string, mode uint32, dev uint64) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func mknodat_freebsd12(fd int, path string, mode uint32, dev uint64) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(path) + if err != nil { + return + } + _, _, e1 := Syscall6(SYS_MKNODAT_FREEBSD12, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(dev), 0, 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Nanosleep(time *Timespec, leftover *Timespec) (err error) { _, _, e1 := Syscall(SYS_NANOSLEEP, uintptr(unsafe.Pointer(time)), uintptr(unsafe.Pointer(leftover)), 0) if e1 != 0 { @@ -1656,7 +1753,22 @@ func Setuid(uid int) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func Statfs(path string, stat *Statfs_t) (err error) { +func stat(path string, stat *stat_freebsd11_t) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(path) + if err != nil { + return + } + _, _, e1 := Syscall(SYS_STAT, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func statfs(path string, stat *statfs_freebsd11_t) (err error) { var _p0 *byte _p0, err = BytePtrFromString(path) if err != nil { @@ -1671,6 +1783,21 @@ func Statfs(path string, stat *Statfs_t) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func statfs_freebsd12(path string, stat *Statfs_t) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(path) + if err != nil { + return + } + _, _, e1 := Syscall(SYS_STATFS_FREEBSD12, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Symlink(path string, link string) (err error) { var _p0 *byte _p0, err = BytePtrFromString(path) diff --git a/vendor/golang.org/x/sys/unix/zsyscall_freebsd_arm.go b/vendor/golang.org/x/sys/unix/zsyscall_freebsd_arm.go index 1018b52..82e9764 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_freebsd_arm.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_freebsd_arm.go @@ -351,6 +351,22 @@ func Munlockall() (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) { + var _p0 unsafe.Pointer + if len(mib) > 0 { + _p0 = unsafe.Pointer(&mib[0]) + } else { + _p0 = unsafe.Pointer(&_zero) + } + _, _, e1 := Syscall6(SYS___SYSCTL, uintptr(_p0), uintptr(len(mib)), uintptr(unsafe.Pointer(old)), uintptr(unsafe.Pointer(oldlen)), uintptr(unsafe.Pointer(new)), uintptr(newlen)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func pipe2(p *[2]_C_int, flags int) (err error) { _, _, e1 := RawSyscall(SYS_PIPE2, uintptr(unsafe.Pointer(p)), uintptr(flags), 0) if e1 != 0 { @@ -388,22 +404,6 @@ func ioctl(fd int, req uint, arg uintptr) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) { - var _p0 unsafe.Pointer - if len(mib) > 0 { - _p0 = unsafe.Pointer(&mib[0]) - } else { - _p0 = unsafe.Pointer(&_zero) - } - _, _, e1 := Syscall6(SYS___SYSCTL, uintptr(_p0), uintptr(len(mib)), uintptr(unsafe.Pointer(old)), uintptr(unsafe.Pointer(oldlen)), uintptr(unsafe.Pointer(new)), uintptr(newlen)) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - func ptrace(request int, pid int, addr uintptr, data int) (err error) { _, _, e1 := Syscall6(SYS_PTRACE, uintptr(request), uintptr(pid), uintptr(addr), uintptr(data), 0, 0) if e1 != 0 { @@ -912,7 +912,7 @@ func Fpathconf(fd int, name int) (val int, err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func Fstat(fd int, stat *Stat_t) (err error) { +func fstat(fd int, stat *stat_freebsd11_t) (err error) { _, _, e1 := Syscall(SYS_FSTAT, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) if e1 != 0 { err = errnoErr(e1) @@ -922,7 +922,17 @@ func Fstat(fd int, stat *Stat_t) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func Fstatat(fd int, path string, stat *Stat_t, flags int) (err error) { +func fstat_freebsd12(fd int, stat *Stat_t) (err error) { + _, _, e1 := Syscall(SYS_FSTAT_FREEBSD12, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func fstatat(fd int, path string, stat *stat_freebsd11_t, flags int) (err error) { var _p0 *byte _p0, err = BytePtrFromString(path) if err != nil { @@ -937,7 +947,22 @@ func Fstatat(fd int, path string, stat *Stat_t, flags int) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func Fstatfs(fd int, stat *Statfs_t) (err error) { +func fstatat_freebsd12(fd int, path string, stat *Stat_t, flags int) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(path) + if err != nil { + return + } + _, _, e1 := Syscall6(SYS_FSTATAT_FREEBSD12, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), uintptr(flags), 0, 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func fstatfs(fd int, stat *statfs_freebsd11_t) (err error) { _, _, e1 := Syscall(SYS_FSTATFS, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) if e1 != 0 { err = errnoErr(e1) @@ -947,6 +972,16 @@ func Fstatfs(fd int, stat *Statfs_t) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func fstatfs_freebsd12(fd int, stat *Statfs_t) (err error) { + _, _, e1 := Syscall(SYS_FSTATFS_FREEBSD12, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Fsync(fd int) (err error) { _, _, e1 := Syscall(SYS_FSYNC, uintptr(fd), 0, 0) if e1 != 0 { @@ -967,7 +1002,7 @@ func Ftruncate(fd int, length int64) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func getdirentries(fd int, buf []byte, basep *uint64) (n int, err error) { +func getdirentries(fd int, buf []byte, basep *uintptr) (n int, err error) { var _p0 unsafe.Pointer if len(buf) > 0 { _p0 = unsafe.Pointer(&buf[0]) @@ -984,6 +1019,23 @@ func getdirentries(fd int, buf []byte, basep *uint64) (n int, err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func getdirentries_freebsd12(fd int, buf []byte, basep *uint64) (n int, err error) { + var _p0 unsafe.Pointer + if len(buf) > 0 { + _p0 = unsafe.Pointer(&buf[0]) + } else { + _p0 = unsafe.Pointer(&_zero) + } + r0, _, e1 := Syscall6(SYS_GETDIRENTRIES_FREEBSD12, uintptr(fd), uintptr(_p0), uintptr(len(buf)), uintptr(unsafe.Pointer(basep)), 0, 0) + n = int(r0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Getdtablesize() (size int) { r0, _, _ := Syscall(SYS_GETDTABLESIZE, 0, 0, 0) size = int(r0) @@ -1205,6 +1257,21 @@ func Listen(s int, backlog int) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func lstat(path string, stat *stat_freebsd11_t) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(path) + if err != nil { + return + } + _, _, e1 := Syscall(SYS_LSTAT, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Mkdir(path string, mode uint32) (err error) { var _p0 *byte _p0, err = BytePtrFromString(path) @@ -1250,13 +1317,43 @@ func Mkfifo(path string, mode uint32) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func Mknodat(fd int, path string, mode uint32, dev uint64) (err error) { +func mknod(path string, mode uint32, dev int) (err error) { var _p0 *byte _p0, err = BytePtrFromString(path) if err != nil { return } - _, _, e1 := Syscall6(SYS_MKNODAT, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0, uintptr(dev), uintptr(dev>>32)) + _, _, e1 := Syscall(SYS_MKNOD, uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(dev)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func mknodat(fd int, path string, mode uint32, dev int) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(path) + if err != nil { + return + } + _, _, e1 := Syscall6(SYS_MKNODAT, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(dev), 0, 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func mknodat_freebsd12(fd int, path string, mode uint32, dev uint64) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(path) + if err != nil { + return + } + _, _, e1 := Syscall6(SYS_MKNODAT_FREEBSD12, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(dev), 0, 0) if e1 != 0 { err = errnoErr(e1) } @@ -1656,7 +1753,22 @@ func Setuid(uid int) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func Statfs(path string, stat *Statfs_t) (err error) { +func stat(path string, stat *stat_freebsd11_t) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(path) + if err != nil { + return + } + _, _, e1 := Syscall(SYS_STAT, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func statfs(path string, stat *statfs_freebsd11_t) (err error) { var _p0 *byte _p0, err = BytePtrFromString(path) if err != nil { @@ -1671,6 +1783,21 @@ func Statfs(path string, stat *Statfs_t) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func statfs_freebsd12(path string, stat *Statfs_t) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(path) + if err != nil { + return + } + _, _, e1 := Syscall(SYS_STATFS_FREEBSD12, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Symlink(path string, link string) (err error) { var _p0 *byte _p0, err = BytePtrFromString(path) diff --git a/vendor/golang.org/x/sys/unix/zsyscall_freebsd_arm64.go b/vendor/golang.org/x/sys/unix/zsyscall_freebsd_arm64.go index 3802f4b..a6479ac 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_freebsd_arm64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_freebsd_arm64.go @@ -912,7 +912,7 @@ func Fpathconf(fd int, name int) (val int, err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func Fstat(fd int, stat *Stat_t) (err error) { +func fstat(fd int, stat *stat_freebsd11_t) (err error) { _, _, e1 := Syscall(SYS_FSTAT, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) if e1 != 0 { err = errnoErr(e1) @@ -922,7 +922,17 @@ func Fstat(fd int, stat *Stat_t) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func Fstatat(fd int, path string, stat *Stat_t, flags int) (err error) { +func fstat_freebsd12(fd int, stat *Stat_t) (err error) { + _, _, e1 := Syscall(SYS_FSTAT_FREEBSD12, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func fstatat(fd int, path string, stat *stat_freebsd11_t, flags int) (err error) { var _p0 *byte _p0, err = BytePtrFromString(path) if err != nil { @@ -937,7 +947,22 @@ func Fstatat(fd int, path string, stat *Stat_t, flags int) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func Fstatfs(fd int, stat *Statfs_t) (err error) { +func fstatat_freebsd12(fd int, path string, stat *Stat_t, flags int) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(path) + if err != nil { + return + } + _, _, e1 := Syscall6(SYS_FSTATAT_FREEBSD12, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), uintptr(flags), 0, 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func fstatfs(fd int, stat *statfs_freebsd11_t) (err error) { _, _, e1 := Syscall(SYS_FSTATFS, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) if e1 != 0 { err = errnoErr(e1) @@ -947,6 +972,16 @@ func Fstatfs(fd int, stat *Statfs_t) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func fstatfs_freebsd12(fd int, stat *Statfs_t) (err error) { + _, _, e1 := Syscall(SYS_FSTATFS_FREEBSD12, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Fsync(fd int) (err error) { _, _, e1 := Syscall(SYS_FSYNC, uintptr(fd), 0, 0) if e1 != 0 { @@ -967,7 +1002,7 @@ func Ftruncate(fd int, length int64) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func getdirentries(fd int, buf []byte, basep *uint64) (n int, err error) { +func getdirentries(fd int, buf []byte, basep *uintptr) (n int, err error) { var _p0 unsafe.Pointer if len(buf) > 0 { _p0 = unsafe.Pointer(&buf[0]) @@ -984,6 +1019,23 @@ func getdirentries(fd int, buf []byte, basep *uint64) (n int, err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func getdirentries_freebsd12(fd int, buf []byte, basep *uint64) (n int, err error) { + var _p0 unsafe.Pointer + if len(buf) > 0 { + _p0 = unsafe.Pointer(&buf[0]) + } else { + _p0 = unsafe.Pointer(&_zero) + } + r0, _, e1 := Syscall6(SYS_GETDIRENTRIES_FREEBSD12, uintptr(fd), uintptr(_p0), uintptr(len(buf)), uintptr(unsafe.Pointer(basep)), 0, 0) + n = int(r0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Getdtablesize() (size int) { r0, _, _ := Syscall(SYS_GETDTABLESIZE, 0, 0, 0) size = int(r0) @@ -1205,6 +1257,21 @@ func Listen(s int, backlog int) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func lstat(path string, stat *stat_freebsd11_t) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(path) + if err != nil { + return + } + _, _, e1 := Syscall(SYS_LSTAT, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Mkdir(path string, mode uint32) (err error) { var _p0 *byte _p0, err = BytePtrFromString(path) @@ -1250,7 +1317,22 @@ func Mkfifo(path string, mode uint32) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func Mknodat(fd int, path string, mode uint32, dev uint64) (err error) { +func mknod(path string, mode uint32, dev int) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(path) + if err != nil { + return + } + _, _, e1 := Syscall(SYS_MKNOD, uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(dev)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func mknodat(fd int, path string, mode uint32, dev int) (err error) { var _p0 *byte _p0, err = BytePtrFromString(path) if err != nil { @@ -1265,6 +1347,21 @@ func Mknodat(fd int, path string, mode uint32, dev uint64) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func mknodat_freebsd12(fd int, path string, mode uint32, dev uint64) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(path) + if err != nil { + return + } + _, _, e1 := Syscall6(SYS_MKNODAT_FREEBSD12, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(dev), 0, 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Nanosleep(time *Timespec, leftover *Timespec) (err error) { _, _, e1 := Syscall(SYS_NANOSLEEP, uintptr(unsafe.Pointer(time)), uintptr(unsafe.Pointer(leftover)), 0) if e1 != 0 { @@ -1656,7 +1753,22 @@ func Setuid(uid int) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func Statfs(path string, stat *Statfs_t) (err error) { +func stat(path string, stat *stat_freebsd11_t) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(path) + if err != nil { + return + } + _, _, e1 := Syscall(SYS_STAT, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func statfs(path string, stat *statfs_freebsd11_t) (err error) { var _p0 *byte _p0, err = BytePtrFromString(path) if err != nil { @@ -1671,6 +1783,21 @@ func Statfs(path string, stat *Statfs_t) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func statfs_freebsd12(path string, stat *Statfs_t) (err error) { + var _p0 *byte + _p0, err = BytePtrFromString(path) + if err != nil { + return + } + _, _, e1 := Syscall(SYS_STATFS_FREEBSD12, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Symlink(path string, link string) (err error) { var _p0 *byte _p0, err = BytePtrFromString(path) diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux.go b/vendor/golang.org/x/sys/unix/zsyscall_linux.go index bc4a275..198b4ac 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux.go @@ -828,49 +828,6 @@ func Fsync(fd int) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func Fsmount(fd int, flags int, mountAttrs int) (fsfd int, err error) { - r0, _, e1 := Syscall(SYS_FSMOUNT, uintptr(fd), uintptr(flags), uintptr(mountAttrs)) - fsfd = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - -func Fsopen(fsName string, flags int) (fd int, err error) { - var _p0 *byte - _p0, err = BytePtrFromString(fsName) - if err != nil { - return - } - r0, _, e1 := Syscall(SYS_FSOPEN, uintptr(unsafe.Pointer(_p0)), uintptr(flags), 0) - fd = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - -func Fspick(dirfd int, pathName string, flags int) (fd int, err error) { - var _p0 *byte - _p0, err = BytePtrFromString(pathName) - if err != nil { - return - } - r0, _, e1 := Syscall(SYS_FSPICK, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(flags)) - fd = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - func Getdents(fd int, buf []byte) (n int, err error) { var _p0 unsafe.Pointer if len(buf) > 0 { diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_386.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_386.go index c81b0ad..88af526 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux_386.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_386.go @@ -287,6 +287,46 @@ func setfsuid(uid int) (prev int, err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func Setregid(rgid int, egid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETREGID32, uintptr(rgid), uintptr(egid), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Setresgid(rgid int, egid int, sgid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETRESGID32, uintptr(rgid), uintptr(egid), uintptr(sgid)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Setresuid(ruid int, euid int, suid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETRESUID32, uintptr(ruid), uintptr(euid), uintptr(suid)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Setreuid(ruid int, euid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETREUID32, uintptr(ruid), uintptr(euid), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Splice(rfd int, roff *int64, wfd int, woff *int64, len int, flags int) (n int, err error) { r0, _, e1 := Syscall6(SYS_SPLICE, uintptr(rfd), uintptr(unsafe.Pointer(roff)), uintptr(wfd), uintptr(unsafe.Pointer(woff)), uintptr(len), uintptr(flags)) n = int(r0) diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go index 2206bce..2a0c4aa 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_amd64.go @@ -334,6 +334,36 @@ func setfsuid(uid int) (prev int, err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func Setregid(rgid int, egid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETREGID, uintptr(rgid), uintptr(egid), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Setresgid(rgid int, egid int, sgid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETRESGID, uintptr(rgid), uintptr(egid), uintptr(sgid)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Setresuid(ruid int, euid int, suid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETRESUID, uintptr(ruid), uintptr(euid), uintptr(suid)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Setrlimit(resource int, rlim *Rlimit) (err error) { _, _, e1 := RawSyscall(SYS_SETRLIMIT, uintptr(resource), uintptr(unsafe.Pointer(rlim)), 0) if e1 != 0 { @@ -344,6 +374,16 @@ func Setrlimit(resource int, rlim *Rlimit) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func Setreuid(ruid int, euid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETREUID, uintptr(ruid), uintptr(euid), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Shutdown(fd int, how int) (err error) { _, _, e1 := Syscall(SYS_SHUTDOWN, uintptr(fd), uintptr(how), 0) if e1 != 0 { diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_arm.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_arm.go index edf6b39..4882bde 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux_arm.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_arm.go @@ -412,6 +412,46 @@ func setfsuid(uid int) (prev int, err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func Setregid(rgid int, egid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETREGID32, uintptr(rgid), uintptr(egid), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Setresgid(rgid int, egid int, sgid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETRESGID32, uintptr(rgid), uintptr(egid), uintptr(sgid)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Setresuid(ruid int, euid int, suid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETRESUID32, uintptr(ruid), uintptr(euid), uintptr(suid)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Setreuid(ruid int, euid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETREUID32, uintptr(ruid), uintptr(euid), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Shutdown(fd int, how int) (err error) { _, _, e1 := Syscall(SYS_SHUTDOWN, uintptr(fd), uintptr(how), 0) if e1 != 0 { diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go index 190609f..9f8c24e 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_arm64.go @@ -289,6 +289,36 @@ func setfsuid(uid int) (prev int, err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func Setregid(rgid int, egid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETREGID, uintptr(rgid), uintptr(egid), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Setresgid(rgid int, egid int, sgid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETRESGID, uintptr(rgid), uintptr(egid), uintptr(sgid)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Setresuid(ruid int, euid int, suid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETRESUID, uintptr(ruid), uintptr(euid), uintptr(suid)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func setrlimit(resource int, rlim *Rlimit) (err error) { _, _, e1 := RawSyscall(SYS_SETRLIMIT, uintptr(resource), uintptr(unsafe.Pointer(rlim)), 0) if e1 != 0 { @@ -299,6 +329,16 @@ func setrlimit(resource int, rlim *Rlimit) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func Setreuid(ruid int, euid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETREUID, uintptr(ruid), uintptr(euid), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Shutdown(fd int, how int) (err error) { _, _, e1 := Syscall(SYS_SHUTDOWN, uintptr(fd), uintptr(how), 0) if e1 != 0 { diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_mips.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_mips.go index 5f984cb..d7d6f42 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux_mips.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_mips.go @@ -248,6 +248,46 @@ func setfsuid(uid int) (prev int, err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func Setregid(rgid int, egid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETREGID, uintptr(rgid), uintptr(egid), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Setresgid(rgid int, egid int, sgid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETRESGID, uintptr(rgid), uintptr(egid), uintptr(sgid)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Setresuid(ruid int, euid int, suid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETRESUID, uintptr(ruid), uintptr(euid), uintptr(suid)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Setreuid(ruid int, euid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETREUID, uintptr(ruid), uintptr(euid), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Shutdown(fd int, how int) (err error) { _, _, e1 := Syscall(SYS_SHUTDOWN, uintptr(fd), uintptr(how), 0) if e1 != 0 { diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_mips64.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_mips64.go index 46fc380..7f1f8e6 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux_mips64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_mips64.go @@ -278,6 +278,36 @@ func setfsuid(uid int) (prev int, err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func Setregid(rgid int, egid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETREGID, uintptr(rgid), uintptr(egid), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Setresgid(rgid int, egid int, sgid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETRESGID, uintptr(rgid), uintptr(egid), uintptr(sgid)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Setresuid(ruid int, euid int, suid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETRESUID, uintptr(ruid), uintptr(euid), uintptr(suid)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Setrlimit(resource int, rlim *Rlimit) (err error) { _, _, e1 := RawSyscall(SYS_SETRLIMIT, uintptr(resource), uintptr(unsafe.Pointer(rlim)), 0) if e1 != 0 { @@ -288,6 +318,16 @@ func Setrlimit(resource int, rlim *Rlimit) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func Setreuid(ruid int, euid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETREUID, uintptr(ruid), uintptr(euid), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Shutdown(fd int, how int) (err error) { _, _, e1 := Syscall(SYS_SHUTDOWN, uintptr(fd), uintptr(how), 0) if e1 != 0 { diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_mips64le.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_mips64le.go index cbd0d4d..f933d0f 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux_mips64le.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_mips64le.go @@ -278,6 +278,36 @@ func setfsuid(uid int) (prev int, err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func Setregid(rgid int, egid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETREGID, uintptr(rgid), uintptr(egid), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Setresgid(rgid int, egid int, sgid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETRESGID, uintptr(rgid), uintptr(egid), uintptr(sgid)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Setresuid(ruid int, euid int, suid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETRESUID, uintptr(ruid), uintptr(euid), uintptr(suid)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Setrlimit(resource int, rlim *Rlimit) (err error) { _, _, e1 := RawSyscall(SYS_SETRLIMIT, uintptr(resource), uintptr(unsafe.Pointer(rlim)), 0) if e1 != 0 { @@ -288,6 +318,16 @@ func Setrlimit(resource int, rlim *Rlimit) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func Setreuid(ruid int, euid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETREUID, uintptr(ruid), uintptr(euid), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Shutdown(fd int, how int) (err error) { _, _, e1 := Syscall(SYS_SHUTDOWN, uintptr(fd), uintptr(how), 0) if e1 != 0 { diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_mipsle.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_mipsle.go index 0c13d15..297d0a9 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux_mipsle.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_mipsle.go @@ -248,6 +248,46 @@ func setfsuid(uid int) (prev int, err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func Setregid(rgid int, egid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETREGID, uintptr(rgid), uintptr(egid), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Setresgid(rgid int, egid int, sgid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETRESGID, uintptr(rgid), uintptr(egid), uintptr(sgid)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Setresuid(ruid int, euid int, suid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETRESUID, uintptr(ruid), uintptr(euid), uintptr(suid)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Setreuid(ruid int, euid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETREUID, uintptr(ruid), uintptr(euid), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Shutdown(fd int, how int) (err error) { _, _, e1 := Syscall(SYS_SHUTDOWN, uintptr(fd), uintptr(how), 0) if e1 != 0 { diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc.go index e01432a..2e32e7a 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc.go @@ -308,6 +308,46 @@ func setfsuid(uid int) (prev int, err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func Setregid(rgid int, egid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETREGID, uintptr(rgid), uintptr(egid), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Setresgid(rgid int, egid int, sgid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETRESGID, uintptr(rgid), uintptr(egid), uintptr(sgid)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Setresuid(ruid int, euid int, suid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETRESUID, uintptr(ruid), uintptr(euid), uintptr(suid)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Setreuid(ruid int, euid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETREUID, uintptr(ruid), uintptr(euid), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Shutdown(fd int, how int) (err error) { _, _, e1 := Syscall(SYS_SHUTDOWN, uintptr(fd), uintptr(how), 0) if e1 != 0 { diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc64.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc64.go index 13c7ee7..3c53170 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc64.go @@ -349,6 +349,36 @@ func setfsuid(uid int) (prev int, err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func Setregid(rgid int, egid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETREGID, uintptr(rgid), uintptr(egid), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Setresgid(rgid int, egid int, sgid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETRESGID, uintptr(rgid), uintptr(egid), uintptr(sgid)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Setresuid(ruid int, euid int, suid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETRESUID, uintptr(ruid), uintptr(euid), uintptr(suid)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Setrlimit(resource int, rlim *Rlimit) (err error) { _, _, e1 := RawSyscall(SYS_SETRLIMIT, uintptr(resource), uintptr(unsafe.Pointer(rlim)), 0) if e1 != 0 { @@ -359,6 +389,16 @@ func Setrlimit(resource int, rlim *Rlimit) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func Setreuid(ruid int, euid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETREUID, uintptr(ruid), uintptr(euid), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Shutdown(fd int, how int) (err error) { _, _, e1 := Syscall(SYS_SHUTDOWN, uintptr(fd), uintptr(how), 0) if e1 != 0 { diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc64le.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc64le.go index 02d0c0f..a00c674 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc64le.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_ppc64le.go @@ -349,6 +349,36 @@ func setfsuid(uid int) (prev int, err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func Setregid(rgid int, egid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETREGID, uintptr(rgid), uintptr(egid), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Setresgid(rgid int, egid int, sgid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETRESGID, uintptr(rgid), uintptr(egid), uintptr(sgid)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Setresuid(ruid int, euid int, suid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETRESUID, uintptr(ruid), uintptr(euid), uintptr(suid)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Setrlimit(resource int, rlim *Rlimit) (err error) { _, _, e1 := RawSyscall(SYS_SETRLIMIT, uintptr(resource), uintptr(unsafe.Pointer(rlim)), 0) if e1 != 0 { @@ -359,6 +389,16 @@ func Setrlimit(resource int, rlim *Rlimit) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func Setreuid(ruid int, euid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETREUID, uintptr(ruid), uintptr(euid), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Shutdown(fd int, how int) (err error) { _, _, e1 := Syscall(SYS_SHUTDOWN, uintptr(fd), uintptr(how), 0) if e1 != 0 { diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_riscv64.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_riscv64.go index 9fee3b1..a1a9bcb 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux_riscv64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_riscv64.go @@ -180,17 +180,6 @@ func Listen(s int, n int) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func MemfdSecret(flags int) (fd int, err error) { - r0, _, e1 := Syscall(SYS_MEMFD_SECRET, uintptr(flags), 0, 0) - fd = int(r0) - if e1 != 0 { - err = errnoErr(e1) - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - func pread(fd int, p []byte, offset int64) (n int, err error) { var _p0 unsafe.Pointer if len(p) > 0 { @@ -269,6 +258,36 @@ func setfsuid(uid int) (prev int, err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func Setregid(rgid int, egid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETREGID, uintptr(rgid), uintptr(egid), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Setresgid(rgid int, egid int, sgid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETRESGID, uintptr(rgid), uintptr(egid), uintptr(sgid)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Setresuid(ruid int, euid int, suid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETRESUID, uintptr(ruid), uintptr(euid), uintptr(suid)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Setrlimit(resource int, rlim *Rlimit) (err error) { _, _, e1 := RawSyscall(SYS_SETRLIMIT, uintptr(resource), uintptr(unsafe.Pointer(rlim)), 0) if e1 != 0 { @@ -279,6 +298,16 @@ func Setrlimit(resource int, rlim *Rlimit) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func Setreuid(ruid int, euid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETREUID, uintptr(ruid), uintptr(euid), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Shutdown(fd int, how int) (err error) { _, _, e1 := Syscall(SYS_SHUTDOWN, uintptr(fd), uintptr(how), 0) if e1 != 0 { diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_s390x.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_s390x.go index 647bbfe..e0dabc6 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux_s390x.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_s390x.go @@ -319,6 +319,36 @@ func setfsuid(uid int) (prev int, err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func Setregid(rgid int, egid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETREGID, uintptr(rgid), uintptr(egid), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Setresgid(rgid int, egid int, sgid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETRESGID, uintptr(rgid), uintptr(egid), uintptr(sgid)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Setresuid(ruid int, euid int, suid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETRESUID, uintptr(ruid), uintptr(euid), uintptr(suid)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Setrlimit(resource int, rlim *Rlimit) (err error) { _, _, e1 := RawSyscall(SYS_SETRLIMIT, uintptr(resource), uintptr(unsafe.Pointer(rlim)), 0) if e1 != 0 { @@ -329,6 +359,16 @@ func Setrlimit(resource int, rlim *Rlimit) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func Setreuid(ruid int, euid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETREUID, uintptr(ruid), uintptr(euid), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Splice(rfd int, roff *int64, wfd int, woff *int64, len int, flags int) (n int64, err error) { r0, _, e1 := Syscall6(SYS_SPLICE, uintptr(rfd), uintptr(unsafe.Pointer(roff)), uintptr(wfd), uintptr(unsafe.Pointer(woff)), uintptr(len), uintptr(flags)) n = int64(r0) diff --git a/vendor/golang.org/x/sys/unix/zsyscall_linux_sparc64.go b/vendor/golang.org/x/sys/unix/zsyscall_linux_sparc64.go index ada057f..368623c 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_linux_sparc64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_linux_sparc64.go @@ -329,6 +329,36 @@ func setfsuid(uid int) (prev int, err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func Setregid(rgid int, egid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETREGID, uintptr(rgid), uintptr(egid), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Setresgid(rgid int, egid int, sgid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETRESGID, uintptr(rgid), uintptr(egid), uintptr(sgid)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + +func Setresuid(ruid int, euid int, suid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETRESUID, uintptr(ruid), uintptr(euid), uintptr(suid)) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Setrlimit(resource int, rlim *Rlimit) (err error) { _, _, e1 := RawSyscall(SYS_SETRLIMIT, uintptr(resource), uintptr(unsafe.Pointer(rlim)), 0) if e1 != 0 { @@ -339,6 +369,16 @@ func Setrlimit(resource int, rlim *Rlimit) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT +func Setreuid(ruid int, euid int) (err error) { + _, _, e1 := RawSyscall(SYS_SETREUID, uintptr(ruid), uintptr(euid), 0) + if e1 != 0 { + err = errnoErr(e1) + } + return +} + +// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT + func Shutdown(fd int, how int) (err error) { _, _, e1 := Syscall(SYS_SHUTDOWN, uintptr(fd), uintptr(how), 0) if e1 != 0 { diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.go b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.go index 2925fe0..a057fc5 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_386.go @@ -1,4 +1,4 @@ -// go run mksyscall.go -l32 -openbsd -libc -tags openbsd,386 syscall_bsd.go syscall_openbsd.go syscall_openbsd_386.go +// go run mksyscall.go -l32 -openbsd -tags openbsd,386 syscall_bsd.go syscall_openbsd.go syscall_openbsd_386.go // Code generated by the command above; see README.md. DO NOT EDIT. //go:build openbsd && 386 @@ -16,7 +16,7 @@ var _ syscall.Errno // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func getgroups(ngid int, gid *_Gid_t) (n int, err error) { - r0, _, e1 := syscall_rawSyscall(libc_getgroups_trampoline_addr, uintptr(ngid), uintptr(unsafe.Pointer(gid)), 0) + r0, _, e1 := RawSyscall(SYS_GETGROUPS, uintptr(ngid), uintptr(unsafe.Pointer(gid)), 0) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -24,28 +24,20 @@ func getgroups(ngid int, gid *_Gid_t) (n int, err error) { return } -var libc_getgroups_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getgroups getgroups "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func setgroups(ngid int, gid *_Gid_t) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setgroups_trampoline_addr, uintptr(ngid), uintptr(unsafe.Pointer(gid)), 0) + _, _, e1 := RawSyscall(SYS_SETGROUPS, uintptr(ngid), uintptr(unsafe.Pointer(gid)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setgroups_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setgroups setgroups "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func wait4(pid int, wstatus *_C_int, options int, rusage *Rusage) (wpid int, err error) { - r0, _, e1 := syscall_syscall6(libc_wait4_trampoline_addr, uintptr(pid), uintptr(unsafe.Pointer(wstatus)), uintptr(options), uintptr(unsafe.Pointer(rusage)), 0, 0) + r0, _, e1 := Syscall6(SYS_WAIT4, uintptr(pid), uintptr(unsafe.Pointer(wstatus)), uintptr(options), uintptr(unsafe.Pointer(rusage)), 0, 0) wpid = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -53,14 +45,10 @@ func wait4(pid int, wstatus *_C_int, options int, rusage *Rusage) (wpid int, err return } -var libc_wait4_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_wait4 wait4 "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func accept(s int, rsa *RawSockaddrAny, addrlen *_Socklen) (fd int, err error) { - r0, _, e1 := syscall_syscall(libc_accept_trampoline_addr, uintptr(s), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen))) + r0, _, e1 := Syscall(SYS_ACCEPT, uintptr(s), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen))) fd = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -68,42 +56,30 @@ func accept(s int, rsa *RawSockaddrAny, addrlen *_Socklen) (fd int, err error) { return } -var libc_accept_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_accept accept "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func bind(s int, addr unsafe.Pointer, addrlen _Socklen) (err error) { - _, _, e1 := syscall_syscall(libc_bind_trampoline_addr, uintptr(s), uintptr(addr), uintptr(addrlen)) + _, _, e1 := Syscall(SYS_BIND, uintptr(s), uintptr(addr), uintptr(addrlen)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_bind_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_bind bind "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func connect(s int, addr unsafe.Pointer, addrlen _Socklen) (err error) { - _, _, e1 := syscall_syscall(libc_connect_trampoline_addr, uintptr(s), uintptr(addr), uintptr(addrlen)) + _, _, e1 := Syscall(SYS_CONNECT, uintptr(s), uintptr(addr), uintptr(addrlen)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_connect_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_connect connect "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func socket(domain int, typ int, proto int) (fd int, err error) { - r0, _, e1 := syscall_rawSyscall(libc_socket_trampoline_addr, uintptr(domain), uintptr(typ), uintptr(proto)) + r0, _, e1 := RawSyscall(SYS_SOCKET, uintptr(domain), uintptr(typ), uintptr(proto)) fd = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -111,94 +87,66 @@ func socket(domain int, typ int, proto int) (fd int, err error) { return } -var libc_socket_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_socket socket "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func getsockopt(s int, level int, name int, val unsafe.Pointer, vallen *_Socklen) (err error) { - _, _, e1 := syscall_syscall6(libc_getsockopt_trampoline_addr, uintptr(s), uintptr(level), uintptr(name), uintptr(val), uintptr(unsafe.Pointer(vallen)), 0) + _, _, e1 := Syscall6(SYS_GETSOCKOPT, uintptr(s), uintptr(level), uintptr(name), uintptr(val), uintptr(unsafe.Pointer(vallen)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_getsockopt_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getsockopt getsockopt "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func setsockopt(s int, level int, name int, val unsafe.Pointer, vallen uintptr) (err error) { - _, _, e1 := syscall_syscall6(libc_setsockopt_trampoline_addr, uintptr(s), uintptr(level), uintptr(name), uintptr(val), uintptr(vallen), 0) + _, _, e1 := Syscall6(SYS_SETSOCKOPT, uintptr(s), uintptr(level), uintptr(name), uintptr(val), uintptr(vallen), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setsockopt_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setsockopt setsockopt "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func getpeername(fd int, rsa *RawSockaddrAny, addrlen *_Socklen) (err error) { - _, _, e1 := syscall_rawSyscall(libc_getpeername_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen))) + _, _, e1 := RawSyscall(SYS_GETPEERNAME, uintptr(fd), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen))) if e1 != 0 { err = errnoErr(e1) } return } -var libc_getpeername_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getpeername getpeername "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func getsockname(fd int, rsa *RawSockaddrAny, addrlen *_Socklen) (err error) { - _, _, e1 := syscall_rawSyscall(libc_getsockname_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen))) + _, _, e1 := RawSyscall(SYS_GETSOCKNAME, uintptr(fd), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen))) if e1 != 0 { err = errnoErr(e1) } return } -var libc_getsockname_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getsockname getsockname "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Shutdown(s int, how int) (err error) { - _, _, e1 := syscall_syscall(libc_shutdown_trampoline_addr, uintptr(s), uintptr(how), 0) + _, _, e1 := Syscall(SYS_SHUTDOWN, uintptr(s), uintptr(how), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_shutdown_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_shutdown shutdown "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func socketpair(domain int, typ int, proto int, fd *[2]int32) (err error) { - _, _, e1 := syscall_rawSyscall6(libc_socketpair_trampoline_addr, uintptr(domain), uintptr(typ), uintptr(proto), uintptr(unsafe.Pointer(fd)), 0, 0) + _, _, e1 := RawSyscall6(SYS_SOCKETPAIR, uintptr(domain), uintptr(typ), uintptr(proto), uintptr(unsafe.Pointer(fd)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_socketpair_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_socketpair socketpair "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func recvfrom(fd int, p []byte, flags int, from *RawSockaddrAny, fromlen *_Socklen) (n int, err error) { @@ -208,7 +156,7 @@ func recvfrom(fd int, p []byte, flags int, from *RawSockaddrAny, fromlen *_Sockl } else { _p0 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall6(libc_recvfrom_trampoline_addr, uintptr(fd), uintptr(_p0), uintptr(len(p)), uintptr(flags), uintptr(unsafe.Pointer(from)), uintptr(unsafe.Pointer(fromlen))) + r0, _, e1 := Syscall6(SYS_RECVFROM, uintptr(fd), uintptr(_p0), uintptr(len(p)), uintptr(flags), uintptr(unsafe.Pointer(from)), uintptr(unsafe.Pointer(fromlen))) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -216,10 +164,6 @@ func recvfrom(fd int, p []byte, flags int, from *RawSockaddrAny, fromlen *_Sockl return } -var libc_recvfrom_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_recvfrom recvfrom "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func sendto(s int, buf []byte, flags int, to unsafe.Pointer, addrlen _Socklen) (err error) { @@ -229,21 +173,17 @@ func sendto(s int, buf []byte, flags int, to unsafe.Pointer, addrlen _Socklen) ( } else { _p0 = unsafe.Pointer(&_zero) } - _, _, e1 := syscall_syscall6(libc_sendto_trampoline_addr, uintptr(s), uintptr(_p0), uintptr(len(buf)), uintptr(flags), uintptr(to), uintptr(addrlen)) + _, _, e1 := Syscall6(SYS_SENDTO, uintptr(s), uintptr(_p0), uintptr(len(buf)), uintptr(flags), uintptr(to), uintptr(addrlen)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_sendto_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_sendto sendto "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func recvmsg(s int, msg *Msghdr, flags int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_recvmsg_trampoline_addr, uintptr(s), uintptr(unsafe.Pointer(msg)), uintptr(flags)) + r0, _, e1 := Syscall(SYS_RECVMSG, uintptr(s), uintptr(unsafe.Pointer(msg)), uintptr(flags)) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -251,14 +191,10 @@ func recvmsg(s int, msg *Msghdr, flags int) (n int, err error) { return } -var libc_recvmsg_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_recvmsg recvmsg "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func sendmsg(s int, msg *Msghdr, flags int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_sendmsg_trampoline_addr, uintptr(s), uintptr(unsafe.Pointer(msg)), uintptr(flags)) + r0, _, e1 := Syscall(SYS_SENDMSG, uintptr(s), uintptr(unsafe.Pointer(msg)), uintptr(flags)) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -266,14 +202,10 @@ func sendmsg(s int, msg *Msghdr, flags int) (n int, err error) { return } -var libc_sendmsg_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_sendmsg sendmsg "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func kevent(kq int, change unsafe.Pointer, nchange int, event unsafe.Pointer, nevent int, timeout *Timespec) (n int, err error) { - r0, _, e1 := syscall_syscall6(libc_kevent_trampoline_addr, uintptr(kq), uintptr(change), uintptr(nchange), uintptr(event), uintptr(nevent), uintptr(unsafe.Pointer(timeout))) + r0, _, e1 := Syscall6(SYS_KEVENT, uintptr(kq), uintptr(change), uintptr(nchange), uintptr(event), uintptr(nevent), uintptr(unsafe.Pointer(timeout))) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -281,10 +213,6 @@ func kevent(kq int, change unsafe.Pointer, nchange int, event unsafe.Pointer, ne return } -var libc_kevent_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_kevent kevent "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func utimes(path string, timeval *[2]Timeval) (err error) { @@ -293,35 +221,27 @@ func utimes(path string, timeval *[2]Timeval) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_utimes_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(timeval)), 0) + _, _, e1 := Syscall(SYS_UTIMES, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(timeval)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_utimes_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_utimes utimes "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func futimes(fd int, timeval *[2]Timeval) (err error) { - _, _, e1 := syscall_syscall(libc_futimes_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(timeval)), 0) + _, _, e1 := Syscall(SYS_FUTIMES, uintptr(fd), uintptr(unsafe.Pointer(timeval)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_futimes_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_futimes futimes "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func poll(fds *PollFd, nfds int, timeout int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_poll_trampoline_addr, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(timeout)) + r0, _, e1 := Syscall(SYS_POLL, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(timeout)) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -329,10 +249,6 @@ func poll(fds *PollFd, nfds int, timeout int) (n int, err error) { return } -var libc_poll_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_poll poll "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Madvise(b []byte, behav int) (err error) { @@ -342,17 +258,13 @@ func Madvise(b []byte, behav int) (err error) { } else { _p0 = unsafe.Pointer(&_zero) } - _, _, e1 := syscall_syscall(libc_madvise_trampoline_addr, uintptr(_p0), uintptr(len(b)), uintptr(behav)) + _, _, e1 := Syscall(SYS_MADVISE, uintptr(_p0), uintptr(len(b)), uintptr(behav)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_madvise_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_madvise madvise "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mlock(b []byte) (err error) { @@ -362,31 +274,23 @@ func Mlock(b []byte) (err error) { } else { _p0 = unsafe.Pointer(&_zero) } - _, _, e1 := syscall_syscall(libc_mlock_trampoline_addr, uintptr(_p0), uintptr(len(b)), 0) + _, _, e1 := Syscall(SYS_MLOCK, uintptr(_p0), uintptr(len(b)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mlock_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mlock mlock "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mlockall(flags int) (err error) { - _, _, e1 := syscall_syscall(libc_mlockall_trampoline_addr, uintptr(flags), 0, 0) + _, _, e1 := Syscall(SYS_MLOCKALL, uintptr(flags), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mlockall_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mlockall mlockall "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mprotect(b []byte, prot int) (err error) { @@ -396,17 +300,13 @@ func Mprotect(b []byte, prot int) (err error) { } else { _p0 = unsafe.Pointer(&_zero) } - _, _, e1 := syscall_syscall(libc_mprotect_trampoline_addr, uintptr(_p0), uintptr(len(b)), uintptr(prot)) + _, _, e1 := Syscall(SYS_MPROTECT, uintptr(_p0), uintptr(len(b)), uintptr(prot)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mprotect_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mprotect mprotect "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Msync(b []byte, flags int) (err error) { @@ -416,17 +316,13 @@ func Msync(b []byte, flags int) (err error) { } else { _p0 = unsafe.Pointer(&_zero) } - _, _, e1 := syscall_syscall(libc_msync_trampoline_addr, uintptr(_p0), uintptr(len(b)), uintptr(flags)) + _, _, e1 := Syscall(SYS_MSYNC, uintptr(_p0), uintptr(len(b)), uintptr(flags)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_msync_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_msync msync "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Munlock(b []byte) (err error) { @@ -436,45 +332,33 @@ func Munlock(b []byte) (err error) { } else { _p0 = unsafe.Pointer(&_zero) } - _, _, e1 := syscall_syscall(libc_munlock_trampoline_addr, uintptr(_p0), uintptr(len(b)), 0) + _, _, e1 := Syscall(SYS_MUNLOCK, uintptr(_p0), uintptr(len(b)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_munlock_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_munlock munlock "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Munlockall() (err error) { - _, _, e1 := syscall_syscall(libc_munlockall_trampoline_addr, 0, 0, 0) + _, _, e1 := Syscall(SYS_MUNLOCKALL, 0, 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_munlockall_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_munlockall munlockall "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func pipe2(p *[2]_C_int, flags int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_pipe2_trampoline_addr, uintptr(unsafe.Pointer(p)), uintptr(flags), 0) + _, _, e1 := RawSyscall(SYS_PIPE2, uintptr(unsafe.Pointer(p)), uintptr(flags), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_pipe2_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_pipe2 pipe2 "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getdents(fd int, buf []byte) (n int, err error) { @@ -484,7 +368,7 @@ func Getdents(fd int, buf []byte) (n int, err error) { } else { _p0 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall(libc_getdents_trampoline_addr, uintptr(fd), uintptr(_p0), uintptr(len(buf))) + r0, _, e1 := Syscall(SYS_GETDENTS, uintptr(fd), uintptr(_p0), uintptr(len(buf))) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -492,10 +376,6 @@ func Getdents(fd int, buf []byte) (n int, err error) { return } -var libc_getdents_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getdents getdents "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getcwd(buf []byte) (n int, err error) { @@ -505,7 +385,7 @@ func Getcwd(buf []byte) (n int, err error) { } else { _p0 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall(libc_getcwd_trampoline_addr, uintptr(_p0), uintptr(len(buf)), 0) + r0, _, e1 := Syscall(SYS___GETCWD, uintptr(_p0), uintptr(len(buf)), 0) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -513,24 +393,16 @@ func Getcwd(buf []byte) (n int, err error) { return } -var libc_getcwd_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getcwd getcwd "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func ioctl(fd int, req uint, arg uintptr) (err error) { - _, _, e1 := syscall_syscall(libc_ioctl_trampoline_addr, uintptr(fd), uintptr(req), uintptr(arg)) + _, _, e1 := Syscall(SYS_IOCTL, uintptr(fd), uintptr(req), uintptr(arg)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_ioctl_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_ioctl ioctl "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) { @@ -540,21 +412,17 @@ func sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) } else { _p0 = unsafe.Pointer(&_zero) } - _, _, e1 := syscall_syscall6(libc_sysctl_trampoline_addr, uintptr(_p0), uintptr(len(mib)), uintptr(unsafe.Pointer(old)), uintptr(unsafe.Pointer(oldlen)), uintptr(unsafe.Pointer(new)), uintptr(newlen)) + _, _, e1 := Syscall6(SYS___SYSCTL, uintptr(_p0), uintptr(len(mib)), uintptr(unsafe.Pointer(old)), uintptr(unsafe.Pointer(oldlen)), uintptr(unsafe.Pointer(new)), uintptr(newlen)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_sysctl_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_sysctl sysctl "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func ppoll(fds *PollFd, nfds int, timeout *Timespec, sigmask *Sigset_t) (n int, err error) { - r0, _, e1 := syscall_syscall6(libc_ppoll_trampoline_addr, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(unsafe.Pointer(timeout)), uintptr(unsafe.Pointer(sigmask)), 0, 0) + r0, _, e1 := Syscall6(SYS_PPOLL, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(unsafe.Pointer(timeout)), uintptr(unsafe.Pointer(sigmask)), 0, 0) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -562,10 +430,6 @@ func ppoll(fds *PollFd, nfds int, timeout *Timespec, sigmask *Sigset_t) (n int, return } -var libc_ppoll_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_ppoll ppoll "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Access(path string, mode uint32) (err error) { @@ -574,31 +438,23 @@ func Access(path string, mode uint32) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_access_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0) + _, _, e1 := Syscall(SYS_ACCESS, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_access_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_access access "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Adjtime(delta *Timeval, olddelta *Timeval) (err error) { - _, _, e1 := syscall_syscall(libc_adjtime_trampoline_addr, uintptr(unsafe.Pointer(delta)), uintptr(unsafe.Pointer(olddelta)), 0) + _, _, e1 := Syscall(SYS_ADJTIME, uintptr(unsafe.Pointer(delta)), uintptr(unsafe.Pointer(olddelta)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_adjtime_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_adjtime adjtime "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Chdir(path string) (err error) { @@ -607,17 +463,13 @@ func Chdir(path string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_chdir_trampoline_addr, uintptr(unsafe.Pointer(_p0)), 0, 0) + _, _, e1 := Syscall(SYS_CHDIR, uintptr(unsafe.Pointer(_p0)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_chdir_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_chdir chdir "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Chflags(path string, flags int) (err error) { @@ -626,17 +478,13 @@ func Chflags(path string, flags int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_chflags_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(flags), 0) + _, _, e1 := Syscall(SYS_CHFLAGS, uintptr(unsafe.Pointer(_p0)), uintptr(flags), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_chflags_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_chflags chflags "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Chmod(path string, mode uint32) (err error) { @@ -645,17 +493,13 @@ func Chmod(path string, mode uint32) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_chmod_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0) + _, _, e1 := Syscall(SYS_CHMOD, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_chmod_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_chmod chmod "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Chown(path string, uid int, gid int) (err error) { @@ -664,17 +508,13 @@ func Chown(path string, uid int, gid int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_chown_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(uid), uintptr(gid)) + _, _, e1 := Syscall(SYS_CHOWN, uintptr(unsafe.Pointer(_p0)), uintptr(uid), uintptr(gid)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_chown_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_chown chown "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Chroot(path string) (err error) { @@ -683,35 +523,27 @@ func Chroot(path string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_chroot_trampoline_addr, uintptr(unsafe.Pointer(_p0)), 0, 0) + _, _, e1 := Syscall(SYS_CHROOT, uintptr(unsafe.Pointer(_p0)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_chroot_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_chroot chroot "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Close(fd int) (err error) { - _, _, e1 := syscall_syscall(libc_close_trampoline_addr, uintptr(fd), 0, 0) + _, _, e1 := Syscall(SYS_CLOSE, uintptr(fd), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_close_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_close close "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Dup(fd int) (nfd int, err error) { - r0, _, e1 := syscall_syscall(libc_dup_trampoline_addr, uintptr(fd), 0, 0) + r0, _, e1 := Syscall(SYS_DUP, uintptr(fd), 0, 0) nfd = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -719,49 +551,33 @@ func Dup(fd int) (nfd int, err error) { return } -var libc_dup_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_dup dup "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Dup2(from int, to int) (err error) { - _, _, e1 := syscall_syscall(libc_dup2_trampoline_addr, uintptr(from), uintptr(to), 0) + _, _, e1 := Syscall(SYS_DUP2, uintptr(from), uintptr(to), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_dup2_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_dup2 dup2 "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Dup3(from int, to int, flags int) (err error) { - _, _, e1 := syscall_syscall(libc_dup3_trampoline_addr, uintptr(from), uintptr(to), uintptr(flags)) + _, _, e1 := Syscall(SYS_DUP3, uintptr(from), uintptr(to), uintptr(flags)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_dup3_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_dup3 dup3 "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Exit(code int) { - syscall_syscall(libc_exit_trampoline_addr, uintptr(code), 0, 0) + Syscall(SYS_EXIT, uintptr(code), 0, 0) return } -var libc_exit_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_exit exit "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Faccessat(dirfd int, path string, mode uint32, flags int) (err error) { @@ -770,59 +586,43 @@ func Faccessat(dirfd int, path string, mode uint32, flags int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall6(libc_faccessat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(flags), 0, 0) + _, _, e1 := Syscall6(SYS_FACCESSAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(flags), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_faccessat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_faccessat faccessat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fchdir(fd int) (err error) { - _, _, e1 := syscall_syscall(libc_fchdir_trampoline_addr, uintptr(fd), 0, 0) + _, _, e1 := Syscall(SYS_FCHDIR, uintptr(fd), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fchdir_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fchdir fchdir "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fchflags(fd int, flags int) (err error) { - _, _, e1 := syscall_syscall(libc_fchflags_trampoline_addr, uintptr(fd), uintptr(flags), 0) + _, _, e1 := Syscall(SYS_FCHFLAGS, uintptr(fd), uintptr(flags), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fchflags_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fchflags fchflags "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fchmod(fd int, mode uint32) (err error) { - _, _, e1 := syscall_syscall(libc_fchmod_trampoline_addr, uintptr(fd), uintptr(mode), 0) + _, _, e1 := Syscall(SYS_FCHMOD, uintptr(fd), uintptr(mode), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fchmod_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fchmod fchmod "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fchmodat(dirfd int, path string, mode uint32, flags int) (err error) { @@ -831,31 +631,23 @@ func Fchmodat(dirfd int, path string, mode uint32, flags int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall6(libc_fchmodat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(flags), 0, 0) + _, _, e1 := Syscall6(SYS_FCHMODAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(flags), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fchmodat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fchmodat fchmodat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fchown(fd int, uid int, gid int) (err error) { - _, _, e1 := syscall_syscall(libc_fchown_trampoline_addr, uintptr(fd), uintptr(uid), uintptr(gid)) + _, _, e1 := Syscall(SYS_FCHOWN, uintptr(fd), uintptr(uid), uintptr(gid)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fchown_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fchown fchown "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fchownat(dirfd int, path string, uid int, gid int, flags int) (err error) { @@ -864,35 +656,27 @@ func Fchownat(dirfd int, path string, uid int, gid int, flags int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall6(libc_fchownat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(uid), uintptr(gid), uintptr(flags), 0) + _, _, e1 := Syscall6(SYS_FCHOWNAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(uid), uintptr(gid), uintptr(flags), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fchownat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fchownat fchownat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Flock(fd int, how int) (err error) { - _, _, e1 := syscall_syscall(libc_flock_trampoline_addr, uintptr(fd), uintptr(how), 0) + _, _, e1 := Syscall(SYS_FLOCK, uintptr(fd), uintptr(how), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_flock_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_flock flock "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fpathconf(fd int, name int) (val int, err error) { - r0, _, e1 := syscall_syscall(libc_fpathconf_trampoline_addr, uintptr(fd), uintptr(name), 0) + r0, _, e1 := Syscall(SYS_FPATHCONF, uintptr(fd), uintptr(name), 0) val = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -900,24 +684,16 @@ func Fpathconf(fd int, name int) (val int, err error) { return } -var libc_fpathconf_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fpathconf fpathconf "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fstat(fd int, stat *Stat_t) (err error) { - _, _, e1 := syscall_syscall(libc_fstat_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) + _, _, e1 := Syscall(SYS_FSTAT, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fstat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fstat fstat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fstatat(fd int, path string, stat *Stat_t, flags int) (err error) { @@ -926,99 +702,71 @@ func Fstatat(fd int, path string, stat *Stat_t, flags int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall6(libc_fstatat_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), uintptr(flags), 0, 0) + _, _, e1 := Syscall6(SYS_FSTATAT, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), uintptr(flags), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fstatat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fstatat fstatat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fstatfs(fd int, stat *Statfs_t) (err error) { - _, _, e1 := syscall_syscall(libc_fstatfs_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) + _, _, e1 := Syscall(SYS_FSTATFS, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fstatfs_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fstatfs fstatfs "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fsync(fd int) (err error) { - _, _, e1 := syscall_syscall(libc_fsync_trampoline_addr, uintptr(fd), 0, 0) + _, _, e1 := Syscall(SYS_FSYNC, uintptr(fd), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fsync_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fsync fsync "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Ftruncate(fd int, length int64) (err error) { - _, _, e1 := syscall_syscall(libc_ftruncate_trampoline_addr, uintptr(fd), uintptr(length), uintptr(length>>32)) + _, _, e1 := Syscall6(SYS_FTRUNCATE, uintptr(fd), 0, uintptr(length), uintptr(length>>32), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_ftruncate_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_ftruncate ftruncate "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getegid() (egid int) { - r0, _, _ := syscall_rawSyscall(libc_getegid_trampoline_addr, 0, 0, 0) + r0, _, _ := RawSyscall(SYS_GETEGID, 0, 0, 0) egid = int(r0) return } -var libc_getegid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getegid getegid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Geteuid() (uid int) { - r0, _, _ := syscall_rawSyscall(libc_geteuid_trampoline_addr, 0, 0, 0) + r0, _, _ := RawSyscall(SYS_GETEUID, 0, 0, 0) uid = int(r0) return } -var libc_geteuid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_geteuid geteuid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getgid() (gid int) { - r0, _, _ := syscall_rawSyscall(libc_getgid_trampoline_addr, 0, 0, 0) + r0, _, _ := RawSyscall(SYS_GETGID, 0, 0, 0) gid = int(r0) return } -var libc_getgid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getgid getgid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getpgid(pid int) (pgid int, err error) { - r0, _, e1 := syscall_rawSyscall(libc_getpgid_trampoline_addr, uintptr(pid), 0, 0) + r0, _, e1 := RawSyscall(SYS_GETPGID, uintptr(pid), 0, 0) pgid = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1026,50 +774,34 @@ func Getpgid(pid int) (pgid int, err error) { return } -var libc_getpgid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getpgid getpgid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getpgrp() (pgrp int) { - r0, _, _ := syscall_rawSyscall(libc_getpgrp_trampoline_addr, 0, 0, 0) + r0, _, _ := RawSyscall(SYS_GETPGRP, 0, 0, 0) pgrp = int(r0) return } -var libc_getpgrp_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getpgrp getpgrp "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getpid() (pid int) { - r0, _, _ := syscall_rawSyscall(libc_getpid_trampoline_addr, 0, 0, 0) + r0, _, _ := RawSyscall(SYS_GETPID, 0, 0, 0) pid = int(r0) return } -var libc_getpid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getpid getpid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getppid() (ppid int) { - r0, _, _ := syscall_rawSyscall(libc_getppid_trampoline_addr, 0, 0, 0) + r0, _, _ := RawSyscall(SYS_GETPPID, 0, 0, 0) ppid = int(r0) return } -var libc_getppid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getppid getppid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getpriority(which int, who int) (prio int, err error) { - r0, _, e1 := syscall_syscall(libc_getpriority_trampoline_addr, uintptr(which), uintptr(who), 0) + r0, _, e1 := Syscall(SYS_GETPRIORITY, uintptr(which), uintptr(who), 0) prio = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1077,28 +809,20 @@ func Getpriority(which int, who int) (prio int, err error) { return } -var libc_getpriority_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getpriority getpriority "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getrlimit(which int, lim *Rlimit) (err error) { - _, _, e1 := syscall_rawSyscall(libc_getrlimit_trampoline_addr, uintptr(which), uintptr(unsafe.Pointer(lim)), 0) + _, _, e1 := RawSyscall(SYS_GETRLIMIT, uintptr(which), uintptr(unsafe.Pointer(lim)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_getrlimit_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getrlimit getrlimit "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getrtable() (rtable int, err error) { - r0, _, e1 := syscall_rawSyscall(libc_getrtable_trampoline_addr, 0, 0, 0) + r0, _, e1 := RawSyscall(SYS_GETRTABLE, 0, 0, 0) rtable = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1106,28 +830,20 @@ func Getrtable() (rtable int, err error) { return } -var libc_getrtable_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getrtable getrtable "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getrusage(who int, rusage *Rusage) (err error) { - _, _, e1 := syscall_rawSyscall(libc_getrusage_trampoline_addr, uintptr(who), uintptr(unsafe.Pointer(rusage)), 0) + _, _, e1 := RawSyscall(SYS_GETRUSAGE, uintptr(who), uintptr(unsafe.Pointer(rusage)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_getrusage_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getrusage getrusage "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getsid(pid int) (sid int, err error) { - r0, _, e1 := syscall_rawSyscall(libc_getsid_trampoline_addr, uintptr(pid), 0, 0) + r0, _, e1 := RawSyscall(SYS_GETSID, uintptr(pid), 0, 0) sid = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1135,66 +851,46 @@ func Getsid(pid int) (sid int, err error) { return } -var libc_getsid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getsid getsid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Gettimeofday(tv *Timeval) (err error) { - _, _, e1 := syscall_rawSyscall(libc_gettimeofday_trampoline_addr, uintptr(unsafe.Pointer(tv)), 0, 0) + _, _, e1 := RawSyscall(SYS_GETTIMEOFDAY, uintptr(unsafe.Pointer(tv)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_gettimeofday_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_gettimeofday gettimeofday "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getuid() (uid int) { - r0, _, _ := syscall_rawSyscall(libc_getuid_trampoline_addr, 0, 0, 0) + r0, _, _ := RawSyscall(SYS_GETUID, 0, 0, 0) uid = int(r0) return } -var libc_getuid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getuid getuid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Issetugid() (tainted bool) { - r0, _, _ := syscall_syscall(libc_issetugid_trampoline_addr, 0, 0, 0) + r0, _, _ := Syscall(SYS_ISSETUGID, 0, 0, 0) tainted = bool(r0 != 0) return } -var libc_issetugid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_issetugid issetugid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Kill(pid int, signum syscall.Signal) (err error) { - _, _, e1 := syscall_syscall(libc_kill_trampoline_addr, uintptr(pid), uintptr(signum), 0) + _, _, e1 := Syscall(SYS_KILL, uintptr(pid), uintptr(signum), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_kill_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_kill kill "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Kqueue() (fd int, err error) { - r0, _, e1 := syscall_syscall(libc_kqueue_trampoline_addr, 0, 0, 0) + r0, _, e1 := Syscall(SYS_KQUEUE, 0, 0, 0) fd = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1202,10 +898,6 @@ func Kqueue() (fd int, err error) { return } -var libc_kqueue_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_kqueue kqueue "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Lchown(path string, uid int, gid int) (err error) { @@ -1214,17 +906,13 @@ func Lchown(path string, uid int, gid int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_lchown_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(uid), uintptr(gid)) + _, _, e1 := Syscall(SYS_LCHOWN, uintptr(unsafe.Pointer(_p0)), uintptr(uid), uintptr(gid)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_lchown_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_lchown lchown "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Link(path string, link string) (err error) { @@ -1238,17 +926,13 @@ func Link(path string, link string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_link_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), 0) + _, _, e1 := Syscall(SYS_LINK, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_link_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_link link "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Linkat(pathfd int, path string, linkfd int, link string, flags int) (err error) { @@ -1262,31 +946,23 @@ func Linkat(pathfd int, path string, linkfd int, link string, flags int) (err er if err != nil { return } - _, _, e1 := syscall_syscall6(libc_linkat_trampoline_addr, uintptr(pathfd), uintptr(unsafe.Pointer(_p0)), uintptr(linkfd), uintptr(unsafe.Pointer(_p1)), uintptr(flags), 0) + _, _, e1 := Syscall6(SYS_LINKAT, uintptr(pathfd), uintptr(unsafe.Pointer(_p0)), uintptr(linkfd), uintptr(unsafe.Pointer(_p1)), uintptr(flags), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_linkat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_linkat linkat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Listen(s int, backlog int) (err error) { - _, _, e1 := syscall_syscall(libc_listen_trampoline_addr, uintptr(s), uintptr(backlog), 0) + _, _, e1 := Syscall(SYS_LISTEN, uintptr(s), uintptr(backlog), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_listen_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_listen listen "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Lstat(path string, stat *Stat_t) (err error) { @@ -1295,17 +971,13 @@ func Lstat(path string, stat *Stat_t) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_lstat_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) + _, _, e1 := Syscall(SYS_LSTAT, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_lstat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_lstat lstat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mkdir(path string, mode uint32) (err error) { @@ -1314,17 +986,13 @@ func Mkdir(path string, mode uint32) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_mkdir_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0) + _, _, e1 := Syscall(SYS_MKDIR, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mkdir_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mkdir mkdir "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mkdirat(dirfd int, path string, mode uint32) (err error) { @@ -1333,17 +1001,13 @@ func Mkdirat(dirfd int, path string, mode uint32) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_mkdirat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode)) + _, _, e1 := Syscall(SYS_MKDIRAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mkdirat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mkdirat mkdirat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mkfifo(path string, mode uint32) (err error) { @@ -1352,17 +1016,13 @@ func Mkfifo(path string, mode uint32) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_mkfifo_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0) + _, _, e1 := Syscall(SYS_MKFIFO, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mkfifo_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mkfifo mkfifo "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mkfifoat(dirfd int, path string, mode uint32) (err error) { @@ -1371,17 +1031,13 @@ func Mkfifoat(dirfd int, path string, mode uint32) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_mkfifoat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode)) + _, _, e1 := Syscall(SYS_MKFIFOAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mkfifoat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mkfifoat mkfifoat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mknod(path string, mode uint32, dev int) (err error) { @@ -1390,17 +1046,13 @@ func Mknod(path string, mode uint32, dev int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_mknod_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(dev)) + _, _, e1 := Syscall(SYS_MKNOD, uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(dev)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mknod_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mknod mknod "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mknodat(dirfd int, path string, mode uint32, dev int) (err error) { @@ -1409,31 +1061,23 @@ func Mknodat(dirfd int, path string, mode uint32, dev int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall6(libc_mknodat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(dev), 0, 0) + _, _, e1 := Syscall6(SYS_MKNODAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(dev), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mknodat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mknodat mknodat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Nanosleep(time *Timespec, leftover *Timespec) (err error) { - _, _, e1 := syscall_syscall(libc_nanosleep_trampoline_addr, uintptr(unsafe.Pointer(time)), uintptr(unsafe.Pointer(leftover)), 0) + _, _, e1 := Syscall(SYS_NANOSLEEP, uintptr(unsafe.Pointer(time)), uintptr(unsafe.Pointer(leftover)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_nanosleep_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_nanosleep nanosleep "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Open(path string, mode int, perm uint32) (fd int, err error) { @@ -1442,7 +1086,7 @@ func Open(path string, mode int, perm uint32) (fd int, err error) { if err != nil { return } - r0, _, e1 := syscall_syscall(libc_open_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(perm)) + r0, _, e1 := Syscall(SYS_OPEN, uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(perm)) fd = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1450,10 +1094,6 @@ func Open(path string, mode int, perm uint32) (fd int, err error) { return } -var libc_open_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_open open "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Openat(dirfd int, path string, mode int, perm uint32) (fd int, err error) { @@ -1462,7 +1102,7 @@ func Openat(dirfd int, path string, mode int, perm uint32) (fd int, err error) { if err != nil { return } - r0, _, e1 := syscall_syscall6(libc_openat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(perm), 0, 0) + r0, _, e1 := Syscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(perm), 0, 0) fd = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1470,10 +1110,6 @@ func Openat(dirfd int, path string, mode int, perm uint32) (fd int, err error) { return } -var libc_openat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_openat openat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Pathconf(path string, name int) (val int, err error) { @@ -1482,7 +1118,7 @@ func Pathconf(path string, name int) (val int, err error) { if err != nil { return } - r0, _, e1 := syscall_syscall(libc_pathconf_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(name), 0) + r0, _, e1 := Syscall(SYS_PATHCONF, uintptr(unsafe.Pointer(_p0)), uintptr(name), 0) val = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1490,10 +1126,6 @@ func Pathconf(path string, name int) (val int, err error) { return } -var libc_pathconf_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_pathconf pathconf "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func pread(fd int, p []byte, offset int64) (n int, err error) { @@ -1503,7 +1135,7 @@ func pread(fd int, p []byte, offset int64) (n int, err error) { } else { _p0 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall6(libc_pread_trampoline_addr, uintptr(fd), uintptr(_p0), uintptr(len(p)), uintptr(offset), uintptr(offset>>32), 0) + r0, _, e1 := Syscall6(SYS_PREAD, uintptr(fd), uintptr(_p0), uintptr(len(p)), 0, uintptr(offset), uintptr(offset>>32)) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1511,10 +1143,6 @@ func pread(fd int, p []byte, offset int64) (n int, err error) { return } -var libc_pread_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_pread pread "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func pwrite(fd int, p []byte, offset int64) (n int, err error) { @@ -1524,7 +1152,7 @@ func pwrite(fd int, p []byte, offset int64) (n int, err error) { } else { _p0 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall6(libc_pwrite_trampoline_addr, uintptr(fd), uintptr(_p0), uintptr(len(p)), uintptr(offset), uintptr(offset>>32), 0) + r0, _, e1 := Syscall6(SYS_PWRITE, uintptr(fd), uintptr(_p0), uintptr(len(p)), 0, uintptr(offset), uintptr(offset>>32)) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1532,10 +1160,6 @@ func pwrite(fd int, p []byte, offset int64) (n int, err error) { return } -var libc_pwrite_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_pwrite pwrite "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func read(fd int, p []byte) (n int, err error) { @@ -1545,7 +1169,7 @@ func read(fd int, p []byte) (n int, err error) { } else { _p0 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall(libc_read_trampoline_addr, uintptr(fd), uintptr(_p0), uintptr(len(p))) + r0, _, e1 := Syscall(SYS_READ, uintptr(fd), uintptr(_p0), uintptr(len(p))) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1553,10 +1177,6 @@ func read(fd int, p []byte) (n int, err error) { return } -var libc_read_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_read read "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Readlink(path string, buf []byte) (n int, err error) { @@ -1571,7 +1191,7 @@ func Readlink(path string, buf []byte) (n int, err error) { } else { _p1 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall(libc_readlink_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(_p1), uintptr(len(buf))) + r0, _, e1 := Syscall(SYS_READLINK, uintptr(unsafe.Pointer(_p0)), uintptr(_p1), uintptr(len(buf))) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1579,10 +1199,6 @@ func Readlink(path string, buf []byte) (n int, err error) { return } -var libc_readlink_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_readlink readlink "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Readlinkat(dirfd int, path string, buf []byte) (n int, err error) { @@ -1597,7 +1213,7 @@ func Readlinkat(dirfd int, path string, buf []byte) (n int, err error) { } else { _p1 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall6(libc_readlinkat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(_p1), uintptr(len(buf)), 0, 0) + r0, _, e1 := Syscall6(SYS_READLINKAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(_p1), uintptr(len(buf)), 0, 0) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1605,10 +1221,6 @@ func Readlinkat(dirfd int, path string, buf []byte) (n int, err error) { return } -var libc_readlinkat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_readlinkat readlinkat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Rename(from string, to string) (err error) { @@ -1622,17 +1234,13 @@ func Rename(from string, to string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_rename_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), 0) + _, _, e1 := Syscall(SYS_RENAME, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_rename_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_rename rename "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Renameat(fromfd int, from string, tofd int, to string) (err error) { @@ -1646,17 +1254,13 @@ func Renameat(fromfd int, from string, tofd int, to string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall6(libc_renameat_trampoline_addr, uintptr(fromfd), uintptr(unsafe.Pointer(_p0)), uintptr(tofd), uintptr(unsafe.Pointer(_p1)), 0, 0) + _, _, e1 := Syscall6(SYS_RENAMEAT, uintptr(fromfd), uintptr(unsafe.Pointer(_p0)), uintptr(tofd), uintptr(unsafe.Pointer(_p1)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_renameat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_renameat renameat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Revoke(path string) (err error) { @@ -1665,17 +1269,13 @@ func Revoke(path string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_revoke_trampoline_addr, uintptr(unsafe.Pointer(_p0)), 0, 0) + _, _, e1 := Syscall(SYS_REVOKE, uintptr(unsafe.Pointer(_p0)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_revoke_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_revoke revoke "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Rmdir(path string) (err error) { @@ -1684,21 +1284,17 @@ func Rmdir(path string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_rmdir_trampoline_addr, uintptr(unsafe.Pointer(_p0)), 0, 0) + _, _, e1 := Syscall(SYS_RMDIR, uintptr(unsafe.Pointer(_p0)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_rmdir_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_rmdir rmdir "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Seek(fd int, offset int64, whence int) (newoffset int64, err error) { - r0, r1, e1 := syscall_syscall6(libc_lseek_trampoline_addr, uintptr(fd), uintptr(offset), uintptr(offset>>32), uintptr(whence), 0, 0) + r0, r1, e1 := Syscall6(SYS_LSEEK, uintptr(fd), 0, uintptr(offset), uintptr(offset>>32), uintptr(whence), 0) newoffset = int64(int64(r1)<<32 | int64(r0)) if e1 != 0 { err = errnoErr(e1) @@ -1706,14 +1302,10 @@ func Seek(fd int, offset int64, whence int) (newoffset int64, err error) { return } -var libc_lseek_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_lseek lseek "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Select(nfd int, r *FdSet, w *FdSet, e *FdSet, timeout *Timeval) (n int, err error) { - r0, _, e1 := syscall_syscall6(libc_select_trampoline_addr, uintptr(nfd), uintptr(unsafe.Pointer(r)), uintptr(unsafe.Pointer(w)), uintptr(unsafe.Pointer(e)), uintptr(unsafe.Pointer(timeout)), 0) + r0, _, e1 := Syscall6(SYS_SELECT, uintptr(nfd), uintptr(unsafe.Pointer(r)), uintptr(unsafe.Pointer(w)), uintptr(unsafe.Pointer(e)), uintptr(unsafe.Pointer(timeout)), 0) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1721,52 +1313,36 @@ func Select(nfd int, r *FdSet, w *FdSet, e *FdSet, timeout *Timeval) (n int, err return } -var libc_select_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_select select "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setegid(egid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setegid_trampoline_addr, uintptr(egid), 0, 0) + _, _, e1 := RawSyscall(SYS_SETEGID, uintptr(egid), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setegid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setegid setegid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Seteuid(euid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_seteuid_trampoline_addr, uintptr(euid), 0, 0) + _, _, e1 := RawSyscall(SYS_SETEUID, uintptr(euid), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_seteuid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_seteuid seteuid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setgid(gid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setgid_trampoline_addr, uintptr(gid), 0, 0) + _, _, e1 := RawSyscall(SYS_SETGID, uintptr(gid), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setgid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setgid setgid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setlogin(name string) (err error) { @@ -1775,133 +1351,97 @@ func Setlogin(name string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_setlogin_trampoline_addr, uintptr(unsafe.Pointer(_p0)), 0, 0) + _, _, e1 := Syscall(SYS_SETLOGIN, uintptr(unsafe.Pointer(_p0)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setlogin_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setlogin setlogin "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setpgid(pid int, pgid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setpgid_trampoline_addr, uintptr(pid), uintptr(pgid), 0) + _, _, e1 := RawSyscall(SYS_SETPGID, uintptr(pid), uintptr(pgid), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setpgid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setpgid setpgid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setpriority(which int, who int, prio int) (err error) { - _, _, e1 := syscall_syscall(libc_setpriority_trampoline_addr, uintptr(which), uintptr(who), uintptr(prio)) + _, _, e1 := Syscall(SYS_SETPRIORITY, uintptr(which), uintptr(who), uintptr(prio)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setpriority_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setpriority setpriority "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setregid(rgid int, egid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setregid_trampoline_addr, uintptr(rgid), uintptr(egid), 0) + _, _, e1 := RawSyscall(SYS_SETREGID, uintptr(rgid), uintptr(egid), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setregid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setregid setregid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setreuid(ruid int, euid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setreuid_trampoline_addr, uintptr(ruid), uintptr(euid), 0) + _, _, e1 := RawSyscall(SYS_SETREUID, uintptr(ruid), uintptr(euid), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setreuid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setreuid setreuid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setresgid(rgid int, egid int, sgid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setresgid_trampoline_addr, uintptr(rgid), uintptr(egid), uintptr(sgid)) + _, _, e1 := RawSyscall(SYS_SETRESGID, uintptr(rgid), uintptr(egid), uintptr(sgid)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setresgid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setresgid setresgid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setresuid(ruid int, euid int, suid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setresuid_trampoline_addr, uintptr(ruid), uintptr(euid), uintptr(suid)) + _, _, e1 := RawSyscall(SYS_SETRESUID, uintptr(ruid), uintptr(euid), uintptr(suid)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setresuid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setresuid setresuid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setrlimit(which int, lim *Rlimit) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setrlimit_trampoline_addr, uintptr(which), uintptr(unsafe.Pointer(lim)), 0) + _, _, e1 := RawSyscall(SYS_SETRLIMIT, uintptr(which), uintptr(unsafe.Pointer(lim)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setrlimit_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setrlimit setrlimit "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setrtable(rtable int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setrtable_trampoline_addr, uintptr(rtable), 0, 0) + _, _, e1 := RawSyscall(SYS_SETRTABLE, uintptr(rtable), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setrtable_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setrtable setrtable "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setsid() (pid int, err error) { - r0, _, e1 := syscall_rawSyscall(libc_setsid_trampoline_addr, 0, 0, 0) + r0, _, e1 := RawSyscall(SYS_SETSID, 0, 0, 0) pid = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1909,38 +1449,26 @@ func Setsid() (pid int, err error) { return } -var libc_setsid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setsid setsid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Settimeofday(tp *Timeval) (err error) { - _, _, e1 := syscall_rawSyscall(libc_settimeofday_trampoline_addr, uintptr(unsafe.Pointer(tp)), 0, 0) + _, _, e1 := RawSyscall(SYS_SETTIMEOFDAY, uintptr(unsafe.Pointer(tp)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_settimeofday_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_settimeofday settimeofday "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setuid(uid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setuid_trampoline_addr, uintptr(uid), 0, 0) + _, _, e1 := RawSyscall(SYS_SETUID, uintptr(uid), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setuid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setuid setuid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Stat(path string, stat *Stat_t) (err error) { @@ -1949,17 +1477,13 @@ func Stat(path string, stat *Stat_t) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_stat_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) + _, _, e1 := Syscall(SYS_STAT, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_stat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_stat stat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Statfs(path string, stat *Statfs_t) (err error) { @@ -1968,17 +1492,13 @@ func Statfs(path string, stat *Statfs_t) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_statfs_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) + _, _, e1 := Syscall(SYS_STATFS, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_statfs_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_statfs statfs "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Symlink(path string, link string) (err error) { @@ -1992,17 +1512,13 @@ func Symlink(path string, link string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_symlink_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), 0) + _, _, e1 := Syscall(SYS_SYMLINK, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_symlink_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_symlink symlink "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Symlinkat(oldpath string, newdirfd int, newpath string) (err error) { @@ -2016,31 +1532,23 @@ func Symlinkat(oldpath string, newdirfd int, newpath string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_symlinkat_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(newdirfd), uintptr(unsafe.Pointer(_p1))) + _, _, e1 := Syscall(SYS_SYMLINKAT, uintptr(unsafe.Pointer(_p0)), uintptr(newdirfd), uintptr(unsafe.Pointer(_p1))) if e1 != 0 { err = errnoErr(e1) } return } -var libc_symlinkat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_symlinkat symlinkat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Sync() (err error) { - _, _, e1 := syscall_syscall(libc_sync_trampoline_addr, 0, 0, 0) + _, _, e1 := Syscall(SYS_SYNC, 0, 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_sync_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_sync sync "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Truncate(path string, length int64) (err error) { @@ -2049,29 +1557,21 @@ func Truncate(path string, length int64) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_truncate_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(length), uintptr(length>>32)) + _, _, e1 := Syscall6(SYS_TRUNCATE, uintptr(unsafe.Pointer(_p0)), 0, uintptr(length), uintptr(length>>32), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_truncate_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_truncate truncate "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Umask(newmask int) (oldmask int) { - r0, _, _ := syscall_syscall(libc_umask_trampoline_addr, uintptr(newmask), 0, 0) + r0, _, _ := Syscall(SYS_UMASK, uintptr(newmask), 0, 0) oldmask = int(r0) return } -var libc_umask_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_umask umask "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Unlink(path string) (err error) { @@ -2080,17 +1580,13 @@ func Unlink(path string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_unlink_trampoline_addr, uintptr(unsafe.Pointer(_p0)), 0, 0) + _, _, e1 := Syscall(SYS_UNLINK, uintptr(unsafe.Pointer(_p0)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_unlink_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_unlink unlink "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Unlinkat(dirfd int, path string, flags int) (err error) { @@ -2099,17 +1595,13 @@ func Unlinkat(dirfd int, path string, flags int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_unlinkat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(flags)) + _, _, e1 := Syscall(SYS_UNLINKAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(flags)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_unlinkat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_unlinkat unlinkat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Unmount(path string, flags int) (err error) { @@ -2118,17 +1610,13 @@ func Unmount(path string, flags int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_unmount_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(flags), 0) + _, _, e1 := Syscall(SYS_UNMOUNT, uintptr(unsafe.Pointer(_p0)), uintptr(flags), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_unmount_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_unmount unmount "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func write(fd int, p []byte) (n int, err error) { @@ -2138,7 +1626,7 @@ func write(fd int, p []byte) (n int, err error) { } else { _p0 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall(libc_write_trampoline_addr, uintptr(fd), uintptr(_p0), uintptr(len(p))) + r0, _, e1 := Syscall(SYS_WRITE, uintptr(fd), uintptr(_p0), uintptr(len(p))) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -2146,14 +1634,10 @@ func write(fd int, p []byte) (n int, err error) { return } -var libc_write_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_write write "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func mmap(addr uintptr, length uintptr, prot int, flag int, fd int, pos int64) (ret uintptr, err error) { - r0, _, e1 := syscall_syscall9(libc_mmap_trampoline_addr, uintptr(addr), uintptr(length), uintptr(prot), uintptr(flag), uintptr(fd), uintptr(pos), uintptr(pos>>32), 0, 0) + r0, _, e1 := Syscall9(SYS_MMAP, uintptr(addr), uintptr(length), uintptr(prot), uintptr(flag), uintptr(fd), 0, uintptr(pos), uintptr(pos>>32), 0) ret = uintptr(r0) if e1 != 0 { err = errnoErr(e1) @@ -2161,28 +1645,20 @@ func mmap(addr uintptr, length uintptr, prot int, flag int, fd int, pos int64) ( return } -var libc_mmap_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mmap mmap "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func munmap(addr uintptr, length uintptr) (err error) { - _, _, e1 := syscall_syscall(libc_munmap_trampoline_addr, uintptr(addr), uintptr(length), 0) + _, _, e1 := Syscall(SYS_MUNMAP, uintptr(addr), uintptr(length), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_munmap_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_munmap munmap "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func readlen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_read_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) + r0, _, e1 := Syscall(SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -2193,7 +1669,7 @@ func readlen(fd int, buf *byte, nbuf int) (n int, err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func writelen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_write_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) + r0, _, e1 := Syscall(SYS_WRITE, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -2209,13 +1685,9 @@ func utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error if err != nil { return } - _, _, e1 := syscall_syscall6(libc_utimensat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(times)), uintptr(flags), 0, 0) + _, _, e1 := Syscall6(SYS_UTIMENSAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(times)), uintptr(flags), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } - -var libc_utimensat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_utimensat utimensat "libc.so" diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.go index 98446d2..04db8fa 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_amd64.go @@ -1,4 +1,4 @@ -// go run mksyscall.go -openbsd -libc -tags openbsd,amd64 syscall_bsd.go syscall_openbsd.go syscall_openbsd_amd64.go +// go run mksyscall.go -openbsd -tags openbsd,amd64 syscall_bsd.go syscall_openbsd.go syscall_openbsd_amd64.go // Code generated by the command above; see README.md. DO NOT EDIT. //go:build openbsd && amd64 @@ -16,7 +16,7 @@ var _ syscall.Errno // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func getgroups(ngid int, gid *_Gid_t) (n int, err error) { - r0, _, e1 := syscall_rawSyscall(libc_getgroups_trampoline_addr, uintptr(ngid), uintptr(unsafe.Pointer(gid)), 0) + r0, _, e1 := RawSyscall(SYS_GETGROUPS, uintptr(ngid), uintptr(unsafe.Pointer(gid)), 0) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -24,28 +24,20 @@ func getgroups(ngid int, gid *_Gid_t) (n int, err error) { return } -var libc_getgroups_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getgroups getgroups "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func setgroups(ngid int, gid *_Gid_t) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setgroups_trampoline_addr, uintptr(ngid), uintptr(unsafe.Pointer(gid)), 0) + _, _, e1 := RawSyscall(SYS_SETGROUPS, uintptr(ngid), uintptr(unsafe.Pointer(gid)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setgroups_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setgroups setgroups "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func wait4(pid int, wstatus *_C_int, options int, rusage *Rusage) (wpid int, err error) { - r0, _, e1 := syscall_syscall6(libc_wait4_trampoline_addr, uintptr(pid), uintptr(unsafe.Pointer(wstatus)), uintptr(options), uintptr(unsafe.Pointer(rusage)), 0, 0) + r0, _, e1 := Syscall6(SYS_WAIT4, uintptr(pid), uintptr(unsafe.Pointer(wstatus)), uintptr(options), uintptr(unsafe.Pointer(rusage)), 0, 0) wpid = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -53,14 +45,10 @@ func wait4(pid int, wstatus *_C_int, options int, rusage *Rusage) (wpid int, err return } -var libc_wait4_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_wait4 wait4 "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func accept(s int, rsa *RawSockaddrAny, addrlen *_Socklen) (fd int, err error) { - r0, _, e1 := syscall_syscall(libc_accept_trampoline_addr, uintptr(s), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen))) + r0, _, e1 := Syscall(SYS_ACCEPT, uintptr(s), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen))) fd = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -68,42 +56,30 @@ func accept(s int, rsa *RawSockaddrAny, addrlen *_Socklen) (fd int, err error) { return } -var libc_accept_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_accept accept "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func bind(s int, addr unsafe.Pointer, addrlen _Socklen) (err error) { - _, _, e1 := syscall_syscall(libc_bind_trampoline_addr, uintptr(s), uintptr(addr), uintptr(addrlen)) + _, _, e1 := Syscall(SYS_BIND, uintptr(s), uintptr(addr), uintptr(addrlen)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_bind_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_bind bind "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func connect(s int, addr unsafe.Pointer, addrlen _Socklen) (err error) { - _, _, e1 := syscall_syscall(libc_connect_trampoline_addr, uintptr(s), uintptr(addr), uintptr(addrlen)) + _, _, e1 := Syscall(SYS_CONNECT, uintptr(s), uintptr(addr), uintptr(addrlen)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_connect_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_connect connect "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func socket(domain int, typ int, proto int) (fd int, err error) { - r0, _, e1 := syscall_rawSyscall(libc_socket_trampoline_addr, uintptr(domain), uintptr(typ), uintptr(proto)) + r0, _, e1 := RawSyscall(SYS_SOCKET, uintptr(domain), uintptr(typ), uintptr(proto)) fd = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -111,94 +87,66 @@ func socket(domain int, typ int, proto int) (fd int, err error) { return } -var libc_socket_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_socket socket "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func getsockopt(s int, level int, name int, val unsafe.Pointer, vallen *_Socklen) (err error) { - _, _, e1 := syscall_syscall6(libc_getsockopt_trampoline_addr, uintptr(s), uintptr(level), uintptr(name), uintptr(val), uintptr(unsafe.Pointer(vallen)), 0) + _, _, e1 := Syscall6(SYS_GETSOCKOPT, uintptr(s), uintptr(level), uintptr(name), uintptr(val), uintptr(unsafe.Pointer(vallen)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_getsockopt_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getsockopt getsockopt "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func setsockopt(s int, level int, name int, val unsafe.Pointer, vallen uintptr) (err error) { - _, _, e1 := syscall_syscall6(libc_setsockopt_trampoline_addr, uintptr(s), uintptr(level), uintptr(name), uintptr(val), uintptr(vallen), 0) + _, _, e1 := Syscall6(SYS_SETSOCKOPT, uintptr(s), uintptr(level), uintptr(name), uintptr(val), uintptr(vallen), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setsockopt_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setsockopt setsockopt "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func getpeername(fd int, rsa *RawSockaddrAny, addrlen *_Socklen) (err error) { - _, _, e1 := syscall_rawSyscall(libc_getpeername_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen))) + _, _, e1 := RawSyscall(SYS_GETPEERNAME, uintptr(fd), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen))) if e1 != 0 { err = errnoErr(e1) } return } -var libc_getpeername_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getpeername getpeername "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func getsockname(fd int, rsa *RawSockaddrAny, addrlen *_Socklen) (err error) { - _, _, e1 := syscall_rawSyscall(libc_getsockname_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen))) + _, _, e1 := RawSyscall(SYS_GETSOCKNAME, uintptr(fd), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen))) if e1 != 0 { err = errnoErr(e1) } return } -var libc_getsockname_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getsockname getsockname "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Shutdown(s int, how int) (err error) { - _, _, e1 := syscall_syscall(libc_shutdown_trampoline_addr, uintptr(s), uintptr(how), 0) + _, _, e1 := Syscall(SYS_SHUTDOWN, uintptr(s), uintptr(how), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_shutdown_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_shutdown shutdown "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func socketpair(domain int, typ int, proto int, fd *[2]int32) (err error) { - _, _, e1 := syscall_rawSyscall6(libc_socketpair_trampoline_addr, uintptr(domain), uintptr(typ), uintptr(proto), uintptr(unsafe.Pointer(fd)), 0, 0) + _, _, e1 := RawSyscall6(SYS_SOCKETPAIR, uintptr(domain), uintptr(typ), uintptr(proto), uintptr(unsafe.Pointer(fd)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_socketpair_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_socketpair socketpair "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func recvfrom(fd int, p []byte, flags int, from *RawSockaddrAny, fromlen *_Socklen) (n int, err error) { @@ -208,7 +156,7 @@ func recvfrom(fd int, p []byte, flags int, from *RawSockaddrAny, fromlen *_Sockl } else { _p0 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall6(libc_recvfrom_trampoline_addr, uintptr(fd), uintptr(_p0), uintptr(len(p)), uintptr(flags), uintptr(unsafe.Pointer(from)), uintptr(unsafe.Pointer(fromlen))) + r0, _, e1 := Syscall6(SYS_RECVFROM, uintptr(fd), uintptr(_p0), uintptr(len(p)), uintptr(flags), uintptr(unsafe.Pointer(from)), uintptr(unsafe.Pointer(fromlen))) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -216,10 +164,6 @@ func recvfrom(fd int, p []byte, flags int, from *RawSockaddrAny, fromlen *_Sockl return } -var libc_recvfrom_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_recvfrom recvfrom "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func sendto(s int, buf []byte, flags int, to unsafe.Pointer, addrlen _Socklen) (err error) { @@ -229,21 +173,17 @@ func sendto(s int, buf []byte, flags int, to unsafe.Pointer, addrlen _Socklen) ( } else { _p0 = unsafe.Pointer(&_zero) } - _, _, e1 := syscall_syscall6(libc_sendto_trampoline_addr, uintptr(s), uintptr(_p0), uintptr(len(buf)), uintptr(flags), uintptr(to), uintptr(addrlen)) + _, _, e1 := Syscall6(SYS_SENDTO, uintptr(s), uintptr(_p0), uintptr(len(buf)), uintptr(flags), uintptr(to), uintptr(addrlen)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_sendto_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_sendto sendto "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func recvmsg(s int, msg *Msghdr, flags int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_recvmsg_trampoline_addr, uintptr(s), uintptr(unsafe.Pointer(msg)), uintptr(flags)) + r0, _, e1 := Syscall(SYS_RECVMSG, uintptr(s), uintptr(unsafe.Pointer(msg)), uintptr(flags)) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -251,14 +191,10 @@ func recvmsg(s int, msg *Msghdr, flags int) (n int, err error) { return } -var libc_recvmsg_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_recvmsg recvmsg "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func sendmsg(s int, msg *Msghdr, flags int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_sendmsg_trampoline_addr, uintptr(s), uintptr(unsafe.Pointer(msg)), uintptr(flags)) + r0, _, e1 := Syscall(SYS_SENDMSG, uintptr(s), uintptr(unsafe.Pointer(msg)), uintptr(flags)) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -266,14 +202,10 @@ func sendmsg(s int, msg *Msghdr, flags int) (n int, err error) { return } -var libc_sendmsg_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_sendmsg sendmsg "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func kevent(kq int, change unsafe.Pointer, nchange int, event unsafe.Pointer, nevent int, timeout *Timespec) (n int, err error) { - r0, _, e1 := syscall_syscall6(libc_kevent_trampoline_addr, uintptr(kq), uintptr(change), uintptr(nchange), uintptr(event), uintptr(nevent), uintptr(unsafe.Pointer(timeout))) + r0, _, e1 := Syscall6(SYS_KEVENT, uintptr(kq), uintptr(change), uintptr(nchange), uintptr(event), uintptr(nevent), uintptr(unsafe.Pointer(timeout))) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -281,10 +213,6 @@ func kevent(kq int, change unsafe.Pointer, nchange int, event unsafe.Pointer, ne return } -var libc_kevent_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_kevent kevent "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func utimes(path string, timeval *[2]Timeval) (err error) { @@ -293,35 +221,27 @@ func utimes(path string, timeval *[2]Timeval) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_utimes_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(timeval)), 0) + _, _, e1 := Syscall(SYS_UTIMES, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(timeval)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_utimes_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_utimes utimes "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func futimes(fd int, timeval *[2]Timeval) (err error) { - _, _, e1 := syscall_syscall(libc_futimes_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(timeval)), 0) + _, _, e1 := Syscall(SYS_FUTIMES, uintptr(fd), uintptr(unsafe.Pointer(timeval)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_futimes_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_futimes futimes "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func poll(fds *PollFd, nfds int, timeout int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_poll_trampoline_addr, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(timeout)) + r0, _, e1 := Syscall(SYS_POLL, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(timeout)) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -329,10 +249,6 @@ func poll(fds *PollFd, nfds int, timeout int) (n int, err error) { return } -var libc_poll_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_poll poll "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Madvise(b []byte, behav int) (err error) { @@ -342,17 +258,13 @@ func Madvise(b []byte, behav int) (err error) { } else { _p0 = unsafe.Pointer(&_zero) } - _, _, e1 := syscall_syscall(libc_madvise_trampoline_addr, uintptr(_p0), uintptr(len(b)), uintptr(behav)) + _, _, e1 := Syscall(SYS_MADVISE, uintptr(_p0), uintptr(len(b)), uintptr(behav)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_madvise_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_madvise madvise "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mlock(b []byte) (err error) { @@ -362,31 +274,23 @@ func Mlock(b []byte) (err error) { } else { _p0 = unsafe.Pointer(&_zero) } - _, _, e1 := syscall_syscall(libc_mlock_trampoline_addr, uintptr(_p0), uintptr(len(b)), 0) + _, _, e1 := Syscall(SYS_MLOCK, uintptr(_p0), uintptr(len(b)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mlock_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mlock mlock "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mlockall(flags int) (err error) { - _, _, e1 := syscall_syscall(libc_mlockall_trampoline_addr, uintptr(flags), 0, 0) + _, _, e1 := Syscall(SYS_MLOCKALL, uintptr(flags), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mlockall_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mlockall mlockall "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mprotect(b []byte, prot int) (err error) { @@ -396,17 +300,13 @@ func Mprotect(b []byte, prot int) (err error) { } else { _p0 = unsafe.Pointer(&_zero) } - _, _, e1 := syscall_syscall(libc_mprotect_trampoline_addr, uintptr(_p0), uintptr(len(b)), uintptr(prot)) + _, _, e1 := Syscall(SYS_MPROTECT, uintptr(_p0), uintptr(len(b)), uintptr(prot)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mprotect_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mprotect mprotect "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Msync(b []byte, flags int) (err error) { @@ -416,17 +316,13 @@ func Msync(b []byte, flags int) (err error) { } else { _p0 = unsafe.Pointer(&_zero) } - _, _, e1 := syscall_syscall(libc_msync_trampoline_addr, uintptr(_p0), uintptr(len(b)), uintptr(flags)) + _, _, e1 := Syscall(SYS_MSYNC, uintptr(_p0), uintptr(len(b)), uintptr(flags)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_msync_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_msync msync "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Munlock(b []byte) (err error) { @@ -436,45 +332,33 @@ func Munlock(b []byte) (err error) { } else { _p0 = unsafe.Pointer(&_zero) } - _, _, e1 := syscall_syscall(libc_munlock_trampoline_addr, uintptr(_p0), uintptr(len(b)), 0) + _, _, e1 := Syscall(SYS_MUNLOCK, uintptr(_p0), uintptr(len(b)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_munlock_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_munlock munlock "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Munlockall() (err error) { - _, _, e1 := syscall_syscall(libc_munlockall_trampoline_addr, 0, 0, 0) + _, _, e1 := Syscall(SYS_MUNLOCKALL, 0, 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_munlockall_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_munlockall munlockall "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func pipe2(p *[2]_C_int, flags int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_pipe2_trampoline_addr, uintptr(unsafe.Pointer(p)), uintptr(flags), 0) + _, _, e1 := RawSyscall(SYS_PIPE2, uintptr(unsafe.Pointer(p)), uintptr(flags), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_pipe2_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_pipe2 pipe2 "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getdents(fd int, buf []byte) (n int, err error) { @@ -484,7 +368,7 @@ func Getdents(fd int, buf []byte) (n int, err error) { } else { _p0 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall(libc_getdents_trampoline_addr, uintptr(fd), uintptr(_p0), uintptr(len(buf))) + r0, _, e1 := Syscall(SYS_GETDENTS, uintptr(fd), uintptr(_p0), uintptr(len(buf))) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -492,10 +376,6 @@ func Getdents(fd int, buf []byte) (n int, err error) { return } -var libc_getdents_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getdents getdents "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getcwd(buf []byte) (n int, err error) { @@ -505,7 +385,7 @@ func Getcwd(buf []byte) (n int, err error) { } else { _p0 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall(libc_getcwd_trampoline_addr, uintptr(_p0), uintptr(len(buf)), 0) + r0, _, e1 := Syscall(SYS___GETCWD, uintptr(_p0), uintptr(len(buf)), 0) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -513,24 +393,16 @@ func Getcwd(buf []byte) (n int, err error) { return } -var libc_getcwd_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getcwd getcwd "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func ioctl(fd int, req uint, arg uintptr) (err error) { - _, _, e1 := syscall_syscall(libc_ioctl_trampoline_addr, uintptr(fd), uintptr(req), uintptr(arg)) + _, _, e1 := Syscall(SYS_IOCTL, uintptr(fd), uintptr(req), uintptr(arg)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_ioctl_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_ioctl ioctl "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) { @@ -540,21 +412,17 @@ func sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) } else { _p0 = unsafe.Pointer(&_zero) } - _, _, e1 := syscall_syscall6(libc_sysctl_trampoline_addr, uintptr(_p0), uintptr(len(mib)), uintptr(unsafe.Pointer(old)), uintptr(unsafe.Pointer(oldlen)), uintptr(unsafe.Pointer(new)), uintptr(newlen)) + _, _, e1 := Syscall6(SYS___SYSCTL, uintptr(_p0), uintptr(len(mib)), uintptr(unsafe.Pointer(old)), uintptr(unsafe.Pointer(oldlen)), uintptr(unsafe.Pointer(new)), uintptr(newlen)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_sysctl_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_sysctl sysctl "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func ppoll(fds *PollFd, nfds int, timeout *Timespec, sigmask *Sigset_t) (n int, err error) { - r0, _, e1 := syscall_syscall6(libc_ppoll_trampoline_addr, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(unsafe.Pointer(timeout)), uintptr(unsafe.Pointer(sigmask)), 0, 0) + r0, _, e1 := Syscall6(SYS_PPOLL, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(unsafe.Pointer(timeout)), uintptr(unsafe.Pointer(sigmask)), 0, 0) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -562,10 +430,6 @@ func ppoll(fds *PollFd, nfds int, timeout *Timespec, sigmask *Sigset_t) (n int, return } -var libc_ppoll_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_ppoll ppoll "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Access(path string, mode uint32) (err error) { @@ -574,31 +438,23 @@ func Access(path string, mode uint32) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_access_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0) + _, _, e1 := Syscall(SYS_ACCESS, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_access_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_access access "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Adjtime(delta *Timeval, olddelta *Timeval) (err error) { - _, _, e1 := syscall_syscall(libc_adjtime_trampoline_addr, uintptr(unsafe.Pointer(delta)), uintptr(unsafe.Pointer(olddelta)), 0) + _, _, e1 := Syscall(SYS_ADJTIME, uintptr(unsafe.Pointer(delta)), uintptr(unsafe.Pointer(olddelta)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_adjtime_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_adjtime adjtime "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Chdir(path string) (err error) { @@ -607,17 +463,13 @@ func Chdir(path string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_chdir_trampoline_addr, uintptr(unsafe.Pointer(_p0)), 0, 0) + _, _, e1 := Syscall(SYS_CHDIR, uintptr(unsafe.Pointer(_p0)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_chdir_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_chdir chdir "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Chflags(path string, flags int) (err error) { @@ -626,17 +478,13 @@ func Chflags(path string, flags int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_chflags_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(flags), 0) + _, _, e1 := Syscall(SYS_CHFLAGS, uintptr(unsafe.Pointer(_p0)), uintptr(flags), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_chflags_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_chflags chflags "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Chmod(path string, mode uint32) (err error) { @@ -645,17 +493,13 @@ func Chmod(path string, mode uint32) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_chmod_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0) + _, _, e1 := Syscall(SYS_CHMOD, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_chmod_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_chmod chmod "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Chown(path string, uid int, gid int) (err error) { @@ -664,17 +508,13 @@ func Chown(path string, uid int, gid int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_chown_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(uid), uintptr(gid)) + _, _, e1 := Syscall(SYS_CHOWN, uintptr(unsafe.Pointer(_p0)), uintptr(uid), uintptr(gid)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_chown_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_chown chown "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Chroot(path string) (err error) { @@ -683,35 +523,27 @@ func Chroot(path string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_chroot_trampoline_addr, uintptr(unsafe.Pointer(_p0)), 0, 0) + _, _, e1 := Syscall(SYS_CHROOT, uintptr(unsafe.Pointer(_p0)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_chroot_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_chroot chroot "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Close(fd int) (err error) { - _, _, e1 := syscall_syscall(libc_close_trampoline_addr, uintptr(fd), 0, 0) + _, _, e1 := Syscall(SYS_CLOSE, uintptr(fd), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_close_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_close close "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Dup(fd int) (nfd int, err error) { - r0, _, e1 := syscall_syscall(libc_dup_trampoline_addr, uintptr(fd), 0, 0) + r0, _, e1 := Syscall(SYS_DUP, uintptr(fd), 0, 0) nfd = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -719,49 +551,33 @@ func Dup(fd int) (nfd int, err error) { return } -var libc_dup_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_dup dup "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Dup2(from int, to int) (err error) { - _, _, e1 := syscall_syscall(libc_dup2_trampoline_addr, uintptr(from), uintptr(to), 0) + _, _, e1 := Syscall(SYS_DUP2, uintptr(from), uintptr(to), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_dup2_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_dup2 dup2 "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Dup3(from int, to int, flags int) (err error) { - _, _, e1 := syscall_syscall(libc_dup3_trampoline_addr, uintptr(from), uintptr(to), uintptr(flags)) + _, _, e1 := Syscall(SYS_DUP3, uintptr(from), uintptr(to), uintptr(flags)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_dup3_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_dup3 dup3 "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Exit(code int) { - syscall_syscall(libc_exit_trampoline_addr, uintptr(code), 0, 0) + Syscall(SYS_EXIT, uintptr(code), 0, 0) return } -var libc_exit_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_exit exit "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Faccessat(dirfd int, path string, mode uint32, flags int) (err error) { @@ -770,59 +586,43 @@ func Faccessat(dirfd int, path string, mode uint32, flags int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall6(libc_faccessat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(flags), 0, 0) + _, _, e1 := Syscall6(SYS_FACCESSAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(flags), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_faccessat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_faccessat faccessat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fchdir(fd int) (err error) { - _, _, e1 := syscall_syscall(libc_fchdir_trampoline_addr, uintptr(fd), 0, 0) + _, _, e1 := Syscall(SYS_FCHDIR, uintptr(fd), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fchdir_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fchdir fchdir "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fchflags(fd int, flags int) (err error) { - _, _, e1 := syscall_syscall(libc_fchflags_trampoline_addr, uintptr(fd), uintptr(flags), 0) + _, _, e1 := Syscall(SYS_FCHFLAGS, uintptr(fd), uintptr(flags), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fchflags_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fchflags fchflags "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fchmod(fd int, mode uint32) (err error) { - _, _, e1 := syscall_syscall(libc_fchmod_trampoline_addr, uintptr(fd), uintptr(mode), 0) + _, _, e1 := Syscall(SYS_FCHMOD, uintptr(fd), uintptr(mode), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fchmod_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fchmod fchmod "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fchmodat(dirfd int, path string, mode uint32, flags int) (err error) { @@ -831,31 +631,23 @@ func Fchmodat(dirfd int, path string, mode uint32, flags int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall6(libc_fchmodat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(flags), 0, 0) + _, _, e1 := Syscall6(SYS_FCHMODAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(flags), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fchmodat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fchmodat fchmodat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fchown(fd int, uid int, gid int) (err error) { - _, _, e1 := syscall_syscall(libc_fchown_trampoline_addr, uintptr(fd), uintptr(uid), uintptr(gid)) + _, _, e1 := Syscall(SYS_FCHOWN, uintptr(fd), uintptr(uid), uintptr(gid)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fchown_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fchown fchown "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fchownat(dirfd int, path string, uid int, gid int, flags int) (err error) { @@ -864,35 +656,27 @@ func Fchownat(dirfd int, path string, uid int, gid int, flags int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall6(libc_fchownat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(uid), uintptr(gid), uintptr(flags), 0) + _, _, e1 := Syscall6(SYS_FCHOWNAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(uid), uintptr(gid), uintptr(flags), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fchownat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fchownat fchownat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Flock(fd int, how int) (err error) { - _, _, e1 := syscall_syscall(libc_flock_trampoline_addr, uintptr(fd), uintptr(how), 0) + _, _, e1 := Syscall(SYS_FLOCK, uintptr(fd), uintptr(how), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_flock_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_flock flock "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fpathconf(fd int, name int) (val int, err error) { - r0, _, e1 := syscall_syscall(libc_fpathconf_trampoline_addr, uintptr(fd), uintptr(name), 0) + r0, _, e1 := Syscall(SYS_FPATHCONF, uintptr(fd), uintptr(name), 0) val = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -900,24 +684,16 @@ func Fpathconf(fd int, name int) (val int, err error) { return } -var libc_fpathconf_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fpathconf fpathconf "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fstat(fd int, stat *Stat_t) (err error) { - _, _, e1 := syscall_syscall(libc_fstat_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) + _, _, e1 := Syscall(SYS_FSTAT, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fstat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fstat fstat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fstatat(fd int, path string, stat *Stat_t, flags int) (err error) { @@ -926,99 +702,71 @@ func Fstatat(fd int, path string, stat *Stat_t, flags int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall6(libc_fstatat_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), uintptr(flags), 0, 0) + _, _, e1 := Syscall6(SYS_FSTATAT, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), uintptr(flags), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fstatat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fstatat fstatat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fstatfs(fd int, stat *Statfs_t) (err error) { - _, _, e1 := syscall_syscall(libc_fstatfs_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) + _, _, e1 := Syscall(SYS_FSTATFS, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fstatfs_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fstatfs fstatfs "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fsync(fd int) (err error) { - _, _, e1 := syscall_syscall(libc_fsync_trampoline_addr, uintptr(fd), 0, 0) + _, _, e1 := Syscall(SYS_FSYNC, uintptr(fd), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fsync_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fsync fsync "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Ftruncate(fd int, length int64) (err error) { - _, _, e1 := syscall_syscall(libc_ftruncate_trampoline_addr, uintptr(fd), uintptr(length), 0) + _, _, e1 := Syscall(SYS_FTRUNCATE, uintptr(fd), 0, uintptr(length)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_ftruncate_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_ftruncate ftruncate "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getegid() (egid int) { - r0, _, _ := syscall_rawSyscall(libc_getegid_trampoline_addr, 0, 0, 0) + r0, _, _ := RawSyscall(SYS_GETEGID, 0, 0, 0) egid = int(r0) return } -var libc_getegid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getegid getegid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Geteuid() (uid int) { - r0, _, _ := syscall_rawSyscall(libc_geteuid_trampoline_addr, 0, 0, 0) + r0, _, _ := RawSyscall(SYS_GETEUID, 0, 0, 0) uid = int(r0) return } -var libc_geteuid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_geteuid geteuid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getgid() (gid int) { - r0, _, _ := syscall_rawSyscall(libc_getgid_trampoline_addr, 0, 0, 0) + r0, _, _ := RawSyscall(SYS_GETGID, 0, 0, 0) gid = int(r0) return } -var libc_getgid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getgid getgid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getpgid(pid int) (pgid int, err error) { - r0, _, e1 := syscall_rawSyscall(libc_getpgid_trampoline_addr, uintptr(pid), 0, 0) + r0, _, e1 := RawSyscall(SYS_GETPGID, uintptr(pid), 0, 0) pgid = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1026,50 +774,34 @@ func Getpgid(pid int) (pgid int, err error) { return } -var libc_getpgid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getpgid getpgid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getpgrp() (pgrp int) { - r0, _, _ := syscall_rawSyscall(libc_getpgrp_trampoline_addr, 0, 0, 0) + r0, _, _ := RawSyscall(SYS_GETPGRP, 0, 0, 0) pgrp = int(r0) return } -var libc_getpgrp_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getpgrp getpgrp "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getpid() (pid int) { - r0, _, _ := syscall_rawSyscall(libc_getpid_trampoline_addr, 0, 0, 0) + r0, _, _ := RawSyscall(SYS_GETPID, 0, 0, 0) pid = int(r0) return } -var libc_getpid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getpid getpid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getppid() (ppid int) { - r0, _, _ := syscall_rawSyscall(libc_getppid_trampoline_addr, 0, 0, 0) + r0, _, _ := RawSyscall(SYS_GETPPID, 0, 0, 0) ppid = int(r0) return } -var libc_getppid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getppid getppid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getpriority(which int, who int) (prio int, err error) { - r0, _, e1 := syscall_syscall(libc_getpriority_trampoline_addr, uintptr(which), uintptr(who), 0) + r0, _, e1 := Syscall(SYS_GETPRIORITY, uintptr(which), uintptr(who), 0) prio = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1077,28 +809,20 @@ func Getpriority(which int, who int) (prio int, err error) { return } -var libc_getpriority_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getpriority getpriority "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getrlimit(which int, lim *Rlimit) (err error) { - _, _, e1 := syscall_rawSyscall(libc_getrlimit_trampoline_addr, uintptr(which), uintptr(unsafe.Pointer(lim)), 0) + _, _, e1 := RawSyscall(SYS_GETRLIMIT, uintptr(which), uintptr(unsafe.Pointer(lim)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_getrlimit_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getrlimit getrlimit "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getrtable() (rtable int, err error) { - r0, _, e1 := syscall_rawSyscall(libc_getrtable_trampoline_addr, 0, 0, 0) + r0, _, e1 := RawSyscall(SYS_GETRTABLE, 0, 0, 0) rtable = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1106,28 +830,20 @@ func Getrtable() (rtable int, err error) { return } -var libc_getrtable_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getrtable getrtable "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getrusage(who int, rusage *Rusage) (err error) { - _, _, e1 := syscall_rawSyscall(libc_getrusage_trampoline_addr, uintptr(who), uintptr(unsafe.Pointer(rusage)), 0) + _, _, e1 := RawSyscall(SYS_GETRUSAGE, uintptr(who), uintptr(unsafe.Pointer(rusage)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_getrusage_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getrusage getrusage "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getsid(pid int) (sid int, err error) { - r0, _, e1 := syscall_rawSyscall(libc_getsid_trampoline_addr, uintptr(pid), 0, 0) + r0, _, e1 := RawSyscall(SYS_GETSID, uintptr(pid), 0, 0) sid = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1135,66 +851,46 @@ func Getsid(pid int) (sid int, err error) { return } -var libc_getsid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getsid getsid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Gettimeofday(tv *Timeval) (err error) { - _, _, e1 := syscall_rawSyscall(libc_gettimeofday_trampoline_addr, uintptr(unsafe.Pointer(tv)), 0, 0) + _, _, e1 := RawSyscall(SYS_GETTIMEOFDAY, uintptr(unsafe.Pointer(tv)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_gettimeofday_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_gettimeofday gettimeofday "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getuid() (uid int) { - r0, _, _ := syscall_rawSyscall(libc_getuid_trampoline_addr, 0, 0, 0) + r0, _, _ := RawSyscall(SYS_GETUID, 0, 0, 0) uid = int(r0) return } -var libc_getuid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getuid getuid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Issetugid() (tainted bool) { - r0, _, _ := syscall_syscall(libc_issetugid_trampoline_addr, 0, 0, 0) + r0, _, _ := Syscall(SYS_ISSETUGID, 0, 0, 0) tainted = bool(r0 != 0) return } -var libc_issetugid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_issetugid issetugid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Kill(pid int, signum syscall.Signal) (err error) { - _, _, e1 := syscall_syscall(libc_kill_trampoline_addr, uintptr(pid), uintptr(signum), 0) + _, _, e1 := Syscall(SYS_KILL, uintptr(pid), uintptr(signum), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_kill_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_kill kill "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Kqueue() (fd int, err error) { - r0, _, e1 := syscall_syscall(libc_kqueue_trampoline_addr, 0, 0, 0) + r0, _, e1 := Syscall(SYS_KQUEUE, 0, 0, 0) fd = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1202,10 +898,6 @@ func Kqueue() (fd int, err error) { return } -var libc_kqueue_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_kqueue kqueue "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Lchown(path string, uid int, gid int) (err error) { @@ -1214,17 +906,13 @@ func Lchown(path string, uid int, gid int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_lchown_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(uid), uintptr(gid)) + _, _, e1 := Syscall(SYS_LCHOWN, uintptr(unsafe.Pointer(_p0)), uintptr(uid), uintptr(gid)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_lchown_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_lchown lchown "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Link(path string, link string) (err error) { @@ -1238,17 +926,13 @@ func Link(path string, link string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_link_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), 0) + _, _, e1 := Syscall(SYS_LINK, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_link_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_link link "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Linkat(pathfd int, path string, linkfd int, link string, flags int) (err error) { @@ -1262,31 +946,23 @@ func Linkat(pathfd int, path string, linkfd int, link string, flags int) (err er if err != nil { return } - _, _, e1 := syscall_syscall6(libc_linkat_trampoline_addr, uintptr(pathfd), uintptr(unsafe.Pointer(_p0)), uintptr(linkfd), uintptr(unsafe.Pointer(_p1)), uintptr(flags), 0) + _, _, e1 := Syscall6(SYS_LINKAT, uintptr(pathfd), uintptr(unsafe.Pointer(_p0)), uintptr(linkfd), uintptr(unsafe.Pointer(_p1)), uintptr(flags), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_linkat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_linkat linkat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Listen(s int, backlog int) (err error) { - _, _, e1 := syscall_syscall(libc_listen_trampoline_addr, uintptr(s), uintptr(backlog), 0) + _, _, e1 := Syscall(SYS_LISTEN, uintptr(s), uintptr(backlog), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_listen_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_listen listen "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Lstat(path string, stat *Stat_t) (err error) { @@ -1295,17 +971,13 @@ func Lstat(path string, stat *Stat_t) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_lstat_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) + _, _, e1 := Syscall(SYS_LSTAT, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_lstat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_lstat lstat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mkdir(path string, mode uint32) (err error) { @@ -1314,17 +986,13 @@ func Mkdir(path string, mode uint32) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_mkdir_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0) + _, _, e1 := Syscall(SYS_MKDIR, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mkdir_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mkdir mkdir "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mkdirat(dirfd int, path string, mode uint32) (err error) { @@ -1333,17 +1001,13 @@ func Mkdirat(dirfd int, path string, mode uint32) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_mkdirat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode)) + _, _, e1 := Syscall(SYS_MKDIRAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mkdirat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mkdirat mkdirat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mkfifo(path string, mode uint32) (err error) { @@ -1352,17 +1016,13 @@ func Mkfifo(path string, mode uint32) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_mkfifo_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0) + _, _, e1 := Syscall(SYS_MKFIFO, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mkfifo_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mkfifo mkfifo "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mkfifoat(dirfd int, path string, mode uint32) (err error) { @@ -1371,17 +1031,13 @@ func Mkfifoat(dirfd int, path string, mode uint32) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_mkfifoat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode)) + _, _, e1 := Syscall(SYS_MKFIFOAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mkfifoat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mkfifoat mkfifoat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mknod(path string, mode uint32, dev int) (err error) { @@ -1390,17 +1046,13 @@ func Mknod(path string, mode uint32, dev int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_mknod_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(dev)) + _, _, e1 := Syscall(SYS_MKNOD, uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(dev)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mknod_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mknod mknod "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mknodat(dirfd int, path string, mode uint32, dev int) (err error) { @@ -1409,31 +1061,23 @@ func Mknodat(dirfd int, path string, mode uint32, dev int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall6(libc_mknodat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(dev), 0, 0) + _, _, e1 := Syscall6(SYS_MKNODAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(dev), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mknodat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mknodat mknodat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Nanosleep(time *Timespec, leftover *Timespec) (err error) { - _, _, e1 := syscall_syscall(libc_nanosleep_trampoline_addr, uintptr(unsafe.Pointer(time)), uintptr(unsafe.Pointer(leftover)), 0) + _, _, e1 := Syscall(SYS_NANOSLEEP, uintptr(unsafe.Pointer(time)), uintptr(unsafe.Pointer(leftover)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_nanosleep_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_nanosleep nanosleep "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Open(path string, mode int, perm uint32) (fd int, err error) { @@ -1442,7 +1086,7 @@ func Open(path string, mode int, perm uint32) (fd int, err error) { if err != nil { return } - r0, _, e1 := syscall_syscall(libc_open_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(perm)) + r0, _, e1 := Syscall(SYS_OPEN, uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(perm)) fd = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1450,10 +1094,6 @@ func Open(path string, mode int, perm uint32) (fd int, err error) { return } -var libc_open_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_open open "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Openat(dirfd int, path string, mode int, perm uint32) (fd int, err error) { @@ -1462,7 +1102,7 @@ func Openat(dirfd int, path string, mode int, perm uint32) (fd int, err error) { if err != nil { return } - r0, _, e1 := syscall_syscall6(libc_openat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(perm), 0, 0) + r0, _, e1 := Syscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(perm), 0, 0) fd = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1470,10 +1110,6 @@ func Openat(dirfd int, path string, mode int, perm uint32) (fd int, err error) { return } -var libc_openat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_openat openat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Pathconf(path string, name int) (val int, err error) { @@ -1482,7 +1118,7 @@ func Pathconf(path string, name int) (val int, err error) { if err != nil { return } - r0, _, e1 := syscall_syscall(libc_pathconf_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(name), 0) + r0, _, e1 := Syscall(SYS_PATHCONF, uintptr(unsafe.Pointer(_p0)), uintptr(name), 0) val = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1490,10 +1126,6 @@ func Pathconf(path string, name int) (val int, err error) { return } -var libc_pathconf_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_pathconf pathconf "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func pread(fd int, p []byte, offset int64) (n int, err error) { @@ -1503,7 +1135,7 @@ func pread(fd int, p []byte, offset int64) (n int, err error) { } else { _p0 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall6(libc_pread_trampoline_addr, uintptr(fd), uintptr(_p0), uintptr(len(p)), uintptr(offset), 0, 0) + r0, _, e1 := Syscall6(SYS_PREAD, uintptr(fd), uintptr(_p0), uintptr(len(p)), 0, uintptr(offset), 0) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1511,10 +1143,6 @@ func pread(fd int, p []byte, offset int64) (n int, err error) { return } -var libc_pread_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_pread pread "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func pwrite(fd int, p []byte, offset int64) (n int, err error) { @@ -1524,7 +1152,7 @@ func pwrite(fd int, p []byte, offset int64) (n int, err error) { } else { _p0 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall6(libc_pwrite_trampoline_addr, uintptr(fd), uintptr(_p0), uintptr(len(p)), uintptr(offset), 0, 0) + r0, _, e1 := Syscall6(SYS_PWRITE, uintptr(fd), uintptr(_p0), uintptr(len(p)), 0, uintptr(offset), 0) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1532,10 +1160,6 @@ func pwrite(fd int, p []byte, offset int64) (n int, err error) { return } -var libc_pwrite_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_pwrite pwrite "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func read(fd int, p []byte) (n int, err error) { @@ -1545,7 +1169,7 @@ func read(fd int, p []byte) (n int, err error) { } else { _p0 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall(libc_read_trampoline_addr, uintptr(fd), uintptr(_p0), uintptr(len(p))) + r0, _, e1 := Syscall(SYS_READ, uintptr(fd), uintptr(_p0), uintptr(len(p))) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1553,10 +1177,6 @@ func read(fd int, p []byte) (n int, err error) { return } -var libc_read_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_read read "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Readlink(path string, buf []byte) (n int, err error) { @@ -1571,7 +1191,7 @@ func Readlink(path string, buf []byte) (n int, err error) { } else { _p1 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall(libc_readlink_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(_p1), uintptr(len(buf))) + r0, _, e1 := Syscall(SYS_READLINK, uintptr(unsafe.Pointer(_p0)), uintptr(_p1), uintptr(len(buf))) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1579,10 +1199,6 @@ func Readlink(path string, buf []byte) (n int, err error) { return } -var libc_readlink_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_readlink readlink "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Readlinkat(dirfd int, path string, buf []byte) (n int, err error) { @@ -1597,7 +1213,7 @@ func Readlinkat(dirfd int, path string, buf []byte) (n int, err error) { } else { _p1 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall6(libc_readlinkat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(_p1), uintptr(len(buf)), 0, 0) + r0, _, e1 := Syscall6(SYS_READLINKAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(_p1), uintptr(len(buf)), 0, 0) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1605,10 +1221,6 @@ func Readlinkat(dirfd int, path string, buf []byte) (n int, err error) { return } -var libc_readlinkat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_readlinkat readlinkat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Rename(from string, to string) (err error) { @@ -1622,17 +1234,13 @@ func Rename(from string, to string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_rename_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), 0) + _, _, e1 := Syscall(SYS_RENAME, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_rename_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_rename rename "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Renameat(fromfd int, from string, tofd int, to string) (err error) { @@ -1646,17 +1254,13 @@ func Renameat(fromfd int, from string, tofd int, to string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall6(libc_renameat_trampoline_addr, uintptr(fromfd), uintptr(unsafe.Pointer(_p0)), uintptr(tofd), uintptr(unsafe.Pointer(_p1)), 0, 0) + _, _, e1 := Syscall6(SYS_RENAMEAT, uintptr(fromfd), uintptr(unsafe.Pointer(_p0)), uintptr(tofd), uintptr(unsafe.Pointer(_p1)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_renameat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_renameat renameat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Revoke(path string) (err error) { @@ -1665,17 +1269,13 @@ func Revoke(path string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_revoke_trampoline_addr, uintptr(unsafe.Pointer(_p0)), 0, 0) + _, _, e1 := Syscall(SYS_REVOKE, uintptr(unsafe.Pointer(_p0)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_revoke_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_revoke revoke "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Rmdir(path string) (err error) { @@ -1684,21 +1284,17 @@ func Rmdir(path string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_rmdir_trampoline_addr, uintptr(unsafe.Pointer(_p0)), 0, 0) + _, _, e1 := Syscall(SYS_RMDIR, uintptr(unsafe.Pointer(_p0)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_rmdir_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_rmdir rmdir "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Seek(fd int, offset int64, whence int) (newoffset int64, err error) { - r0, _, e1 := syscall_syscall(libc_lseek_trampoline_addr, uintptr(fd), uintptr(offset), uintptr(whence)) + r0, _, e1 := Syscall6(SYS_LSEEK, uintptr(fd), 0, uintptr(offset), uintptr(whence), 0, 0) newoffset = int64(r0) if e1 != 0 { err = errnoErr(e1) @@ -1706,14 +1302,10 @@ func Seek(fd int, offset int64, whence int) (newoffset int64, err error) { return } -var libc_lseek_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_lseek lseek "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Select(nfd int, r *FdSet, w *FdSet, e *FdSet, timeout *Timeval) (n int, err error) { - r0, _, e1 := syscall_syscall6(libc_select_trampoline_addr, uintptr(nfd), uintptr(unsafe.Pointer(r)), uintptr(unsafe.Pointer(w)), uintptr(unsafe.Pointer(e)), uintptr(unsafe.Pointer(timeout)), 0) + r0, _, e1 := Syscall6(SYS_SELECT, uintptr(nfd), uintptr(unsafe.Pointer(r)), uintptr(unsafe.Pointer(w)), uintptr(unsafe.Pointer(e)), uintptr(unsafe.Pointer(timeout)), 0) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1721,52 +1313,36 @@ func Select(nfd int, r *FdSet, w *FdSet, e *FdSet, timeout *Timeval) (n int, err return } -var libc_select_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_select select "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setegid(egid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setegid_trampoline_addr, uintptr(egid), 0, 0) + _, _, e1 := RawSyscall(SYS_SETEGID, uintptr(egid), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setegid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setegid setegid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Seteuid(euid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_seteuid_trampoline_addr, uintptr(euid), 0, 0) + _, _, e1 := RawSyscall(SYS_SETEUID, uintptr(euid), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_seteuid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_seteuid seteuid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setgid(gid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setgid_trampoline_addr, uintptr(gid), 0, 0) + _, _, e1 := RawSyscall(SYS_SETGID, uintptr(gid), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setgid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setgid setgid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setlogin(name string) (err error) { @@ -1775,133 +1351,97 @@ func Setlogin(name string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_setlogin_trampoline_addr, uintptr(unsafe.Pointer(_p0)), 0, 0) + _, _, e1 := Syscall(SYS_SETLOGIN, uintptr(unsafe.Pointer(_p0)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setlogin_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setlogin setlogin "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setpgid(pid int, pgid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setpgid_trampoline_addr, uintptr(pid), uintptr(pgid), 0) + _, _, e1 := RawSyscall(SYS_SETPGID, uintptr(pid), uintptr(pgid), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setpgid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setpgid setpgid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setpriority(which int, who int, prio int) (err error) { - _, _, e1 := syscall_syscall(libc_setpriority_trampoline_addr, uintptr(which), uintptr(who), uintptr(prio)) + _, _, e1 := Syscall(SYS_SETPRIORITY, uintptr(which), uintptr(who), uintptr(prio)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setpriority_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setpriority setpriority "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setregid(rgid int, egid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setregid_trampoline_addr, uintptr(rgid), uintptr(egid), 0) + _, _, e1 := RawSyscall(SYS_SETREGID, uintptr(rgid), uintptr(egid), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setregid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setregid setregid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setreuid(ruid int, euid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setreuid_trampoline_addr, uintptr(ruid), uintptr(euid), 0) + _, _, e1 := RawSyscall(SYS_SETREUID, uintptr(ruid), uintptr(euid), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setreuid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setreuid setreuid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setresgid(rgid int, egid int, sgid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setresgid_trampoline_addr, uintptr(rgid), uintptr(egid), uintptr(sgid)) + _, _, e1 := RawSyscall(SYS_SETRESGID, uintptr(rgid), uintptr(egid), uintptr(sgid)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setresgid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setresgid setresgid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setresuid(ruid int, euid int, suid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setresuid_trampoline_addr, uintptr(ruid), uintptr(euid), uintptr(suid)) + _, _, e1 := RawSyscall(SYS_SETRESUID, uintptr(ruid), uintptr(euid), uintptr(suid)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setresuid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setresuid setresuid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setrlimit(which int, lim *Rlimit) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setrlimit_trampoline_addr, uintptr(which), uintptr(unsafe.Pointer(lim)), 0) + _, _, e1 := RawSyscall(SYS_SETRLIMIT, uintptr(which), uintptr(unsafe.Pointer(lim)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setrlimit_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setrlimit setrlimit "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setrtable(rtable int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setrtable_trampoline_addr, uintptr(rtable), 0, 0) + _, _, e1 := RawSyscall(SYS_SETRTABLE, uintptr(rtable), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setrtable_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setrtable setrtable "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setsid() (pid int, err error) { - r0, _, e1 := syscall_rawSyscall(libc_setsid_trampoline_addr, 0, 0, 0) + r0, _, e1 := RawSyscall(SYS_SETSID, 0, 0, 0) pid = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1909,38 +1449,26 @@ func Setsid() (pid int, err error) { return } -var libc_setsid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setsid setsid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Settimeofday(tp *Timeval) (err error) { - _, _, e1 := syscall_rawSyscall(libc_settimeofday_trampoline_addr, uintptr(unsafe.Pointer(tp)), 0, 0) + _, _, e1 := RawSyscall(SYS_SETTIMEOFDAY, uintptr(unsafe.Pointer(tp)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_settimeofday_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_settimeofday settimeofday "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setuid(uid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setuid_trampoline_addr, uintptr(uid), 0, 0) + _, _, e1 := RawSyscall(SYS_SETUID, uintptr(uid), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setuid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setuid setuid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Stat(path string, stat *Stat_t) (err error) { @@ -1949,17 +1477,13 @@ func Stat(path string, stat *Stat_t) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_stat_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) + _, _, e1 := Syscall(SYS_STAT, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_stat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_stat stat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Statfs(path string, stat *Statfs_t) (err error) { @@ -1968,17 +1492,13 @@ func Statfs(path string, stat *Statfs_t) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_statfs_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) + _, _, e1 := Syscall(SYS_STATFS, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_statfs_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_statfs statfs "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Symlink(path string, link string) (err error) { @@ -1992,17 +1512,13 @@ func Symlink(path string, link string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_symlink_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), 0) + _, _, e1 := Syscall(SYS_SYMLINK, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_symlink_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_symlink symlink "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Symlinkat(oldpath string, newdirfd int, newpath string) (err error) { @@ -2016,31 +1532,23 @@ func Symlinkat(oldpath string, newdirfd int, newpath string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_symlinkat_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(newdirfd), uintptr(unsafe.Pointer(_p1))) + _, _, e1 := Syscall(SYS_SYMLINKAT, uintptr(unsafe.Pointer(_p0)), uintptr(newdirfd), uintptr(unsafe.Pointer(_p1))) if e1 != 0 { err = errnoErr(e1) } return } -var libc_symlinkat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_symlinkat symlinkat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Sync() (err error) { - _, _, e1 := syscall_syscall(libc_sync_trampoline_addr, 0, 0, 0) + _, _, e1 := Syscall(SYS_SYNC, 0, 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_sync_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_sync sync "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Truncate(path string, length int64) (err error) { @@ -2049,29 +1557,21 @@ func Truncate(path string, length int64) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_truncate_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(length), 0) + _, _, e1 := Syscall(SYS_TRUNCATE, uintptr(unsafe.Pointer(_p0)), 0, uintptr(length)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_truncate_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_truncate truncate "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Umask(newmask int) (oldmask int) { - r0, _, _ := syscall_syscall(libc_umask_trampoline_addr, uintptr(newmask), 0, 0) + r0, _, _ := Syscall(SYS_UMASK, uintptr(newmask), 0, 0) oldmask = int(r0) return } -var libc_umask_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_umask umask "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Unlink(path string) (err error) { @@ -2080,17 +1580,13 @@ func Unlink(path string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_unlink_trampoline_addr, uintptr(unsafe.Pointer(_p0)), 0, 0) + _, _, e1 := Syscall(SYS_UNLINK, uintptr(unsafe.Pointer(_p0)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_unlink_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_unlink unlink "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Unlinkat(dirfd int, path string, flags int) (err error) { @@ -2099,17 +1595,13 @@ func Unlinkat(dirfd int, path string, flags int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_unlinkat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(flags)) + _, _, e1 := Syscall(SYS_UNLINKAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(flags)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_unlinkat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_unlinkat unlinkat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Unmount(path string, flags int) (err error) { @@ -2118,17 +1610,13 @@ func Unmount(path string, flags int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_unmount_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(flags), 0) + _, _, e1 := Syscall(SYS_UNMOUNT, uintptr(unsafe.Pointer(_p0)), uintptr(flags), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_unmount_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_unmount unmount "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func write(fd int, p []byte) (n int, err error) { @@ -2138,7 +1626,7 @@ func write(fd int, p []byte) (n int, err error) { } else { _p0 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall(libc_write_trampoline_addr, uintptr(fd), uintptr(_p0), uintptr(len(p))) + r0, _, e1 := Syscall(SYS_WRITE, uintptr(fd), uintptr(_p0), uintptr(len(p))) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -2146,14 +1634,10 @@ func write(fd int, p []byte) (n int, err error) { return } -var libc_write_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_write write "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func mmap(addr uintptr, length uintptr, prot int, flag int, fd int, pos int64) (ret uintptr, err error) { - r0, _, e1 := syscall_syscall6(libc_mmap_trampoline_addr, uintptr(addr), uintptr(length), uintptr(prot), uintptr(flag), uintptr(fd), uintptr(pos)) + r0, _, e1 := Syscall9(SYS_MMAP, uintptr(addr), uintptr(length), uintptr(prot), uintptr(flag), uintptr(fd), 0, uintptr(pos), 0, 0) ret = uintptr(r0) if e1 != 0 { err = errnoErr(e1) @@ -2161,28 +1645,20 @@ func mmap(addr uintptr, length uintptr, prot int, flag int, fd int, pos int64) ( return } -var libc_mmap_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mmap mmap "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func munmap(addr uintptr, length uintptr) (err error) { - _, _, e1 := syscall_syscall(libc_munmap_trampoline_addr, uintptr(addr), uintptr(length), 0) + _, _, e1 := Syscall(SYS_MUNMAP, uintptr(addr), uintptr(length), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_munmap_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_munmap munmap "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func readlen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_read_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) + r0, _, e1 := Syscall(SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -2193,7 +1669,7 @@ func readlen(fd int, buf *byte, nbuf int) (n int, err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func writelen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_write_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) + r0, _, e1 := Syscall(SYS_WRITE, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -2209,13 +1685,9 @@ func utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error if err != nil { return } - _, _, e1 := syscall_syscall6(libc_utimensat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(times)), uintptr(flags), 0, 0) + _, _, e1 := Syscall6(SYS_UTIMENSAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(times)), uintptr(flags), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } - -var libc_utimensat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_utimensat utimensat "libc.so" diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.go b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.go index 8da6791..69f8030 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm.go @@ -1,4 +1,4 @@ -// go run mksyscall.go -l32 -openbsd -arm -libc -tags openbsd,arm syscall_bsd.go syscall_openbsd.go syscall_openbsd_arm.go +// go run mksyscall.go -l32 -openbsd -arm -tags openbsd,arm syscall_bsd.go syscall_openbsd.go syscall_openbsd_arm.go // Code generated by the command above; see README.md. DO NOT EDIT. //go:build openbsd && arm @@ -16,7 +16,7 @@ var _ syscall.Errno // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func getgroups(ngid int, gid *_Gid_t) (n int, err error) { - r0, _, e1 := syscall_rawSyscall(libc_getgroups_trampoline_addr, uintptr(ngid), uintptr(unsafe.Pointer(gid)), 0) + r0, _, e1 := RawSyscall(SYS_GETGROUPS, uintptr(ngid), uintptr(unsafe.Pointer(gid)), 0) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -24,28 +24,20 @@ func getgroups(ngid int, gid *_Gid_t) (n int, err error) { return } -var libc_getgroups_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getgroups getgroups "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func setgroups(ngid int, gid *_Gid_t) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setgroups_trampoline_addr, uintptr(ngid), uintptr(unsafe.Pointer(gid)), 0) + _, _, e1 := RawSyscall(SYS_SETGROUPS, uintptr(ngid), uintptr(unsafe.Pointer(gid)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setgroups_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setgroups setgroups "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func wait4(pid int, wstatus *_C_int, options int, rusage *Rusage) (wpid int, err error) { - r0, _, e1 := syscall_syscall6(libc_wait4_trampoline_addr, uintptr(pid), uintptr(unsafe.Pointer(wstatus)), uintptr(options), uintptr(unsafe.Pointer(rusage)), 0, 0) + r0, _, e1 := Syscall6(SYS_WAIT4, uintptr(pid), uintptr(unsafe.Pointer(wstatus)), uintptr(options), uintptr(unsafe.Pointer(rusage)), 0, 0) wpid = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -53,14 +45,10 @@ func wait4(pid int, wstatus *_C_int, options int, rusage *Rusage) (wpid int, err return } -var libc_wait4_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_wait4 wait4 "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func accept(s int, rsa *RawSockaddrAny, addrlen *_Socklen) (fd int, err error) { - r0, _, e1 := syscall_syscall(libc_accept_trampoline_addr, uintptr(s), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen))) + r0, _, e1 := Syscall(SYS_ACCEPT, uintptr(s), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen))) fd = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -68,42 +56,30 @@ func accept(s int, rsa *RawSockaddrAny, addrlen *_Socklen) (fd int, err error) { return } -var libc_accept_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_accept accept "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func bind(s int, addr unsafe.Pointer, addrlen _Socklen) (err error) { - _, _, e1 := syscall_syscall(libc_bind_trampoline_addr, uintptr(s), uintptr(addr), uintptr(addrlen)) + _, _, e1 := Syscall(SYS_BIND, uintptr(s), uintptr(addr), uintptr(addrlen)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_bind_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_bind bind "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func connect(s int, addr unsafe.Pointer, addrlen _Socklen) (err error) { - _, _, e1 := syscall_syscall(libc_connect_trampoline_addr, uintptr(s), uintptr(addr), uintptr(addrlen)) + _, _, e1 := Syscall(SYS_CONNECT, uintptr(s), uintptr(addr), uintptr(addrlen)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_connect_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_connect connect "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func socket(domain int, typ int, proto int) (fd int, err error) { - r0, _, e1 := syscall_rawSyscall(libc_socket_trampoline_addr, uintptr(domain), uintptr(typ), uintptr(proto)) + r0, _, e1 := RawSyscall(SYS_SOCKET, uintptr(domain), uintptr(typ), uintptr(proto)) fd = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -111,94 +87,66 @@ func socket(domain int, typ int, proto int) (fd int, err error) { return } -var libc_socket_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_socket socket "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func getsockopt(s int, level int, name int, val unsafe.Pointer, vallen *_Socklen) (err error) { - _, _, e1 := syscall_syscall6(libc_getsockopt_trampoline_addr, uintptr(s), uintptr(level), uintptr(name), uintptr(val), uintptr(unsafe.Pointer(vallen)), 0) + _, _, e1 := Syscall6(SYS_GETSOCKOPT, uintptr(s), uintptr(level), uintptr(name), uintptr(val), uintptr(unsafe.Pointer(vallen)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_getsockopt_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getsockopt getsockopt "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func setsockopt(s int, level int, name int, val unsafe.Pointer, vallen uintptr) (err error) { - _, _, e1 := syscall_syscall6(libc_setsockopt_trampoline_addr, uintptr(s), uintptr(level), uintptr(name), uintptr(val), uintptr(vallen), 0) + _, _, e1 := Syscall6(SYS_SETSOCKOPT, uintptr(s), uintptr(level), uintptr(name), uintptr(val), uintptr(vallen), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setsockopt_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setsockopt setsockopt "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func getpeername(fd int, rsa *RawSockaddrAny, addrlen *_Socklen) (err error) { - _, _, e1 := syscall_rawSyscall(libc_getpeername_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen))) + _, _, e1 := RawSyscall(SYS_GETPEERNAME, uintptr(fd), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen))) if e1 != 0 { err = errnoErr(e1) } return } -var libc_getpeername_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getpeername getpeername "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func getsockname(fd int, rsa *RawSockaddrAny, addrlen *_Socklen) (err error) { - _, _, e1 := syscall_rawSyscall(libc_getsockname_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen))) + _, _, e1 := RawSyscall(SYS_GETSOCKNAME, uintptr(fd), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen))) if e1 != 0 { err = errnoErr(e1) } return } -var libc_getsockname_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getsockname getsockname "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Shutdown(s int, how int) (err error) { - _, _, e1 := syscall_syscall(libc_shutdown_trampoline_addr, uintptr(s), uintptr(how), 0) + _, _, e1 := Syscall(SYS_SHUTDOWN, uintptr(s), uintptr(how), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_shutdown_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_shutdown shutdown "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func socketpair(domain int, typ int, proto int, fd *[2]int32) (err error) { - _, _, e1 := syscall_rawSyscall6(libc_socketpair_trampoline_addr, uintptr(domain), uintptr(typ), uintptr(proto), uintptr(unsafe.Pointer(fd)), 0, 0) + _, _, e1 := RawSyscall6(SYS_SOCKETPAIR, uintptr(domain), uintptr(typ), uintptr(proto), uintptr(unsafe.Pointer(fd)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_socketpair_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_socketpair socketpair "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func recvfrom(fd int, p []byte, flags int, from *RawSockaddrAny, fromlen *_Socklen) (n int, err error) { @@ -208,7 +156,7 @@ func recvfrom(fd int, p []byte, flags int, from *RawSockaddrAny, fromlen *_Sockl } else { _p0 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall6(libc_recvfrom_trampoline_addr, uintptr(fd), uintptr(_p0), uintptr(len(p)), uintptr(flags), uintptr(unsafe.Pointer(from)), uintptr(unsafe.Pointer(fromlen))) + r0, _, e1 := Syscall6(SYS_RECVFROM, uintptr(fd), uintptr(_p0), uintptr(len(p)), uintptr(flags), uintptr(unsafe.Pointer(from)), uintptr(unsafe.Pointer(fromlen))) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -216,10 +164,6 @@ func recvfrom(fd int, p []byte, flags int, from *RawSockaddrAny, fromlen *_Sockl return } -var libc_recvfrom_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_recvfrom recvfrom "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func sendto(s int, buf []byte, flags int, to unsafe.Pointer, addrlen _Socklen) (err error) { @@ -229,21 +173,17 @@ func sendto(s int, buf []byte, flags int, to unsafe.Pointer, addrlen _Socklen) ( } else { _p0 = unsafe.Pointer(&_zero) } - _, _, e1 := syscall_syscall6(libc_sendto_trampoline_addr, uintptr(s), uintptr(_p0), uintptr(len(buf)), uintptr(flags), uintptr(to), uintptr(addrlen)) + _, _, e1 := Syscall6(SYS_SENDTO, uintptr(s), uintptr(_p0), uintptr(len(buf)), uintptr(flags), uintptr(to), uintptr(addrlen)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_sendto_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_sendto sendto "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func recvmsg(s int, msg *Msghdr, flags int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_recvmsg_trampoline_addr, uintptr(s), uintptr(unsafe.Pointer(msg)), uintptr(flags)) + r0, _, e1 := Syscall(SYS_RECVMSG, uintptr(s), uintptr(unsafe.Pointer(msg)), uintptr(flags)) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -251,14 +191,10 @@ func recvmsg(s int, msg *Msghdr, flags int) (n int, err error) { return } -var libc_recvmsg_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_recvmsg recvmsg "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func sendmsg(s int, msg *Msghdr, flags int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_sendmsg_trampoline_addr, uintptr(s), uintptr(unsafe.Pointer(msg)), uintptr(flags)) + r0, _, e1 := Syscall(SYS_SENDMSG, uintptr(s), uintptr(unsafe.Pointer(msg)), uintptr(flags)) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -266,14 +202,10 @@ func sendmsg(s int, msg *Msghdr, flags int) (n int, err error) { return } -var libc_sendmsg_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_sendmsg sendmsg "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func kevent(kq int, change unsafe.Pointer, nchange int, event unsafe.Pointer, nevent int, timeout *Timespec) (n int, err error) { - r0, _, e1 := syscall_syscall6(libc_kevent_trampoline_addr, uintptr(kq), uintptr(change), uintptr(nchange), uintptr(event), uintptr(nevent), uintptr(unsafe.Pointer(timeout))) + r0, _, e1 := Syscall6(SYS_KEVENT, uintptr(kq), uintptr(change), uintptr(nchange), uintptr(event), uintptr(nevent), uintptr(unsafe.Pointer(timeout))) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -281,10 +213,6 @@ func kevent(kq int, change unsafe.Pointer, nchange int, event unsafe.Pointer, ne return } -var libc_kevent_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_kevent kevent "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func utimes(path string, timeval *[2]Timeval) (err error) { @@ -293,35 +221,27 @@ func utimes(path string, timeval *[2]Timeval) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_utimes_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(timeval)), 0) + _, _, e1 := Syscall(SYS_UTIMES, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(timeval)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_utimes_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_utimes utimes "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func futimes(fd int, timeval *[2]Timeval) (err error) { - _, _, e1 := syscall_syscall(libc_futimes_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(timeval)), 0) + _, _, e1 := Syscall(SYS_FUTIMES, uintptr(fd), uintptr(unsafe.Pointer(timeval)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_futimes_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_futimes futimes "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func poll(fds *PollFd, nfds int, timeout int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_poll_trampoline_addr, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(timeout)) + r0, _, e1 := Syscall(SYS_POLL, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(timeout)) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -329,10 +249,6 @@ func poll(fds *PollFd, nfds int, timeout int) (n int, err error) { return } -var libc_poll_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_poll poll "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Madvise(b []byte, behav int) (err error) { @@ -342,17 +258,13 @@ func Madvise(b []byte, behav int) (err error) { } else { _p0 = unsafe.Pointer(&_zero) } - _, _, e1 := syscall_syscall(libc_madvise_trampoline_addr, uintptr(_p0), uintptr(len(b)), uintptr(behav)) + _, _, e1 := Syscall(SYS_MADVISE, uintptr(_p0), uintptr(len(b)), uintptr(behav)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_madvise_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_madvise madvise "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mlock(b []byte) (err error) { @@ -362,31 +274,23 @@ func Mlock(b []byte) (err error) { } else { _p0 = unsafe.Pointer(&_zero) } - _, _, e1 := syscall_syscall(libc_mlock_trampoline_addr, uintptr(_p0), uintptr(len(b)), 0) + _, _, e1 := Syscall(SYS_MLOCK, uintptr(_p0), uintptr(len(b)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mlock_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mlock mlock "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mlockall(flags int) (err error) { - _, _, e1 := syscall_syscall(libc_mlockall_trampoline_addr, uintptr(flags), 0, 0) + _, _, e1 := Syscall(SYS_MLOCKALL, uintptr(flags), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mlockall_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mlockall mlockall "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mprotect(b []byte, prot int) (err error) { @@ -396,17 +300,13 @@ func Mprotect(b []byte, prot int) (err error) { } else { _p0 = unsafe.Pointer(&_zero) } - _, _, e1 := syscall_syscall(libc_mprotect_trampoline_addr, uintptr(_p0), uintptr(len(b)), uintptr(prot)) + _, _, e1 := Syscall(SYS_MPROTECT, uintptr(_p0), uintptr(len(b)), uintptr(prot)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mprotect_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mprotect mprotect "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Msync(b []byte, flags int) (err error) { @@ -416,17 +316,13 @@ func Msync(b []byte, flags int) (err error) { } else { _p0 = unsafe.Pointer(&_zero) } - _, _, e1 := syscall_syscall(libc_msync_trampoline_addr, uintptr(_p0), uintptr(len(b)), uintptr(flags)) + _, _, e1 := Syscall(SYS_MSYNC, uintptr(_p0), uintptr(len(b)), uintptr(flags)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_msync_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_msync msync "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Munlock(b []byte) (err error) { @@ -436,45 +332,33 @@ func Munlock(b []byte) (err error) { } else { _p0 = unsafe.Pointer(&_zero) } - _, _, e1 := syscall_syscall(libc_munlock_trampoline_addr, uintptr(_p0), uintptr(len(b)), 0) + _, _, e1 := Syscall(SYS_MUNLOCK, uintptr(_p0), uintptr(len(b)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_munlock_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_munlock munlock "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Munlockall() (err error) { - _, _, e1 := syscall_syscall(libc_munlockall_trampoline_addr, 0, 0, 0) + _, _, e1 := Syscall(SYS_MUNLOCKALL, 0, 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_munlockall_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_munlockall munlockall "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func pipe2(p *[2]_C_int, flags int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_pipe2_trampoline_addr, uintptr(unsafe.Pointer(p)), uintptr(flags), 0) + _, _, e1 := RawSyscall(SYS_PIPE2, uintptr(unsafe.Pointer(p)), uintptr(flags), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_pipe2_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_pipe2 pipe2 "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getdents(fd int, buf []byte) (n int, err error) { @@ -484,7 +368,7 @@ func Getdents(fd int, buf []byte) (n int, err error) { } else { _p0 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall(libc_getdents_trampoline_addr, uintptr(fd), uintptr(_p0), uintptr(len(buf))) + r0, _, e1 := Syscall(SYS_GETDENTS, uintptr(fd), uintptr(_p0), uintptr(len(buf))) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -492,10 +376,6 @@ func Getdents(fd int, buf []byte) (n int, err error) { return } -var libc_getdents_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getdents getdents "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getcwd(buf []byte) (n int, err error) { @@ -505,7 +385,7 @@ func Getcwd(buf []byte) (n int, err error) { } else { _p0 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall(libc_getcwd_trampoline_addr, uintptr(_p0), uintptr(len(buf)), 0) + r0, _, e1 := Syscall(SYS___GETCWD, uintptr(_p0), uintptr(len(buf)), 0) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -513,24 +393,16 @@ func Getcwd(buf []byte) (n int, err error) { return } -var libc_getcwd_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getcwd getcwd "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func ioctl(fd int, req uint, arg uintptr) (err error) { - _, _, e1 := syscall_syscall(libc_ioctl_trampoline_addr, uintptr(fd), uintptr(req), uintptr(arg)) + _, _, e1 := Syscall(SYS_IOCTL, uintptr(fd), uintptr(req), uintptr(arg)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_ioctl_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_ioctl ioctl "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) { @@ -540,21 +412,17 @@ func sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) } else { _p0 = unsafe.Pointer(&_zero) } - _, _, e1 := syscall_syscall6(libc_sysctl_trampoline_addr, uintptr(_p0), uintptr(len(mib)), uintptr(unsafe.Pointer(old)), uintptr(unsafe.Pointer(oldlen)), uintptr(unsafe.Pointer(new)), uintptr(newlen)) + _, _, e1 := Syscall6(SYS___SYSCTL, uintptr(_p0), uintptr(len(mib)), uintptr(unsafe.Pointer(old)), uintptr(unsafe.Pointer(oldlen)), uintptr(unsafe.Pointer(new)), uintptr(newlen)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_sysctl_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_sysctl sysctl "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func ppoll(fds *PollFd, nfds int, timeout *Timespec, sigmask *Sigset_t) (n int, err error) { - r0, _, e1 := syscall_syscall6(libc_ppoll_trampoline_addr, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(unsafe.Pointer(timeout)), uintptr(unsafe.Pointer(sigmask)), 0, 0) + r0, _, e1 := Syscall6(SYS_PPOLL, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(unsafe.Pointer(timeout)), uintptr(unsafe.Pointer(sigmask)), 0, 0) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -562,10 +430,6 @@ func ppoll(fds *PollFd, nfds int, timeout *Timespec, sigmask *Sigset_t) (n int, return } -var libc_ppoll_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_ppoll ppoll "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Access(path string, mode uint32) (err error) { @@ -574,31 +438,23 @@ func Access(path string, mode uint32) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_access_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0) + _, _, e1 := Syscall(SYS_ACCESS, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_access_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_access access "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Adjtime(delta *Timeval, olddelta *Timeval) (err error) { - _, _, e1 := syscall_syscall(libc_adjtime_trampoline_addr, uintptr(unsafe.Pointer(delta)), uintptr(unsafe.Pointer(olddelta)), 0) + _, _, e1 := Syscall(SYS_ADJTIME, uintptr(unsafe.Pointer(delta)), uintptr(unsafe.Pointer(olddelta)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_adjtime_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_adjtime adjtime "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Chdir(path string) (err error) { @@ -607,17 +463,13 @@ func Chdir(path string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_chdir_trampoline_addr, uintptr(unsafe.Pointer(_p0)), 0, 0) + _, _, e1 := Syscall(SYS_CHDIR, uintptr(unsafe.Pointer(_p0)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_chdir_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_chdir chdir "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Chflags(path string, flags int) (err error) { @@ -626,17 +478,13 @@ func Chflags(path string, flags int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_chflags_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(flags), 0) + _, _, e1 := Syscall(SYS_CHFLAGS, uintptr(unsafe.Pointer(_p0)), uintptr(flags), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_chflags_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_chflags chflags "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Chmod(path string, mode uint32) (err error) { @@ -645,17 +493,13 @@ func Chmod(path string, mode uint32) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_chmod_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0) + _, _, e1 := Syscall(SYS_CHMOD, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_chmod_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_chmod chmod "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Chown(path string, uid int, gid int) (err error) { @@ -664,17 +508,13 @@ func Chown(path string, uid int, gid int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_chown_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(uid), uintptr(gid)) + _, _, e1 := Syscall(SYS_CHOWN, uintptr(unsafe.Pointer(_p0)), uintptr(uid), uintptr(gid)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_chown_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_chown chown "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Chroot(path string) (err error) { @@ -683,35 +523,27 @@ func Chroot(path string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_chroot_trampoline_addr, uintptr(unsafe.Pointer(_p0)), 0, 0) + _, _, e1 := Syscall(SYS_CHROOT, uintptr(unsafe.Pointer(_p0)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_chroot_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_chroot chroot "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Close(fd int) (err error) { - _, _, e1 := syscall_syscall(libc_close_trampoline_addr, uintptr(fd), 0, 0) + _, _, e1 := Syscall(SYS_CLOSE, uintptr(fd), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_close_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_close close "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Dup(fd int) (nfd int, err error) { - r0, _, e1 := syscall_syscall(libc_dup_trampoline_addr, uintptr(fd), 0, 0) + r0, _, e1 := Syscall(SYS_DUP, uintptr(fd), 0, 0) nfd = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -719,49 +551,33 @@ func Dup(fd int) (nfd int, err error) { return } -var libc_dup_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_dup dup "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Dup2(from int, to int) (err error) { - _, _, e1 := syscall_syscall(libc_dup2_trampoline_addr, uintptr(from), uintptr(to), 0) + _, _, e1 := Syscall(SYS_DUP2, uintptr(from), uintptr(to), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_dup2_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_dup2 dup2 "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Dup3(from int, to int, flags int) (err error) { - _, _, e1 := syscall_syscall(libc_dup3_trampoline_addr, uintptr(from), uintptr(to), uintptr(flags)) + _, _, e1 := Syscall(SYS_DUP3, uintptr(from), uintptr(to), uintptr(flags)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_dup3_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_dup3 dup3 "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Exit(code int) { - syscall_syscall(libc_exit_trampoline_addr, uintptr(code), 0, 0) + Syscall(SYS_EXIT, uintptr(code), 0, 0) return } -var libc_exit_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_exit exit "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Faccessat(dirfd int, path string, mode uint32, flags int) (err error) { @@ -770,59 +586,43 @@ func Faccessat(dirfd int, path string, mode uint32, flags int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall6(libc_faccessat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(flags), 0, 0) + _, _, e1 := Syscall6(SYS_FACCESSAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(flags), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_faccessat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_faccessat faccessat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fchdir(fd int) (err error) { - _, _, e1 := syscall_syscall(libc_fchdir_trampoline_addr, uintptr(fd), 0, 0) + _, _, e1 := Syscall(SYS_FCHDIR, uintptr(fd), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fchdir_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fchdir fchdir "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fchflags(fd int, flags int) (err error) { - _, _, e1 := syscall_syscall(libc_fchflags_trampoline_addr, uintptr(fd), uintptr(flags), 0) + _, _, e1 := Syscall(SYS_FCHFLAGS, uintptr(fd), uintptr(flags), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fchflags_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fchflags fchflags "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fchmod(fd int, mode uint32) (err error) { - _, _, e1 := syscall_syscall(libc_fchmod_trampoline_addr, uintptr(fd), uintptr(mode), 0) + _, _, e1 := Syscall(SYS_FCHMOD, uintptr(fd), uintptr(mode), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fchmod_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fchmod fchmod "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fchmodat(dirfd int, path string, mode uint32, flags int) (err error) { @@ -831,31 +631,23 @@ func Fchmodat(dirfd int, path string, mode uint32, flags int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall6(libc_fchmodat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(flags), 0, 0) + _, _, e1 := Syscall6(SYS_FCHMODAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(flags), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fchmodat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fchmodat fchmodat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fchown(fd int, uid int, gid int) (err error) { - _, _, e1 := syscall_syscall(libc_fchown_trampoline_addr, uintptr(fd), uintptr(uid), uintptr(gid)) + _, _, e1 := Syscall(SYS_FCHOWN, uintptr(fd), uintptr(uid), uintptr(gid)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fchown_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fchown fchown "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fchownat(dirfd int, path string, uid int, gid int, flags int) (err error) { @@ -864,35 +656,27 @@ func Fchownat(dirfd int, path string, uid int, gid int, flags int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall6(libc_fchownat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(uid), uintptr(gid), uintptr(flags), 0) + _, _, e1 := Syscall6(SYS_FCHOWNAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(uid), uintptr(gid), uintptr(flags), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fchownat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fchownat fchownat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Flock(fd int, how int) (err error) { - _, _, e1 := syscall_syscall(libc_flock_trampoline_addr, uintptr(fd), uintptr(how), 0) + _, _, e1 := Syscall(SYS_FLOCK, uintptr(fd), uintptr(how), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_flock_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_flock flock "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fpathconf(fd int, name int) (val int, err error) { - r0, _, e1 := syscall_syscall(libc_fpathconf_trampoline_addr, uintptr(fd), uintptr(name), 0) + r0, _, e1 := Syscall(SYS_FPATHCONF, uintptr(fd), uintptr(name), 0) val = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -900,24 +684,16 @@ func Fpathconf(fd int, name int) (val int, err error) { return } -var libc_fpathconf_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fpathconf fpathconf "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fstat(fd int, stat *Stat_t) (err error) { - _, _, e1 := syscall_syscall(libc_fstat_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) + _, _, e1 := Syscall(SYS_FSTAT, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fstat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fstat fstat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fstatat(fd int, path string, stat *Stat_t, flags int) (err error) { @@ -926,99 +702,71 @@ func Fstatat(fd int, path string, stat *Stat_t, flags int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall6(libc_fstatat_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), uintptr(flags), 0, 0) + _, _, e1 := Syscall6(SYS_FSTATAT, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), uintptr(flags), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fstatat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fstatat fstatat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fstatfs(fd int, stat *Statfs_t) (err error) { - _, _, e1 := syscall_syscall(libc_fstatfs_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) + _, _, e1 := Syscall(SYS_FSTATFS, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fstatfs_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fstatfs fstatfs "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fsync(fd int) (err error) { - _, _, e1 := syscall_syscall(libc_fsync_trampoline_addr, uintptr(fd), 0, 0) + _, _, e1 := Syscall(SYS_FSYNC, uintptr(fd), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fsync_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fsync fsync "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Ftruncate(fd int, length int64) (err error) { - _, _, e1 := syscall_syscall6(libc_ftruncate_trampoline_addr, uintptr(fd), 0, uintptr(length), uintptr(length>>32), 0, 0) + _, _, e1 := Syscall6(SYS_FTRUNCATE, uintptr(fd), 0, uintptr(length), uintptr(length>>32), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_ftruncate_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_ftruncate ftruncate "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getegid() (egid int) { - r0, _, _ := syscall_rawSyscall(libc_getegid_trampoline_addr, 0, 0, 0) + r0, _, _ := RawSyscall(SYS_GETEGID, 0, 0, 0) egid = int(r0) return } -var libc_getegid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getegid getegid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Geteuid() (uid int) { - r0, _, _ := syscall_rawSyscall(libc_geteuid_trampoline_addr, 0, 0, 0) + r0, _, _ := RawSyscall(SYS_GETEUID, 0, 0, 0) uid = int(r0) return } -var libc_geteuid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_geteuid geteuid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getgid() (gid int) { - r0, _, _ := syscall_rawSyscall(libc_getgid_trampoline_addr, 0, 0, 0) + r0, _, _ := RawSyscall(SYS_GETGID, 0, 0, 0) gid = int(r0) return } -var libc_getgid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getgid getgid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getpgid(pid int) (pgid int, err error) { - r0, _, e1 := syscall_rawSyscall(libc_getpgid_trampoline_addr, uintptr(pid), 0, 0) + r0, _, e1 := RawSyscall(SYS_GETPGID, uintptr(pid), 0, 0) pgid = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1026,50 +774,34 @@ func Getpgid(pid int) (pgid int, err error) { return } -var libc_getpgid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getpgid getpgid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getpgrp() (pgrp int) { - r0, _, _ := syscall_rawSyscall(libc_getpgrp_trampoline_addr, 0, 0, 0) + r0, _, _ := RawSyscall(SYS_GETPGRP, 0, 0, 0) pgrp = int(r0) return } -var libc_getpgrp_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getpgrp getpgrp "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getpid() (pid int) { - r0, _, _ := syscall_rawSyscall(libc_getpid_trampoline_addr, 0, 0, 0) + r0, _, _ := RawSyscall(SYS_GETPID, 0, 0, 0) pid = int(r0) return } -var libc_getpid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getpid getpid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getppid() (ppid int) { - r0, _, _ := syscall_rawSyscall(libc_getppid_trampoline_addr, 0, 0, 0) + r0, _, _ := RawSyscall(SYS_GETPPID, 0, 0, 0) ppid = int(r0) return } -var libc_getppid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getppid getppid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getpriority(which int, who int) (prio int, err error) { - r0, _, e1 := syscall_syscall(libc_getpriority_trampoline_addr, uintptr(which), uintptr(who), 0) + r0, _, e1 := Syscall(SYS_GETPRIORITY, uintptr(which), uintptr(who), 0) prio = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1077,28 +809,20 @@ func Getpriority(which int, who int) (prio int, err error) { return } -var libc_getpriority_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getpriority getpriority "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getrlimit(which int, lim *Rlimit) (err error) { - _, _, e1 := syscall_rawSyscall(libc_getrlimit_trampoline_addr, uintptr(which), uintptr(unsafe.Pointer(lim)), 0) + _, _, e1 := RawSyscall(SYS_GETRLIMIT, uintptr(which), uintptr(unsafe.Pointer(lim)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_getrlimit_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getrlimit getrlimit "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getrtable() (rtable int, err error) { - r0, _, e1 := syscall_rawSyscall(libc_getrtable_trampoline_addr, 0, 0, 0) + r0, _, e1 := RawSyscall(SYS_GETRTABLE, 0, 0, 0) rtable = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1106,28 +830,20 @@ func Getrtable() (rtable int, err error) { return } -var libc_getrtable_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getrtable getrtable "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getrusage(who int, rusage *Rusage) (err error) { - _, _, e1 := syscall_rawSyscall(libc_getrusage_trampoline_addr, uintptr(who), uintptr(unsafe.Pointer(rusage)), 0) + _, _, e1 := RawSyscall(SYS_GETRUSAGE, uintptr(who), uintptr(unsafe.Pointer(rusage)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_getrusage_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getrusage getrusage "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getsid(pid int) (sid int, err error) { - r0, _, e1 := syscall_rawSyscall(libc_getsid_trampoline_addr, uintptr(pid), 0, 0) + r0, _, e1 := RawSyscall(SYS_GETSID, uintptr(pid), 0, 0) sid = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1135,66 +851,46 @@ func Getsid(pid int) (sid int, err error) { return } -var libc_getsid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getsid getsid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Gettimeofday(tv *Timeval) (err error) { - _, _, e1 := syscall_rawSyscall(libc_gettimeofday_trampoline_addr, uintptr(unsafe.Pointer(tv)), 0, 0) + _, _, e1 := RawSyscall(SYS_GETTIMEOFDAY, uintptr(unsafe.Pointer(tv)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_gettimeofday_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_gettimeofday gettimeofday "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getuid() (uid int) { - r0, _, _ := syscall_rawSyscall(libc_getuid_trampoline_addr, 0, 0, 0) + r0, _, _ := RawSyscall(SYS_GETUID, 0, 0, 0) uid = int(r0) return } -var libc_getuid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getuid getuid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Issetugid() (tainted bool) { - r0, _, _ := syscall_syscall(libc_issetugid_trampoline_addr, 0, 0, 0) + r0, _, _ := Syscall(SYS_ISSETUGID, 0, 0, 0) tainted = bool(r0 != 0) return } -var libc_issetugid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_issetugid issetugid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Kill(pid int, signum syscall.Signal) (err error) { - _, _, e1 := syscall_syscall(libc_kill_trampoline_addr, uintptr(pid), uintptr(signum), 0) + _, _, e1 := Syscall(SYS_KILL, uintptr(pid), uintptr(signum), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_kill_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_kill kill "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Kqueue() (fd int, err error) { - r0, _, e1 := syscall_syscall(libc_kqueue_trampoline_addr, 0, 0, 0) + r0, _, e1 := Syscall(SYS_KQUEUE, 0, 0, 0) fd = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1202,10 +898,6 @@ func Kqueue() (fd int, err error) { return } -var libc_kqueue_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_kqueue kqueue "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Lchown(path string, uid int, gid int) (err error) { @@ -1214,17 +906,13 @@ func Lchown(path string, uid int, gid int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_lchown_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(uid), uintptr(gid)) + _, _, e1 := Syscall(SYS_LCHOWN, uintptr(unsafe.Pointer(_p0)), uintptr(uid), uintptr(gid)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_lchown_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_lchown lchown "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Link(path string, link string) (err error) { @@ -1238,17 +926,13 @@ func Link(path string, link string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_link_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), 0) + _, _, e1 := Syscall(SYS_LINK, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_link_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_link link "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Linkat(pathfd int, path string, linkfd int, link string, flags int) (err error) { @@ -1262,31 +946,23 @@ func Linkat(pathfd int, path string, linkfd int, link string, flags int) (err er if err != nil { return } - _, _, e1 := syscall_syscall6(libc_linkat_trampoline_addr, uintptr(pathfd), uintptr(unsafe.Pointer(_p0)), uintptr(linkfd), uintptr(unsafe.Pointer(_p1)), uintptr(flags), 0) + _, _, e1 := Syscall6(SYS_LINKAT, uintptr(pathfd), uintptr(unsafe.Pointer(_p0)), uintptr(linkfd), uintptr(unsafe.Pointer(_p1)), uintptr(flags), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_linkat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_linkat linkat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Listen(s int, backlog int) (err error) { - _, _, e1 := syscall_syscall(libc_listen_trampoline_addr, uintptr(s), uintptr(backlog), 0) + _, _, e1 := Syscall(SYS_LISTEN, uintptr(s), uintptr(backlog), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_listen_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_listen listen "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Lstat(path string, stat *Stat_t) (err error) { @@ -1295,17 +971,13 @@ func Lstat(path string, stat *Stat_t) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_lstat_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) + _, _, e1 := Syscall(SYS_LSTAT, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_lstat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_lstat lstat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mkdir(path string, mode uint32) (err error) { @@ -1314,17 +986,13 @@ func Mkdir(path string, mode uint32) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_mkdir_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0) + _, _, e1 := Syscall(SYS_MKDIR, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mkdir_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mkdir mkdir "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mkdirat(dirfd int, path string, mode uint32) (err error) { @@ -1333,17 +1001,13 @@ func Mkdirat(dirfd int, path string, mode uint32) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_mkdirat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode)) + _, _, e1 := Syscall(SYS_MKDIRAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mkdirat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mkdirat mkdirat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mkfifo(path string, mode uint32) (err error) { @@ -1352,17 +1016,13 @@ func Mkfifo(path string, mode uint32) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_mkfifo_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0) + _, _, e1 := Syscall(SYS_MKFIFO, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mkfifo_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mkfifo mkfifo "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mkfifoat(dirfd int, path string, mode uint32) (err error) { @@ -1371,17 +1031,13 @@ func Mkfifoat(dirfd int, path string, mode uint32) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_mkfifoat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode)) + _, _, e1 := Syscall(SYS_MKFIFOAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mkfifoat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mkfifoat mkfifoat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mknod(path string, mode uint32, dev int) (err error) { @@ -1390,17 +1046,13 @@ func Mknod(path string, mode uint32, dev int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_mknod_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(dev)) + _, _, e1 := Syscall(SYS_MKNOD, uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(dev)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mknod_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mknod mknod "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mknodat(dirfd int, path string, mode uint32, dev int) (err error) { @@ -1409,31 +1061,23 @@ func Mknodat(dirfd int, path string, mode uint32, dev int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall6(libc_mknodat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(dev), 0, 0) + _, _, e1 := Syscall6(SYS_MKNODAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(dev), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mknodat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mknodat mknodat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Nanosleep(time *Timespec, leftover *Timespec) (err error) { - _, _, e1 := syscall_syscall(libc_nanosleep_trampoline_addr, uintptr(unsafe.Pointer(time)), uintptr(unsafe.Pointer(leftover)), 0) + _, _, e1 := Syscall(SYS_NANOSLEEP, uintptr(unsafe.Pointer(time)), uintptr(unsafe.Pointer(leftover)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_nanosleep_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_nanosleep nanosleep "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Open(path string, mode int, perm uint32) (fd int, err error) { @@ -1442,7 +1086,7 @@ func Open(path string, mode int, perm uint32) (fd int, err error) { if err != nil { return } - r0, _, e1 := syscall_syscall(libc_open_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(perm)) + r0, _, e1 := Syscall(SYS_OPEN, uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(perm)) fd = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1450,10 +1094,6 @@ func Open(path string, mode int, perm uint32) (fd int, err error) { return } -var libc_open_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_open open "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Openat(dirfd int, path string, mode int, perm uint32) (fd int, err error) { @@ -1462,7 +1102,7 @@ func Openat(dirfd int, path string, mode int, perm uint32) (fd int, err error) { if err != nil { return } - r0, _, e1 := syscall_syscall6(libc_openat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(perm), 0, 0) + r0, _, e1 := Syscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(perm), 0, 0) fd = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1470,10 +1110,6 @@ func Openat(dirfd int, path string, mode int, perm uint32) (fd int, err error) { return } -var libc_openat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_openat openat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Pathconf(path string, name int) (val int, err error) { @@ -1482,7 +1118,7 @@ func Pathconf(path string, name int) (val int, err error) { if err != nil { return } - r0, _, e1 := syscall_syscall(libc_pathconf_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(name), 0) + r0, _, e1 := Syscall(SYS_PATHCONF, uintptr(unsafe.Pointer(_p0)), uintptr(name), 0) val = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1490,10 +1126,6 @@ func Pathconf(path string, name int) (val int, err error) { return } -var libc_pathconf_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_pathconf pathconf "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func pread(fd int, p []byte, offset int64) (n int, err error) { @@ -1503,7 +1135,7 @@ func pread(fd int, p []byte, offset int64) (n int, err error) { } else { _p0 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall6(libc_pread_trampoline_addr, uintptr(fd), uintptr(_p0), uintptr(len(p)), 0, uintptr(offset), uintptr(offset>>32)) + r0, _, e1 := Syscall6(SYS_PREAD, uintptr(fd), uintptr(_p0), uintptr(len(p)), 0, uintptr(offset), uintptr(offset>>32)) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1511,10 +1143,6 @@ func pread(fd int, p []byte, offset int64) (n int, err error) { return } -var libc_pread_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_pread pread "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func pwrite(fd int, p []byte, offset int64) (n int, err error) { @@ -1524,7 +1152,7 @@ func pwrite(fd int, p []byte, offset int64) (n int, err error) { } else { _p0 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall6(libc_pwrite_trampoline_addr, uintptr(fd), uintptr(_p0), uintptr(len(p)), 0, uintptr(offset), uintptr(offset>>32)) + r0, _, e1 := Syscall6(SYS_PWRITE, uintptr(fd), uintptr(_p0), uintptr(len(p)), 0, uintptr(offset), uintptr(offset>>32)) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1532,10 +1160,6 @@ func pwrite(fd int, p []byte, offset int64) (n int, err error) { return } -var libc_pwrite_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_pwrite pwrite "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func read(fd int, p []byte) (n int, err error) { @@ -1545,7 +1169,7 @@ func read(fd int, p []byte) (n int, err error) { } else { _p0 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall(libc_read_trampoline_addr, uintptr(fd), uintptr(_p0), uintptr(len(p))) + r0, _, e1 := Syscall(SYS_READ, uintptr(fd), uintptr(_p0), uintptr(len(p))) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1553,10 +1177,6 @@ func read(fd int, p []byte) (n int, err error) { return } -var libc_read_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_read read "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Readlink(path string, buf []byte) (n int, err error) { @@ -1571,7 +1191,7 @@ func Readlink(path string, buf []byte) (n int, err error) { } else { _p1 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall(libc_readlink_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(_p1), uintptr(len(buf))) + r0, _, e1 := Syscall(SYS_READLINK, uintptr(unsafe.Pointer(_p0)), uintptr(_p1), uintptr(len(buf))) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1579,10 +1199,6 @@ func Readlink(path string, buf []byte) (n int, err error) { return } -var libc_readlink_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_readlink readlink "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Readlinkat(dirfd int, path string, buf []byte) (n int, err error) { @@ -1597,7 +1213,7 @@ func Readlinkat(dirfd int, path string, buf []byte) (n int, err error) { } else { _p1 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall6(libc_readlinkat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(_p1), uintptr(len(buf)), 0, 0) + r0, _, e1 := Syscall6(SYS_READLINKAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(_p1), uintptr(len(buf)), 0, 0) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1605,10 +1221,6 @@ func Readlinkat(dirfd int, path string, buf []byte) (n int, err error) { return } -var libc_readlinkat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_readlinkat readlinkat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Rename(from string, to string) (err error) { @@ -1622,17 +1234,13 @@ func Rename(from string, to string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_rename_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), 0) + _, _, e1 := Syscall(SYS_RENAME, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_rename_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_rename rename "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Renameat(fromfd int, from string, tofd int, to string) (err error) { @@ -1646,17 +1254,13 @@ func Renameat(fromfd int, from string, tofd int, to string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall6(libc_renameat_trampoline_addr, uintptr(fromfd), uintptr(unsafe.Pointer(_p0)), uintptr(tofd), uintptr(unsafe.Pointer(_p1)), 0, 0) + _, _, e1 := Syscall6(SYS_RENAMEAT, uintptr(fromfd), uintptr(unsafe.Pointer(_p0)), uintptr(tofd), uintptr(unsafe.Pointer(_p1)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_renameat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_renameat renameat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Revoke(path string) (err error) { @@ -1665,17 +1269,13 @@ func Revoke(path string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_revoke_trampoline_addr, uintptr(unsafe.Pointer(_p0)), 0, 0) + _, _, e1 := Syscall(SYS_REVOKE, uintptr(unsafe.Pointer(_p0)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_revoke_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_revoke revoke "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Rmdir(path string) (err error) { @@ -1684,21 +1284,17 @@ func Rmdir(path string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_rmdir_trampoline_addr, uintptr(unsafe.Pointer(_p0)), 0, 0) + _, _, e1 := Syscall(SYS_RMDIR, uintptr(unsafe.Pointer(_p0)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_rmdir_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_rmdir rmdir "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Seek(fd int, offset int64, whence int) (newoffset int64, err error) { - r0, r1, e1 := syscall_syscall6(libc_lseek_trampoline_addr, uintptr(fd), 0, uintptr(offset), uintptr(offset>>32), uintptr(whence), 0) + r0, r1, e1 := Syscall6(SYS_LSEEK, uintptr(fd), 0, uintptr(offset), uintptr(offset>>32), uintptr(whence), 0) newoffset = int64(int64(r1)<<32 | int64(r0)) if e1 != 0 { err = errnoErr(e1) @@ -1706,14 +1302,10 @@ func Seek(fd int, offset int64, whence int) (newoffset int64, err error) { return } -var libc_lseek_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_lseek lseek "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Select(nfd int, r *FdSet, w *FdSet, e *FdSet, timeout *Timeval) (n int, err error) { - r0, _, e1 := syscall_syscall6(libc_select_trampoline_addr, uintptr(nfd), uintptr(unsafe.Pointer(r)), uintptr(unsafe.Pointer(w)), uintptr(unsafe.Pointer(e)), uintptr(unsafe.Pointer(timeout)), 0) + r0, _, e1 := Syscall6(SYS_SELECT, uintptr(nfd), uintptr(unsafe.Pointer(r)), uintptr(unsafe.Pointer(w)), uintptr(unsafe.Pointer(e)), uintptr(unsafe.Pointer(timeout)), 0) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1721,52 +1313,36 @@ func Select(nfd int, r *FdSet, w *FdSet, e *FdSet, timeout *Timeval) (n int, err return } -var libc_select_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_select select "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setegid(egid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setegid_trampoline_addr, uintptr(egid), 0, 0) + _, _, e1 := RawSyscall(SYS_SETEGID, uintptr(egid), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setegid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setegid setegid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Seteuid(euid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_seteuid_trampoline_addr, uintptr(euid), 0, 0) + _, _, e1 := RawSyscall(SYS_SETEUID, uintptr(euid), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_seteuid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_seteuid seteuid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setgid(gid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setgid_trampoline_addr, uintptr(gid), 0, 0) + _, _, e1 := RawSyscall(SYS_SETGID, uintptr(gid), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setgid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setgid setgid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setlogin(name string) (err error) { @@ -1775,133 +1351,97 @@ func Setlogin(name string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_setlogin_trampoline_addr, uintptr(unsafe.Pointer(_p0)), 0, 0) + _, _, e1 := Syscall(SYS_SETLOGIN, uintptr(unsafe.Pointer(_p0)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setlogin_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setlogin setlogin "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setpgid(pid int, pgid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setpgid_trampoline_addr, uintptr(pid), uintptr(pgid), 0) + _, _, e1 := RawSyscall(SYS_SETPGID, uintptr(pid), uintptr(pgid), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setpgid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setpgid setpgid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setpriority(which int, who int, prio int) (err error) { - _, _, e1 := syscall_syscall(libc_setpriority_trampoline_addr, uintptr(which), uintptr(who), uintptr(prio)) + _, _, e1 := Syscall(SYS_SETPRIORITY, uintptr(which), uintptr(who), uintptr(prio)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setpriority_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setpriority setpriority "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setregid(rgid int, egid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setregid_trampoline_addr, uintptr(rgid), uintptr(egid), 0) + _, _, e1 := RawSyscall(SYS_SETREGID, uintptr(rgid), uintptr(egid), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setregid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setregid setregid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setreuid(ruid int, euid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setreuid_trampoline_addr, uintptr(ruid), uintptr(euid), 0) + _, _, e1 := RawSyscall(SYS_SETREUID, uintptr(ruid), uintptr(euid), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setreuid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setreuid setreuid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setresgid(rgid int, egid int, sgid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setresgid_trampoline_addr, uintptr(rgid), uintptr(egid), uintptr(sgid)) + _, _, e1 := RawSyscall(SYS_SETRESGID, uintptr(rgid), uintptr(egid), uintptr(sgid)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setresgid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setresgid setresgid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setresuid(ruid int, euid int, suid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setresuid_trampoline_addr, uintptr(ruid), uintptr(euid), uintptr(suid)) + _, _, e1 := RawSyscall(SYS_SETRESUID, uintptr(ruid), uintptr(euid), uintptr(suid)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setresuid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setresuid setresuid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setrlimit(which int, lim *Rlimit) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setrlimit_trampoline_addr, uintptr(which), uintptr(unsafe.Pointer(lim)), 0) + _, _, e1 := RawSyscall(SYS_SETRLIMIT, uintptr(which), uintptr(unsafe.Pointer(lim)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setrlimit_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setrlimit setrlimit "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setrtable(rtable int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setrtable_trampoline_addr, uintptr(rtable), 0, 0) + _, _, e1 := RawSyscall(SYS_SETRTABLE, uintptr(rtable), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setrtable_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setrtable setrtable "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setsid() (pid int, err error) { - r0, _, e1 := syscall_rawSyscall(libc_setsid_trampoline_addr, 0, 0, 0) + r0, _, e1 := RawSyscall(SYS_SETSID, 0, 0, 0) pid = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1909,38 +1449,26 @@ func Setsid() (pid int, err error) { return } -var libc_setsid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setsid setsid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Settimeofday(tp *Timeval) (err error) { - _, _, e1 := syscall_rawSyscall(libc_settimeofday_trampoline_addr, uintptr(unsafe.Pointer(tp)), 0, 0) + _, _, e1 := RawSyscall(SYS_SETTIMEOFDAY, uintptr(unsafe.Pointer(tp)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_settimeofday_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_settimeofday settimeofday "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setuid(uid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setuid_trampoline_addr, uintptr(uid), 0, 0) + _, _, e1 := RawSyscall(SYS_SETUID, uintptr(uid), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setuid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setuid setuid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Stat(path string, stat *Stat_t) (err error) { @@ -1949,17 +1477,13 @@ func Stat(path string, stat *Stat_t) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_stat_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) + _, _, e1 := Syscall(SYS_STAT, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_stat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_stat stat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Statfs(path string, stat *Statfs_t) (err error) { @@ -1968,17 +1492,13 @@ func Statfs(path string, stat *Statfs_t) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_statfs_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) + _, _, e1 := Syscall(SYS_STATFS, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_statfs_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_statfs statfs "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Symlink(path string, link string) (err error) { @@ -1992,17 +1512,13 @@ func Symlink(path string, link string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_symlink_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), 0) + _, _, e1 := Syscall(SYS_SYMLINK, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_symlink_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_symlink symlink "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Symlinkat(oldpath string, newdirfd int, newpath string) (err error) { @@ -2016,31 +1532,23 @@ func Symlinkat(oldpath string, newdirfd int, newpath string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_symlinkat_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(newdirfd), uintptr(unsafe.Pointer(_p1))) + _, _, e1 := Syscall(SYS_SYMLINKAT, uintptr(unsafe.Pointer(_p0)), uintptr(newdirfd), uintptr(unsafe.Pointer(_p1))) if e1 != 0 { err = errnoErr(e1) } return } -var libc_symlinkat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_symlinkat symlinkat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Sync() (err error) { - _, _, e1 := syscall_syscall(libc_sync_trampoline_addr, 0, 0, 0) + _, _, e1 := Syscall(SYS_SYNC, 0, 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_sync_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_sync sync "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Truncate(path string, length int64) (err error) { @@ -2049,29 +1557,21 @@ func Truncate(path string, length int64) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall6(libc_truncate_trampoline_addr, uintptr(unsafe.Pointer(_p0)), 0, uintptr(length), uintptr(length>>32), 0, 0) + _, _, e1 := Syscall6(SYS_TRUNCATE, uintptr(unsafe.Pointer(_p0)), 0, uintptr(length), uintptr(length>>32), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_truncate_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_truncate truncate "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Umask(newmask int) (oldmask int) { - r0, _, _ := syscall_syscall(libc_umask_trampoline_addr, uintptr(newmask), 0, 0) + r0, _, _ := Syscall(SYS_UMASK, uintptr(newmask), 0, 0) oldmask = int(r0) return } -var libc_umask_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_umask umask "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Unlink(path string) (err error) { @@ -2080,17 +1580,13 @@ func Unlink(path string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_unlink_trampoline_addr, uintptr(unsafe.Pointer(_p0)), 0, 0) + _, _, e1 := Syscall(SYS_UNLINK, uintptr(unsafe.Pointer(_p0)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_unlink_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_unlink unlink "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Unlinkat(dirfd int, path string, flags int) (err error) { @@ -2099,17 +1595,13 @@ func Unlinkat(dirfd int, path string, flags int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_unlinkat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(flags)) + _, _, e1 := Syscall(SYS_UNLINKAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(flags)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_unlinkat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_unlinkat unlinkat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Unmount(path string, flags int) (err error) { @@ -2118,17 +1610,13 @@ func Unmount(path string, flags int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_unmount_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(flags), 0) + _, _, e1 := Syscall(SYS_UNMOUNT, uintptr(unsafe.Pointer(_p0)), uintptr(flags), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_unmount_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_unmount unmount "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func write(fd int, p []byte) (n int, err error) { @@ -2138,7 +1626,7 @@ func write(fd int, p []byte) (n int, err error) { } else { _p0 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall(libc_write_trampoline_addr, uintptr(fd), uintptr(_p0), uintptr(len(p))) + r0, _, e1 := Syscall(SYS_WRITE, uintptr(fd), uintptr(_p0), uintptr(len(p))) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -2146,14 +1634,10 @@ func write(fd int, p []byte) (n int, err error) { return } -var libc_write_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_write write "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func mmap(addr uintptr, length uintptr, prot int, flag int, fd int, pos int64) (ret uintptr, err error) { - r0, _, e1 := syscall_syscall9(libc_mmap_trampoline_addr, uintptr(addr), uintptr(length), uintptr(prot), uintptr(flag), uintptr(fd), 0, uintptr(pos), uintptr(pos>>32), 0) + r0, _, e1 := Syscall9(SYS_MMAP, uintptr(addr), uintptr(length), uintptr(prot), uintptr(flag), uintptr(fd), 0, uintptr(pos), uintptr(pos>>32), 0) ret = uintptr(r0) if e1 != 0 { err = errnoErr(e1) @@ -2161,28 +1645,20 @@ func mmap(addr uintptr, length uintptr, prot int, flag int, fd int, pos int64) ( return } -var libc_mmap_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mmap mmap "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func munmap(addr uintptr, length uintptr) (err error) { - _, _, e1 := syscall_syscall(libc_munmap_trampoline_addr, uintptr(addr), uintptr(length), 0) + _, _, e1 := Syscall(SYS_MUNMAP, uintptr(addr), uintptr(length), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_munmap_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_munmap munmap "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func readlen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_read_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) + r0, _, e1 := Syscall(SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -2193,7 +1669,7 @@ func readlen(fd int, buf *byte, nbuf int) (n int, err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func writelen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_write_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) + r0, _, e1 := Syscall(SYS_WRITE, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -2209,13 +1685,9 @@ func utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error if err != nil { return } - _, _, e1 := syscall_syscall6(libc_utimensat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(times)), uintptr(flags), 0, 0) + _, _, e1 := Syscall6(SYS_UTIMENSAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(times)), uintptr(flags), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } - -var libc_utimensat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_utimensat utimensat "libc.so" diff --git a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.go b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.go index 800aab6..c96a505 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_openbsd_arm64.go @@ -1,4 +1,4 @@ -// go run mksyscall.go -openbsd -libc -tags openbsd,arm64 syscall_bsd.go syscall_openbsd.go syscall_openbsd_arm64.go +// go run mksyscall.go -openbsd -tags openbsd,arm64 syscall_bsd.go syscall_openbsd.go syscall_openbsd_arm64.go // Code generated by the command above; see README.md. DO NOT EDIT. //go:build openbsd && arm64 @@ -16,7 +16,7 @@ var _ syscall.Errno // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func getgroups(ngid int, gid *_Gid_t) (n int, err error) { - r0, _, e1 := syscall_rawSyscall(libc_getgroups_trampoline_addr, uintptr(ngid), uintptr(unsafe.Pointer(gid)), 0) + r0, _, e1 := RawSyscall(SYS_GETGROUPS, uintptr(ngid), uintptr(unsafe.Pointer(gid)), 0) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -24,28 +24,20 @@ func getgroups(ngid int, gid *_Gid_t) (n int, err error) { return } -var libc_getgroups_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getgroups getgroups "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func setgroups(ngid int, gid *_Gid_t) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setgroups_trampoline_addr, uintptr(ngid), uintptr(unsafe.Pointer(gid)), 0) + _, _, e1 := RawSyscall(SYS_SETGROUPS, uintptr(ngid), uintptr(unsafe.Pointer(gid)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setgroups_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setgroups setgroups "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func wait4(pid int, wstatus *_C_int, options int, rusage *Rusage) (wpid int, err error) { - r0, _, e1 := syscall_syscall6(libc_wait4_trampoline_addr, uintptr(pid), uintptr(unsafe.Pointer(wstatus)), uintptr(options), uintptr(unsafe.Pointer(rusage)), 0, 0) + r0, _, e1 := Syscall6(SYS_WAIT4, uintptr(pid), uintptr(unsafe.Pointer(wstatus)), uintptr(options), uintptr(unsafe.Pointer(rusage)), 0, 0) wpid = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -53,14 +45,10 @@ func wait4(pid int, wstatus *_C_int, options int, rusage *Rusage) (wpid int, err return } -var libc_wait4_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_wait4 wait4 "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func accept(s int, rsa *RawSockaddrAny, addrlen *_Socklen) (fd int, err error) { - r0, _, e1 := syscall_syscall(libc_accept_trampoline_addr, uintptr(s), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen))) + r0, _, e1 := Syscall(SYS_ACCEPT, uintptr(s), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen))) fd = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -68,42 +56,30 @@ func accept(s int, rsa *RawSockaddrAny, addrlen *_Socklen) (fd int, err error) { return } -var libc_accept_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_accept accept "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func bind(s int, addr unsafe.Pointer, addrlen _Socklen) (err error) { - _, _, e1 := syscall_syscall(libc_bind_trampoline_addr, uintptr(s), uintptr(addr), uintptr(addrlen)) + _, _, e1 := Syscall(SYS_BIND, uintptr(s), uintptr(addr), uintptr(addrlen)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_bind_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_bind bind "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func connect(s int, addr unsafe.Pointer, addrlen _Socklen) (err error) { - _, _, e1 := syscall_syscall(libc_connect_trampoline_addr, uintptr(s), uintptr(addr), uintptr(addrlen)) + _, _, e1 := Syscall(SYS_CONNECT, uintptr(s), uintptr(addr), uintptr(addrlen)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_connect_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_connect connect "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func socket(domain int, typ int, proto int) (fd int, err error) { - r0, _, e1 := syscall_rawSyscall(libc_socket_trampoline_addr, uintptr(domain), uintptr(typ), uintptr(proto)) + r0, _, e1 := RawSyscall(SYS_SOCKET, uintptr(domain), uintptr(typ), uintptr(proto)) fd = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -111,94 +87,66 @@ func socket(domain int, typ int, proto int) (fd int, err error) { return } -var libc_socket_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_socket socket "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func getsockopt(s int, level int, name int, val unsafe.Pointer, vallen *_Socklen) (err error) { - _, _, e1 := syscall_syscall6(libc_getsockopt_trampoline_addr, uintptr(s), uintptr(level), uintptr(name), uintptr(val), uintptr(unsafe.Pointer(vallen)), 0) + _, _, e1 := Syscall6(SYS_GETSOCKOPT, uintptr(s), uintptr(level), uintptr(name), uintptr(val), uintptr(unsafe.Pointer(vallen)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_getsockopt_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getsockopt getsockopt "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func setsockopt(s int, level int, name int, val unsafe.Pointer, vallen uintptr) (err error) { - _, _, e1 := syscall_syscall6(libc_setsockopt_trampoline_addr, uintptr(s), uintptr(level), uintptr(name), uintptr(val), uintptr(vallen), 0) + _, _, e1 := Syscall6(SYS_SETSOCKOPT, uintptr(s), uintptr(level), uintptr(name), uintptr(val), uintptr(vallen), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setsockopt_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setsockopt setsockopt "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func getpeername(fd int, rsa *RawSockaddrAny, addrlen *_Socklen) (err error) { - _, _, e1 := syscall_rawSyscall(libc_getpeername_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen))) + _, _, e1 := RawSyscall(SYS_GETPEERNAME, uintptr(fd), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen))) if e1 != 0 { err = errnoErr(e1) } return } -var libc_getpeername_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getpeername getpeername "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func getsockname(fd int, rsa *RawSockaddrAny, addrlen *_Socklen) (err error) { - _, _, e1 := syscall_rawSyscall(libc_getsockname_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen))) + _, _, e1 := RawSyscall(SYS_GETSOCKNAME, uintptr(fd), uintptr(unsafe.Pointer(rsa)), uintptr(unsafe.Pointer(addrlen))) if e1 != 0 { err = errnoErr(e1) } return } -var libc_getsockname_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getsockname getsockname "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Shutdown(s int, how int) (err error) { - _, _, e1 := syscall_syscall(libc_shutdown_trampoline_addr, uintptr(s), uintptr(how), 0) + _, _, e1 := Syscall(SYS_SHUTDOWN, uintptr(s), uintptr(how), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_shutdown_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_shutdown shutdown "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func socketpair(domain int, typ int, proto int, fd *[2]int32) (err error) { - _, _, e1 := syscall_rawSyscall6(libc_socketpair_trampoline_addr, uintptr(domain), uintptr(typ), uintptr(proto), uintptr(unsafe.Pointer(fd)), 0, 0) + _, _, e1 := RawSyscall6(SYS_SOCKETPAIR, uintptr(domain), uintptr(typ), uintptr(proto), uintptr(unsafe.Pointer(fd)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_socketpair_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_socketpair socketpair "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func recvfrom(fd int, p []byte, flags int, from *RawSockaddrAny, fromlen *_Socklen) (n int, err error) { @@ -208,7 +156,7 @@ func recvfrom(fd int, p []byte, flags int, from *RawSockaddrAny, fromlen *_Sockl } else { _p0 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall6(libc_recvfrom_trampoline_addr, uintptr(fd), uintptr(_p0), uintptr(len(p)), uintptr(flags), uintptr(unsafe.Pointer(from)), uintptr(unsafe.Pointer(fromlen))) + r0, _, e1 := Syscall6(SYS_RECVFROM, uintptr(fd), uintptr(_p0), uintptr(len(p)), uintptr(flags), uintptr(unsafe.Pointer(from)), uintptr(unsafe.Pointer(fromlen))) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -216,10 +164,6 @@ func recvfrom(fd int, p []byte, flags int, from *RawSockaddrAny, fromlen *_Sockl return } -var libc_recvfrom_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_recvfrom recvfrom "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func sendto(s int, buf []byte, flags int, to unsafe.Pointer, addrlen _Socklen) (err error) { @@ -229,21 +173,17 @@ func sendto(s int, buf []byte, flags int, to unsafe.Pointer, addrlen _Socklen) ( } else { _p0 = unsafe.Pointer(&_zero) } - _, _, e1 := syscall_syscall6(libc_sendto_trampoline_addr, uintptr(s), uintptr(_p0), uintptr(len(buf)), uintptr(flags), uintptr(to), uintptr(addrlen)) + _, _, e1 := Syscall6(SYS_SENDTO, uintptr(s), uintptr(_p0), uintptr(len(buf)), uintptr(flags), uintptr(to), uintptr(addrlen)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_sendto_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_sendto sendto "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func recvmsg(s int, msg *Msghdr, flags int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_recvmsg_trampoline_addr, uintptr(s), uintptr(unsafe.Pointer(msg)), uintptr(flags)) + r0, _, e1 := Syscall(SYS_RECVMSG, uintptr(s), uintptr(unsafe.Pointer(msg)), uintptr(flags)) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -251,14 +191,10 @@ func recvmsg(s int, msg *Msghdr, flags int) (n int, err error) { return } -var libc_recvmsg_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_recvmsg recvmsg "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func sendmsg(s int, msg *Msghdr, flags int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_sendmsg_trampoline_addr, uintptr(s), uintptr(unsafe.Pointer(msg)), uintptr(flags)) + r0, _, e1 := Syscall(SYS_SENDMSG, uintptr(s), uintptr(unsafe.Pointer(msg)), uintptr(flags)) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -266,14 +202,10 @@ func sendmsg(s int, msg *Msghdr, flags int) (n int, err error) { return } -var libc_sendmsg_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_sendmsg sendmsg "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func kevent(kq int, change unsafe.Pointer, nchange int, event unsafe.Pointer, nevent int, timeout *Timespec) (n int, err error) { - r0, _, e1 := syscall_syscall6(libc_kevent_trampoline_addr, uintptr(kq), uintptr(change), uintptr(nchange), uintptr(event), uintptr(nevent), uintptr(unsafe.Pointer(timeout))) + r0, _, e1 := Syscall6(SYS_KEVENT, uintptr(kq), uintptr(change), uintptr(nchange), uintptr(event), uintptr(nevent), uintptr(unsafe.Pointer(timeout))) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -281,10 +213,6 @@ func kevent(kq int, change unsafe.Pointer, nchange int, event unsafe.Pointer, ne return } -var libc_kevent_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_kevent kevent "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func utimes(path string, timeval *[2]Timeval) (err error) { @@ -293,35 +221,27 @@ func utimes(path string, timeval *[2]Timeval) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_utimes_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(timeval)), 0) + _, _, e1 := Syscall(SYS_UTIMES, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(timeval)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_utimes_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_utimes utimes "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func futimes(fd int, timeval *[2]Timeval) (err error) { - _, _, e1 := syscall_syscall(libc_futimes_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(timeval)), 0) + _, _, e1 := Syscall(SYS_FUTIMES, uintptr(fd), uintptr(unsafe.Pointer(timeval)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_futimes_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_futimes futimes "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func poll(fds *PollFd, nfds int, timeout int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_poll_trampoline_addr, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(timeout)) + r0, _, e1 := Syscall(SYS_POLL, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(timeout)) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -329,10 +249,6 @@ func poll(fds *PollFd, nfds int, timeout int) (n int, err error) { return } -var libc_poll_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_poll poll "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Madvise(b []byte, behav int) (err error) { @@ -342,17 +258,13 @@ func Madvise(b []byte, behav int) (err error) { } else { _p0 = unsafe.Pointer(&_zero) } - _, _, e1 := syscall_syscall(libc_madvise_trampoline_addr, uintptr(_p0), uintptr(len(b)), uintptr(behav)) + _, _, e1 := Syscall(SYS_MADVISE, uintptr(_p0), uintptr(len(b)), uintptr(behav)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_madvise_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_madvise madvise "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mlock(b []byte) (err error) { @@ -362,31 +274,23 @@ func Mlock(b []byte) (err error) { } else { _p0 = unsafe.Pointer(&_zero) } - _, _, e1 := syscall_syscall(libc_mlock_trampoline_addr, uintptr(_p0), uintptr(len(b)), 0) + _, _, e1 := Syscall(SYS_MLOCK, uintptr(_p0), uintptr(len(b)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mlock_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mlock mlock "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mlockall(flags int) (err error) { - _, _, e1 := syscall_syscall(libc_mlockall_trampoline_addr, uintptr(flags), 0, 0) + _, _, e1 := Syscall(SYS_MLOCKALL, uintptr(flags), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mlockall_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mlockall mlockall "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mprotect(b []byte, prot int) (err error) { @@ -396,17 +300,13 @@ func Mprotect(b []byte, prot int) (err error) { } else { _p0 = unsafe.Pointer(&_zero) } - _, _, e1 := syscall_syscall(libc_mprotect_trampoline_addr, uintptr(_p0), uintptr(len(b)), uintptr(prot)) + _, _, e1 := Syscall(SYS_MPROTECT, uintptr(_p0), uintptr(len(b)), uintptr(prot)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mprotect_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mprotect mprotect "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Msync(b []byte, flags int) (err error) { @@ -416,17 +316,13 @@ func Msync(b []byte, flags int) (err error) { } else { _p0 = unsafe.Pointer(&_zero) } - _, _, e1 := syscall_syscall(libc_msync_trampoline_addr, uintptr(_p0), uintptr(len(b)), uintptr(flags)) + _, _, e1 := Syscall(SYS_MSYNC, uintptr(_p0), uintptr(len(b)), uintptr(flags)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_msync_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_msync msync "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Munlock(b []byte) (err error) { @@ -436,45 +332,33 @@ func Munlock(b []byte) (err error) { } else { _p0 = unsafe.Pointer(&_zero) } - _, _, e1 := syscall_syscall(libc_munlock_trampoline_addr, uintptr(_p0), uintptr(len(b)), 0) + _, _, e1 := Syscall(SYS_MUNLOCK, uintptr(_p0), uintptr(len(b)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_munlock_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_munlock munlock "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Munlockall() (err error) { - _, _, e1 := syscall_syscall(libc_munlockall_trampoline_addr, 0, 0, 0) + _, _, e1 := Syscall(SYS_MUNLOCKALL, 0, 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_munlockall_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_munlockall munlockall "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func pipe2(p *[2]_C_int, flags int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_pipe2_trampoline_addr, uintptr(unsafe.Pointer(p)), uintptr(flags), 0) + _, _, e1 := RawSyscall(SYS_PIPE2, uintptr(unsafe.Pointer(p)), uintptr(flags), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_pipe2_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_pipe2 pipe2 "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getdents(fd int, buf []byte) (n int, err error) { @@ -484,7 +368,7 @@ func Getdents(fd int, buf []byte) (n int, err error) { } else { _p0 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall(libc_getdents_trampoline_addr, uintptr(fd), uintptr(_p0), uintptr(len(buf))) + r0, _, e1 := Syscall(SYS_GETDENTS, uintptr(fd), uintptr(_p0), uintptr(len(buf))) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -492,10 +376,6 @@ func Getdents(fd int, buf []byte) (n int, err error) { return } -var libc_getdents_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getdents getdents "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getcwd(buf []byte) (n int, err error) { @@ -505,7 +385,7 @@ func Getcwd(buf []byte) (n int, err error) { } else { _p0 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall(libc_getcwd_trampoline_addr, uintptr(_p0), uintptr(len(buf)), 0) + r0, _, e1 := Syscall(SYS___GETCWD, uintptr(_p0), uintptr(len(buf)), 0) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -513,24 +393,16 @@ func Getcwd(buf []byte) (n int, err error) { return } -var libc_getcwd_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getcwd getcwd "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func ioctl(fd int, req uint, arg uintptr) (err error) { - _, _, e1 := syscall_syscall(libc_ioctl_trampoline_addr, uintptr(fd), uintptr(req), uintptr(arg)) + _, _, e1 := Syscall(SYS_IOCTL, uintptr(fd), uintptr(req), uintptr(arg)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_ioctl_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_ioctl ioctl "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) (err error) { @@ -540,21 +412,17 @@ func sysctl(mib []_C_int, old *byte, oldlen *uintptr, new *byte, newlen uintptr) } else { _p0 = unsafe.Pointer(&_zero) } - _, _, e1 := syscall_syscall6(libc_sysctl_trampoline_addr, uintptr(_p0), uintptr(len(mib)), uintptr(unsafe.Pointer(old)), uintptr(unsafe.Pointer(oldlen)), uintptr(unsafe.Pointer(new)), uintptr(newlen)) + _, _, e1 := Syscall6(SYS___SYSCTL, uintptr(_p0), uintptr(len(mib)), uintptr(unsafe.Pointer(old)), uintptr(unsafe.Pointer(oldlen)), uintptr(unsafe.Pointer(new)), uintptr(newlen)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_sysctl_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_sysctl sysctl "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func ppoll(fds *PollFd, nfds int, timeout *Timespec, sigmask *Sigset_t) (n int, err error) { - r0, _, e1 := syscall_syscall6(libc_ppoll_trampoline_addr, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(unsafe.Pointer(timeout)), uintptr(unsafe.Pointer(sigmask)), 0, 0) + r0, _, e1 := Syscall6(SYS_PPOLL, uintptr(unsafe.Pointer(fds)), uintptr(nfds), uintptr(unsafe.Pointer(timeout)), uintptr(unsafe.Pointer(sigmask)), 0, 0) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -562,10 +430,6 @@ func ppoll(fds *PollFd, nfds int, timeout *Timespec, sigmask *Sigset_t) (n int, return } -var libc_ppoll_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_ppoll ppoll "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Access(path string, mode uint32) (err error) { @@ -574,31 +438,23 @@ func Access(path string, mode uint32) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_access_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0) + _, _, e1 := Syscall(SYS_ACCESS, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_access_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_access access "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Adjtime(delta *Timeval, olddelta *Timeval) (err error) { - _, _, e1 := syscall_syscall(libc_adjtime_trampoline_addr, uintptr(unsafe.Pointer(delta)), uintptr(unsafe.Pointer(olddelta)), 0) + _, _, e1 := Syscall(SYS_ADJTIME, uintptr(unsafe.Pointer(delta)), uintptr(unsafe.Pointer(olddelta)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_adjtime_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_adjtime adjtime "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Chdir(path string) (err error) { @@ -607,17 +463,13 @@ func Chdir(path string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_chdir_trampoline_addr, uintptr(unsafe.Pointer(_p0)), 0, 0) + _, _, e1 := Syscall(SYS_CHDIR, uintptr(unsafe.Pointer(_p0)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_chdir_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_chdir chdir "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Chflags(path string, flags int) (err error) { @@ -626,17 +478,13 @@ func Chflags(path string, flags int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_chflags_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(flags), 0) + _, _, e1 := Syscall(SYS_CHFLAGS, uintptr(unsafe.Pointer(_p0)), uintptr(flags), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_chflags_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_chflags chflags "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Chmod(path string, mode uint32) (err error) { @@ -645,17 +493,13 @@ func Chmod(path string, mode uint32) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_chmod_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0) + _, _, e1 := Syscall(SYS_CHMOD, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_chmod_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_chmod chmod "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Chown(path string, uid int, gid int) (err error) { @@ -664,17 +508,13 @@ func Chown(path string, uid int, gid int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_chown_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(uid), uintptr(gid)) + _, _, e1 := Syscall(SYS_CHOWN, uintptr(unsafe.Pointer(_p0)), uintptr(uid), uintptr(gid)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_chown_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_chown chown "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Chroot(path string) (err error) { @@ -683,35 +523,27 @@ func Chroot(path string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_chroot_trampoline_addr, uintptr(unsafe.Pointer(_p0)), 0, 0) + _, _, e1 := Syscall(SYS_CHROOT, uintptr(unsafe.Pointer(_p0)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_chroot_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_chroot chroot "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Close(fd int) (err error) { - _, _, e1 := syscall_syscall(libc_close_trampoline_addr, uintptr(fd), 0, 0) + _, _, e1 := Syscall(SYS_CLOSE, uintptr(fd), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_close_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_close close "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Dup(fd int) (nfd int, err error) { - r0, _, e1 := syscall_syscall(libc_dup_trampoline_addr, uintptr(fd), 0, 0) + r0, _, e1 := Syscall(SYS_DUP, uintptr(fd), 0, 0) nfd = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -719,49 +551,33 @@ func Dup(fd int) (nfd int, err error) { return } -var libc_dup_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_dup dup "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Dup2(from int, to int) (err error) { - _, _, e1 := syscall_syscall(libc_dup2_trampoline_addr, uintptr(from), uintptr(to), 0) + _, _, e1 := Syscall(SYS_DUP2, uintptr(from), uintptr(to), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_dup2_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_dup2 dup2 "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Dup3(from int, to int, flags int) (err error) { - _, _, e1 := syscall_syscall(libc_dup3_trampoline_addr, uintptr(from), uintptr(to), uintptr(flags)) + _, _, e1 := Syscall(SYS_DUP3, uintptr(from), uintptr(to), uintptr(flags)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_dup3_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_dup3 dup3 "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Exit(code int) { - syscall_syscall(libc_exit_trampoline_addr, uintptr(code), 0, 0) + Syscall(SYS_EXIT, uintptr(code), 0, 0) return } -var libc_exit_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_exit exit "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Faccessat(dirfd int, path string, mode uint32, flags int) (err error) { @@ -770,59 +586,43 @@ func Faccessat(dirfd int, path string, mode uint32, flags int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall6(libc_faccessat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(flags), 0, 0) + _, _, e1 := Syscall6(SYS_FACCESSAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(flags), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_faccessat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_faccessat faccessat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fchdir(fd int) (err error) { - _, _, e1 := syscall_syscall(libc_fchdir_trampoline_addr, uintptr(fd), 0, 0) + _, _, e1 := Syscall(SYS_FCHDIR, uintptr(fd), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fchdir_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fchdir fchdir "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fchflags(fd int, flags int) (err error) { - _, _, e1 := syscall_syscall(libc_fchflags_trampoline_addr, uintptr(fd), uintptr(flags), 0) + _, _, e1 := Syscall(SYS_FCHFLAGS, uintptr(fd), uintptr(flags), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fchflags_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fchflags fchflags "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fchmod(fd int, mode uint32) (err error) { - _, _, e1 := syscall_syscall(libc_fchmod_trampoline_addr, uintptr(fd), uintptr(mode), 0) + _, _, e1 := Syscall(SYS_FCHMOD, uintptr(fd), uintptr(mode), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fchmod_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fchmod fchmod "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fchmodat(dirfd int, path string, mode uint32, flags int) (err error) { @@ -831,31 +631,23 @@ func Fchmodat(dirfd int, path string, mode uint32, flags int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall6(libc_fchmodat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(flags), 0, 0) + _, _, e1 := Syscall6(SYS_FCHMODAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(flags), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fchmodat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fchmodat fchmodat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fchown(fd int, uid int, gid int) (err error) { - _, _, e1 := syscall_syscall(libc_fchown_trampoline_addr, uintptr(fd), uintptr(uid), uintptr(gid)) + _, _, e1 := Syscall(SYS_FCHOWN, uintptr(fd), uintptr(uid), uintptr(gid)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fchown_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fchown fchown "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fchownat(dirfd int, path string, uid int, gid int, flags int) (err error) { @@ -864,35 +656,27 @@ func Fchownat(dirfd int, path string, uid int, gid int, flags int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall6(libc_fchownat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(uid), uintptr(gid), uintptr(flags), 0) + _, _, e1 := Syscall6(SYS_FCHOWNAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(uid), uintptr(gid), uintptr(flags), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fchownat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fchownat fchownat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Flock(fd int, how int) (err error) { - _, _, e1 := syscall_syscall(libc_flock_trampoline_addr, uintptr(fd), uintptr(how), 0) + _, _, e1 := Syscall(SYS_FLOCK, uintptr(fd), uintptr(how), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_flock_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_flock flock "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fpathconf(fd int, name int) (val int, err error) { - r0, _, e1 := syscall_syscall(libc_fpathconf_trampoline_addr, uintptr(fd), uintptr(name), 0) + r0, _, e1 := Syscall(SYS_FPATHCONF, uintptr(fd), uintptr(name), 0) val = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -900,24 +684,16 @@ func Fpathconf(fd int, name int) (val int, err error) { return } -var libc_fpathconf_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fpathconf fpathconf "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fstat(fd int, stat *Stat_t) (err error) { - _, _, e1 := syscall_syscall(libc_fstat_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) + _, _, e1 := Syscall(SYS_FSTAT, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fstat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fstat fstat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fstatat(fd int, path string, stat *Stat_t, flags int) (err error) { @@ -926,99 +702,71 @@ func Fstatat(fd int, path string, stat *Stat_t, flags int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall6(libc_fstatat_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), uintptr(flags), 0, 0) + _, _, e1 := Syscall6(SYS_FSTATAT, uintptr(fd), uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), uintptr(flags), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fstatat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fstatat fstatat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fstatfs(fd int, stat *Statfs_t) (err error) { - _, _, e1 := syscall_syscall(libc_fstatfs_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) + _, _, e1 := Syscall(SYS_FSTATFS, uintptr(fd), uintptr(unsafe.Pointer(stat)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fstatfs_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fstatfs fstatfs "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Fsync(fd int) (err error) { - _, _, e1 := syscall_syscall(libc_fsync_trampoline_addr, uintptr(fd), 0, 0) + _, _, e1 := Syscall(SYS_FSYNC, uintptr(fd), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_fsync_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_fsync fsync "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Ftruncate(fd int, length int64) (err error) { - _, _, e1 := syscall_syscall(libc_ftruncate_trampoline_addr, uintptr(fd), uintptr(length), 0) + _, _, e1 := Syscall(SYS_FTRUNCATE, uintptr(fd), 0, uintptr(length)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_ftruncate_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_ftruncate ftruncate "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getegid() (egid int) { - r0, _, _ := syscall_rawSyscall(libc_getegid_trampoline_addr, 0, 0, 0) + r0, _, _ := RawSyscall(SYS_GETEGID, 0, 0, 0) egid = int(r0) return } -var libc_getegid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getegid getegid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Geteuid() (uid int) { - r0, _, _ := syscall_rawSyscall(libc_geteuid_trampoline_addr, 0, 0, 0) + r0, _, _ := RawSyscall(SYS_GETEUID, 0, 0, 0) uid = int(r0) return } -var libc_geteuid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_geteuid geteuid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getgid() (gid int) { - r0, _, _ := syscall_rawSyscall(libc_getgid_trampoline_addr, 0, 0, 0) + r0, _, _ := RawSyscall(SYS_GETGID, 0, 0, 0) gid = int(r0) return } -var libc_getgid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getgid getgid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getpgid(pid int) (pgid int, err error) { - r0, _, e1 := syscall_rawSyscall(libc_getpgid_trampoline_addr, uintptr(pid), 0, 0) + r0, _, e1 := RawSyscall(SYS_GETPGID, uintptr(pid), 0, 0) pgid = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1026,50 +774,34 @@ func Getpgid(pid int) (pgid int, err error) { return } -var libc_getpgid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getpgid getpgid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getpgrp() (pgrp int) { - r0, _, _ := syscall_rawSyscall(libc_getpgrp_trampoline_addr, 0, 0, 0) + r0, _, _ := RawSyscall(SYS_GETPGRP, 0, 0, 0) pgrp = int(r0) return } -var libc_getpgrp_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getpgrp getpgrp "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getpid() (pid int) { - r0, _, _ := syscall_rawSyscall(libc_getpid_trampoline_addr, 0, 0, 0) + r0, _, _ := RawSyscall(SYS_GETPID, 0, 0, 0) pid = int(r0) return } -var libc_getpid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getpid getpid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getppid() (ppid int) { - r0, _, _ := syscall_rawSyscall(libc_getppid_trampoline_addr, 0, 0, 0) + r0, _, _ := RawSyscall(SYS_GETPPID, 0, 0, 0) ppid = int(r0) return } -var libc_getppid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getppid getppid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getpriority(which int, who int) (prio int, err error) { - r0, _, e1 := syscall_syscall(libc_getpriority_trampoline_addr, uintptr(which), uintptr(who), 0) + r0, _, e1 := Syscall(SYS_GETPRIORITY, uintptr(which), uintptr(who), 0) prio = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1077,28 +809,20 @@ func Getpriority(which int, who int) (prio int, err error) { return } -var libc_getpriority_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getpriority getpriority "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getrlimit(which int, lim *Rlimit) (err error) { - _, _, e1 := syscall_rawSyscall(libc_getrlimit_trampoline_addr, uintptr(which), uintptr(unsafe.Pointer(lim)), 0) + _, _, e1 := RawSyscall(SYS_GETRLIMIT, uintptr(which), uintptr(unsafe.Pointer(lim)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_getrlimit_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getrlimit getrlimit "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getrtable() (rtable int, err error) { - r0, _, e1 := syscall_rawSyscall(libc_getrtable_trampoline_addr, 0, 0, 0) + r0, _, e1 := RawSyscall(SYS_GETRTABLE, 0, 0, 0) rtable = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1106,28 +830,20 @@ func Getrtable() (rtable int, err error) { return } -var libc_getrtable_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getrtable getrtable "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getrusage(who int, rusage *Rusage) (err error) { - _, _, e1 := syscall_rawSyscall(libc_getrusage_trampoline_addr, uintptr(who), uintptr(unsafe.Pointer(rusage)), 0) + _, _, e1 := RawSyscall(SYS_GETRUSAGE, uintptr(who), uintptr(unsafe.Pointer(rusage)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_getrusage_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getrusage getrusage "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getsid(pid int) (sid int, err error) { - r0, _, e1 := syscall_rawSyscall(libc_getsid_trampoline_addr, uintptr(pid), 0, 0) + r0, _, e1 := RawSyscall(SYS_GETSID, uintptr(pid), 0, 0) sid = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1135,66 +851,46 @@ func Getsid(pid int) (sid int, err error) { return } -var libc_getsid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getsid getsid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Gettimeofday(tv *Timeval) (err error) { - _, _, e1 := syscall_rawSyscall(libc_gettimeofday_trampoline_addr, uintptr(unsafe.Pointer(tv)), 0, 0) + _, _, e1 := RawSyscall(SYS_GETTIMEOFDAY, uintptr(unsafe.Pointer(tv)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_gettimeofday_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_gettimeofday gettimeofday "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Getuid() (uid int) { - r0, _, _ := syscall_rawSyscall(libc_getuid_trampoline_addr, 0, 0, 0) + r0, _, _ := RawSyscall(SYS_GETUID, 0, 0, 0) uid = int(r0) return } -var libc_getuid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_getuid getuid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Issetugid() (tainted bool) { - r0, _, _ := syscall_syscall(libc_issetugid_trampoline_addr, 0, 0, 0) + r0, _, _ := Syscall(SYS_ISSETUGID, 0, 0, 0) tainted = bool(r0 != 0) return } -var libc_issetugid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_issetugid issetugid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Kill(pid int, signum syscall.Signal) (err error) { - _, _, e1 := syscall_syscall(libc_kill_trampoline_addr, uintptr(pid), uintptr(signum), 0) + _, _, e1 := Syscall(SYS_KILL, uintptr(pid), uintptr(signum), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_kill_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_kill kill "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Kqueue() (fd int, err error) { - r0, _, e1 := syscall_syscall(libc_kqueue_trampoline_addr, 0, 0, 0) + r0, _, e1 := Syscall(SYS_KQUEUE, 0, 0, 0) fd = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1202,10 +898,6 @@ func Kqueue() (fd int, err error) { return } -var libc_kqueue_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_kqueue kqueue "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Lchown(path string, uid int, gid int) (err error) { @@ -1214,17 +906,13 @@ func Lchown(path string, uid int, gid int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_lchown_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(uid), uintptr(gid)) + _, _, e1 := Syscall(SYS_LCHOWN, uintptr(unsafe.Pointer(_p0)), uintptr(uid), uintptr(gid)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_lchown_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_lchown lchown "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Link(path string, link string) (err error) { @@ -1238,17 +926,13 @@ func Link(path string, link string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_link_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), 0) + _, _, e1 := Syscall(SYS_LINK, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_link_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_link link "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Linkat(pathfd int, path string, linkfd int, link string, flags int) (err error) { @@ -1262,31 +946,23 @@ func Linkat(pathfd int, path string, linkfd int, link string, flags int) (err er if err != nil { return } - _, _, e1 := syscall_syscall6(libc_linkat_trampoline_addr, uintptr(pathfd), uintptr(unsafe.Pointer(_p0)), uintptr(linkfd), uintptr(unsafe.Pointer(_p1)), uintptr(flags), 0) + _, _, e1 := Syscall6(SYS_LINKAT, uintptr(pathfd), uintptr(unsafe.Pointer(_p0)), uintptr(linkfd), uintptr(unsafe.Pointer(_p1)), uintptr(flags), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_linkat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_linkat linkat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Listen(s int, backlog int) (err error) { - _, _, e1 := syscall_syscall(libc_listen_trampoline_addr, uintptr(s), uintptr(backlog), 0) + _, _, e1 := Syscall(SYS_LISTEN, uintptr(s), uintptr(backlog), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_listen_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_listen listen "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Lstat(path string, stat *Stat_t) (err error) { @@ -1295,17 +971,13 @@ func Lstat(path string, stat *Stat_t) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_lstat_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) + _, _, e1 := Syscall(SYS_LSTAT, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_lstat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_lstat lstat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mkdir(path string, mode uint32) (err error) { @@ -1314,17 +986,13 @@ func Mkdir(path string, mode uint32) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_mkdir_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0) + _, _, e1 := Syscall(SYS_MKDIR, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mkdir_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mkdir mkdir "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mkdirat(dirfd int, path string, mode uint32) (err error) { @@ -1333,17 +1001,13 @@ func Mkdirat(dirfd int, path string, mode uint32) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_mkdirat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode)) + _, _, e1 := Syscall(SYS_MKDIRAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mkdirat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mkdirat mkdirat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mkfifo(path string, mode uint32) (err error) { @@ -1352,17 +1016,13 @@ func Mkfifo(path string, mode uint32) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_mkfifo_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0) + _, _, e1 := Syscall(SYS_MKFIFO, uintptr(unsafe.Pointer(_p0)), uintptr(mode), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mkfifo_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mkfifo mkfifo "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mkfifoat(dirfd int, path string, mode uint32) (err error) { @@ -1371,17 +1031,13 @@ func Mkfifoat(dirfd int, path string, mode uint32) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_mkfifoat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode)) + _, _, e1 := Syscall(SYS_MKFIFOAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mkfifoat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mkfifoat mkfifoat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mknod(path string, mode uint32, dev int) (err error) { @@ -1390,17 +1046,13 @@ func Mknod(path string, mode uint32, dev int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_mknod_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(dev)) + _, _, e1 := Syscall(SYS_MKNOD, uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(dev)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mknod_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mknod mknod "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Mknodat(dirfd int, path string, mode uint32, dev int) (err error) { @@ -1409,31 +1061,23 @@ func Mknodat(dirfd int, path string, mode uint32, dev int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall6(libc_mknodat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(dev), 0, 0) + _, _, e1 := Syscall6(SYS_MKNODAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(dev), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_mknodat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mknodat mknodat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Nanosleep(time *Timespec, leftover *Timespec) (err error) { - _, _, e1 := syscall_syscall(libc_nanosleep_trampoline_addr, uintptr(unsafe.Pointer(time)), uintptr(unsafe.Pointer(leftover)), 0) + _, _, e1 := Syscall(SYS_NANOSLEEP, uintptr(unsafe.Pointer(time)), uintptr(unsafe.Pointer(leftover)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_nanosleep_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_nanosleep nanosleep "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Open(path string, mode int, perm uint32) (fd int, err error) { @@ -1442,7 +1086,7 @@ func Open(path string, mode int, perm uint32) (fd int, err error) { if err != nil { return } - r0, _, e1 := syscall_syscall(libc_open_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(perm)) + r0, _, e1 := Syscall(SYS_OPEN, uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(perm)) fd = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1450,10 +1094,6 @@ func Open(path string, mode int, perm uint32) (fd int, err error) { return } -var libc_open_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_open open "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Openat(dirfd int, path string, mode int, perm uint32) (fd int, err error) { @@ -1462,7 +1102,7 @@ func Openat(dirfd int, path string, mode int, perm uint32) (fd int, err error) { if err != nil { return } - r0, _, e1 := syscall_syscall6(libc_openat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(perm), 0, 0) + r0, _, e1 := Syscall6(SYS_OPENAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(mode), uintptr(perm), 0, 0) fd = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1470,10 +1110,6 @@ func Openat(dirfd int, path string, mode int, perm uint32) (fd int, err error) { return } -var libc_openat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_openat openat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Pathconf(path string, name int) (val int, err error) { @@ -1482,7 +1118,7 @@ func Pathconf(path string, name int) (val int, err error) { if err != nil { return } - r0, _, e1 := syscall_syscall(libc_pathconf_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(name), 0) + r0, _, e1 := Syscall(SYS_PATHCONF, uintptr(unsafe.Pointer(_p0)), uintptr(name), 0) val = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1490,10 +1126,6 @@ func Pathconf(path string, name int) (val int, err error) { return } -var libc_pathconf_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_pathconf pathconf "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func pread(fd int, p []byte, offset int64) (n int, err error) { @@ -1503,7 +1135,7 @@ func pread(fd int, p []byte, offset int64) (n int, err error) { } else { _p0 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall6(libc_pread_trampoline_addr, uintptr(fd), uintptr(_p0), uintptr(len(p)), uintptr(offset), 0, 0) + r0, _, e1 := Syscall6(SYS_PREAD, uintptr(fd), uintptr(_p0), uintptr(len(p)), 0, uintptr(offset), 0) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1511,10 +1143,6 @@ func pread(fd int, p []byte, offset int64) (n int, err error) { return } -var libc_pread_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_pread pread "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func pwrite(fd int, p []byte, offset int64) (n int, err error) { @@ -1524,7 +1152,7 @@ func pwrite(fd int, p []byte, offset int64) (n int, err error) { } else { _p0 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall6(libc_pwrite_trampoline_addr, uintptr(fd), uintptr(_p0), uintptr(len(p)), uintptr(offset), 0, 0) + r0, _, e1 := Syscall6(SYS_PWRITE, uintptr(fd), uintptr(_p0), uintptr(len(p)), 0, uintptr(offset), 0) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1532,10 +1160,6 @@ func pwrite(fd int, p []byte, offset int64) (n int, err error) { return } -var libc_pwrite_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_pwrite pwrite "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func read(fd int, p []byte) (n int, err error) { @@ -1545,7 +1169,7 @@ func read(fd int, p []byte) (n int, err error) { } else { _p0 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall(libc_read_trampoline_addr, uintptr(fd), uintptr(_p0), uintptr(len(p))) + r0, _, e1 := Syscall(SYS_READ, uintptr(fd), uintptr(_p0), uintptr(len(p))) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1553,10 +1177,6 @@ func read(fd int, p []byte) (n int, err error) { return } -var libc_read_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_read read "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Readlink(path string, buf []byte) (n int, err error) { @@ -1571,7 +1191,7 @@ func Readlink(path string, buf []byte) (n int, err error) { } else { _p1 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall(libc_readlink_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(_p1), uintptr(len(buf))) + r0, _, e1 := Syscall(SYS_READLINK, uintptr(unsafe.Pointer(_p0)), uintptr(_p1), uintptr(len(buf))) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1579,10 +1199,6 @@ func Readlink(path string, buf []byte) (n int, err error) { return } -var libc_readlink_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_readlink readlink "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Readlinkat(dirfd int, path string, buf []byte) (n int, err error) { @@ -1597,7 +1213,7 @@ func Readlinkat(dirfd int, path string, buf []byte) (n int, err error) { } else { _p1 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall6(libc_readlinkat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(_p1), uintptr(len(buf)), 0, 0) + r0, _, e1 := Syscall6(SYS_READLINKAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(_p1), uintptr(len(buf)), 0, 0) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1605,10 +1221,6 @@ func Readlinkat(dirfd int, path string, buf []byte) (n int, err error) { return } -var libc_readlinkat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_readlinkat readlinkat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Rename(from string, to string) (err error) { @@ -1622,17 +1234,13 @@ func Rename(from string, to string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_rename_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), 0) + _, _, e1 := Syscall(SYS_RENAME, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_rename_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_rename rename "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Renameat(fromfd int, from string, tofd int, to string) (err error) { @@ -1646,17 +1254,13 @@ func Renameat(fromfd int, from string, tofd int, to string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall6(libc_renameat_trampoline_addr, uintptr(fromfd), uintptr(unsafe.Pointer(_p0)), uintptr(tofd), uintptr(unsafe.Pointer(_p1)), 0, 0) + _, _, e1 := Syscall6(SYS_RENAMEAT, uintptr(fromfd), uintptr(unsafe.Pointer(_p0)), uintptr(tofd), uintptr(unsafe.Pointer(_p1)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_renameat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_renameat renameat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Revoke(path string) (err error) { @@ -1665,17 +1269,13 @@ func Revoke(path string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_revoke_trampoline_addr, uintptr(unsafe.Pointer(_p0)), 0, 0) + _, _, e1 := Syscall(SYS_REVOKE, uintptr(unsafe.Pointer(_p0)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_revoke_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_revoke revoke "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Rmdir(path string) (err error) { @@ -1684,21 +1284,17 @@ func Rmdir(path string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_rmdir_trampoline_addr, uintptr(unsafe.Pointer(_p0)), 0, 0) + _, _, e1 := Syscall(SYS_RMDIR, uintptr(unsafe.Pointer(_p0)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_rmdir_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_rmdir rmdir "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Seek(fd int, offset int64, whence int) (newoffset int64, err error) { - r0, _, e1 := syscall_syscall(libc_lseek_trampoline_addr, uintptr(fd), uintptr(offset), uintptr(whence)) + r0, _, e1 := Syscall6(SYS_LSEEK, uintptr(fd), 0, uintptr(offset), uintptr(whence), 0, 0) newoffset = int64(r0) if e1 != 0 { err = errnoErr(e1) @@ -1706,14 +1302,10 @@ func Seek(fd int, offset int64, whence int) (newoffset int64, err error) { return } -var libc_lseek_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_lseek lseek "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Select(nfd int, r *FdSet, w *FdSet, e *FdSet, timeout *Timeval) (n int, err error) { - r0, _, e1 := syscall_syscall6(libc_select_trampoline_addr, uintptr(nfd), uintptr(unsafe.Pointer(r)), uintptr(unsafe.Pointer(w)), uintptr(unsafe.Pointer(e)), uintptr(unsafe.Pointer(timeout)), 0) + r0, _, e1 := Syscall6(SYS_SELECT, uintptr(nfd), uintptr(unsafe.Pointer(r)), uintptr(unsafe.Pointer(w)), uintptr(unsafe.Pointer(e)), uintptr(unsafe.Pointer(timeout)), 0) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1721,52 +1313,36 @@ func Select(nfd int, r *FdSet, w *FdSet, e *FdSet, timeout *Timeval) (n int, err return } -var libc_select_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_select select "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setegid(egid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setegid_trampoline_addr, uintptr(egid), 0, 0) + _, _, e1 := RawSyscall(SYS_SETEGID, uintptr(egid), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setegid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setegid setegid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Seteuid(euid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_seteuid_trampoline_addr, uintptr(euid), 0, 0) + _, _, e1 := RawSyscall(SYS_SETEUID, uintptr(euid), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_seteuid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_seteuid seteuid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setgid(gid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setgid_trampoline_addr, uintptr(gid), 0, 0) + _, _, e1 := RawSyscall(SYS_SETGID, uintptr(gid), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setgid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setgid setgid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setlogin(name string) (err error) { @@ -1775,133 +1351,97 @@ func Setlogin(name string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_setlogin_trampoline_addr, uintptr(unsafe.Pointer(_p0)), 0, 0) + _, _, e1 := Syscall(SYS_SETLOGIN, uintptr(unsafe.Pointer(_p0)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setlogin_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setlogin setlogin "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setpgid(pid int, pgid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setpgid_trampoline_addr, uintptr(pid), uintptr(pgid), 0) + _, _, e1 := RawSyscall(SYS_SETPGID, uintptr(pid), uintptr(pgid), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setpgid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setpgid setpgid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setpriority(which int, who int, prio int) (err error) { - _, _, e1 := syscall_syscall(libc_setpriority_trampoline_addr, uintptr(which), uintptr(who), uintptr(prio)) + _, _, e1 := Syscall(SYS_SETPRIORITY, uintptr(which), uintptr(who), uintptr(prio)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setpriority_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setpriority setpriority "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setregid(rgid int, egid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setregid_trampoline_addr, uintptr(rgid), uintptr(egid), 0) + _, _, e1 := RawSyscall(SYS_SETREGID, uintptr(rgid), uintptr(egid), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setregid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setregid setregid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setreuid(ruid int, euid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setreuid_trampoline_addr, uintptr(ruid), uintptr(euid), 0) + _, _, e1 := RawSyscall(SYS_SETREUID, uintptr(ruid), uintptr(euid), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setreuid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setreuid setreuid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setresgid(rgid int, egid int, sgid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setresgid_trampoline_addr, uintptr(rgid), uintptr(egid), uintptr(sgid)) + _, _, e1 := RawSyscall(SYS_SETRESGID, uintptr(rgid), uintptr(egid), uintptr(sgid)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setresgid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setresgid setresgid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setresuid(ruid int, euid int, suid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setresuid_trampoline_addr, uintptr(ruid), uintptr(euid), uintptr(suid)) + _, _, e1 := RawSyscall(SYS_SETRESUID, uintptr(ruid), uintptr(euid), uintptr(suid)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setresuid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setresuid setresuid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setrlimit(which int, lim *Rlimit) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setrlimit_trampoline_addr, uintptr(which), uintptr(unsafe.Pointer(lim)), 0) + _, _, e1 := RawSyscall(SYS_SETRLIMIT, uintptr(which), uintptr(unsafe.Pointer(lim)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setrlimit_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setrlimit setrlimit "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setrtable(rtable int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setrtable_trampoline_addr, uintptr(rtable), 0, 0) + _, _, e1 := RawSyscall(SYS_SETRTABLE, uintptr(rtable), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setrtable_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setrtable setrtable "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setsid() (pid int, err error) { - r0, _, e1 := syscall_rawSyscall(libc_setsid_trampoline_addr, 0, 0, 0) + r0, _, e1 := RawSyscall(SYS_SETSID, 0, 0, 0) pid = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -1909,38 +1449,26 @@ func Setsid() (pid int, err error) { return } -var libc_setsid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setsid setsid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Settimeofday(tp *Timeval) (err error) { - _, _, e1 := syscall_rawSyscall(libc_settimeofday_trampoline_addr, uintptr(unsafe.Pointer(tp)), 0, 0) + _, _, e1 := RawSyscall(SYS_SETTIMEOFDAY, uintptr(unsafe.Pointer(tp)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_settimeofday_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_settimeofday settimeofday "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Setuid(uid int) (err error) { - _, _, e1 := syscall_rawSyscall(libc_setuid_trampoline_addr, uintptr(uid), 0, 0) + _, _, e1 := RawSyscall(SYS_SETUID, uintptr(uid), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_setuid_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_setuid setuid "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Stat(path string, stat *Stat_t) (err error) { @@ -1949,17 +1477,13 @@ func Stat(path string, stat *Stat_t) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_stat_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) + _, _, e1 := Syscall(SYS_STAT, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_stat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_stat stat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Statfs(path string, stat *Statfs_t) (err error) { @@ -1968,17 +1492,13 @@ func Statfs(path string, stat *Statfs_t) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_statfs_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) + _, _, e1 := Syscall(SYS_STATFS, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(stat)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_statfs_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_statfs statfs "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Symlink(path string, link string) (err error) { @@ -1992,17 +1512,13 @@ func Symlink(path string, link string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_symlink_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), 0) + _, _, e1 := Syscall(SYS_SYMLINK, uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(_p1)), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_symlink_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_symlink symlink "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Symlinkat(oldpath string, newdirfd int, newpath string) (err error) { @@ -2016,31 +1532,23 @@ func Symlinkat(oldpath string, newdirfd int, newpath string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_symlinkat_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(newdirfd), uintptr(unsafe.Pointer(_p1))) + _, _, e1 := Syscall(SYS_SYMLINKAT, uintptr(unsafe.Pointer(_p0)), uintptr(newdirfd), uintptr(unsafe.Pointer(_p1))) if e1 != 0 { err = errnoErr(e1) } return } -var libc_symlinkat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_symlinkat symlinkat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Sync() (err error) { - _, _, e1 := syscall_syscall(libc_sync_trampoline_addr, 0, 0, 0) + _, _, e1 := Syscall(SYS_SYNC, 0, 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_sync_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_sync sync "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Truncate(path string, length int64) (err error) { @@ -2049,29 +1557,21 @@ func Truncate(path string, length int64) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_truncate_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(length), 0) + _, _, e1 := Syscall(SYS_TRUNCATE, uintptr(unsafe.Pointer(_p0)), 0, uintptr(length)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_truncate_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_truncate truncate "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Umask(newmask int) (oldmask int) { - r0, _, _ := syscall_syscall(libc_umask_trampoline_addr, uintptr(newmask), 0, 0) + r0, _, _ := Syscall(SYS_UMASK, uintptr(newmask), 0, 0) oldmask = int(r0) return } -var libc_umask_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_umask umask "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Unlink(path string) (err error) { @@ -2080,17 +1580,13 @@ func Unlink(path string) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_unlink_trampoline_addr, uintptr(unsafe.Pointer(_p0)), 0, 0) + _, _, e1 := Syscall(SYS_UNLINK, uintptr(unsafe.Pointer(_p0)), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_unlink_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_unlink unlink "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Unlinkat(dirfd int, path string, flags int) (err error) { @@ -2099,17 +1595,13 @@ func Unlinkat(dirfd int, path string, flags int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_unlinkat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(flags)) + _, _, e1 := Syscall(SYS_UNLINKAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(flags)) if e1 != 0 { err = errnoErr(e1) } return } -var libc_unlinkat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_unlinkat unlinkat "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func Unmount(path string, flags int) (err error) { @@ -2118,17 +1610,13 @@ func Unmount(path string, flags int) (err error) { if err != nil { return } - _, _, e1 := syscall_syscall(libc_unmount_trampoline_addr, uintptr(unsafe.Pointer(_p0)), uintptr(flags), 0) + _, _, e1 := Syscall(SYS_UNMOUNT, uintptr(unsafe.Pointer(_p0)), uintptr(flags), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_unmount_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_unmount unmount "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func write(fd int, p []byte) (n int, err error) { @@ -2138,7 +1626,7 @@ func write(fd int, p []byte) (n int, err error) { } else { _p0 = unsafe.Pointer(&_zero) } - r0, _, e1 := syscall_syscall(libc_write_trampoline_addr, uintptr(fd), uintptr(_p0), uintptr(len(p))) + r0, _, e1 := Syscall(SYS_WRITE, uintptr(fd), uintptr(_p0), uintptr(len(p))) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -2146,14 +1634,10 @@ func write(fd int, p []byte) (n int, err error) { return } -var libc_write_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_write write "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func mmap(addr uintptr, length uintptr, prot int, flag int, fd int, pos int64) (ret uintptr, err error) { - r0, _, e1 := syscall_syscall6(libc_mmap_trampoline_addr, uintptr(addr), uintptr(length), uintptr(prot), uintptr(flag), uintptr(fd), uintptr(pos)) + r0, _, e1 := Syscall9(SYS_MMAP, uintptr(addr), uintptr(length), uintptr(prot), uintptr(flag), uintptr(fd), 0, uintptr(pos), 0, 0) ret = uintptr(r0) if e1 != 0 { err = errnoErr(e1) @@ -2161,28 +1645,20 @@ func mmap(addr uintptr, length uintptr, prot int, flag int, fd int, pos int64) ( return } -var libc_mmap_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_mmap mmap "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func munmap(addr uintptr, length uintptr) (err error) { - _, _, e1 := syscall_syscall(libc_munmap_trampoline_addr, uintptr(addr), uintptr(length), 0) + _, _, e1 := Syscall(SYS_MUNMAP, uintptr(addr), uintptr(length), 0) if e1 != 0 { err = errnoErr(e1) } return } -var libc_munmap_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_munmap munmap "libc.so" - // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func readlen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_read_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) + r0, _, e1 := Syscall(SYS_READ, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -2193,7 +1669,7 @@ func readlen(fd int, buf *byte, nbuf int) (n int, err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT func writelen(fd int, buf *byte, nbuf int) (n int, err error) { - r0, _, e1 := syscall_syscall(libc_write_trampoline_addr, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) + r0, _, e1 := Syscall(SYS_WRITE, uintptr(fd), uintptr(unsafe.Pointer(buf)), uintptr(nbuf)) n = int(r0) if e1 != 0 { err = errnoErr(e1) @@ -2209,13 +1685,9 @@ func utimensat(dirfd int, path string, times *[2]Timespec, flags int) (err error if err != nil { return } - _, _, e1 := syscall_syscall6(libc_utimensat_trampoline_addr, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(times)), uintptr(flags), 0, 0) + _, _, e1 := Syscall6(SYS_UTIMENSAT, uintptr(dirfd), uintptr(unsafe.Pointer(_p0)), uintptr(unsafe.Pointer(times)), uintptr(flags), 0, 0) if e1 != 0 { err = errnoErr(e1) } return } - -var libc_utimensat_trampoline_addr uintptr - -//go:cgo_import_dynamic libc_utimensat utimensat "libc.so" diff --git a/vendor/golang.org/x/sys/unix/zsyscall_solaris_amd64.go b/vendor/golang.org/x/sys/unix/zsyscall_solaris_amd64.go index fdf53f8..d12f4fb 100644 --- a/vendor/golang.org/x/sys/unix/zsyscall_solaris_amd64.go +++ b/vendor/golang.org/x/sys/unix/zsyscall_solaris_amd64.go @@ -66,7 +66,6 @@ import ( //go:cgo_import_dynamic libc_getpriority getpriority "libc.so" //go:cgo_import_dynamic libc_getrlimit getrlimit "libc.so" //go:cgo_import_dynamic libc_getrusage getrusage "libc.so" -//go:cgo_import_dynamic libc_getsid getsid "libc.so" //go:cgo_import_dynamic libc_gettimeofday gettimeofday "libc.so" //go:cgo_import_dynamic libc_getuid getuid "libc.so" //go:cgo_import_dynamic libc_kill kill "libc.so" @@ -203,7 +202,6 @@ import ( //go:linkname procGetpriority libc_getpriority //go:linkname procGetrlimit libc_getrlimit //go:linkname procGetrusage libc_getrusage -//go:linkname procGetsid libc_getsid //go:linkname procGettimeofday libc_gettimeofday //go:linkname procGetuid libc_getuid //go:linkname procKill libc_kill @@ -341,7 +339,6 @@ var ( procGetpriority, procGetrlimit, procGetrusage, - procGetsid, procGettimeofday, procGetuid, procKill, @@ -1047,17 +1044,6 @@ func Getrusage(who int, rusage *Rusage) (err error) { // THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT -func Getsid(pid int) (sid int, err error) { - r0, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procGetsid)), 1, uintptr(pid), 0, 0, 0, 0, 0) - sid = int(r0) - if e1 != 0 { - err = e1 - } - return -} - -// THIS FILE IS GENERATED BY THE COMMAND AT THE TOP; DO NOT EDIT - func Gettimeofday(tv *Timeval) (err error) { _, _, e1 := rawSysvicall6(uintptr(unsafe.Pointer(&procGettimeofday)), 1, uintptr(unsafe.Pointer(tv)), 0, 0, 0, 0, 0) if e1 != 0 { diff --git a/vendor/golang.org/x/sys/unix/zsysnum_freebsd_386.go b/vendor/golang.org/x/sys/unix/zsysnum_freebsd_386.go index 4e0d961..59d5dfc 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_freebsd_386.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_freebsd_386.go @@ -1,4 +1,4 @@ -// go run mksysnum.go https://cgit.freebsd.org/src/plain/sys/kern/syscalls.master?h=stable/12 +// go run mksysnum.go https://svn.freebsd.org/base/stable/11/sys/kern/syscalls.master // Code generated by the command above; see README.md. DO NOT EDIT. //go:build 386 && freebsd @@ -19,9 +19,10 @@ const ( SYS_UNLINK = 10 // { int unlink(char *path); } SYS_CHDIR = 12 // { int chdir(char *path); } SYS_FCHDIR = 13 // { int fchdir(int fd); } + SYS_MKNOD = 14 // { int mknod(char *path, int mode, int dev); } SYS_CHMOD = 15 // { int chmod(char *path, int mode); } SYS_CHOWN = 16 // { int chown(char *path, int uid, int gid); } - SYS_BREAK = 17 // { caddr_t break(char *nsize); } + SYS_OBREAK = 17 // { int obreak(char *nsize); } break obreak_args int SYS_GETPID = 20 // { pid_t getpid(void); } SYS_MOUNT = 21 // { int mount(char *type, char *path, int flags, caddr_t data); } SYS_UNMOUNT = 22 // { int unmount(char *path, int flags); } @@ -42,6 +43,7 @@ const ( SYS_KILL = 37 // { int kill(int pid, int signum); } SYS_GETPPID = 39 // { pid_t getppid(void); } SYS_DUP = 41 // { int dup(u_int fd); } + SYS_PIPE = 42 // { int pipe(void); } SYS_GETEGID = 43 // { gid_t getegid(void); } SYS_PROFIL = 44 // { int profil(caddr_t samples, size_t size, size_t offset, u_int scale); } SYS_KTRACE = 45 // { int ktrace(const char *fname, int ops, int facs, int pid); } @@ -56,14 +58,15 @@ const ( SYS_SYMLINK = 57 // { int symlink(char *path, char *link); } SYS_READLINK = 58 // { ssize_t readlink(char *path, char *buf, size_t count); } SYS_EXECVE = 59 // { int execve(char *fname, char **argv, char **envv); } - SYS_UMASK = 60 // { int umask(int newmask); } + SYS_UMASK = 60 // { int umask(int newmask); } umask umask_args int SYS_CHROOT = 61 // { int chroot(char *path); } SYS_MSYNC = 65 // { int msync(void *addr, size_t len, int flags); } SYS_VFORK = 66 // { int vfork(void); } SYS_SBRK = 69 // { int sbrk(int incr); } SYS_SSTK = 70 // { int sstk(int incr); } + SYS_OVADVISE = 72 // { int ovadvise(int anom); } vadvise ovadvise_args int SYS_MUNMAP = 73 // { int munmap(void *addr, size_t len); } - SYS_MPROTECT = 74 // { int mprotect(void *addr, size_t len, int prot); } + SYS_MPROTECT = 74 // { int mprotect(const void *addr, size_t len, int prot); } SYS_MADVISE = 75 // { int madvise(void *addr, size_t len, int behav); } SYS_MINCORE = 78 // { int mincore(const void *addr, size_t len, char *vec); } SYS_GETGROUPS = 79 // { int getgroups(u_int gidsetsize, gid_t *gidset); } @@ -121,10 +124,14 @@ const ( SYS_SETGID = 181 // { int setgid(gid_t gid); } SYS_SETEGID = 182 // { int setegid(gid_t egid); } SYS_SETEUID = 183 // { int seteuid(uid_t euid); } + SYS_STAT = 188 // { int stat(char *path, struct stat *ub); } + SYS_FSTAT = 189 // { int fstat(int fd, struct stat *sb); } + SYS_LSTAT = 190 // { int lstat(char *path, struct stat *ub); } SYS_PATHCONF = 191 // { int pathconf(char *path, int name); } SYS_FPATHCONF = 192 // { int fpathconf(int fd, int name); } SYS_GETRLIMIT = 194 // { int getrlimit(u_int which, struct rlimit *rlp); } getrlimit __getrlimit_args int SYS_SETRLIMIT = 195 // { int setrlimit(u_int which, struct rlimit *rlp); } setrlimit __setrlimit_args int + SYS_GETDIRENTRIES = 196 // { int getdirentries(int fd, char *buf, u_int count, long *basep); } SYS___SYSCTL = 202 // { int __sysctl(int *name, u_int namelen, void *old, size_t *oldlenp, void *new, size_t newlen); } __sysctl sysctl_args int SYS_MLOCK = 203 // { int mlock(const void *addr, size_t len); } SYS_MUNLOCK = 204 // { int munlock(const void *addr, size_t len); } @@ -136,12 +143,12 @@ const ( SYS_SEMOP = 222 // { int semop(int semid, struct sembuf *sops, size_t nsops); } SYS_MSGGET = 225 // { int msgget(key_t key, int msgflg); } SYS_MSGSND = 226 // { int msgsnd(int msqid, const void *msgp, size_t msgsz, int msgflg); } - SYS_MSGRCV = 227 // { ssize_t msgrcv(int msqid, void *msgp, size_t msgsz, long msgtyp, int msgflg); } + SYS_MSGRCV = 227 // { int msgrcv(int msqid, void *msgp, size_t msgsz, long msgtyp, int msgflg); } SYS_SHMAT = 228 // { int shmat(int shmid, const void *shmaddr, int shmflg); } SYS_SHMDT = 230 // { int shmdt(const void *shmaddr); } SYS_SHMGET = 231 // { int shmget(key_t key, size_t size, int shmflg); } SYS_CLOCK_GETTIME = 232 // { int clock_gettime(clockid_t clock_id, struct timespec *tp); } - SYS_CLOCK_SETTIME = 233 // { int clock_settime(clockid_t clock_id, const struct timespec *tp); } + SYS_CLOCK_SETTIME = 233 // { int clock_settime( clockid_t clock_id, const struct timespec *tp); } SYS_CLOCK_GETRES = 234 // { int clock_getres(clockid_t clock_id, struct timespec *tp); } SYS_KTIMER_CREATE = 235 // { int ktimer_create(clockid_t clock_id, struct sigevent *evp, int *timerid); } SYS_KTIMER_DELETE = 236 // { int ktimer_delete(int timerid); } @@ -150,44 +157,50 @@ const ( SYS_KTIMER_GETOVERRUN = 239 // { int ktimer_getoverrun(int timerid); } SYS_NANOSLEEP = 240 // { int nanosleep(const struct timespec *rqtp, struct timespec *rmtp); } SYS_FFCLOCK_GETCOUNTER = 241 // { int ffclock_getcounter(ffcounter *ffcount); } - SYS_FFCLOCK_SETESTIMATE = 242 // { int ffclock_setestimate(struct ffclock_estimate *cest); } - SYS_FFCLOCK_GETESTIMATE = 243 // { int ffclock_getestimate(struct ffclock_estimate *cest); } + SYS_FFCLOCK_SETESTIMATE = 242 // { int ffclock_setestimate( struct ffclock_estimate *cest); } + SYS_FFCLOCK_GETESTIMATE = 243 // { int ffclock_getestimate( struct ffclock_estimate *cest); } SYS_CLOCK_NANOSLEEP = 244 // { int clock_nanosleep(clockid_t clock_id, int flags, const struct timespec *rqtp, struct timespec *rmtp); } - SYS_CLOCK_GETCPUCLOCKID2 = 247 // { int clock_getcpuclockid2(id_t id, int which, clockid_t *clock_id); } + SYS_CLOCK_GETCPUCLOCKID2 = 247 // { int clock_getcpuclockid2(id_t id,int which, clockid_t *clock_id); } SYS_NTP_GETTIME = 248 // { int ntp_gettime(struct ntptimeval *ntvp); } SYS_MINHERIT = 250 // { int minherit(void *addr, size_t len, int inherit); } SYS_RFORK = 251 // { int rfork(int flags); } + SYS_OPENBSD_POLL = 252 // { int openbsd_poll(struct pollfd *fds, u_int nfds, int timeout); } SYS_ISSETUGID = 253 // { int issetugid(void); } SYS_LCHOWN = 254 // { int lchown(char *path, int uid, int gid); } SYS_AIO_READ = 255 // { int aio_read(struct aiocb *aiocbp); } SYS_AIO_WRITE = 256 // { int aio_write(struct aiocb *aiocbp); } - SYS_LIO_LISTIO = 257 // { int lio_listio(int mode, struct aiocb* const *acb_list, int nent, struct sigevent *sig); } + SYS_LIO_LISTIO = 257 // { int lio_listio(int mode, struct aiocb * const *acb_list, int nent, struct sigevent *sig); } + SYS_GETDENTS = 272 // { int getdents(int fd, char *buf, size_t count); } SYS_LCHMOD = 274 // { int lchmod(char *path, mode_t mode); } SYS_LUTIMES = 276 // { int lutimes(char *path, struct timeval *tptr); } + SYS_NSTAT = 278 // { int nstat(char *path, struct nstat *ub); } + SYS_NFSTAT = 279 // { int nfstat(int fd, struct nstat *sb); } + SYS_NLSTAT = 280 // { int nlstat(char *path, struct nstat *ub); } SYS_PREADV = 289 // { ssize_t preadv(int fd, struct iovec *iovp, u_int iovcnt, off_t offset); } SYS_PWRITEV = 290 // { ssize_t pwritev(int fd, struct iovec *iovp, u_int iovcnt, off_t offset); } SYS_FHOPEN = 298 // { int fhopen(const struct fhandle *u_fhp, int flags); } + SYS_FHSTAT = 299 // { int fhstat(const struct fhandle *u_fhp, struct stat *sb); } SYS_MODNEXT = 300 // { int modnext(int modid); } - SYS_MODSTAT = 301 // { int modstat(int modid, struct module_stat* stat); } + SYS_MODSTAT = 301 // { int modstat(int modid, struct module_stat *stat); } SYS_MODFNEXT = 302 // { int modfnext(int modid); } SYS_MODFIND = 303 // { int modfind(const char *name); } SYS_KLDLOAD = 304 // { int kldload(const char *file); } SYS_KLDUNLOAD = 305 // { int kldunload(int fileid); } SYS_KLDFIND = 306 // { int kldfind(const char *file); } SYS_KLDNEXT = 307 // { int kldnext(int fileid); } - SYS_KLDSTAT = 308 // { int kldstat(int fileid, struct kld_file_stat *stat); } + SYS_KLDSTAT = 308 // { int kldstat(int fileid, struct kld_file_stat* stat); } SYS_KLDFIRSTMOD = 309 // { int kldfirstmod(int fileid); } SYS_GETSID = 310 // { int getsid(pid_t pid); } SYS_SETRESUID = 311 // { int setresuid(uid_t ruid, uid_t euid, uid_t suid); } SYS_SETRESGID = 312 // { int setresgid(gid_t rgid, gid_t egid, gid_t sgid); } SYS_AIO_RETURN = 314 // { ssize_t aio_return(struct aiocb *aiocbp); } - SYS_AIO_SUSPEND = 315 // { int aio_suspend(struct aiocb * const * aiocbp, int nent, const struct timespec *timeout); } + SYS_AIO_SUSPEND = 315 // { int aio_suspend( struct aiocb * const * aiocbp, int nent, const struct timespec *timeout); } SYS_AIO_CANCEL = 316 // { int aio_cancel(int fd, struct aiocb *aiocbp); } SYS_AIO_ERROR = 317 // { int aio_error(struct aiocb *aiocbp); } SYS_YIELD = 321 // { int yield(void); } SYS_MLOCKALL = 324 // { int mlockall(int how); } SYS_MUNLOCKALL = 325 // { int munlockall(void); } - SYS___GETCWD = 326 // { int __getcwd(char *buf, size_t buflen); } + SYS___GETCWD = 326 // { int __getcwd(char *buf, u_int buflen); } SYS_SCHED_SETPARAM = 327 // { int sched_setparam (pid_t pid, const struct sched_param *param); } SYS_SCHED_GETPARAM = 328 // { int sched_getparam (pid_t pid, struct sched_param *param); } SYS_SCHED_SETSCHEDULER = 329 // { int sched_setscheduler (pid_t pid, int policy, const struct sched_param *param); } @@ -213,13 +226,14 @@ const ( SYS___ACL_ACLCHECK_FILE = 353 // { int __acl_aclcheck_file(const char *path, acl_type_t type, struct acl *aclp); } SYS___ACL_ACLCHECK_FD = 354 // { int __acl_aclcheck_fd(int filedes, acl_type_t type, struct acl *aclp); } SYS_EXTATTRCTL = 355 // { int extattrctl(const char *path, int cmd, const char *filename, int attrnamespace, const char *attrname); } - SYS_EXTATTR_SET_FILE = 356 // { ssize_t extattr_set_file(const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes); } - SYS_EXTATTR_GET_FILE = 357 // { ssize_t extattr_get_file(const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes); } + SYS_EXTATTR_SET_FILE = 356 // { ssize_t extattr_set_file( const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes); } + SYS_EXTATTR_GET_FILE = 357 // { ssize_t extattr_get_file( const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes); } SYS_EXTATTR_DELETE_FILE = 358 // { int extattr_delete_file(const char *path, int attrnamespace, const char *attrname); } - SYS_AIO_WAITCOMPLETE = 359 // { ssize_t aio_waitcomplete(struct aiocb **aiocbp, struct timespec *timeout); } + SYS_AIO_WAITCOMPLETE = 359 // { ssize_t aio_waitcomplete( struct aiocb **aiocbp, struct timespec *timeout); } SYS_GETRESUID = 360 // { int getresuid(uid_t *ruid, uid_t *euid, uid_t *suid); } SYS_GETRESGID = 361 // { int getresgid(gid_t *rgid, gid_t *egid, gid_t *sgid); } SYS_KQUEUE = 362 // { int kqueue(void); } + SYS_KEVENT = 363 // { int kevent(int fd, struct kevent *changelist, int nchanges, struct kevent *eventlist, int nevents, const struct timespec *timeout); } SYS_EXTATTR_SET_FD = 371 // { ssize_t extattr_set_fd(int fd, int attrnamespace, const char *attrname, void *data, size_t nbytes); } SYS_EXTATTR_GET_FD = 372 // { ssize_t extattr_get_fd(int fd, int attrnamespace, const char *attrname, void *data, size_t nbytes); } SYS_EXTATTR_DELETE_FD = 373 // { int extattr_delete_fd(int fd, int attrnamespace, const char *attrname); } @@ -237,6 +251,10 @@ const ( SYS_UUIDGEN = 392 // { int uuidgen(struct uuid *store, int count); } SYS_SENDFILE = 393 // { int sendfile(int fd, int s, off_t offset, size_t nbytes, struct sf_hdtr *hdtr, off_t *sbytes, int flags); } SYS_MAC_SYSCALL = 394 // { int mac_syscall(const char *policy, int call, void *arg); } + SYS_GETFSSTAT = 395 // { int getfsstat(struct statfs *buf, long bufsize, int mode); } + SYS_STATFS = 396 // { int statfs(char *path, struct statfs *buf); } + SYS_FSTATFS = 397 // { int fstatfs(int fd, struct statfs *buf); } + SYS_FHSTATFS = 398 // { int fhstatfs(const struct fhandle *u_fhp, struct statfs *buf); } SYS_KSEM_CLOSE = 400 // { int ksem_close(semid_t id); } SYS_KSEM_POST = 401 // { int ksem_post(semid_t id); } SYS_KSEM_WAIT = 402 // { int ksem_wait(semid_t id); } @@ -249,14 +267,14 @@ const ( SYS___MAC_GET_PID = 409 // { int __mac_get_pid(pid_t pid, struct mac *mac_p); } SYS___MAC_GET_LINK = 410 // { int __mac_get_link(const char *path_p, struct mac *mac_p); } SYS___MAC_SET_LINK = 411 // { int __mac_set_link(const char *path_p, struct mac *mac_p); } - SYS_EXTATTR_SET_LINK = 412 // { ssize_t extattr_set_link(const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes); } - SYS_EXTATTR_GET_LINK = 413 // { ssize_t extattr_get_link(const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes); } - SYS_EXTATTR_DELETE_LINK = 414 // { int extattr_delete_link(const char *path, int attrnamespace, const char *attrname); } + SYS_EXTATTR_SET_LINK = 412 // { ssize_t extattr_set_link( const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes); } + SYS_EXTATTR_GET_LINK = 413 // { ssize_t extattr_get_link( const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes); } + SYS_EXTATTR_DELETE_LINK = 414 // { int extattr_delete_link( const char *path, int attrnamespace, const char *attrname); } SYS___MAC_EXECVE = 415 // { int __mac_execve(char *fname, char **argv, char **envv, struct mac *mac_p); } SYS_SIGACTION = 416 // { int sigaction(int sig, const struct sigaction *act, struct sigaction *oact); } - SYS_SIGRETURN = 417 // { int sigreturn(const struct __ucontext *sigcntxp); } + SYS_SIGRETURN = 417 // { int sigreturn( const struct __ucontext *sigcntxp); } SYS_GETCONTEXT = 421 // { int getcontext(struct __ucontext *ucp); } - SYS_SETCONTEXT = 422 // { int setcontext(const struct __ucontext *ucp); } + SYS_SETCONTEXT = 422 // { int setcontext( const struct __ucontext *ucp); } SYS_SWAPCONTEXT = 423 // { int swapcontext(struct __ucontext *oucp, const struct __ucontext *ucp); } SYS_SWAPOFF = 424 // { int swapoff(const char *name); } SYS___ACL_GET_LINK = 425 // { int __acl_get_link(const char *path, acl_type_t type, struct acl *aclp); } @@ -270,10 +288,10 @@ const ( SYS_THR_KILL = 433 // { int thr_kill(long id, int sig); } SYS_JAIL_ATTACH = 436 // { int jail_attach(int jid); } SYS_EXTATTR_LIST_FD = 437 // { ssize_t extattr_list_fd(int fd, int attrnamespace, void *data, size_t nbytes); } - SYS_EXTATTR_LIST_FILE = 438 // { ssize_t extattr_list_file(const char *path, int attrnamespace, void *data, size_t nbytes); } - SYS_EXTATTR_LIST_LINK = 439 // { ssize_t extattr_list_link(const char *path, int attrnamespace, void *data, size_t nbytes); } + SYS_EXTATTR_LIST_FILE = 438 // { ssize_t extattr_list_file( const char *path, int attrnamespace, void *data, size_t nbytes); } + SYS_EXTATTR_LIST_LINK = 439 // { ssize_t extattr_list_link( const char *path, int attrnamespace, void *data, size_t nbytes); } SYS_KSEM_TIMEDWAIT = 441 // { int ksem_timedwait(semid_t id, const struct timespec *abstime); } - SYS_THR_SUSPEND = 442 // { int thr_suspend(const struct timespec *timeout); } + SYS_THR_SUSPEND = 442 // { int thr_suspend( const struct timespec *timeout); } SYS_THR_WAKE = 443 // { int thr_wake(long id); } SYS_KLDUNLOADF = 444 // { int kldunloadf(int fileid, int flags); } SYS_AUDIT = 445 // { int audit(const void *record, u_int length); } @@ -282,17 +300,17 @@ const ( SYS_SETAUID = 448 // { int setauid(uid_t *auid); } SYS_GETAUDIT = 449 // { int getaudit(struct auditinfo *auditinfo); } SYS_SETAUDIT = 450 // { int setaudit(struct auditinfo *auditinfo); } - SYS_GETAUDIT_ADDR = 451 // { int getaudit_addr(struct auditinfo_addr *auditinfo_addr, u_int length); } - SYS_SETAUDIT_ADDR = 452 // { int setaudit_addr(struct auditinfo_addr *auditinfo_addr, u_int length); } + SYS_GETAUDIT_ADDR = 451 // { int getaudit_addr( struct auditinfo_addr *auditinfo_addr, u_int length); } + SYS_SETAUDIT_ADDR = 452 // { int setaudit_addr( struct auditinfo_addr *auditinfo_addr, u_int length); } SYS_AUDITCTL = 453 // { int auditctl(char *path); } SYS__UMTX_OP = 454 // { int _umtx_op(void *obj, int op, u_long val, void *uaddr1, void *uaddr2); } SYS_THR_NEW = 455 // { int thr_new(struct thr_param *param, int param_size); } SYS_SIGQUEUE = 456 // { int sigqueue(pid_t pid, int signum, void *value); } SYS_KMQ_OPEN = 457 // { int kmq_open(const char *path, int flags, mode_t mode, const struct mq_attr *attr); } - SYS_KMQ_SETATTR = 458 // { int kmq_setattr(int mqd, const struct mq_attr *attr, struct mq_attr *oattr); } - SYS_KMQ_TIMEDRECEIVE = 459 // { int kmq_timedreceive(int mqd, char *msg_ptr, size_t msg_len, unsigned *msg_prio, const struct timespec *abs_timeout); } - SYS_KMQ_TIMEDSEND = 460 // { int kmq_timedsend(int mqd, const char *msg_ptr, size_t msg_len, unsigned msg_prio, const struct timespec *abs_timeout); } - SYS_KMQ_NOTIFY = 461 // { int kmq_notify(int mqd, const struct sigevent *sigev); } + SYS_KMQ_SETATTR = 458 // { int kmq_setattr(int mqd, const struct mq_attr *attr, struct mq_attr *oattr); } + SYS_KMQ_TIMEDRECEIVE = 459 // { int kmq_timedreceive(int mqd, char *msg_ptr, size_t msg_len, unsigned *msg_prio, const struct timespec *abs_timeout); } + SYS_KMQ_TIMEDSEND = 460 // { int kmq_timedsend(int mqd, const char *msg_ptr, size_t msg_len,unsigned msg_prio, const struct timespec *abs_timeout);} + SYS_KMQ_NOTIFY = 461 // { int kmq_notify(int mqd, const struct sigevent *sigev); } SYS_KMQ_UNLINK = 462 // { int kmq_unlink(const char *path); } SYS_ABORT2 = 463 // { int abort2(const char *why, int nargs, void **args); } SYS_THR_SET_NAME = 464 // { int thr_set_name(long id, const char *name); } @@ -301,7 +319,7 @@ const ( SYS_SCTP_PEELOFF = 471 // { int sctp_peeloff(int sd, uint32_t name); } SYS_SCTP_GENERIC_SENDMSG = 472 // { int sctp_generic_sendmsg(int sd, caddr_t msg, int mlen, caddr_t to, __socklen_t tolen, struct sctp_sndrcvinfo *sinfo, int flags); } SYS_SCTP_GENERIC_SENDMSG_IOV = 473 // { int sctp_generic_sendmsg_iov(int sd, struct iovec *iov, int iovlen, caddr_t to, __socklen_t tolen, struct sctp_sndrcvinfo *sinfo, int flags); } - SYS_SCTP_GENERIC_RECVMSG = 474 // { int sctp_generic_recvmsg(int sd, struct iovec *iov, int iovlen, struct sockaddr *from, __socklen_t *fromlenaddr, struct sctp_sndrcvinfo *sinfo, int *msg_flags); } + SYS_SCTP_GENERIC_RECVMSG = 474 // { int sctp_generic_recvmsg(int sd, struct iovec *iov, int iovlen, struct sockaddr * from, __socklen_t *fromlenaddr, struct sctp_sndrcvinfo *sinfo, int *msg_flags); } SYS_PREAD = 475 // { ssize_t pread(int fd, void *buf, size_t nbyte, off_t offset); } SYS_PWRITE = 476 // { ssize_t pwrite(int fd, const void *buf, size_t nbyte, off_t offset); } SYS_MMAP = 477 // { caddr_t mmap(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos); } @@ -320,12 +338,14 @@ const ( SYS_FCHMODAT = 490 // { int fchmodat(int fd, char *path, mode_t mode, int flag); } SYS_FCHOWNAT = 491 // { int fchownat(int fd, char *path, uid_t uid, gid_t gid, int flag); } SYS_FEXECVE = 492 // { int fexecve(int fd, char **argv, char **envv); } + SYS_FSTATAT = 493 // { int fstatat(int fd, char *path, struct stat *buf, int flag); } SYS_FUTIMESAT = 494 // { int futimesat(int fd, char *path, struct timeval *times); } SYS_LINKAT = 495 // { int linkat(int fd1, char *path1, int fd2, char *path2, int flag); } SYS_MKDIRAT = 496 // { int mkdirat(int fd, char *path, mode_t mode); } SYS_MKFIFOAT = 497 // { int mkfifoat(int fd, char *path, mode_t mode); } + SYS_MKNODAT = 498 // { int mknodat(int fd, char *path, mode_t mode, dev_t dev); } SYS_OPENAT = 499 // { int openat(int fd, char *path, int flag, mode_t mode); } - SYS_READLINKAT = 500 // { ssize_t readlinkat(int fd, char *path, char *buf, size_t bufsize); } + SYS_READLINKAT = 500 // { int readlinkat(int fd, char *path, char *buf, size_t bufsize); } SYS_RENAMEAT = 501 // { int renameat(int oldfd, char *old, int newfd, char *new); } SYS_SYMLINKAT = 502 // { int symlinkat(char *path1, int fd, char *path2); } SYS_UNLINKAT = 503 // { int unlinkat(int fd, char *path, int flag); } @@ -371,24 +391,7 @@ const ( SYS_PPOLL = 545 // { int ppoll(struct pollfd *fds, u_int nfds, const struct timespec *ts, const sigset_t *set); } SYS_FUTIMENS = 546 // { int futimens(int fd, struct timespec *times); } SYS_UTIMENSAT = 547 // { int utimensat(int fd, char *path, struct timespec *times, int flag); } + SYS_NUMA_GETAFFINITY = 548 // { int numa_getaffinity(cpuwhich_t which, id_t id, struct vm_domain_policy_entry *policy); } + SYS_NUMA_SETAFFINITY = 549 // { int numa_setaffinity(cpuwhich_t which, id_t id, const struct vm_domain_policy_entry *policy); } SYS_FDATASYNC = 550 // { int fdatasync(int fd); } - SYS_FSTAT = 551 // { int fstat(int fd, struct stat *sb); } - SYS_FSTATAT = 552 // { int fstatat(int fd, char *path, struct stat *buf, int flag); } - SYS_FHSTAT = 553 // { int fhstat(const struct fhandle *u_fhp, struct stat *sb); } - SYS_GETDIRENTRIES = 554 // { ssize_t getdirentries(int fd, char *buf, size_t count, off_t *basep); } - SYS_STATFS = 555 // { int statfs(char *path, struct statfs *buf); } - SYS_FSTATFS = 556 // { int fstatfs(int fd, struct statfs *buf); } - SYS_GETFSSTAT = 557 // { int getfsstat(struct statfs *buf, long bufsize, int mode); } - SYS_FHSTATFS = 558 // { int fhstatfs(const struct fhandle *u_fhp, struct statfs *buf); } - SYS_MKNODAT = 559 // { int mknodat(int fd, char *path, mode_t mode, dev_t dev); } - SYS_KEVENT = 560 // { int kevent(int fd, struct kevent *changelist, int nchanges, struct kevent *eventlist, int nevents, const struct timespec *timeout); } - SYS_CPUSET_GETDOMAIN = 561 // { int cpuset_getdomain(cpulevel_t level, cpuwhich_t which, id_t id, size_t domainsetsize, domainset_t *mask, int *policy); } - SYS_CPUSET_SETDOMAIN = 562 // { int cpuset_setdomain(cpulevel_t level, cpuwhich_t which, id_t id, size_t domainsetsize, domainset_t *mask, int policy); } - SYS_GETRANDOM = 563 // { int getrandom(void *buf, size_t buflen, unsigned int flags); } - SYS_GETFHAT = 564 // { int getfhat(int fd, char *path, struct fhandle *fhp, int flags); } - SYS_FHLINK = 565 // { int fhlink(struct fhandle *fhp, const char *to); } - SYS_FHLINKAT = 566 // { int fhlinkat(struct fhandle *fhp, int tofd, const char *to,); } - SYS_FHREADLINK = 567 // { int fhreadlink(struct fhandle *fhp, char *buf, size_t bufsize); } - SYS___SYSCTLBYNAME = 570 // { int __sysctlbyname(const char *name, size_t namelen, void *old, size_t *oldlenp, void *new, size_t newlen); } - SYS_CLOSE_RANGE = 575 // { int close_range(u_int lowfd, u_int highfd, int flags); } ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_freebsd_amd64.go b/vendor/golang.org/x/sys/unix/zsysnum_freebsd_amd64.go index 01636b8..342d471 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_freebsd_amd64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_freebsd_amd64.go @@ -1,4 +1,4 @@ -// go run mksysnum.go https://cgit.freebsd.org/src/plain/sys/kern/syscalls.master?h=stable/12 +// go run mksysnum.go https://svn.freebsd.org/base/stable/11/sys/kern/syscalls.master // Code generated by the command above; see README.md. DO NOT EDIT. //go:build amd64 && freebsd @@ -19,9 +19,10 @@ const ( SYS_UNLINK = 10 // { int unlink(char *path); } SYS_CHDIR = 12 // { int chdir(char *path); } SYS_FCHDIR = 13 // { int fchdir(int fd); } + SYS_MKNOD = 14 // { int mknod(char *path, int mode, int dev); } SYS_CHMOD = 15 // { int chmod(char *path, int mode); } SYS_CHOWN = 16 // { int chown(char *path, int uid, int gid); } - SYS_BREAK = 17 // { caddr_t break(char *nsize); } + SYS_OBREAK = 17 // { int obreak(char *nsize); } break obreak_args int SYS_GETPID = 20 // { pid_t getpid(void); } SYS_MOUNT = 21 // { int mount(char *type, char *path, int flags, caddr_t data); } SYS_UNMOUNT = 22 // { int unmount(char *path, int flags); } @@ -42,6 +43,7 @@ const ( SYS_KILL = 37 // { int kill(int pid, int signum); } SYS_GETPPID = 39 // { pid_t getppid(void); } SYS_DUP = 41 // { int dup(u_int fd); } + SYS_PIPE = 42 // { int pipe(void); } SYS_GETEGID = 43 // { gid_t getegid(void); } SYS_PROFIL = 44 // { int profil(caddr_t samples, size_t size, size_t offset, u_int scale); } SYS_KTRACE = 45 // { int ktrace(const char *fname, int ops, int facs, int pid); } @@ -56,14 +58,15 @@ const ( SYS_SYMLINK = 57 // { int symlink(char *path, char *link); } SYS_READLINK = 58 // { ssize_t readlink(char *path, char *buf, size_t count); } SYS_EXECVE = 59 // { int execve(char *fname, char **argv, char **envv); } - SYS_UMASK = 60 // { int umask(int newmask); } + SYS_UMASK = 60 // { int umask(int newmask); } umask umask_args int SYS_CHROOT = 61 // { int chroot(char *path); } SYS_MSYNC = 65 // { int msync(void *addr, size_t len, int flags); } SYS_VFORK = 66 // { int vfork(void); } SYS_SBRK = 69 // { int sbrk(int incr); } SYS_SSTK = 70 // { int sstk(int incr); } + SYS_OVADVISE = 72 // { int ovadvise(int anom); } vadvise ovadvise_args int SYS_MUNMAP = 73 // { int munmap(void *addr, size_t len); } - SYS_MPROTECT = 74 // { int mprotect(void *addr, size_t len, int prot); } + SYS_MPROTECT = 74 // { int mprotect(const void *addr, size_t len, int prot); } SYS_MADVISE = 75 // { int madvise(void *addr, size_t len, int behav); } SYS_MINCORE = 78 // { int mincore(const void *addr, size_t len, char *vec); } SYS_GETGROUPS = 79 // { int getgroups(u_int gidsetsize, gid_t *gidset); } @@ -121,10 +124,14 @@ const ( SYS_SETGID = 181 // { int setgid(gid_t gid); } SYS_SETEGID = 182 // { int setegid(gid_t egid); } SYS_SETEUID = 183 // { int seteuid(uid_t euid); } + SYS_STAT = 188 // { int stat(char *path, struct stat *ub); } + SYS_FSTAT = 189 // { int fstat(int fd, struct stat *sb); } + SYS_LSTAT = 190 // { int lstat(char *path, struct stat *ub); } SYS_PATHCONF = 191 // { int pathconf(char *path, int name); } SYS_FPATHCONF = 192 // { int fpathconf(int fd, int name); } SYS_GETRLIMIT = 194 // { int getrlimit(u_int which, struct rlimit *rlp); } getrlimit __getrlimit_args int SYS_SETRLIMIT = 195 // { int setrlimit(u_int which, struct rlimit *rlp); } setrlimit __setrlimit_args int + SYS_GETDIRENTRIES = 196 // { int getdirentries(int fd, char *buf, u_int count, long *basep); } SYS___SYSCTL = 202 // { int __sysctl(int *name, u_int namelen, void *old, size_t *oldlenp, void *new, size_t newlen); } __sysctl sysctl_args int SYS_MLOCK = 203 // { int mlock(const void *addr, size_t len); } SYS_MUNLOCK = 204 // { int munlock(const void *addr, size_t len); } @@ -136,12 +143,12 @@ const ( SYS_SEMOP = 222 // { int semop(int semid, struct sembuf *sops, size_t nsops); } SYS_MSGGET = 225 // { int msgget(key_t key, int msgflg); } SYS_MSGSND = 226 // { int msgsnd(int msqid, const void *msgp, size_t msgsz, int msgflg); } - SYS_MSGRCV = 227 // { ssize_t msgrcv(int msqid, void *msgp, size_t msgsz, long msgtyp, int msgflg); } + SYS_MSGRCV = 227 // { int msgrcv(int msqid, void *msgp, size_t msgsz, long msgtyp, int msgflg); } SYS_SHMAT = 228 // { int shmat(int shmid, const void *shmaddr, int shmflg); } SYS_SHMDT = 230 // { int shmdt(const void *shmaddr); } SYS_SHMGET = 231 // { int shmget(key_t key, size_t size, int shmflg); } SYS_CLOCK_GETTIME = 232 // { int clock_gettime(clockid_t clock_id, struct timespec *tp); } - SYS_CLOCK_SETTIME = 233 // { int clock_settime(clockid_t clock_id, const struct timespec *tp); } + SYS_CLOCK_SETTIME = 233 // { int clock_settime( clockid_t clock_id, const struct timespec *tp); } SYS_CLOCK_GETRES = 234 // { int clock_getres(clockid_t clock_id, struct timespec *tp); } SYS_KTIMER_CREATE = 235 // { int ktimer_create(clockid_t clock_id, struct sigevent *evp, int *timerid); } SYS_KTIMER_DELETE = 236 // { int ktimer_delete(int timerid); } @@ -150,44 +157,50 @@ const ( SYS_KTIMER_GETOVERRUN = 239 // { int ktimer_getoverrun(int timerid); } SYS_NANOSLEEP = 240 // { int nanosleep(const struct timespec *rqtp, struct timespec *rmtp); } SYS_FFCLOCK_GETCOUNTER = 241 // { int ffclock_getcounter(ffcounter *ffcount); } - SYS_FFCLOCK_SETESTIMATE = 242 // { int ffclock_setestimate(struct ffclock_estimate *cest); } - SYS_FFCLOCK_GETESTIMATE = 243 // { int ffclock_getestimate(struct ffclock_estimate *cest); } + SYS_FFCLOCK_SETESTIMATE = 242 // { int ffclock_setestimate( struct ffclock_estimate *cest); } + SYS_FFCLOCK_GETESTIMATE = 243 // { int ffclock_getestimate( struct ffclock_estimate *cest); } SYS_CLOCK_NANOSLEEP = 244 // { int clock_nanosleep(clockid_t clock_id, int flags, const struct timespec *rqtp, struct timespec *rmtp); } - SYS_CLOCK_GETCPUCLOCKID2 = 247 // { int clock_getcpuclockid2(id_t id, int which, clockid_t *clock_id); } + SYS_CLOCK_GETCPUCLOCKID2 = 247 // { int clock_getcpuclockid2(id_t id,int which, clockid_t *clock_id); } SYS_NTP_GETTIME = 248 // { int ntp_gettime(struct ntptimeval *ntvp); } SYS_MINHERIT = 250 // { int minherit(void *addr, size_t len, int inherit); } SYS_RFORK = 251 // { int rfork(int flags); } + SYS_OPENBSD_POLL = 252 // { int openbsd_poll(struct pollfd *fds, u_int nfds, int timeout); } SYS_ISSETUGID = 253 // { int issetugid(void); } SYS_LCHOWN = 254 // { int lchown(char *path, int uid, int gid); } SYS_AIO_READ = 255 // { int aio_read(struct aiocb *aiocbp); } SYS_AIO_WRITE = 256 // { int aio_write(struct aiocb *aiocbp); } - SYS_LIO_LISTIO = 257 // { int lio_listio(int mode, struct aiocb* const *acb_list, int nent, struct sigevent *sig); } + SYS_LIO_LISTIO = 257 // { int lio_listio(int mode, struct aiocb * const *acb_list, int nent, struct sigevent *sig); } + SYS_GETDENTS = 272 // { int getdents(int fd, char *buf, size_t count); } SYS_LCHMOD = 274 // { int lchmod(char *path, mode_t mode); } SYS_LUTIMES = 276 // { int lutimes(char *path, struct timeval *tptr); } + SYS_NSTAT = 278 // { int nstat(char *path, struct nstat *ub); } + SYS_NFSTAT = 279 // { int nfstat(int fd, struct nstat *sb); } + SYS_NLSTAT = 280 // { int nlstat(char *path, struct nstat *ub); } SYS_PREADV = 289 // { ssize_t preadv(int fd, struct iovec *iovp, u_int iovcnt, off_t offset); } SYS_PWRITEV = 290 // { ssize_t pwritev(int fd, struct iovec *iovp, u_int iovcnt, off_t offset); } SYS_FHOPEN = 298 // { int fhopen(const struct fhandle *u_fhp, int flags); } + SYS_FHSTAT = 299 // { int fhstat(const struct fhandle *u_fhp, struct stat *sb); } SYS_MODNEXT = 300 // { int modnext(int modid); } - SYS_MODSTAT = 301 // { int modstat(int modid, struct module_stat* stat); } + SYS_MODSTAT = 301 // { int modstat(int modid, struct module_stat *stat); } SYS_MODFNEXT = 302 // { int modfnext(int modid); } SYS_MODFIND = 303 // { int modfind(const char *name); } SYS_KLDLOAD = 304 // { int kldload(const char *file); } SYS_KLDUNLOAD = 305 // { int kldunload(int fileid); } SYS_KLDFIND = 306 // { int kldfind(const char *file); } SYS_KLDNEXT = 307 // { int kldnext(int fileid); } - SYS_KLDSTAT = 308 // { int kldstat(int fileid, struct kld_file_stat *stat); } + SYS_KLDSTAT = 308 // { int kldstat(int fileid, struct kld_file_stat* stat); } SYS_KLDFIRSTMOD = 309 // { int kldfirstmod(int fileid); } SYS_GETSID = 310 // { int getsid(pid_t pid); } SYS_SETRESUID = 311 // { int setresuid(uid_t ruid, uid_t euid, uid_t suid); } SYS_SETRESGID = 312 // { int setresgid(gid_t rgid, gid_t egid, gid_t sgid); } SYS_AIO_RETURN = 314 // { ssize_t aio_return(struct aiocb *aiocbp); } - SYS_AIO_SUSPEND = 315 // { int aio_suspend(struct aiocb * const * aiocbp, int nent, const struct timespec *timeout); } + SYS_AIO_SUSPEND = 315 // { int aio_suspend( struct aiocb * const * aiocbp, int nent, const struct timespec *timeout); } SYS_AIO_CANCEL = 316 // { int aio_cancel(int fd, struct aiocb *aiocbp); } SYS_AIO_ERROR = 317 // { int aio_error(struct aiocb *aiocbp); } SYS_YIELD = 321 // { int yield(void); } SYS_MLOCKALL = 324 // { int mlockall(int how); } SYS_MUNLOCKALL = 325 // { int munlockall(void); } - SYS___GETCWD = 326 // { int __getcwd(char *buf, size_t buflen); } + SYS___GETCWD = 326 // { int __getcwd(char *buf, u_int buflen); } SYS_SCHED_SETPARAM = 327 // { int sched_setparam (pid_t pid, const struct sched_param *param); } SYS_SCHED_GETPARAM = 328 // { int sched_getparam (pid_t pid, struct sched_param *param); } SYS_SCHED_SETSCHEDULER = 329 // { int sched_setscheduler (pid_t pid, int policy, const struct sched_param *param); } @@ -213,13 +226,14 @@ const ( SYS___ACL_ACLCHECK_FILE = 353 // { int __acl_aclcheck_file(const char *path, acl_type_t type, struct acl *aclp); } SYS___ACL_ACLCHECK_FD = 354 // { int __acl_aclcheck_fd(int filedes, acl_type_t type, struct acl *aclp); } SYS_EXTATTRCTL = 355 // { int extattrctl(const char *path, int cmd, const char *filename, int attrnamespace, const char *attrname); } - SYS_EXTATTR_SET_FILE = 356 // { ssize_t extattr_set_file(const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes); } - SYS_EXTATTR_GET_FILE = 357 // { ssize_t extattr_get_file(const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes); } + SYS_EXTATTR_SET_FILE = 356 // { ssize_t extattr_set_file( const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes); } + SYS_EXTATTR_GET_FILE = 357 // { ssize_t extattr_get_file( const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes); } SYS_EXTATTR_DELETE_FILE = 358 // { int extattr_delete_file(const char *path, int attrnamespace, const char *attrname); } - SYS_AIO_WAITCOMPLETE = 359 // { ssize_t aio_waitcomplete(struct aiocb **aiocbp, struct timespec *timeout); } + SYS_AIO_WAITCOMPLETE = 359 // { ssize_t aio_waitcomplete( struct aiocb **aiocbp, struct timespec *timeout); } SYS_GETRESUID = 360 // { int getresuid(uid_t *ruid, uid_t *euid, uid_t *suid); } SYS_GETRESGID = 361 // { int getresgid(gid_t *rgid, gid_t *egid, gid_t *sgid); } SYS_KQUEUE = 362 // { int kqueue(void); } + SYS_KEVENT = 363 // { int kevent(int fd, struct kevent *changelist, int nchanges, struct kevent *eventlist, int nevents, const struct timespec *timeout); } SYS_EXTATTR_SET_FD = 371 // { ssize_t extattr_set_fd(int fd, int attrnamespace, const char *attrname, void *data, size_t nbytes); } SYS_EXTATTR_GET_FD = 372 // { ssize_t extattr_get_fd(int fd, int attrnamespace, const char *attrname, void *data, size_t nbytes); } SYS_EXTATTR_DELETE_FD = 373 // { int extattr_delete_fd(int fd, int attrnamespace, const char *attrname); } @@ -237,6 +251,10 @@ const ( SYS_UUIDGEN = 392 // { int uuidgen(struct uuid *store, int count); } SYS_SENDFILE = 393 // { int sendfile(int fd, int s, off_t offset, size_t nbytes, struct sf_hdtr *hdtr, off_t *sbytes, int flags); } SYS_MAC_SYSCALL = 394 // { int mac_syscall(const char *policy, int call, void *arg); } + SYS_GETFSSTAT = 395 // { int getfsstat(struct statfs *buf, long bufsize, int mode); } + SYS_STATFS = 396 // { int statfs(char *path, struct statfs *buf); } + SYS_FSTATFS = 397 // { int fstatfs(int fd, struct statfs *buf); } + SYS_FHSTATFS = 398 // { int fhstatfs(const struct fhandle *u_fhp, struct statfs *buf); } SYS_KSEM_CLOSE = 400 // { int ksem_close(semid_t id); } SYS_KSEM_POST = 401 // { int ksem_post(semid_t id); } SYS_KSEM_WAIT = 402 // { int ksem_wait(semid_t id); } @@ -249,14 +267,14 @@ const ( SYS___MAC_GET_PID = 409 // { int __mac_get_pid(pid_t pid, struct mac *mac_p); } SYS___MAC_GET_LINK = 410 // { int __mac_get_link(const char *path_p, struct mac *mac_p); } SYS___MAC_SET_LINK = 411 // { int __mac_set_link(const char *path_p, struct mac *mac_p); } - SYS_EXTATTR_SET_LINK = 412 // { ssize_t extattr_set_link(const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes); } - SYS_EXTATTR_GET_LINK = 413 // { ssize_t extattr_get_link(const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes); } - SYS_EXTATTR_DELETE_LINK = 414 // { int extattr_delete_link(const char *path, int attrnamespace, const char *attrname); } + SYS_EXTATTR_SET_LINK = 412 // { ssize_t extattr_set_link( const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes); } + SYS_EXTATTR_GET_LINK = 413 // { ssize_t extattr_get_link( const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes); } + SYS_EXTATTR_DELETE_LINK = 414 // { int extattr_delete_link( const char *path, int attrnamespace, const char *attrname); } SYS___MAC_EXECVE = 415 // { int __mac_execve(char *fname, char **argv, char **envv, struct mac *mac_p); } SYS_SIGACTION = 416 // { int sigaction(int sig, const struct sigaction *act, struct sigaction *oact); } - SYS_SIGRETURN = 417 // { int sigreturn(const struct __ucontext *sigcntxp); } + SYS_SIGRETURN = 417 // { int sigreturn( const struct __ucontext *sigcntxp); } SYS_GETCONTEXT = 421 // { int getcontext(struct __ucontext *ucp); } - SYS_SETCONTEXT = 422 // { int setcontext(const struct __ucontext *ucp); } + SYS_SETCONTEXT = 422 // { int setcontext( const struct __ucontext *ucp); } SYS_SWAPCONTEXT = 423 // { int swapcontext(struct __ucontext *oucp, const struct __ucontext *ucp); } SYS_SWAPOFF = 424 // { int swapoff(const char *name); } SYS___ACL_GET_LINK = 425 // { int __acl_get_link(const char *path, acl_type_t type, struct acl *aclp); } @@ -270,10 +288,10 @@ const ( SYS_THR_KILL = 433 // { int thr_kill(long id, int sig); } SYS_JAIL_ATTACH = 436 // { int jail_attach(int jid); } SYS_EXTATTR_LIST_FD = 437 // { ssize_t extattr_list_fd(int fd, int attrnamespace, void *data, size_t nbytes); } - SYS_EXTATTR_LIST_FILE = 438 // { ssize_t extattr_list_file(const char *path, int attrnamespace, void *data, size_t nbytes); } - SYS_EXTATTR_LIST_LINK = 439 // { ssize_t extattr_list_link(const char *path, int attrnamespace, void *data, size_t nbytes); } + SYS_EXTATTR_LIST_FILE = 438 // { ssize_t extattr_list_file( const char *path, int attrnamespace, void *data, size_t nbytes); } + SYS_EXTATTR_LIST_LINK = 439 // { ssize_t extattr_list_link( const char *path, int attrnamespace, void *data, size_t nbytes); } SYS_KSEM_TIMEDWAIT = 441 // { int ksem_timedwait(semid_t id, const struct timespec *abstime); } - SYS_THR_SUSPEND = 442 // { int thr_suspend(const struct timespec *timeout); } + SYS_THR_SUSPEND = 442 // { int thr_suspend( const struct timespec *timeout); } SYS_THR_WAKE = 443 // { int thr_wake(long id); } SYS_KLDUNLOADF = 444 // { int kldunloadf(int fileid, int flags); } SYS_AUDIT = 445 // { int audit(const void *record, u_int length); } @@ -282,17 +300,17 @@ const ( SYS_SETAUID = 448 // { int setauid(uid_t *auid); } SYS_GETAUDIT = 449 // { int getaudit(struct auditinfo *auditinfo); } SYS_SETAUDIT = 450 // { int setaudit(struct auditinfo *auditinfo); } - SYS_GETAUDIT_ADDR = 451 // { int getaudit_addr(struct auditinfo_addr *auditinfo_addr, u_int length); } - SYS_SETAUDIT_ADDR = 452 // { int setaudit_addr(struct auditinfo_addr *auditinfo_addr, u_int length); } + SYS_GETAUDIT_ADDR = 451 // { int getaudit_addr( struct auditinfo_addr *auditinfo_addr, u_int length); } + SYS_SETAUDIT_ADDR = 452 // { int setaudit_addr( struct auditinfo_addr *auditinfo_addr, u_int length); } SYS_AUDITCTL = 453 // { int auditctl(char *path); } SYS__UMTX_OP = 454 // { int _umtx_op(void *obj, int op, u_long val, void *uaddr1, void *uaddr2); } SYS_THR_NEW = 455 // { int thr_new(struct thr_param *param, int param_size); } SYS_SIGQUEUE = 456 // { int sigqueue(pid_t pid, int signum, void *value); } SYS_KMQ_OPEN = 457 // { int kmq_open(const char *path, int flags, mode_t mode, const struct mq_attr *attr); } - SYS_KMQ_SETATTR = 458 // { int kmq_setattr(int mqd, const struct mq_attr *attr, struct mq_attr *oattr); } - SYS_KMQ_TIMEDRECEIVE = 459 // { int kmq_timedreceive(int mqd, char *msg_ptr, size_t msg_len, unsigned *msg_prio, const struct timespec *abs_timeout); } - SYS_KMQ_TIMEDSEND = 460 // { int kmq_timedsend(int mqd, const char *msg_ptr, size_t msg_len, unsigned msg_prio, const struct timespec *abs_timeout); } - SYS_KMQ_NOTIFY = 461 // { int kmq_notify(int mqd, const struct sigevent *sigev); } + SYS_KMQ_SETATTR = 458 // { int kmq_setattr(int mqd, const struct mq_attr *attr, struct mq_attr *oattr); } + SYS_KMQ_TIMEDRECEIVE = 459 // { int kmq_timedreceive(int mqd, char *msg_ptr, size_t msg_len, unsigned *msg_prio, const struct timespec *abs_timeout); } + SYS_KMQ_TIMEDSEND = 460 // { int kmq_timedsend(int mqd, const char *msg_ptr, size_t msg_len,unsigned msg_prio, const struct timespec *abs_timeout);} + SYS_KMQ_NOTIFY = 461 // { int kmq_notify(int mqd, const struct sigevent *sigev); } SYS_KMQ_UNLINK = 462 // { int kmq_unlink(const char *path); } SYS_ABORT2 = 463 // { int abort2(const char *why, int nargs, void **args); } SYS_THR_SET_NAME = 464 // { int thr_set_name(long id, const char *name); } @@ -301,7 +319,7 @@ const ( SYS_SCTP_PEELOFF = 471 // { int sctp_peeloff(int sd, uint32_t name); } SYS_SCTP_GENERIC_SENDMSG = 472 // { int sctp_generic_sendmsg(int sd, caddr_t msg, int mlen, caddr_t to, __socklen_t tolen, struct sctp_sndrcvinfo *sinfo, int flags); } SYS_SCTP_GENERIC_SENDMSG_IOV = 473 // { int sctp_generic_sendmsg_iov(int sd, struct iovec *iov, int iovlen, caddr_t to, __socklen_t tolen, struct sctp_sndrcvinfo *sinfo, int flags); } - SYS_SCTP_GENERIC_RECVMSG = 474 // { int sctp_generic_recvmsg(int sd, struct iovec *iov, int iovlen, struct sockaddr *from, __socklen_t *fromlenaddr, struct sctp_sndrcvinfo *sinfo, int *msg_flags); } + SYS_SCTP_GENERIC_RECVMSG = 474 // { int sctp_generic_recvmsg(int sd, struct iovec *iov, int iovlen, struct sockaddr * from, __socklen_t *fromlenaddr, struct sctp_sndrcvinfo *sinfo, int *msg_flags); } SYS_PREAD = 475 // { ssize_t pread(int fd, void *buf, size_t nbyte, off_t offset); } SYS_PWRITE = 476 // { ssize_t pwrite(int fd, const void *buf, size_t nbyte, off_t offset); } SYS_MMAP = 477 // { caddr_t mmap(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos); } @@ -320,12 +338,14 @@ const ( SYS_FCHMODAT = 490 // { int fchmodat(int fd, char *path, mode_t mode, int flag); } SYS_FCHOWNAT = 491 // { int fchownat(int fd, char *path, uid_t uid, gid_t gid, int flag); } SYS_FEXECVE = 492 // { int fexecve(int fd, char **argv, char **envv); } + SYS_FSTATAT = 493 // { int fstatat(int fd, char *path, struct stat *buf, int flag); } SYS_FUTIMESAT = 494 // { int futimesat(int fd, char *path, struct timeval *times); } SYS_LINKAT = 495 // { int linkat(int fd1, char *path1, int fd2, char *path2, int flag); } SYS_MKDIRAT = 496 // { int mkdirat(int fd, char *path, mode_t mode); } SYS_MKFIFOAT = 497 // { int mkfifoat(int fd, char *path, mode_t mode); } + SYS_MKNODAT = 498 // { int mknodat(int fd, char *path, mode_t mode, dev_t dev); } SYS_OPENAT = 499 // { int openat(int fd, char *path, int flag, mode_t mode); } - SYS_READLINKAT = 500 // { ssize_t readlinkat(int fd, char *path, char *buf, size_t bufsize); } + SYS_READLINKAT = 500 // { int readlinkat(int fd, char *path, char *buf, size_t bufsize); } SYS_RENAMEAT = 501 // { int renameat(int oldfd, char *old, int newfd, char *new); } SYS_SYMLINKAT = 502 // { int symlinkat(char *path1, int fd, char *path2); } SYS_UNLINKAT = 503 // { int unlinkat(int fd, char *path, int flag); } @@ -371,24 +391,7 @@ const ( SYS_PPOLL = 545 // { int ppoll(struct pollfd *fds, u_int nfds, const struct timespec *ts, const sigset_t *set); } SYS_FUTIMENS = 546 // { int futimens(int fd, struct timespec *times); } SYS_UTIMENSAT = 547 // { int utimensat(int fd, char *path, struct timespec *times, int flag); } + SYS_NUMA_GETAFFINITY = 548 // { int numa_getaffinity(cpuwhich_t which, id_t id, struct vm_domain_policy_entry *policy); } + SYS_NUMA_SETAFFINITY = 549 // { int numa_setaffinity(cpuwhich_t which, id_t id, const struct vm_domain_policy_entry *policy); } SYS_FDATASYNC = 550 // { int fdatasync(int fd); } - SYS_FSTAT = 551 // { int fstat(int fd, struct stat *sb); } - SYS_FSTATAT = 552 // { int fstatat(int fd, char *path, struct stat *buf, int flag); } - SYS_FHSTAT = 553 // { int fhstat(const struct fhandle *u_fhp, struct stat *sb); } - SYS_GETDIRENTRIES = 554 // { ssize_t getdirentries(int fd, char *buf, size_t count, off_t *basep); } - SYS_STATFS = 555 // { int statfs(char *path, struct statfs *buf); } - SYS_FSTATFS = 556 // { int fstatfs(int fd, struct statfs *buf); } - SYS_GETFSSTAT = 557 // { int getfsstat(struct statfs *buf, long bufsize, int mode); } - SYS_FHSTATFS = 558 // { int fhstatfs(const struct fhandle *u_fhp, struct statfs *buf); } - SYS_MKNODAT = 559 // { int mknodat(int fd, char *path, mode_t mode, dev_t dev); } - SYS_KEVENT = 560 // { int kevent(int fd, struct kevent *changelist, int nchanges, struct kevent *eventlist, int nevents, const struct timespec *timeout); } - SYS_CPUSET_GETDOMAIN = 561 // { int cpuset_getdomain(cpulevel_t level, cpuwhich_t which, id_t id, size_t domainsetsize, domainset_t *mask, int *policy); } - SYS_CPUSET_SETDOMAIN = 562 // { int cpuset_setdomain(cpulevel_t level, cpuwhich_t which, id_t id, size_t domainsetsize, domainset_t *mask, int policy); } - SYS_GETRANDOM = 563 // { int getrandom(void *buf, size_t buflen, unsigned int flags); } - SYS_GETFHAT = 564 // { int getfhat(int fd, char *path, struct fhandle *fhp, int flags); } - SYS_FHLINK = 565 // { int fhlink(struct fhandle *fhp, const char *to); } - SYS_FHLINKAT = 566 // { int fhlinkat(struct fhandle *fhp, int tofd, const char *to,); } - SYS_FHREADLINK = 567 // { int fhreadlink(struct fhandle *fhp, char *buf, size_t bufsize); } - SYS___SYSCTLBYNAME = 570 // { int __sysctlbyname(const char *name, size_t namelen, void *old, size_t *oldlenp, void *new, size_t newlen); } - SYS_CLOSE_RANGE = 575 // { int close_range(u_int lowfd, u_int highfd, int flags); } ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_freebsd_arm.go b/vendor/golang.org/x/sys/unix/zsysnum_freebsd_arm.go index ad99bc1..e2e3d72 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_freebsd_arm.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_freebsd_arm.go @@ -1,4 +1,4 @@ -// go run mksysnum.go https://cgit.freebsd.org/src/plain/sys/kern/syscalls.master?h=stable/12 +// go run mksysnum.go https://svn.freebsd.org/base/stable/11/sys/kern/syscalls.master // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm && freebsd @@ -19,9 +19,10 @@ const ( SYS_UNLINK = 10 // { int unlink(char *path); } SYS_CHDIR = 12 // { int chdir(char *path); } SYS_FCHDIR = 13 // { int fchdir(int fd); } + SYS_MKNOD = 14 // { int mknod(char *path, int mode, int dev); } SYS_CHMOD = 15 // { int chmod(char *path, int mode); } SYS_CHOWN = 16 // { int chown(char *path, int uid, int gid); } - SYS_BREAK = 17 // { caddr_t break(char *nsize); } + SYS_OBREAK = 17 // { int obreak(char *nsize); } break obreak_args int SYS_GETPID = 20 // { pid_t getpid(void); } SYS_MOUNT = 21 // { int mount(char *type, char *path, int flags, caddr_t data); } SYS_UNMOUNT = 22 // { int unmount(char *path, int flags); } @@ -42,6 +43,7 @@ const ( SYS_KILL = 37 // { int kill(int pid, int signum); } SYS_GETPPID = 39 // { pid_t getppid(void); } SYS_DUP = 41 // { int dup(u_int fd); } + SYS_PIPE = 42 // { int pipe(void); } SYS_GETEGID = 43 // { gid_t getegid(void); } SYS_PROFIL = 44 // { int profil(caddr_t samples, size_t size, size_t offset, u_int scale); } SYS_KTRACE = 45 // { int ktrace(const char *fname, int ops, int facs, int pid); } @@ -56,14 +58,15 @@ const ( SYS_SYMLINK = 57 // { int symlink(char *path, char *link); } SYS_READLINK = 58 // { ssize_t readlink(char *path, char *buf, size_t count); } SYS_EXECVE = 59 // { int execve(char *fname, char **argv, char **envv); } - SYS_UMASK = 60 // { int umask(int newmask); } + SYS_UMASK = 60 // { int umask(int newmask); } umask umask_args int SYS_CHROOT = 61 // { int chroot(char *path); } SYS_MSYNC = 65 // { int msync(void *addr, size_t len, int flags); } SYS_VFORK = 66 // { int vfork(void); } SYS_SBRK = 69 // { int sbrk(int incr); } SYS_SSTK = 70 // { int sstk(int incr); } + SYS_OVADVISE = 72 // { int ovadvise(int anom); } vadvise ovadvise_args int SYS_MUNMAP = 73 // { int munmap(void *addr, size_t len); } - SYS_MPROTECT = 74 // { int mprotect(void *addr, size_t len, int prot); } + SYS_MPROTECT = 74 // { int mprotect(const void *addr, size_t len, int prot); } SYS_MADVISE = 75 // { int madvise(void *addr, size_t len, int behav); } SYS_MINCORE = 78 // { int mincore(const void *addr, size_t len, char *vec); } SYS_GETGROUPS = 79 // { int getgroups(u_int gidsetsize, gid_t *gidset); } @@ -121,10 +124,14 @@ const ( SYS_SETGID = 181 // { int setgid(gid_t gid); } SYS_SETEGID = 182 // { int setegid(gid_t egid); } SYS_SETEUID = 183 // { int seteuid(uid_t euid); } + SYS_STAT = 188 // { int stat(char *path, struct stat *ub); } + SYS_FSTAT = 189 // { int fstat(int fd, struct stat *sb); } + SYS_LSTAT = 190 // { int lstat(char *path, struct stat *ub); } SYS_PATHCONF = 191 // { int pathconf(char *path, int name); } SYS_FPATHCONF = 192 // { int fpathconf(int fd, int name); } SYS_GETRLIMIT = 194 // { int getrlimit(u_int which, struct rlimit *rlp); } getrlimit __getrlimit_args int SYS_SETRLIMIT = 195 // { int setrlimit(u_int which, struct rlimit *rlp); } setrlimit __setrlimit_args int + SYS_GETDIRENTRIES = 196 // { int getdirentries(int fd, char *buf, u_int count, long *basep); } SYS___SYSCTL = 202 // { int __sysctl(int *name, u_int namelen, void *old, size_t *oldlenp, void *new, size_t newlen); } __sysctl sysctl_args int SYS_MLOCK = 203 // { int mlock(const void *addr, size_t len); } SYS_MUNLOCK = 204 // { int munlock(const void *addr, size_t len); } @@ -136,12 +143,12 @@ const ( SYS_SEMOP = 222 // { int semop(int semid, struct sembuf *sops, size_t nsops); } SYS_MSGGET = 225 // { int msgget(key_t key, int msgflg); } SYS_MSGSND = 226 // { int msgsnd(int msqid, const void *msgp, size_t msgsz, int msgflg); } - SYS_MSGRCV = 227 // { ssize_t msgrcv(int msqid, void *msgp, size_t msgsz, long msgtyp, int msgflg); } + SYS_MSGRCV = 227 // { int msgrcv(int msqid, void *msgp, size_t msgsz, long msgtyp, int msgflg); } SYS_SHMAT = 228 // { int shmat(int shmid, const void *shmaddr, int shmflg); } SYS_SHMDT = 230 // { int shmdt(const void *shmaddr); } SYS_SHMGET = 231 // { int shmget(key_t key, size_t size, int shmflg); } SYS_CLOCK_GETTIME = 232 // { int clock_gettime(clockid_t clock_id, struct timespec *tp); } - SYS_CLOCK_SETTIME = 233 // { int clock_settime(clockid_t clock_id, const struct timespec *tp); } + SYS_CLOCK_SETTIME = 233 // { int clock_settime( clockid_t clock_id, const struct timespec *tp); } SYS_CLOCK_GETRES = 234 // { int clock_getres(clockid_t clock_id, struct timespec *tp); } SYS_KTIMER_CREATE = 235 // { int ktimer_create(clockid_t clock_id, struct sigevent *evp, int *timerid); } SYS_KTIMER_DELETE = 236 // { int ktimer_delete(int timerid); } @@ -150,44 +157,50 @@ const ( SYS_KTIMER_GETOVERRUN = 239 // { int ktimer_getoverrun(int timerid); } SYS_NANOSLEEP = 240 // { int nanosleep(const struct timespec *rqtp, struct timespec *rmtp); } SYS_FFCLOCK_GETCOUNTER = 241 // { int ffclock_getcounter(ffcounter *ffcount); } - SYS_FFCLOCK_SETESTIMATE = 242 // { int ffclock_setestimate(struct ffclock_estimate *cest); } - SYS_FFCLOCK_GETESTIMATE = 243 // { int ffclock_getestimate(struct ffclock_estimate *cest); } + SYS_FFCLOCK_SETESTIMATE = 242 // { int ffclock_setestimate( struct ffclock_estimate *cest); } + SYS_FFCLOCK_GETESTIMATE = 243 // { int ffclock_getestimate( struct ffclock_estimate *cest); } SYS_CLOCK_NANOSLEEP = 244 // { int clock_nanosleep(clockid_t clock_id, int flags, const struct timespec *rqtp, struct timespec *rmtp); } - SYS_CLOCK_GETCPUCLOCKID2 = 247 // { int clock_getcpuclockid2(id_t id, int which, clockid_t *clock_id); } + SYS_CLOCK_GETCPUCLOCKID2 = 247 // { int clock_getcpuclockid2(id_t id,int which, clockid_t *clock_id); } SYS_NTP_GETTIME = 248 // { int ntp_gettime(struct ntptimeval *ntvp); } SYS_MINHERIT = 250 // { int minherit(void *addr, size_t len, int inherit); } SYS_RFORK = 251 // { int rfork(int flags); } + SYS_OPENBSD_POLL = 252 // { int openbsd_poll(struct pollfd *fds, u_int nfds, int timeout); } SYS_ISSETUGID = 253 // { int issetugid(void); } SYS_LCHOWN = 254 // { int lchown(char *path, int uid, int gid); } SYS_AIO_READ = 255 // { int aio_read(struct aiocb *aiocbp); } SYS_AIO_WRITE = 256 // { int aio_write(struct aiocb *aiocbp); } - SYS_LIO_LISTIO = 257 // { int lio_listio(int mode, struct aiocb* const *acb_list, int nent, struct sigevent *sig); } + SYS_LIO_LISTIO = 257 // { int lio_listio(int mode, struct aiocb * const *acb_list, int nent, struct sigevent *sig); } + SYS_GETDENTS = 272 // { int getdents(int fd, char *buf, size_t count); } SYS_LCHMOD = 274 // { int lchmod(char *path, mode_t mode); } SYS_LUTIMES = 276 // { int lutimes(char *path, struct timeval *tptr); } + SYS_NSTAT = 278 // { int nstat(char *path, struct nstat *ub); } + SYS_NFSTAT = 279 // { int nfstat(int fd, struct nstat *sb); } + SYS_NLSTAT = 280 // { int nlstat(char *path, struct nstat *ub); } SYS_PREADV = 289 // { ssize_t preadv(int fd, struct iovec *iovp, u_int iovcnt, off_t offset); } SYS_PWRITEV = 290 // { ssize_t pwritev(int fd, struct iovec *iovp, u_int iovcnt, off_t offset); } SYS_FHOPEN = 298 // { int fhopen(const struct fhandle *u_fhp, int flags); } + SYS_FHSTAT = 299 // { int fhstat(const struct fhandle *u_fhp, struct stat *sb); } SYS_MODNEXT = 300 // { int modnext(int modid); } - SYS_MODSTAT = 301 // { int modstat(int modid, struct module_stat* stat); } + SYS_MODSTAT = 301 // { int modstat(int modid, struct module_stat *stat); } SYS_MODFNEXT = 302 // { int modfnext(int modid); } SYS_MODFIND = 303 // { int modfind(const char *name); } SYS_KLDLOAD = 304 // { int kldload(const char *file); } SYS_KLDUNLOAD = 305 // { int kldunload(int fileid); } SYS_KLDFIND = 306 // { int kldfind(const char *file); } SYS_KLDNEXT = 307 // { int kldnext(int fileid); } - SYS_KLDSTAT = 308 // { int kldstat(int fileid, struct kld_file_stat *stat); } + SYS_KLDSTAT = 308 // { int kldstat(int fileid, struct kld_file_stat* stat); } SYS_KLDFIRSTMOD = 309 // { int kldfirstmod(int fileid); } SYS_GETSID = 310 // { int getsid(pid_t pid); } SYS_SETRESUID = 311 // { int setresuid(uid_t ruid, uid_t euid, uid_t suid); } SYS_SETRESGID = 312 // { int setresgid(gid_t rgid, gid_t egid, gid_t sgid); } SYS_AIO_RETURN = 314 // { ssize_t aio_return(struct aiocb *aiocbp); } - SYS_AIO_SUSPEND = 315 // { int aio_suspend(struct aiocb * const * aiocbp, int nent, const struct timespec *timeout); } + SYS_AIO_SUSPEND = 315 // { int aio_suspend( struct aiocb * const * aiocbp, int nent, const struct timespec *timeout); } SYS_AIO_CANCEL = 316 // { int aio_cancel(int fd, struct aiocb *aiocbp); } SYS_AIO_ERROR = 317 // { int aio_error(struct aiocb *aiocbp); } SYS_YIELD = 321 // { int yield(void); } SYS_MLOCKALL = 324 // { int mlockall(int how); } SYS_MUNLOCKALL = 325 // { int munlockall(void); } - SYS___GETCWD = 326 // { int __getcwd(char *buf, size_t buflen); } + SYS___GETCWD = 326 // { int __getcwd(char *buf, u_int buflen); } SYS_SCHED_SETPARAM = 327 // { int sched_setparam (pid_t pid, const struct sched_param *param); } SYS_SCHED_GETPARAM = 328 // { int sched_getparam (pid_t pid, struct sched_param *param); } SYS_SCHED_SETSCHEDULER = 329 // { int sched_setscheduler (pid_t pid, int policy, const struct sched_param *param); } @@ -213,13 +226,14 @@ const ( SYS___ACL_ACLCHECK_FILE = 353 // { int __acl_aclcheck_file(const char *path, acl_type_t type, struct acl *aclp); } SYS___ACL_ACLCHECK_FD = 354 // { int __acl_aclcheck_fd(int filedes, acl_type_t type, struct acl *aclp); } SYS_EXTATTRCTL = 355 // { int extattrctl(const char *path, int cmd, const char *filename, int attrnamespace, const char *attrname); } - SYS_EXTATTR_SET_FILE = 356 // { ssize_t extattr_set_file(const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes); } - SYS_EXTATTR_GET_FILE = 357 // { ssize_t extattr_get_file(const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes); } + SYS_EXTATTR_SET_FILE = 356 // { ssize_t extattr_set_file( const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes); } + SYS_EXTATTR_GET_FILE = 357 // { ssize_t extattr_get_file( const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes); } SYS_EXTATTR_DELETE_FILE = 358 // { int extattr_delete_file(const char *path, int attrnamespace, const char *attrname); } - SYS_AIO_WAITCOMPLETE = 359 // { ssize_t aio_waitcomplete(struct aiocb **aiocbp, struct timespec *timeout); } + SYS_AIO_WAITCOMPLETE = 359 // { ssize_t aio_waitcomplete( struct aiocb **aiocbp, struct timespec *timeout); } SYS_GETRESUID = 360 // { int getresuid(uid_t *ruid, uid_t *euid, uid_t *suid); } SYS_GETRESGID = 361 // { int getresgid(gid_t *rgid, gid_t *egid, gid_t *sgid); } SYS_KQUEUE = 362 // { int kqueue(void); } + SYS_KEVENT = 363 // { int kevent(int fd, struct kevent *changelist, int nchanges, struct kevent *eventlist, int nevents, const struct timespec *timeout); } SYS_EXTATTR_SET_FD = 371 // { ssize_t extattr_set_fd(int fd, int attrnamespace, const char *attrname, void *data, size_t nbytes); } SYS_EXTATTR_GET_FD = 372 // { ssize_t extattr_get_fd(int fd, int attrnamespace, const char *attrname, void *data, size_t nbytes); } SYS_EXTATTR_DELETE_FD = 373 // { int extattr_delete_fd(int fd, int attrnamespace, const char *attrname); } @@ -237,6 +251,10 @@ const ( SYS_UUIDGEN = 392 // { int uuidgen(struct uuid *store, int count); } SYS_SENDFILE = 393 // { int sendfile(int fd, int s, off_t offset, size_t nbytes, struct sf_hdtr *hdtr, off_t *sbytes, int flags); } SYS_MAC_SYSCALL = 394 // { int mac_syscall(const char *policy, int call, void *arg); } + SYS_GETFSSTAT = 395 // { int getfsstat(struct statfs *buf, long bufsize, int mode); } + SYS_STATFS = 396 // { int statfs(char *path, struct statfs *buf); } + SYS_FSTATFS = 397 // { int fstatfs(int fd, struct statfs *buf); } + SYS_FHSTATFS = 398 // { int fhstatfs(const struct fhandle *u_fhp, struct statfs *buf); } SYS_KSEM_CLOSE = 400 // { int ksem_close(semid_t id); } SYS_KSEM_POST = 401 // { int ksem_post(semid_t id); } SYS_KSEM_WAIT = 402 // { int ksem_wait(semid_t id); } @@ -249,14 +267,14 @@ const ( SYS___MAC_GET_PID = 409 // { int __mac_get_pid(pid_t pid, struct mac *mac_p); } SYS___MAC_GET_LINK = 410 // { int __mac_get_link(const char *path_p, struct mac *mac_p); } SYS___MAC_SET_LINK = 411 // { int __mac_set_link(const char *path_p, struct mac *mac_p); } - SYS_EXTATTR_SET_LINK = 412 // { ssize_t extattr_set_link(const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes); } - SYS_EXTATTR_GET_LINK = 413 // { ssize_t extattr_get_link(const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes); } - SYS_EXTATTR_DELETE_LINK = 414 // { int extattr_delete_link(const char *path, int attrnamespace, const char *attrname); } + SYS_EXTATTR_SET_LINK = 412 // { ssize_t extattr_set_link( const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes); } + SYS_EXTATTR_GET_LINK = 413 // { ssize_t extattr_get_link( const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes); } + SYS_EXTATTR_DELETE_LINK = 414 // { int extattr_delete_link( const char *path, int attrnamespace, const char *attrname); } SYS___MAC_EXECVE = 415 // { int __mac_execve(char *fname, char **argv, char **envv, struct mac *mac_p); } SYS_SIGACTION = 416 // { int sigaction(int sig, const struct sigaction *act, struct sigaction *oact); } - SYS_SIGRETURN = 417 // { int sigreturn(const struct __ucontext *sigcntxp); } + SYS_SIGRETURN = 417 // { int sigreturn( const struct __ucontext *sigcntxp); } SYS_GETCONTEXT = 421 // { int getcontext(struct __ucontext *ucp); } - SYS_SETCONTEXT = 422 // { int setcontext(const struct __ucontext *ucp); } + SYS_SETCONTEXT = 422 // { int setcontext( const struct __ucontext *ucp); } SYS_SWAPCONTEXT = 423 // { int swapcontext(struct __ucontext *oucp, const struct __ucontext *ucp); } SYS_SWAPOFF = 424 // { int swapoff(const char *name); } SYS___ACL_GET_LINK = 425 // { int __acl_get_link(const char *path, acl_type_t type, struct acl *aclp); } @@ -270,10 +288,10 @@ const ( SYS_THR_KILL = 433 // { int thr_kill(long id, int sig); } SYS_JAIL_ATTACH = 436 // { int jail_attach(int jid); } SYS_EXTATTR_LIST_FD = 437 // { ssize_t extattr_list_fd(int fd, int attrnamespace, void *data, size_t nbytes); } - SYS_EXTATTR_LIST_FILE = 438 // { ssize_t extattr_list_file(const char *path, int attrnamespace, void *data, size_t nbytes); } - SYS_EXTATTR_LIST_LINK = 439 // { ssize_t extattr_list_link(const char *path, int attrnamespace, void *data, size_t nbytes); } + SYS_EXTATTR_LIST_FILE = 438 // { ssize_t extattr_list_file( const char *path, int attrnamespace, void *data, size_t nbytes); } + SYS_EXTATTR_LIST_LINK = 439 // { ssize_t extattr_list_link( const char *path, int attrnamespace, void *data, size_t nbytes); } SYS_KSEM_TIMEDWAIT = 441 // { int ksem_timedwait(semid_t id, const struct timespec *abstime); } - SYS_THR_SUSPEND = 442 // { int thr_suspend(const struct timespec *timeout); } + SYS_THR_SUSPEND = 442 // { int thr_suspend( const struct timespec *timeout); } SYS_THR_WAKE = 443 // { int thr_wake(long id); } SYS_KLDUNLOADF = 444 // { int kldunloadf(int fileid, int flags); } SYS_AUDIT = 445 // { int audit(const void *record, u_int length); } @@ -282,17 +300,17 @@ const ( SYS_SETAUID = 448 // { int setauid(uid_t *auid); } SYS_GETAUDIT = 449 // { int getaudit(struct auditinfo *auditinfo); } SYS_SETAUDIT = 450 // { int setaudit(struct auditinfo *auditinfo); } - SYS_GETAUDIT_ADDR = 451 // { int getaudit_addr(struct auditinfo_addr *auditinfo_addr, u_int length); } - SYS_SETAUDIT_ADDR = 452 // { int setaudit_addr(struct auditinfo_addr *auditinfo_addr, u_int length); } + SYS_GETAUDIT_ADDR = 451 // { int getaudit_addr( struct auditinfo_addr *auditinfo_addr, u_int length); } + SYS_SETAUDIT_ADDR = 452 // { int setaudit_addr( struct auditinfo_addr *auditinfo_addr, u_int length); } SYS_AUDITCTL = 453 // { int auditctl(char *path); } SYS__UMTX_OP = 454 // { int _umtx_op(void *obj, int op, u_long val, void *uaddr1, void *uaddr2); } SYS_THR_NEW = 455 // { int thr_new(struct thr_param *param, int param_size); } SYS_SIGQUEUE = 456 // { int sigqueue(pid_t pid, int signum, void *value); } SYS_KMQ_OPEN = 457 // { int kmq_open(const char *path, int flags, mode_t mode, const struct mq_attr *attr); } - SYS_KMQ_SETATTR = 458 // { int kmq_setattr(int mqd, const struct mq_attr *attr, struct mq_attr *oattr); } - SYS_KMQ_TIMEDRECEIVE = 459 // { int kmq_timedreceive(int mqd, char *msg_ptr, size_t msg_len, unsigned *msg_prio, const struct timespec *abs_timeout); } - SYS_KMQ_TIMEDSEND = 460 // { int kmq_timedsend(int mqd, const char *msg_ptr, size_t msg_len, unsigned msg_prio, const struct timespec *abs_timeout); } - SYS_KMQ_NOTIFY = 461 // { int kmq_notify(int mqd, const struct sigevent *sigev); } + SYS_KMQ_SETATTR = 458 // { int kmq_setattr(int mqd, const struct mq_attr *attr, struct mq_attr *oattr); } + SYS_KMQ_TIMEDRECEIVE = 459 // { int kmq_timedreceive(int mqd, char *msg_ptr, size_t msg_len, unsigned *msg_prio, const struct timespec *abs_timeout); } + SYS_KMQ_TIMEDSEND = 460 // { int kmq_timedsend(int mqd, const char *msg_ptr, size_t msg_len,unsigned msg_prio, const struct timespec *abs_timeout);} + SYS_KMQ_NOTIFY = 461 // { int kmq_notify(int mqd, const struct sigevent *sigev); } SYS_KMQ_UNLINK = 462 // { int kmq_unlink(const char *path); } SYS_ABORT2 = 463 // { int abort2(const char *why, int nargs, void **args); } SYS_THR_SET_NAME = 464 // { int thr_set_name(long id, const char *name); } @@ -301,7 +319,7 @@ const ( SYS_SCTP_PEELOFF = 471 // { int sctp_peeloff(int sd, uint32_t name); } SYS_SCTP_GENERIC_SENDMSG = 472 // { int sctp_generic_sendmsg(int sd, caddr_t msg, int mlen, caddr_t to, __socklen_t tolen, struct sctp_sndrcvinfo *sinfo, int flags); } SYS_SCTP_GENERIC_SENDMSG_IOV = 473 // { int sctp_generic_sendmsg_iov(int sd, struct iovec *iov, int iovlen, caddr_t to, __socklen_t tolen, struct sctp_sndrcvinfo *sinfo, int flags); } - SYS_SCTP_GENERIC_RECVMSG = 474 // { int sctp_generic_recvmsg(int sd, struct iovec *iov, int iovlen, struct sockaddr *from, __socklen_t *fromlenaddr, struct sctp_sndrcvinfo *sinfo, int *msg_flags); } + SYS_SCTP_GENERIC_RECVMSG = 474 // { int sctp_generic_recvmsg(int sd, struct iovec *iov, int iovlen, struct sockaddr * from, __socklen_t *fromlenaddr, struct sctp_sndrcvinfo *sinfo, int *msg_flags); } SYS_PREAD = 475 // { ssize_t pread(int fd, void *buf, size_t nbyte, off_t offset); } SYS_PWRITE = 476 // { ssize_t pwrite(int fd, const void *buf, size_t nbyte, off_t offset); } SYS_MMAP = 477 // { caddr_t mmap(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos); } @@ -320,12 +338,14 @@ const ( SYS_FCHMODAT = 490 // { int fchmodat(int fd, char *path, mode_t mode, int flag); } SYS_FCHOWNAT = 491 // { int fchownat(int fd, char *path, uid_t uid, gid_t gid, int flag); } SYS_FEXECVE = 492 // { int fexecve(int fd, char **argv, char **envv); } + SYS_FSTATAT = 493 // { int fstatat(int fd, char *path, struct stat *buf, int flag); } SYS_FUTIMESAT = 494 // { int futimesat(int fd, char *path, struct timeval *times); } SYS_LINKAT = 495 // { int linkat(int fd1, char *path1, int fd2, char *path2, int flag); } SYS_MKDIRAT = 496 // { int mkdirat(int fd, char *path, mode_t mode); } SYS_MKFIFOAT = 497 // { int mkfifoat(int fd, char *path, mode_t mode); } + SYS_MKNODAT = 498 // { int mknodat(int fd, char *path, mode_t mode, dev_t dev); } SYS_OPENAT = 499 // { int openat(int fd, char *path, int flag, mode_t mode); } - SYS_READLINKAT = 500 // { ssize_t readlinkat(int fd, char *path, char *buf, size_t bufsize); } + SYS_READLINKAT = 500 // { int readlinkat(int fd, char *path, char *buf, size_t bufsize); } SYS_RENAMEAT = 501 // { int renameat(int oldfd, char *old, int newfd, char *new); } SYS_SYMLINKAT = 502 // { int symlinkat(char *path1, int fd, char *path2); } SYS_UNLINKAT = 503 // { int unlinkat(int fd, char *path, int flag); } @@ -371,24 +391,7 @@ const ( SYS_PPOLL = 545 // { int ppoll(struct pollfd *fds, u_int nfds, const struct timespec *ts, const sigset_t *set); } SYS_FUTIMENS = 546 // { int futimens(int fd, struct timespec *times); } SYS_UTIMENSAT = 547 // { int utimensat(int fd, char *path, struct timespec *times, int flag); } + SYS_NUMA_GETAFFINITY = 548 // { int numa_getaffinity(cpuwhich_t which, id_t id, struct vm_domain_policy_entry *policy); } + SYS_NUMA_SETAFFINITY = 549 // { int numa_setaffinity(cpuwhich_t which, id_t id, const struct vm_domain_policy_entry *policy); } SYS_FDATASYNC = 550 // { int fdatasync(int fd); } - SYS_FSTAT = 551 // { int fstat(int fd, struct stat *sb); } - SYS_FSTATAT = 552 // { int fstatat(int fd, char *path, struct stat *buf, int flag); } - SYS_FHSTAT = 553 // { int fhstat(const struct fhandle *u_fhp, struct stat *sb); } - SYS_GETDIRENTRIES = 554 // { ssize_t getdirentries(int fd, char *buf, size_t count, off_t *basep); } - SYS_STATFS = 555 // { int statfs(char *path, struct statfs *buf); } - SYS_FSTATFS = 556 // { int fstatfs(int fd, struct statfs *buf); } - SYS_GETFSSTAT = 557 // { int getfsstat(struct statfs *buf, long bufsize, int mode); } - SYS_FHSTATFS = 558 // { int fhstatfs(const struct fhandle *u_fhp, struct statfs *buf); } - SYS_MKNODAT = 559 // { int mknodat(int fd, char *path, mode_t mode, dev_t dev); } - SYS_KEVENT = 560 // { int kevent(int fd, struct kevent *changelist, int nchanges, struct kevent *eventlist, int nevents, const struct timespec *timeout); } - SYS_CPUSET_GETDOMAIN = 561 // { int cpuset_getdomain(cpulevel_t level, cpuwhich_t which, id_t id, size_t domainsetsize, domainset_t *mask, int *policy); } - SYS_CPUSET_SETDOMAIN = 562 // { int cpuset_setdomain(cpulevel_t level, cpuwhich_t which, id_t id, size_t domainsetsize, domainset_t *mask, int policy); } - SYS_GETRANDOM = 563 // { int getrandom(void *buf, size_t buflen, unsigned int flags); } - SYS_GETFHAT = 564 // { int getfhat(int fd, char *path, struct fhandle *fhp, int flags); } - SYS_FHLINK = 565 // { int fhlink(struct fhandle *fhp, const char *to); } - SYS_FHLINKAT = 566 // { int fhlinkat(struct fhandle *fhp, int tofd, const char *to,); } - SYS_FHREADLINK = 567 // { int fhreadlink(struct fhandle *fhp, char *buf, size_t bufsize); } - SYS___SYSCTLBYNAME = 570 // { int __sysctlbyname(const char *name, size_t namelen, void *old, size_t *oldlenp, void *new, size_t newlen); } - SYS_CLOSE_RANGE = 575 // { int close_range(u_int lowfd, u_int highfd, int flags); } ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_freebsd_arm64.go b/vendor/golang.org/x/sys/unix/zsysnum_freebsd_arm64.go index 89dcc42..61ad5ca 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_freebsd_arm64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_freebsd_arm64.go @@ -1,4 +1,4 @@ -// go run mksysnum.go https://cgit.freebsd.org/src/plain/sys/kern/syscalls.master?h=stable/12 +// go run mksysnum.go https://svn.freebsd.org/base/stable/11/sys/kern/syscalls.master // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm64 && freebsd @@ -19,9 +19,10 @@ const ( SYS_UNLINK = 10 // { int unlink(char *path); } SYS_CHDIR = 12 // { int chdir(char *path); } SYS_FCHDIR = 13 // { int fchdir(int fd); } + SYS_MKNOD = 14 // { int mknod(char *path, int mode, int dev); } SYS_CHMOD = 15 // { int chmod(char *path, int mode); } SYS_CHOWN = 16 // { int chown(char *path, int uid, int gid); } - SYS_BREAK = 17 // { caddr_t break(char *nsize); } + SYS_OBREAK = 17 // { int obreak(char *nsize); } break obreak_args int SYS_GETPID = 20 // { pid_t getpid(void); } SYS_MOUNT = 21 // { int mount(char *type, char *path, int flags, caddr_t data); } SYS_UNMOUNT = 22 // { int unmount(char *path, int flags); } @@ -42,6 +43,7 @@ const ( SYS_KILL = 37 // { int kill(int pid, int signum); } SYS_GETPPID = 39 // { pid_t getppid(void); } SYS_DUP = 41 // { int dup(u_int fd); } + SYS_PIPE = 42 // { int pipe(void); } SYS_GETEGID = 43 // { gid_t getegid(void); } SYS_PROFIL = 44 // { int profil(caddr_t samples, size_t size, size_t offset, u_int scale); } SYS_KTRACE = 45 // { int ktrace(const char *fname, int ops, int facs, int pid); } @@ -56,14 +58,15 @@ const ( SYS_SYMLINK = 57 // { int symlink(char *path, char *link); } SYS_READLINK = 58 // { ssize_t readlink(char *path, char *buf, size_t count); } SYS_EXECVE = 59 // { int execve(char *fname, char **argv, char **envv); } - SYS_UMASK = 60 // { int umask(int newmask); } + SYS_UMASK = 60 // { int umask(int newmask); } umask umask_args int SYS_CHROOT = 61 // { int chroot(char *path); } SYS_MSYNC = 65 // { int msync(void *addr, size_t len, int flags); } SYS_VFORK = 66 // { int vfork(void); } SYS_SBRK = 69 // { int sbrk(int incr); } SYS_SSTK = 70 // { int sstk(int incr); } + SYS_OVADVISE = 72 // { int ovadvise(int anom); } vadvise ovadvise_args int SYS_MUNMAP = 73 // { int munmap(void *addr, size_t len); } - SYS_MPROTECT = 74 // { int mprotect(void *addr, size_t len, int prot); } + SYS_MPROTECT = 74 // { int mprotect(const void *addr, size_t len, int prot); } SYS_MADVISE = 75 // { int madvise(void *addr, size_t len, int behav); } SYS_MINCORE = 78 // { int mincore(const void *addr, size_t len, char *vec); } SYS_GETGROUPS = 79 // { int getgroups(u_int gidsetsize, gid_t *gidset); } @@ -121,10 +124,14 @@ const ( SYS_SETGID = 181 // { int setgid(gid_t gid); } SYS_SETEGID = 182 // { int setegid(gid_t egid); } SYS_SETEUID = 183 // { int seteuid(uid_t euid); } + SYS_STAT = 188 // { int stat(char *path, struct stat *ub); } + SYS_FSTAT = 189 // { int fstat(int fd, struct stat *sb); } + SYS_LSTAT = 190 // { int lstat(char *path, struct stat *ub); } SYS_PATHCONF = 191 // { int pathconf(char *path, int name); } SYS_FPATHCONF = 192 // { int fpathconf(int fd, int name); } SYS_GETRLIMIT = 194 // { int getrlimit(u_int which, struct rlimit *rlp); } getrlimit __getrlimit_args int SYS_SETRLIMIT = 195 // { int setrlimit(u_int which, struct rlimit *rlp); } setrlimit __setrlimit_args int + SYS_GETDIRENTRIES = 196 // { int getdirentries(int fd, char *buf, u_int count, long *basep); } SYS___SYSCTL = 202 // { int __sysctl(int *name, u_int namelen, void *old, size_t *oldlenp, void *new, size_t newlen); } __sysctl sysctl_args int SYS_MLOCK = 203 // { int mlock(const void *addr, size_t len); } SYS_MUNLOCK = 204 // { int munlock(const void *addr, size_t len); } @@ -136,12 +143,12 @@ const ( SYS_SEMOP = 222 // { int semop(int semid, struct sembuf *sops, size_t nsops); } SYS_MSGGET = 225 // { int msgget(key_t key, int msgflg); } SYS_MSGSND = 226 // { int msgsnd(int msqid, const void *msgp, size_t msgsz, int msgflg); } - SYS_MSGRCV = 227 // { ssize_t msgrcv(int msqid, void *msgp, size_t msgsz, long msgtyp, int msgflg); } + SYS_MSGRCV = 227 // { int msgrcv(int msqid, void *msgp, size_t msgsz, long msgtyp, int msgflg); } SYS_SHMAT = 228 // { int shmat(int shmid, const void *shmaddr, int shmflg); } SYS_SHMDT = 230 // { int shmdt(const void *shmaddr); } SYS_SHMGET = 231 // { int shmget(key_t key, size_t size, int shmflg); } SYS_CLOCK_GETTIME = 232 // { int clock_gettime(clockid_t clock_id, struct timespec *tp); } - SYS_CLOCK_SETTIME = 233 // { int clock_settime(clockid_t clock_id, const struct timespec *tp); } + SYS_CLOCK_SETTIME = 233 // { int clock_settime( clockid_t clock_id, const struct timespec *tp); } SYS_CLOCK_GETRES = 234 // { int clock_getres(clockid_t clock_id, struct timespec *tp); } SYS_KTIMER_CREATE = 235 // { int ktimer_create(clockid_t clock_id, struct sigevent *evp, int *timerid); } SYS_KTIMER_DELETE = 236 // { int ktimer_delete(int timerid); } @@ -150,44 +157,50 @@ const ( SYS_KTIMER_GETOVERRUN = 239 // { int ktimer_getoverrun(int timerid); } SYS_NANOSLEEP = 240 // { int nanosleep(const struct timespec *rqtp, struct timespec *rmtp); } SYS_FFCLOCK_GETCOUNTER = 241 // { int ffclock_getcounter(ffcounter *ffcount); } - SYS_FFCLOCK_SETESTIMATE = 242 // { int ffclock_setestimate(struct ffclock_estimate *cest); } - SYS_FFCLOCK_GETESTIMATE = 243 // { int ffclock_getestimate(struct ffclock_estimate *cest); } + SYS_FFCLOCK_SETESTIMATE = 242 // { int ffclock_setestimate( struct ffclock_estimate *cest); } + SYS_FFCLOCK_GETESTIMATE = 243 // { int ffclock_getestimate( struct ffclock_estimate *cest); } SYS_CLOCK_NANOSLEEP = 244 // { int clock_nanosleep(clockid_t clock_id, int flags, const struct timespec *rqtp, struct timespec *rmtp); } - SYS_CLOCK_GETCPUCLOCKID2 = 247 // { int clock_getcpuclockid2(id_t id, int which, clockid_t *clock_id); } + SYS_CLOCK_GETCPUCLOCKID2 = 247 // { int clock_getcpuclockid2(id_t id,int which, clockid_t *clock_id); } SYS_NTP_GETTIME = 248 // { int ntp_gettime(struct ntptimeval *ntvp); } SYS_MINHERIT = 250 // { int minherit(void *addr, size_t len, int inherit); } SYS_RFORK = 251 // { int rfork(int flags); } + SYS_OPENBSD_POLL = 252 // { int openbsd_poll(struct pollfd *fds, u_int nfds, int timeout); } SYS_ISSETUGID = 253 // { int issetugid(void); } SYS_LCHOWN = 254 // { int lchown(char *path, int uid, int gid); } SYS_AIO_READ = 255 // { int aio_read(struct aiocb *aiocbp); } SYS_AIO_WRITE = 256 // { int aio_write(struct aiocb *aiocbp); } - SYS_LIO_LISTIO = 257 // { int lio_listio(int mode, struct aiocb* const *acb_list, int nent, struct sigevent *sig); } + SYS_LIO_LISTIO = 257 // { int lio_listio(int mode, struct aiocb * const *acb_list, int nent, struct sigevent *sig); } + SYS_GETDENTS = 272 // { int getdents(int fd, char *buf, size_t count); } SYS_LCHMOD = 274 // { int lchmod(char *path, mode_t mode); } SYS_LUTIMES = 276 // { int lutimes(char *path, struct timeval *tptr); } + SYS_NSTAT = 278 // { int nstat(char *path, struct nstat *ub); } + SYS_NFSTAT = 279 // { int nfstat(int fd, struct nstat *sb); } + SYS_NLSTAT = 280 // { int nlstat(char *path, struct nstat *ub); } SYS_PREADV = 289 // { ssize_t preadv(int fd, struct iovec *iovp, u_int iovcnt, off_t offset); } SYS_PWRITEV = 290 // { ssize_t pwritev(int fd, struct iovec *iovp, u_int iovcnt, off_t offset); } SYS_FHOPEN = 298 // { int fhopen(const struct fhandle *u_fhp, int flags); } + SYS_FHSTAT = 299 // { int fhstat(const struct fhandle *u_fhp, struct stat *sb); } SYS_MODNEXT = 300 // { int modnext(int modid); } - SYS_MODSTAT = 301 // { int modstat(int modid, struct module_stat* stat); } + SYS_MODSTAT = 301 // { int modstat(int modid, struct module_stat *stat); } SYS_MODFNEXT = 302 // { int modfnext(int modid); } SYS_MODFIND = 303 // { int modfind(const char *name); } SYS_KLDLOAD = 304 // { int kldload(const char *file); } SYS_KLDUNLOAD = 305 // { int kldunload(int fileid); } SYS_KLDFIND = 306 // { int kldfind(const char *file); } SYS_KLDNEXT = 307 // { int kldnext(int fileid); } - SYS_KLDSTAT = 308 // { int kldstat(int fileid, struct kld_file_stat *stat); } + SYS_KLDSTAT = 308 // { int kldstat(int fileid, struct kld_file_stat* stat); } SYS_KLDFIRSTMOD = 309 // { int kldfirstmod(int fileid); } SYS_GETSID = 310 // { int getsid(pid_t pid); } SYS_SETRESUID = 311 // { int setresuid(uid_t ruid, uid_t euid, uid_t suid); } SYS_SETRESGID = 312 // { int setresgid(gid_t rgid, gid_t egid, gid_t sgid); } SYS_AIO_RETURN = 314 // { ssize_t aio_return(struct aiocb *aiocbp); } - SYS_AIO_SUSPEND = 315 // { int aio_suspend(struct aiocb * const * aiocbp, int nent, const struct timespec *timeout); } + SYS_AIO_SUSPEND = 315 // { int aio_suspend( struct aiocb * const * aiocbp, int nent, const struct timespec *timeout); } SYS_AIO_CANCEL = 316 // { int aio_cancel(int fd, struct aiocb *aiocbp); } SYS_AIO_ERROR = 317 // { int aio_error(struct aiocb *aiocbp); } SYS_YIELD = 321 // { int yield(void); } SYS_MLOCKALL = 324 // { int mlockall(int how); } SYS_MUNLOCKALL = 325 // { int munlockall(void); } - SYS___GETCWD = 326 // { int __getcwd(char *buf, size_t buflen); } + SYS___GETCWD = 326 // { int __getcwd(char *buf, u_int buflen); } SYS_SCHED_SETPARAM = 327 // { int sched_setparam (pid_t pid, const struct sched_param *param); } SYS_SCHED_GETPARAM = 328 // { int sched_getparam (pid_t pid, struct sched_param *param); } SYS_SCHED_SETSCHEDULER = 329 // { int sched_setscheduler (pid_t pid, int policy, const struct sched_param *param); } @@ -213,13 +226,14 @@ const ( SYS___ACL_ACLCHECK_FILE = 353 // { int __acl_aclcheck_file(const char *path, acl_type_t type, struct acl *aclp); } SYS___ACL_ACLCHECK_FD = 354 // { int __acl_aclcheck_fd(int filedes, acl_type_t type, struct acl *aclp); } SYS_EXTATTRCTL = 355 // { int extattrctl(const char *path, int cmd, const char *filename, int attrnamespace, const char *attrname); } - SYS_EXTATTR_SET_FILE = 356 // { ssize_t extattr_set_file(const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes); } - SYS_EXTATTR_GET_FILE = 357 // { ssize_t extattr_get_file(const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes); } + SYS_EXTATTR_SET_FILE = 356 // { ssize_t extattr_set_file( const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes); } + SYS_EXTATTR_GET_FILE = 357 // { ssize_t extattr_get_file( const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes); } SYS_EXTATTR_DELETE_FILE = 358 // { int extattr_delete_file(const char *path, int attrnamespace, const char *attrname); } - SYS_AIO_WAITCOMPLETE = 359 // { ssize_t aio_waitcomplete(struct aiocb **aiocbp, struct timespec *timeout); } + SYS_AIO_WAITCOMPLETE = 359 // { ssize_t aio_waitcomplete( struct aiocb **aiocbp, struct timespec *timeout); } SYS_GETRESUID = 360 // { int getresuid(uid_t *ruid, uid_t *euid, uid_t *suid); } SYS_GETRESGID = 361 // { int getresgid(gid_t *rgid, gid_t *egid, gid_t *sgid); } SYS_KQUEUE = 362 // { int kqueue(void); } + SYS_KEVENT = 363 // { int kevent(int fd, struct kevent *changelist, int nchanges, struct kevent *eventlist, int nevents, const struct timespec *timeout); } SYS_EXTATTR_SET_FD = 371 // { ssize_t extattr_set_fd(int fd, int attrnamespace, const char *attrname, void *data, size_t nbytes); } SYS_EXTATTR_GET_FD = 372 // { ssize_t extattr_get_fd(int fd, int attrnamespace, const char *attrname, void *data, size_t nbytes); } SYS_EXTATTR_DELETE_FD = 373 // { int extattr_delete_fd(int fd, int attrnamespace, const char *attrname); } @@ -237,6 +251,10 @@ const ( SYS_UUIDGEN = 392 // { int uuidgen(struct uuid *store, int count); } SYS_SENDFILE = 393 // { int sendfile(int fd, int s, off_t offset, size_t nbytes, struct sf_hdtr *hdtr, off_t *sbytes, int flags); } SYS_MAC_SYSCALL = 394 // { int mac_syscall(const char *policy, int call, void *arg); } + SYS_GETFSSTAT = 395 // { int getfsstat(struct statfs *buf, long bufsize, int mode); } + SYS_STATFS = 396 // { int statfs(char *path, struct statfs *buf); } + SYS_FSTATFS = 397 // { int fstatfs(int fd, struct statfs *buf); } + SYS_FHSTATFS = 398 // { int fhstatfs(const struct fhandle *u_fhp, struct statfs *buf); } SYS_KSEM_CLOSE = 400 // { int ksem_close(semid_t id); } SYS_KSEM_POST = 401 // { int ksem_post(semid_t id); } SYS_KSEM_WAIT = 402 // { int ksem_wait(semid_t id); } @@ -249,14 +267,14 @@ const ( SYS___MAC_GET_PID = 409 // { int __mac_get_pid(pid_t pid, struct mac *mac_p); } SYS___MAC_GET_LINK = 410 // { int __mac_get_link(const char *path_p, struct mac *mac_p); } SYS___MAC_SET_LINK = 411 // { int __mac_set_link(const char *path_p, struct mac *mac_p); } - SYS_EXTATTR_SET_LINK = 412 // { ssize_t extattr_set_link(const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes); } - SYS_EXTATTR_GET_LINK = 413 // { ssize_t extattr_get_link(const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes); } - SYS_EXTATTR_DELETE_LINK = 414 // { int extattr_delete_link(const char *path, int attrnamespace, const char *attrname); } + SYS_EXTATTR_SET_LINK = 412 // { ssize_t extattr_set_link( const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes); } + SYS_EXTATTR_GET_LINK = 413 // { ssize_t extattr_get_link( const char *path, int attrnamespace, const char *attrname, void *data, size_t nbytes); } + SYS_EXTATTR_DELETE_LINK = 414 // { int extattr_delete_link( const char *path, int attrnamespace, const char *attrname); } SYS___MAC_EXECVE = 415 // { int __mac_execve(char *fname, char **argv, char **envv, struct mac *mac_p); } SYS_SIGACTION = 416 // { int sigaction(int sig, const struct sigaction *act, struct sigaction *oact); } - SYS_SIGRETURN = 417 // { int sigreturn(const struct __ucontext *sigcntxp); } + SYS_SIGRETURN = 417 // { int sigreturn( const struct __ucontext *sigcntxp); } SYS_GETCONTEXT = 421 // { int getcontext(struct __ucontext *ucp); } - SYS_SETCONTEXT = 422 // { int setcontext(const struct __ucontext *ucp); } + SYS_SETCONTEXT = 422 // { int setcontext( const struct __ucontext *ucp); } SYS_SWAPCONTEXT = 423 // { int swapcontext(struct __ucontext *oucp, const struct __ucontext *ucp); } SYS_SWAPOFF = 424 // { int swapoff(const char *name); } SYS___ACL_GET_LINK = 425 // { int __acl_get_link(const char *path, acl_type_t type, struct acl *aclp); } @@ -270,10 +288,10 @@ const ( SYS_THR_KILL = 433 // { int thr_kill(long id, int sig); } SYS_JAIL_ATTACH = 436 // { int jail_attach(int jid); } SYS_EXTATTR_LIST_FD = 437 // { ssize_t extattr_list_fd(int fd, int attrnamespace, void *data, size_t nbytes); } - SYS_EXTATTR_LIST_FILE = 438 // { ssize_t extattr_list_file(const char *path, int attrnamespace, void *data, size_t nbytes); } - SYS_EXTATTR_LIST_LINK = 439 // { ssize_t extattr_list_link(const char *path, int attrnamespace, void *data, size_t nbytes); } + SYS_EXTATTR_LIST_FILE = 438 // { ssize_t extattr_list_file( const char *path, int attrnamespace, void *data, size_t nbytes); } + SYS_EXTATTR_LIST_LINK = 439 // { ssize_t extattr_list_link( const char *path, int attrnamespace, void *data, size_t nbytes); } SYS_KSEM_TIMEDWAIT = 441 // { int ksem_timedwait(semid_t id, const struct timespec *abstime); } - SYS_THR_SUSPEND = 442 // { int thr_suspend(const struct timespec *timeout); } + SYS_THR_SUSPEND = 442 // { int thr_suspend( const struct timespec *timeout); } SYS_THR_WAKE = 443 // { int thr_wake(long id); } SYS_KLDUNLOADF = 444 // { int kldunloadf(int fileid, int flags); } SYS_AUDIT = 445 // { int audit(const void *record, u_int length); } @@ -282,17 +300,17 @@ const ( SYS_SETAUID = 448 // { int setauid(uid_t *auid); } SYS_GETAUDIT = 449 // { int getaudit(struct auditinfo *auditinfo); } SYS_SETAUDIT = 450 // { int setaudit(struct auditinfo *auditinfo); } - SYS_GETAUDIT_ADDR = 451 // { int getaudit_addr(struct auditinfo_addr *auditinfo_addr, u_int length); } - SYS_SETAUDIT_ADDR = 452 // { int setaudit_addr(struct auditinfo_addr *auditinfo_addr, u_int length); } + SYS_GETAUDIT_ADDR = 451 // { int getaudit_addr( struct auditinfo_addr *auditinfo_addr, u_int length); } + SYS_SETAUDIT_ADDR = 452 // { int setaudit_addr( struct auditinfo_addr *auditinfo_addr, u_int length); } SYS_AUDITCTL = 453 // { int auditctl(char *path); } SYS__UMTX_OP = 454 // { int _umtx_op(void *obj, int op, u_long val, void *uaddr1, void *uaddr2); } SYS_THR_NEW = 455 // { int thr_new(struct thr_param *param, int param_size); } SYS_SIGQUEUE = 456 // { int sigqueue(pid_t pid, int signum, void *value); } SYS_KMQ_OPEN = 457 // { int kmq_open(const char *path, int flags, mode_t mode, const struct mq_attr *attr); } - SYS_KMQ_SETATTR = 458 // { int kmq_setattr(int mqd, const struct mq_attr *attr, struct mq_attr *oattr); } - SYS_KMQ_TIMEDRECEIVE = 459 // { int kmq_timedreceive(int mqd, char *msg_ptr, size_t msg_len, unsigned *msg_prio, const struct timespec *abs_timeout); } - SYS_KMQ_TIMEDSEND = 460 // { int kmq_timedsend(int mqd, const char *msg_ptr, size_t msg_len, unsigned msg_prio, const struct timespec *abs_timeout); } - SYS_KMQ_NOTIFY = 461 // { int kmq_notify(int mqd, const struct sigevent *sigev); } + SYS_KMQ_SETATTR = 458 // { int kmq_setattr(int mqd, const struct mq_attr *attr, struct mq_attr *oattr); } + SYS_KMQ_TIMEDRECEIVE = 459 // { int kmq_timedreceive(int mqd, char *msg_ptr, size_t msg_len, unsigned *msg_prio, const struct timespec *abs_timeout); } + SYS_KMQ_TIMEDSEND = 460 // { int kmq_timedsend(int mqd, const char *msg_ptr, size_t msg_len,unsigned msg_prio, const struct timespec *abs_timeout);} + SYS_KMQ_NOTIFY = 461 // { int kmq_notify(int mqd, const struct sigevent *sigev); } SYS_KMQ_UNLINK = 462 // { int kmq_unlink(const char *path); } SYS_ABORT2 = 463 // { int abort2(const char *why, int nargs, void **args); } SYS_THR_SET_NAME = 464 // { int thr_set_name(long id, const char *name); } @@ -301,7 +319,7 @@ const ( SYS_SCTP_PEELOFF = 471 // { int sctp_peeloff(int sd, uint32_t name); } SYS_SCTP_GENERIC_SENDMSG = 472 // { int sctp_generic_sendmsg(int sd, caddr_t msg, int mlen, caddr_t to, __socklen_t tolen, struct sctp_sndrcvinfo *sinfo, int flags); } SYS_SCTP_GENERIC_SENDMSG_IOV = 473 // { int sctp_generic_sendmsg_iov(int sd, struct iovec *iov, int iovlen, caddr_t to, __socklen_t tolen, struct sctp_sndrcvinfo *sinfo, int flags); } - SYS_SCTP_GENERIC_RECVMSG = 474 // { int sctp_generic_recvmsg(int sd, struct iovec *iov, int iovlen, struct sockaddr *from, __socklen_t *fromlenaddr, struct sctp_sndrcvinfo *sinfo, int *msg_flags); } + SYS_SCTP_GENERIC_RECVMSG = 474 // { int sctp_generic_recvmsg(int sd, struct iovec *iov, int iovlen, struct sockaddr * from, __socklen_t *fromlenaddr, struct sctp_sndrcvinfo *sinfo, int *msg_flags); } SYS_PREAD = 475 // { ssize_t pread(int fd, void *buf, size_t nbyte, off_t offset); } SYS_PWRITE = 476 // { ssize_t pwrite(int fd, const void *buf, size_t nbyte, off_t offset); } SYS_MMAP = 477 // { caddr_t mmap(caddr_t addr, size_t len, int prot, int flags, int fd, off_t pos); } @@ -320,12 +338,14 @@ const ( SYS_FCHMODAT = 490 // { int fchmodat(int fd, char *path, mode_t mode, int flag); } SYS_FCHOWNAT = 491 // { int fchownat(int fd, char *path, uid_t uid, gid_t gid, int flag); } SYS_FEXECVE = 492 // { int fexecve(int fd, char **argv, char **envv); } + SYS_FSTATAT = 493 // { int fstatat(int fd, char *path, struct stat *buf, int flag); } SYS_FUTIMESAT = 494 // { int futimesat(int fd, char *path, struct timeval *times); } SYS_LINKAT = 495 // { int linkat(int fd1, char *path1, int fd2, char *path2, int flag); } SYS_MKDIRAT = 496 // { int mkdirat(int fd, char *path, mode_t mode); } SYS_MKFIFOAT = 497 // { int mkfifoat(int fd, char *path, mode_t mode); } + SYS_MKNODAT = 498 // { int mknodat(int fd, char *path, mode_t mode, dev_t dev); } SYS_OPENAT = 499 // { int openat(int fd, char *path, int flag, mode_t mode); } - SYS_READLINKAT = 500 // { ssize_t readlinkat(int fd, char *path, char *buf, size_t bufsize); } + SYS_READLINKAT = 500 // { int readlinkat(int fd, char *path, char *buf, size_t bufsize); } SYS_RENAMEAT = 501 // { int renameat(int oldfd, char *old, int newfd, char *new); } SYS_SYMLINKAT = 502 // { int symlinkat(char *path1, int fd, char *path2); } SYS_UNLINKAT = 503 // { int unlinkat(int fd, char *path, int flag); } @@ -371,24 +391,7 @@ const ( SYS_PPOLL = 545 // { int ppoll(struct pollfd *fds, u_int nfds, const struct timespec *ts, const sigset_t *set); } SYS_FUTIMENS = 546 // { int futimens(int fd, struct timespec *times); } SYS_UTIMENSAT = 547 // { int utimensat(int fd, char *path, struct timespec *times, int flag); } + SYS_NUMA_GETAFFINITY = 548 // { int numa_getaffinity(cpuwhich_t which, id_t id, struct vm_domain_policy_entry *policy); } + SYS_NUMA_SETAFFINITY = 549 // { int numa_setaffinity(cpuwhich_t which, id_t id, const struct vm_domain_policy_entry *policy); } SYS_FDATASYNC = 550 // { int fdatasync(int fd); } - SYS_FSTAT = 551 // { int fstat(int fd, struct stat *sb); } - SYS_FSTATAT = 552 // { int fstatat(int fd, char *path, struct stat *buf, int flag); } - SYS_FHSTAT = 553 // { int fhstat(const struct fhandle *u_fhp, struct stat *sb); } - SYS_GETDIRENTRIES = 554 // { ssize_t getdirentries(int fd, char *buf, size_t count, off_t *basep); } - SYS_STATFS = 555 // { int statfs(char *path, struct statfs *buf); } - SYS_FSTATFS = 556 // { int fstatfs(int fd, struct statfs *buf); } - SYS_GETFSSTAT = 557 // { int getfsstat(struct statfs *buf, long bufsize, int mode); } - SYS_FHSTATFS = 558 // { int fhstatfs(const struct fhandle *u_fhp, struct statfs *buf); } - SYS_MKNODAT = 559 // { int mknodat(int fd, char *path, mode_t mode, dev_t dev); } - SYS_KEVENT = 560 // { int kevent(int fd, struct kevent *changelist, int nchanges, struct kevent *eventlist, int nevents, const struct timespec *timeout); } - SYS_CPUSET_GETDOMAIN = 561 // { int cpuset_getdomain(cpulevel_t level, cpuwhich_t which, id_t id, size_t domainsetsize, domainset_t *mask, int *policy); } - SYS_CPUSET_SETDOMAIN = 562 // { int cpuset_setdomain(cpulevel_t level, cpuwhich_t which, id_t id, size_t domainsetsize, domainset_t *mask, int policy); } - SYS_GETRANDOM = 563 // { int getrandom(void *buf, size_t buflen, unsigned int flags); } - SYS_GETFHAT = 564 // { int getfhat(int fd, char *path, struct fhandle *fhp, int flags); } - SYS_FHLINK = 565 // { int fhlink(struct fhandle *fhp, const char *to); } - SYS_FHLINKAT = 566 // { int fhlinkat(struct fhandle *fhp, int tofd, const char *to,); } - SYS_FHREADLINK = 567 // { int fhreadlink(struct fhandle *fhp, char *buf, size_t bufsize); } - SYS___SYSCTLBYNAME = 570 // { int __sysctlbyname(const char *name, size_t namelen, void *old, size_t *oldlenp, void *new, size_t newlen); } - SYS_CLOSE_RANGE = 575 // { int close_range(u_int lowfd, u_int highfd, int flags); } ) diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_386.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_386.go index c9c4ad0..62192e1 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_386.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_386.go @@ -1,4 +1,4 @@ -// go run linux/mksysnum.go -Wall -Werror -static -I/tmp/386/include -m32 /tmp/386/include/asm/unistd.h +// go run linux/mksysnum.go -Wall -Werror -static -I/tmp/include -m32 /tmp/include/asm/unistd.h // Code generated by the command above; see README.md. DO NOT EDIT. //go:build 386 && linux diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go index 12ff341..490aab5 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_amd64.go @@ -1,4 +1,4 @@ -// go run linux/mksysnum.go -Wall -Werror -static -I/tmp/amd64/include -m64 /tmp/amd64/include/asm/unistd.h +// go run linux/mksysnum.go -Wall -Werror -static -I/tmp/include -m64 /tmp/include/asm/unistd.h // Code generated by the command above; see README.md. DO NOT EDIT. //go:build amd64 && linux diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm.go index c3fb5e7..aca17b6 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm.go @@ -1,4 +1,4 @@ -// go run linux/mksysnum.go -Wall -Werror -static -I/tmp/arm/include /tmp/arm/include/asm/unistd.h +// go run linux/mksysnum.go -Wall -Werror -static -I/tmp/include /tmp/include/asm/unistd.h // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm && linux diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go index 358c847..54b4dfa 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_arm64.go @@ -1,4 +1,4 @@ -// go run linux/mksysnum.go -Wall -Werror -static -I/tmp/arm64/include -fsigned-char /tmp/arm64/include/asm/unistd.h +// go run linux/mksysnum.go -Wall -Werror -static -I/tmp/include -fsigned-char /tmp/include/asm/unistd.h // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm64 && linux diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_mips.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_mips.go index 202a57e..65a99ef 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_mips.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_mips.go @@ -1,4 +1,4 @@ -// go run linux/mksysnum.go -Wall -Werror -static -I/tmp/mips/include /tmp/mips/include/asm/unistd.h +// go run linux/mksysnum.go -Wall -Werror -static -I/tmp/include /tmp/include/asm/unistd.h // Code generated by the command above; see README.md. DO NOT EDIT. //go:build mips && linux diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64.go index 1fbceb5..841c8a6 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64.go @@ -1,4 +1,4 @@ -// go run linux/mksysnum.go -Wall -Werror -static -I/tmp/mips64/include /tmp/mips64/include/asm/unistd.h +// go run linux/mksysnum.go -Wall -Werror -static -I/tmp/include /tmp/include/asm/unistd.h // Code generated by the command above; see README.md. DO NOT EDIT. //go:build mips64 && linux diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64le.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64le.go index b4ffb7a..e26a7c7 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64le.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_mips64le.go @@ -1,4 +1,4 @@ -// go run linux/mksysnum.go -Wall -Werror -static -I/tmp/mips64le/include /tmp/mips64le/include/asm/unistd.h +// go run linux/mksysnum.go -Wall -Werror -static -I/tmp/include /tmp/include/asm/unistd.h // Code generated by the command above; see README.md. DO NOT EDIT. //go:build mips64le && linux diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_mipsle.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_mipsle.go index 867985f..2644726 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_mipsle.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_mipsle.go @@ -1,4 +1,4 @@ -// go run linux/mksysnum.go -Wall -Werror -static -I/tmp/mipsle/include /tmp/mipsle/include/asm/unistd.h +// go run linux/mksysnum.go -Wall -Werror -static -I/tmp/include /tmp/include/asm/unistd.h // Code generated by the command above; see README.md. DO NOT EDIT. //go:build mipsle && linux diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc.go index a8cce69..26aefc1 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc.go @@ -1,4 +1,4 @@ -// go run linux/mksysnum.go -Wall -Werror -static -I/tmp/ppc/include /tmp/ppc/include/asm/unistd.h +// go run linux/mksysnum.go -Wall -Werror -static -I/tmp/include /tmp/include/asm/unistd.h // Code generated by the command above; see README.md. DO NOT EDIT. //go:build ppc && linux diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64.go index d44c5b3..8d4cd9d 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64.go @@ -1,4 +1,4 @@ -// go run linux/mksysnum.go -Wall -Werror -static -I/tmp/ppc64/include /tmp/ppc64/include/asm/unistd.h +// go run linux/mksysnum.go -Wall -Werror -static -I/tmp/include /tmp/include/asm/unistd.h // Code generated by the command above; see README.md. DO NOT EDIT. //go:build ppc64 && linux diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64le.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64le.go index 4214dd9..3b405d1 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64le.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_ppc64le.go @@ -1,4 +1,4 @@ -// go run linux/mksysnum.go -Wall -Werror -static -I/tmp/ppc64le/include /tmp/ppc64le/include/asm/unistd.h +// go run linux/mksysnum.go -Wall -Werror -static -I/tmp/include /tmp/include/asm/unistd.h // Code generated by the command above; see README.md. DO NOT EDIT. //go:build ppc64le && linux diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_riscv64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_riscv64.go index 3e594a8..c3a5af8 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_riscv64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_riscv64.go @@ -1,4 +1,4 @@ -// go run linux/mksysnum.go -Wall -Werror -static -I/tmp/riscv64/include /tmp/riscv64/include/asm/unistd.h +// go run linux/mksysnum.go -Wall -Werror -static -I/tmp/include /tmp/include/asm/unistd.h // Code generated by the command above; see README.md. DO NOT EDIT. //go:build riscv64 && linux @@ -309,7 +309,6 @@ const ( SYS_LANDLOCK_CREATE_RULESET = 444 SYS_LANDLOCK_ADD_RULE = 445 SYS_LANDLOCK_RESTRICT_SELF = 446 - SYS_MEMFD_SECRET = 447 SYS_PROCESS_MRELEASE = 448 SYS_FUTEX_WAITV = 449 SYS_SET_MEMPOLICY_HOME_NODE = 450 diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_s390x.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_s390x.go index 7ea4652..8ffa664 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_s390x.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_s390x.go @@ -1,4 +1,4 @@ -// go run linux/mksysnum.go -Wall -Werror -static -I/tmp/s390x/include -fsigned-char /tmp/s390x/include/asm/unistd.h +// go run linux/mksysnum.go -Wall -Werror -static -I/tmp/include -fsigned-char /tmp/include/asm/unistd.h // Code generated by the command above; see README.md. DO NOT EDIT. //go:build s390x && linux diff --git a/vendor/golang.org/x/sys/unix/zsysnum_linux_sparc64.go b/vendor/golang.org/x/sys/unix/zsysnum_linux_sparc64.go index 92f628e..6a39640 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_linux_sparc64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_linux_sparc64.go @@ -1,4 +1,4 @@ -// go run linux/mksysnum.go -Wall -Werror -static -I/tmp/sparc64/include /tmp/sparc64/include/asm/unistd.h +// go run linux/mksysnum.go -Wall -Werror -static -I/tmp/include /tmp/include/asm/unistd.h // Code generated by the command above; see README.md. DO NOT EDIT. //go:build sparc64 && linux diff --git a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_386.go b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_386.go index 5977338..817edbf 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_386.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_386.go @@ -6,7 +6,6 @@ package unix -// Deprecated: Use libc wrappers instead of direct syscalls. const ( SYS_EXIT = 1 // { void sys_exit(int rval); } SYS_FORK = 2 // { int sys_fork(void); } diff --git a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_amd64.go b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_amd64.go index 16af291..ea45361 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_amd64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_amd64.go @@ -6,7 +6,6 @@ package unix -// Deprecated: Use libc wrappers instead of direct syscalls. const ( SYS_EXIT = 1 // { void sys_exit(int rval); } SYS_FORK = 2 // { int sys_fork(void); } diff --git a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_arm.go b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_arm.go index f59b18a..467971e 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_arm.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_arm.go @@ -6,7 +6,6 @@ package unix -// Deprecated: Use libc wrappers instead of direct syscalls. const ( SYS_EXIT = 1 // { void sys_exit(int rval); } SYS_FORK = 2 // { int sys_fork(void); } diff --git a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_arm64.go b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_arm64.go index 721ef59..32eec5e 100644 --- a/vendor/golang.org/x/sys/unix/zsysnum_openbsd_arm64.go +++ b/vendor/golang.org/x/sys/unix/zsysnum_openbsd_arm64.go @@ -6,7 +6,6 @@ package unix -// Deprecated: Use libc wrappers instead of direct syscalls. const ( SYS_EXIT = 1 // { void sys_exit(int rval); } SYS_FORK = 2 // { int sys_fork(void); } diff --git a/vendor/golang.org/x/sys/unix/ztypes_darwin_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_darwin_amd64.go index e2a64f0..885842c 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_darwin_amd64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_darwin_amd64.go @@ -366,57 +366,30 @@ type ICMPv6Filter struct { Filt [8]uint32 } -type TCPConnectionInfo struct { - State uint8 - Snd_wscale uint8 - Rcv_wscale uint8 - _ uint8 - Options uint32 - Flags uint32 - Rto uint32 - Maxseg uint32 - Snd_ssthresh uint32 - Snd_cwnd uint32 - Snd_wnd uint32 - Snd_sbbytes uint32 - Rcv_wnd uint32 - Rttcur uint32 - Srtt uint32 - Rttvar uint32 - Txpackets uint64 - Txbytes uint64 - Txretransmitbytes uint64 - Rxpackets uint64 - Rxbytes uint64 - Rxoutoforderbytes uint64 - Txretransmitpackets uint64 -} - const ( - SizeofSockaddrInet4 = 0x10 - SizeofSockaddrInet6 = 0x1c - SizeofSockaddrAny = 0x6c - SizeofSockaddrUnix = 0x6a - SizeofSockaddrDatalink = 0x14 - SizeofSockaddrCtl = 0x20 - SizeofSockaddrVM = 0xc - SizeofXvsockpcb = 0xa8 - SizeofXSocket = 0x64 - SizeofXSockbuf = 0x18 - SizeofXVSockPgen = 0x20 - SizeofXucred = 0x4c - SizeofLinger = 0x8 - SizeofIovec = 0x10 - SizeofIPMreq = 0x8 - SizeofIPMreqn = 0xc - SizeofIPv6Mreq = 0x14 - SizeofMsghdr = 0x30 - SizeofCmsghdr = 0xc - SizeofInet4Pktinfo = 0xc - SizeofInet6Pktinfo = 0x14 - SizeofIPv6MTUInfo = 0x20 - SizeofICMPv6Filter = 0x20 - SizeofTCPConnectionInfo = 0x70 + SizeofSockaddrInet4 = 0x10 + SizeofSockaddrInet6 = 0x1c + SizeofSockaddrAny = 0x6c + SizeofSockaddrUnix = 0x6a + SizeofSockaddrDatalink = 0x14 + SizeofSockaddrCtl = 0x20 + SizeofSockaddrVM = 0xc + SizeofXvsockpcb = 0xa8 + SizeofXSocket = 0x64 + SizeofXSockbuf = 0x18 + SizeofXVSockPgen = 0x20 + SizeofXucred = 0x4c + SizeofLinger = 0x8 + SizeofIovec = 0x10 + SizeofIPMreq = 0x8 + SizeofIPMreqn = 0xc + SizeofIPv6Mreq = 0x14 + SizeofMsghdr = 0x30 + SizeofCmsghdr = 0xc + SizeofInet4Pktinfo = 0xc + SizeofInet6Pktinfo = 0x14 + SizeofIPv6MTUInfo = 0x20 + SizeofICMPv6Filter = 0x20 ) const ( diff --git a/vendor/golang.org/x/sys/unix/ztypes_darwin_arm64.go b/vendor/golang.org/x/sys/unix/ztypes_darwin_arm64.go index 34aa775..b23c023 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_darwin_arm64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_darwin_arm64.go @@ -366,57 +366,30 @@ type ICMPv6Filter struct { Filt [8]uint32 } -type TCPConnectionInfo struct { - State uint8 - Snd_wscale uint8 - Rcv_wscale uint8 - _ uint8 - Options uint32 - Flags uint32 - Rto uint32 - Maxseg uint32 - Snd_ssthresh uint32 - Snd_cwnd uint32 - Snd_wnd uint32 - Snd_sbbytes uint32 - Rcv_wnd uint32 - Rttcur uint32 - Srtt uint32 - Rttvar uint32 - Txpackets uint64 - Txbytes uint64 - Txretransmitbytes uint64 - Rxpackets uint64 - Rxbytes uint64 - Rxoutoforderbytes uint64 - Txretransmitpackets uint64 -} - const ( - SizeofSockaddrInet4 = 0x10 - SizeofSockaddrInet6 = 0x1c - SizeofSockaddrAny = 0x6c - SizeofSockaddrUnix = 0x6a - SizeofSockaddrDatalink = 0x14 - SizeofSockaddrCtl = 0x20 - SizeofSockaddrVM = 0xc - SizeofXvsockpcb = 0xa8 - SizeofXSocket = 0x64 - SizeofXSockbuf = 0x18 - SizeofXVSockPgen = 0x20 - SizeofXucred = 0x4c - SizeofLinger = 0x8 - SizeofIovec = 0x10 - SizeofIPMreq = 0x8 - SizeofIPMreqn = 0xc - SizeofIPv6Mreq = 0x14 - SizeofMsghdr = 0x30 - SizeofCmsghdr = 0xc - SizeofInet4Pktinfo = 0xc - SizeofInet6Pktinfo = 0x14 - SizeofIPv6MTUInfo = 0x20 - SizeofICMPv6Filter = 0x20 - SizeofTCPConnectionInfo = 0x70 + SizeofSockaddrInet4 = 0x10 + SizeofSockaddrInet6 = 0x1c + SizeofSockaddrAny = 0x6c + SizeofSockaddrUnix = 0x6a + SizeofSockaddrDatalink = 0x14 + SizeofSockaddrCtl = 0x20 + SizeofSockaddrVM = 0xc + SizeofXvsockpcb = 0xa8 + SizeofXSocket = 0x64 + SizeofXSockbuf = 0x18 + SizeofXVSockPgen = 0x20 + SizeofXucred = 0x4c + SizeofLinger = 0x8 + SizeofIovec = 0x10 + SizeofIPMreq = 0x8 + SizeofIPMreqn = 0xc + SizeofIPv6Mreq = 0x14 + SizeofMsghdr = 0x30 + SizeofCmsghdr = 0xc + SizeofInet4Pktinfo = 0xc + SizeofInet6Pktinfo = 0x14 + SizeofIPv6MTUInfo = 0x20 + SizeofICMPv6Filter = 0x20 ) const ( diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_386.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_386.go index d9c78cd..4eec078 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_386.go +++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_386.go @@ -90,6 +90,27 @@ type Stat_t struct { Spare [10]uint64 } +type stat_freebsd11_t struct { + Dev uint32 + Ino uint32 + Mode uint16 + Nlink uint16 + Uid uint32 + Gid uint32 + Rdev uint32 + Atim Timespec + Mtim Timespec + Ctim Timespec + Size int64 + Blocks int64 + Blksize int32 + Flags uint32 + Gen uint32 + Lspare int32 + Btim Timespec + _ [8]byte +} + type Statfs_t struct { Version uint32 Type uint32 @@ -115,6 +136,31 @@ type Statfs_t struct { Mntonname [1024]byte } +type statfs_freebsd11_t struct { + Version uint32 + Type uint32 + Flags uint64 + Bsize uint64 + Iosize uint64 + Blocks uint64 + Bfree uint64 + Bavail int64 + Files uint64 + Ffree int64 + Syncwrites uint64 + Asyncwrites uint64 + Syncreads uint64 + Asyncreads uint64 + Spare [10]uint64 + Namemax uint32 + Owner uint32 + Fsid Fsid + Charspare [80]int8 + Fstypename [16]byte + Mntfromname [88]byte + Mntonname [88]byte +} + type Flock_t struct { Start int64 Len int64 @@ -135,6 +181,14 @@ type Dirent struct { Name [256]int8 } +type dirent_freebsd11 struct { + Fileno uint32 + Reclen uint16 + Type uint8 + Namlen uint8 + Name [256]int8 +} + type Fsid struct { Val [2]int32 } @@ -283,9 +337,41 @@ const ( ) const ( - PTRACE_TRACEME = 0x0 - PTRACE_CONT = 0x7 - PTRACE_KILL = 0x8 + PTRACE_ATTACH = 0xa + PTRACE_CONT = 0x7 + PTRACE_DETACH = 0xb + PTRACE_GETFPREGS = 0x23 + PTRACE_GETFSBASE = 0x47 + PTRACE_GETLWPLIST = 0xf + PTRACE_GETNUMLWPS = 0xe + PTRACE_GETREGS = 0x21 + PTRACE_GETXSTATE = 0x45 + PTRACE_IO = 0xc + PTRACE_KILL = 0x8 + PTRACE_LWPEVENTS = 0x18 + PTRACE_LWPINFO = 0xd + PTRACE_SETFPREGS = 0x24 + PTRACE_SETREGS = 0x22 + PTRACE_SINGLESTEP = 0x9 + PTRACE_TRACEME = 0x0 +) + +const ( + PIOD_READ_D = 0x1 + PIOD_WRITE_D = 0x2 + PIOD_READ_I = 0x3 + PIOD_WRITE_I = 0x4 +) + +const ( + PL_FLAG_BORN = 0x100 + PL_FLAG_EXITED = 0x200 + PL_FLAG_SI = 0x20 +) + +const ( + TRAP_BRKPT = 0x1 + TRAP_TRACE = 0x2 ) type PtraceLwpInfoStruct struct { @@ -294,7 +380,7 @@ type PtraceLwpInfoStruct struct { Flags int32 Sigmask Sigset_t Siglist Sigset_t - Siginfo __PtraceSiginfo + Siginfo __Siginfo Tdname [20]int8 Child_pid int32 Syscall_code uint32 @@ -312,17 +398,6 @@ type __Siginfo struct { Value [4]byte _ [32]byte } -type __PtraceSiginfo struct { - Signo int32 - Errno int32 - Code int32 - Pid int32 - Uid uint32 - Status int32 - Addr uintptr - Value [4]byte - _ [32]byte -} type Sigset_t struct { Val [4]uint32 @@ -357,12 +432,10 @@ type FpReg struct { Pad [64]uint8 } -type FpExtendedPrecision struct{} - type PtraceIoDesc struct { Op int32 - Offs uintptr - Addr uintptr + Offs *byte + Addr *byte Len uint32 } @@ -371,9 +444,8 @@ type Kevent_t struct { Filter int16 Flags uint16 Fflags uint32 - Data int64 + Data int32 Udata *byte - Ext [4]uint64 } type FdSet struct { diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_amd64.go index 26991b1..7622904 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_amd64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_amd64.go @@ -86,6 +86,26 @@ type Stat_t struct { Spare [10]uint64 } +type stat_freebsd11_t struct { + Dev uint32 + Ino uint32 + Mode uint16 + Nlink uint16 + Uid uint32 + Gid uint32 + Rdev uint32 + Atim Timespec + Mtim Timespec + Ctim Timespec + Size int64 + Blocks int64 + Blksize int32 + Flags uint32 + Gen uint32 + Lspare int32 + Btim Timespec +} + type Statfs_t struct { Version uint32 Type uint32 @@ -111,6 +131,31 @@ type Statfs_t struct { Mntonname [1024]byte } +type statfs_freebsd11_t struct { + Version uint32 + Type uint32 + Flags uint64 + Bsize uint64 + Iosize uint64 + Blocks uint64 + Bfree uint64 + Bavail int64 + Files uint64 + Ffree int64 + Syncwrites uint64 + Asyncwrites uint64 + Syncreads uint64 + Asyncreads uint64 + Spare [10]uint64 + Namemax uint32 + Owner uint32 + Fsid Fsid + Charspare [80]int8 + Fstypename [16]byte + Mntfromname [88]byte + Mntonname [88]byte +} + type Flock_t struct { Start int64 Len int64 @@ -132,6 +177,14 @@ type Dirent struct { Name [256]int8 } +type dirent_freebsd11 struct { + Fileno uint32 + Reclen uint16 + Type uint8 + Namlen uint8 + Name [256]int8 +} + type Fsid struct { Val [2]int32 } @@ -280,9 +333,41 @@ const ( ) const ( - PTRACE_TRACEME = 0x0 - PTRACE_CONT = 0x7 - PTRACE_KILL = 0x8 + PTRACE_ATTACH = 0xa + PTRACE_CONT = 0x7 + PTRACE_DETACH = 0xb + PTRACE_GETFPREGS = 0x23 + PTRACE_GETFSBASE = 0x47 + PTRACE_GETLWPLIST = 0xf + PTRACE_GETNUMLWPS = 0xe + PTRACE_GETREGS = 0x21 + PTRACE_GETXSTATE = 0x45 + PTRACE_IO = 0xc + PTRACE_KILL = 0x8 + PTRACE_LWPEVENTS = 0x18 + PTRACE_LWPINFO = 0xd + PTRACE_SETFPREGS = 0x24 + PTRACE_SETREGS = 0x22 + PTRACE_SINGLESTEP = 0x9 + PTRACE_TRACEME = 0x0 +) + +const ( + PIOD_READ_D = 0x1 + PIOD_WRITE_D = 0x2 + PIOD_READ_I = 0x3 + PIOD_WRITE_I = 0x4 +) + +const ( + PL_FLAG_BORN = 0x100 + PL_FLAG_EXITED = 0x200 + PL_FLAG_SI = 0x20 +) + +const ( + TRAP_BRKPT = 0x1 + TRAP_TRACE = 0x2 ) type PtraceLwpInfoStruct struct { @@ -291,7 +376,7 @@ type PtraceLwpInfoStruct struct { Flags int32 Sigmask Sigset_t Siglist Sigset_t - Siginfo __PtraceSiginfo + Siginfo __Siginfo Tdname [20]int8 Child_pid int32 Syscall_code uint32 @@ -310,18 +395,6 @@ type __Siginfo struct { _ [40]byte } -type __PtraceSiginfo struct { - Signo int32 - Errno int32 - Code int32 - Pid int32 - Uid uint32 - Status int32 - Addr uintptr - Value [8]byte - _ [40]byte -} - type Sigset_t struct { Val [4]uint32 } @@ -362,12 +435,10 @@ type FpReg struct { Spare [12]uint64 } -type FpExtendedPrecision struct{} - type PtraceIoDesc struct { Op int32 - Offs uintptr - Addr uintptr + Offs *byte + Addr *byte Len uint64 } @@ -378,7 +449,6 @@ type Kevent_t struct { Fflags uint32 Data int64 Udata *byte - Ext [4]uint64 } type FdSet struct { diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm.go index f8324e7..19223ce 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm.go +++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm.go @@ -33,7 +33,7 @@ type Timeval struct { _ [4]byte } -type Time_t int64 +type Time_t int32 type Rusage struct { Utime Timeval @@ -88,6 +88,26 @@ type Stat_t struct { Spare [10]uint64 } +type stat_freebsd11_t struct { + Dev uint32 + Ino uint32 + Mode uint16 + Nlink uint16 + Uid uint32 + Gid uint32 + Rdev uint32 + Atim Timespec + Mtim Timespec + Ctim Timespec + Size int64 + Blocks int64 + Blksize int32 + Flags uint32 + Gen uint32 + Lspare int32 + Btim Timespec +} + type Statfs_t struct { Version uint32 Type uint32 @@ -113,6 +133,31 @@ type Statfs_t struct { Mntonname [1024]byte } +type statfs_freebsd11_t struct { + Version uint32 + Type uint32 + Flags uint64 + Bsize uint64 + Iosize uint64 + Blocks uint64 + Bfree uint64 + Bavail int64 + Files uint64 + Ffree int64 + Syncwrites uint64 + Asyncwrites uint64 + Syncreads uint64 + Asyncreads uint64 + Spare [10]uint64 + Namemax uint32 + Owner uint32 + Fsid Fsid + Charspare [80]int8 + Fstypename [16]byte + Mntfromname [88]byte + Mntonname [88]byte +} + type Flock_t struct { Start int64 Len int64 @@ -134,6 +179,14 @@ type Dirent struct { Name [256]int8 } +type dirent_freebsd11 struct { + Fileno uint32 + Reclen uint16 + Type uint8 + Namlen uint8 + Name [256]int8 +} + type Fsid struct { Val [2]int32 } @@ -282,9 +335,41 @@ const ( ) const ( - PTRACE_TRACEME = 0x0 - PTRACE_CONT = 0x7 - PTRACE_KILL = 0x8 + PTRACE_ATTACH = 0xa + PTRACE_CONT = 0x7 + PTRACE_DETACH = 0xb + PTRACE_GETFPREGS = 0x23 + PTRACE_GETFSBASE = 0x47 + PTRACE_GETLWPLIST = 0xf + PTRACE_GETNUMLWPS = 0xe + PTRACE_GETREGS = 0x21 + PTRACE_GETXSTATE = 0x45 + PTRACE_IO = 0xc + PTRACE_KILL = 0x8 + PTRACE_LWPEVENTS = 0x18 + PTRACE_LWPINFO = 0xd + PTRACE_SETFPREGS = 0x24 + PTRACE_SETREGS = 0x22 + PTRACE_SINGLESTEP = 0x9 + PTRACE_TRACEME = 0x0 +) + +const ( + PIOD_READ_D = 0x1 + PIOD_WRITE_D = 0x2 + PIOD_READ_I = 0x3 + PIOD_WRITE_I = 0x4 +) + +const ( + PL_FLAG_BORN = 0x100 + PL_FLAG_EXITED = 0x200 + PL_FLAG_SI = 0x20 +) + +const ( + TRAP_BRKPT = 0x1 + TRAP_TRACE = 0x2 ) type PtraceLwpInfoStruct struct { @@ -293,7 +378,7 @@ type PtraceLwpInfoStruct struct { Flags int32 Sigmask Sigset_t Siglist Sigset_t - Siginfo __PtraceSiginfo + Siginfo __Siginfo Tdname [20]int8 Child_pid int32 Syscall_code uint32 @@ -301,27 +386,15 @@ type PtraceLwpInfoStruct struct { } type __Siginfo struct { - Signo int32 - Errno int32 - Code int32 - Pid int32 - Uid uint32 - Status int32 - Addr *byte - Value [4]byte - _ [32]byte -} - -type __PtraceSiginfo struct { - Signo int32 - Errno int32 - Code int32 - Pid int32 - Uid uint32 - Status int32 - Addr uintptr - Value [4]byte - _ [32]byte + Signo int32 + Errno int32 + Code int32 + Pid int32 + Uid uint32 + Status int32 + Addr *byte + Value [4]byte + X_reason [32]byte } type Sigset_t struct { @@ -329,28 +402,22 @@ type Sigset_t struct { } type Reg struct { - R [13]uint32 - Sp uint32 - Lr uint32 - Pc uint32 - Cpsr uint32 + R [13]uint32 + R_sp uint32 + R_lr uint32 + R_pc uint32 + R_cpsr uint32 } type FpReg struct { - Fpsr uint32 - Fpr [8]FpExtendedPrecision -} - -type FpExtendedPrecision struct { - Exponent uint32 - Mantissa_hi uint32 - Mantissa_lo uint32 + Fpr_fpsr uint32 + Fpr [8][3]uint32 } type PtraceIoDesc struct { Op int32 - Offs uintptr - Addr uintptr + Offs *byte + Addr *byte Len uint32 } @@ -359,11 +426,8 @@ type Kevent_t struct { Filter int16 Flags uint16 Fflags uint32 - _ [4]byte - Data int64 + Data int32 Udata *byte - _ [4]byte - Ext [4]uint64 } type FdSet struct { @@ -389,7 +453,7 @@ type ifMsghdr struct { Addrs int32 Flags int32 Index uint16 - _ uint16 + _ [2]byte Data ifData } @@ -400,6 +464,7 @@ type IfMsghdr struct { Addrs int32 Flags int32 Index uint16 + _ [2]byte Data IfData } @@ -467,7 +532,7 @@ type IfaMsghdr struct { Addrs int32 Flags int32 Index uint16 - _ uint16 + _ [2]byte Metric int32 } @@ -478,7 +543,7 @@ type IfmaMsghdr struct { Addrs int32 Flags int32 Index uint16 - _ uint16 + _ [2]byte } type IfAnnounceMsghdr struct { @@ -495,7 +560,7 @@ type RtMsghdr struct { Version uint8 Type uint8 Index uint16 - _ uint16 + _ [2]byte Flags int32 Addrs int32 Pid int32 diff --git a/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm64.go b/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm64.go index 4220411..8e3e33f 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_freebsd_arm64.go @@ -86,6 +86,26 @@ type Stat_t struct { Spare [10]uint64 } +type stat_freebsd11_t struct { + Dev uint32 + Ino uint32 + Mode uint16 + Nlink uint16 + Uid uint32 + Gid uint32 + Rdev uint32 + Atim Timespec + Mtim Timespec + Ctim Timespec + Size int64 + Blocks int64 + Blksize int32 + Flags uint32 + Gen uint32 + Lspare int32 + Btim Timespec +} + type Statfs_t struct { Version uint32 Type uint32 @@ -111,6 +131,31 @@ type Statfs_t struct { Mntonname [1024]byte } +type statfs_freebsd11_t struct { + Version uint32 + Type uint32 + Flags uint64 + Bsize uint64 + Iosize uint64 + Blocks uint64 + Bfree uint64 + Bavail int64 + Files uint64 + Ffree int64 + Syncwrites uint64 + Asyncwrites uint64 + Syncreads uint64 + Asyncreads uint64 + Spare [10]uint64 + Namemax uint32 + Owner uint32 + Fsid Fsid + Charspare [80]int8 + Fstypename [16]byte + Mntfromname [88]byte + Mntonname [88]byte +} + type Flock_t struct { Start int64 Len int64 @@ -132,6 +177,14 @@ type Dirent struct { Name [256]int8 } +type dirent_freebsd11 struct { + Fileno uint32 + Reclen uint16 + Type uint8 + Namlen uint8 + Name [256]int8 +} + type Fsid struct { Val [2]int32 } @@ -280,9 +333,39 @@ const ( ) const ( - PTRACE_TRACEME = 0x0 - PTRACE_CONT = 0x7 - PTRACE_KILL = 0x8 + PTRACE_ATTACH = 0xa + PTRACE_CONT = 0x7 + PTRACE_DETACH = 0xb + PTRACE_GETFPREGS = 0x23 + PTRACE_GETLWPLIST = 0xf + PTRACE_GETNUMLWPS = 0xe + PTRACE_GETREGS = 0x21 + PTRACE_IO = 0xc + PTRACE_KILL = 0x8 + PTRACE_LWPEVENTS = 0x18 + PTRACE_LWPINFO = 0xd + PTRACE_SETFPREGS = 0x24 + PTRACE_SETREGS = 0x22 + PTRACE_SINGLESTEP = 0x9 + PTRACE_TRACEME = 0x0 +) + +const ( + PIOD_READ_D = 0x1 + PIOD_WRITE_D = 0x2 + PIOD_READ_I = 0x3 + PIOD_WRITE_I = 0x4 +) + +const ( + PL_FLAG_BORN = 0x100 + PL_FLAG_EXITED = 0x200 + PL_FLAG_SI = 0x20 +) + +const ( + TRAP_BRKPT = 0x1 + TRAP_TRACE = 0x2 ) type PtraceLwpInfoStruct struct { @@ -291,7 +374,7 @@ type PtraceLwpInfoStruct struct { Flags int32 Sigmask Sigset_t Siglist Sigset_t - Siginfo __PtraceSiginfo + Siginfo __Siginfo Tdname [20]int8 Child_pid int32 Syscall_code uint32 @@ -310,18 +393,6 @@ type __Siginfo struct { _ [40]byte } -type __PtraceSiginfo struct { - Signo int32 - Errno int32 - Code int32 - Pid int32 - Uid uint32 - Status int32 - Addr uintptr - Value [8]byte - _ [40]byte -} - type Sigset_t struct { Val [4]uint32 } @@ -342,12 +413,10 @@ type FpReg struct { _ [8]byte } -type FpExtendedPrecision struct{} - type PtraceIoDesc struct { Op int32 - Offs uintptr - Addr uintptr + Offs *byte + Addr *byte Len uint64 } @@ -358,7 +427,6 @@ type Kevent_t struct { Fflags uint32 Data int64 Udata *byte - Ext [4]uint64 } type FdSet struct { diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux.go b/vendor/golang.org/x/sys/unix/ztypes_linux.go index ff68811..c55b617 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux.go @@ -764,15 +764,6 @@ const ( MOVE_MOUNT_T_AUTOMOUNTS = 0x20 MOVE_MOUNT_T_EMPTY_PATH = 0x40 MOVE_MOUNT_SET_GROUP = 0x100 - - FSOPEN_CLOEXEC = 0x1 - - FSPICK_CLOEXEC = 0x1 - FSPICK_SYMLINK_NOFOLLOW = 0x2 - FSPICK_NO_AUTOMOUNT = 0x4 - FSPICK_EMPTY_PATH = 0x8 - - FSMOUNT_CLOEXEC = 0x1 ) type OpenHow struct { @@ -945,9 +936,6 @@ type PerfEventAttr struct { Aux_watermark uint32 Sample_max_stack uint16 _ uint16 - Aux_sample_size uint32 - _ uint32 - Sig_data uint64 } type PerfEventMmapPage struct { @@ -1130,9 +1118,7 @@ const ( PERF_BR_SYSRET = 0x8 PERF_BR_COND_CALL = 0x9 PERF_BR_COND_RET = 0xa - PERF_BR_ERET = 0xb - PERF_BR_IRQ = 0xc - PERF_BR_MAX = 0xd + PERF_BR_MAX = 0xb PERF_SAMPLE_REGS_ABI_NONE = 0x0 PERF_SAMPLE_REGS_ABI_32 = 0x1 PERF_SAMPLE_REGS_ABI_64 = 0x2 @@ -1466,11 +1452,6 @@ const ( IFLA_ALT_IFNAME = 0x35 IFLA_PERM_ADDRESS = 0x36 IFLA_PROTO_DOWN_REASON = 0x37 - IFLA_PARENT_DEV_NAME = 0x38 - IFLA_PARENT_DEV_BUS_NAME = 0x39 - IFLA_GRO_MAX_SIZE = 0x3a - IFLA_TSO_MAX_SIZE = 0x3b - IFLA_TSO_MAX_SEGS = 0x3c IFLA_PROTO_DOWN_REASON_UNSPEC = 0x0 IFLA_PROTO_DOWN_REASON_MASK = 0x1 IFLA_PROTO_DOWN_REASON_VALUE = 0x2 @@ -2979,7 +2960,7 @@ const ( DEVLINK_CMD_TRAP_POLICER_NEW = 0x47 DEVLINK_CMD_TRAP_POLICER_DEL = 0x48 DEVLINK_CMD_HEALTH_REPORTER_TEST = 0x49 - DEVLINK_CMD_MAX = 0x51 + DEVLINK_CMD_MAX = 0x4d DEVLINK_PORT_TYPE_NOTSET = 0x0 DEVLINK_PORT_TYPE_AUTO = 0x1 DEVLINK_PORT_TYPE_ETH = 0x2 @@ -3208,7 +3189,7 @@ const ( DEVLINK_ATTR_RATE_NODE_NAME = 0xa8 DEVLINK_ATTR_RATE_PARENT_NODE_NAME = 0xa9 DEVLINK_ATTR_REGION_MAX_SNAPSHOTS = 0xaa - DEVLINK_ATTR_MAX = 0xae + DEVLINK_ATTR_MAX = 0xaa DEVLINK_DPIPE_FIELD_MAPPING_TYPE_NONE = 0x0 DEVLINK_DPIPE_FIELD_MAPPING_TYPE_IFINDEX = 0x1 DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT = 0x0 @@ -3648,11 +3629,7 @@ const ( ETHTOOL_A_RINGS_RX_MINI = 0x7 ETHTOOL_A_RINGS_RX_JUMBO = 0x8 ETHTOOL_A_RINGS_TX = 0x9 - ETHTOOL_A_RINGS_RX_BUF_LEN = 0xa - ETHTOOL_A_RINGS_TCP_DATA_SPLIT = 0xb - ETHTOOL_A_RINGS_CQE_SIZE = 0xc - ETHTOOL_A_RINGS_TX_PUSH = 0xd - ETHTOOL_A_RINGS_MAX = 0xd + ETHTOOL_A_RINGS_MAX = 0xa ETHTOOL_A_CHANNELS_UNSPEC = 0x0 ETHTOOL_A_CHANNELS_HEADER = 0x1 ETHTOOL_A_CHANNELS_RX_MAX = 0x2 @@ -4337,7 +4314,7 @@ const ( NL80211_ATTR_MAC_HINT = 0xc8 NL80211_ATTR_MAC_MASK = 0xd7 NL80211_ATTR_MAX_AP_ASSOC_STA = 0xca - NL80211_ATTR_MAX = 0x137 + NL80211_ATTR_MAX = 0x135 NL80211_ATTR_MAX_CRIT_PROT_DURATION = 0xb4 NL80211_ATTR_MAX_CSA_COUNTERS = 0xce NL80211_ATTR_MAX_MATCH_SETS = 0x85 @@ -4563,7 +4540,7 @@ const ( NL80211_BAND_IFTYPE_ATTR_HE_CAP_PHY = 0x3 NL80211_BAND_IFTYPE_ATTR_HE_CAP_PPE = 0x5 NL80211_BAND_IFTYPE_ATTR_IFTYPES = 0x1 - NL80211_BAND_IFTYPE_ATTR_MAX = 0xb + NL80211_BAND_IFTYPE_ATTR_MAX = 0x7 NL80211_BAND_S1GHZ = 0x4 NL80211_BITRATE_ATTR_2GHZ_SHORTPREAMBLE = 0x2 NL80211_BITRATE_ATTR_MAX = 0x2 @@ -4901,7 +4878,7 @@ const ( NL80211_FREQUENCY_ATTR_GO_CONCURRENT = 0xf NL80211_FREQUENCY_ATTR_INDOOR_ONLY = 0xe NL80211_FREQUENCY_ATTR_IR_CONCURRENT = 0xf - NL80211_FREQUENCY_ATTR_MAX = 0x1b + NL80211_FREQUENCY_ATTR_MAX = 0x19 NL80211_FREQUENCY_ATTR_MAX_TX_POWER = 0x6 NL80211_FREQUENCY_ATTR_NO_10MHZ = 0x11 NL80211_FREQUENCY_ATTR_NO_160MHZ = 0xc @@ -5268,7 +5245,7 @@ const ( NL80211_RATE_INFO_HE_RU_ALLOC_52 = 0x1 NL80211_RATE_INFO_HE_RU_ALLOC_996 = 0x5 NL80211_RATE_INFO_HE_RU_ALLOC = 0x11 - NL80211_RATE_INFO_MAX = 0x16 + NL80211_RATE_INFO_MAX = 0x11 NL80211_RATE_INFO_MCS = 0x2 NL80211_RATE_INFO_SHORT_GI = 0x4 NL80211_RATE_INFO_VHT_MCS = 0x6 @@ -5602,8 +5579,3 @@ const ( FR_ACT_UNREACHABLE = 0x7 FR_ACT_PROHIBIT = 0x8 ) - -const ( - AUDIT_NLGRP_NONE = 0x0 - AUDIT_NLGRP_READLOG = 0x1 -) diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_386.go b/vendor/golang.org/x/sys/unix/ztypes_linux_386.go index 2636044..5314092 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_386.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_386.go @@ -1,4 +1,4 @@ -// cgo -godefs -objdir=/tmp/386/cgo -- -Wall -Werror -static -I/tmp/386/include -m32 linux/types.go | go run mkpost.go +// cgo -godefs -- -Wall -Werror -static -I/tmp/include -m32 /build/unix/linux/types.go | go run mkpost.go // Code generated by the command above; see README.md. DO NOT EDIT. //go:build 386 && linux @@ -324,13 +324,6 @@ type Taskstats struct { Ac_btime64 uint64 Compact_count uint64 Compact_delay_total uint64 - Ac_tgid uint32 - _ [4]byte - Ac_tgetime uint64 - Ac_exe_dev uint64 - Ac_exe_inode uint64 - Wpcopy_count uint64 - Wpcopy_delay_total uint64 } type cpuMask uint32 diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go index 8187489..b02ab83 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_amd64.go @@ -1,4 +1,4 @@ -// cgo -godefs -objdir=/tmp/amd64/cgo -- -Wall -Werror -static -I/tmp/amd64/include -m64 linux/types.go | go run mkpost.go +// cgo -godefs -- -Wall -Werror -static -I/tmp/include -m64 /build/unix/linux/types.go | go run mkpost.go // Code generated by the command above; see README.md. DO NOT EDIT. //go:build amd64 && linux @@ -338,12 +338,6 @@ type Taskstats struct { Ac_btime64 uint64 Compact_count uint64 Compact_delay_total uint64 - Ac_tgid uint32 - Ac_tgetime uint64 - Ac_exe_dev uint64 - Ac_exe_inode uint64 - Wpcopy_count uint64 - Wpcopy_delay_total uint64 } type cpuMask uint64 diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_arm.go b/vendor/golang.org/x/sys/unix/ztypes_linux_arm.go index d161233..9e6871d 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_arm.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_arm.go @@ -1,4 +1,4 @@ -// cgo -godefs -objdir=/tmp/arm/cgo -- -Wall -Werror -static -I/tmp/arm/include linux/types.go | go run mkpost.go +// cgo -godefs -- -Wall -Werror -static -I/tmp/include /build/unix/linux/types.go | go run mkpost.go // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm && linux @@ -315,13 +315,6 @@ type Taskstats struct { Ac_btime64 uint64 Compact_count uint64 Compact_delay_total uint64 - Ac_tgid uint32 - _ [4]byte - Ac_tgetime uint64 - Ac_exe_dev uint64 - Ac_exe_inode uint64 - Wpcopy_count uint64 - Wpcopy_delay_total uint64 } type cpuMask uint32 diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go index c28e555..b732d12 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_arm64.go @@ -1,4 +1,4 @@ -// cgo -godefs -objdir=/tmp/arm64/cgo -- -Wall -Werror -static -I/tmp/arm64/include -fsigned-char linux/types.go | go run mkpost.go +// cgo -godefs -- -Wall -Werror -static -I/tmp/include -fsigned-char /build/unix/linux/types.go | go run mkpost.go // Code generated by the command above; see README.md. DO NOT EDIT. //go:build arm64 && linux @@ -317,12 +317,6 @@ type Taskstats struct { Ac_btime64 uint64 Compact_count uint64 Compact_delay_total uint64 - Ac_tgid uint32 - Ac_tgetime uint64 - Ac_exe_dev uint64 - Ac_exe_inode uint64 - Wpcopy_count uint64 - Wpcopy_delay_total uint64 } type cpuMask uint64 diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_mips.go b/vendor/golang.org/x/sys/unix/ztypes_linux_mips.go index 3691299..5310f71 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_mips.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_mips.go @@ -1,4 +1,4 @@ -// cgo -godefs -objdir=/tmp/mips/cgo -- -Wall -Werror -static -I/tmp/mips/include linux/types.go | go run mkpost.go +// cgo -godefs -- -Wall -Werror -static -I/tmp/include /build/unix/linux/types.go | go run mkpost.go // Code generated by the command above; see README.md. DO NOT EDIT. //go:build mips && linux @@ -320,13 +320,6 @@ type Taskstats struct { Ac_btime64 uint64 Compact_count uint64 Compact_delay_total uint64 - Ac_tgid uint32 - _ [4]byte - Ac_tgetime uint64 - Ac_exe_dev uint64 - Ac_exe_inode uint64 - Wpcopy_count uint64 - Wpcopy_delay_total uint64 } type cpuMask uint32 diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_mips64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_mips64.go index 7473468..219bbb1 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_mips64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_mips64.go @@ -1,4 +1,4 @@ -// cgo -godefs -objdir=/tmp/mips64/cgo -- -Wall -Werror -static -I/tmp/mips64/include linux/types.go | go run mkpost.go +// cgo -godefs -- -Wall -Werror -static -I/tmp/include /build/unix/linux/types.go | go run mkpost.go // Code generated by the command above; see README.md. DO NOT EDIT. //go:build mips64 && linux @@ -320,12 +320,6 @@ type Taskstats struct { Ac_btime64 uint64 Compact_count uint64 Compact_delay_total uint64 - Ac_tgid uint32 - Ac_tgetime uint64 - Ac_exe_dev uint64 - Ac_exe_inode uint64 - Wpcopy_count uint64 - Wpcopy_delay_total uint64 } type cpuMask uint64 diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_mips64le.go b/vendor/golang.org/x/sys/unix/ztypes_linux_mips64le.go index ed94485..be9432d 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_mips64le.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_mips64le.go @@ -1,4 +1,4 @@ -// cgo -godefs -objdir=/tmp/mips64le/cgo -- -Wall -Werror -static -I/tmp/mips64le/include linux/types.go | go run mkpost.go +// cgo -godefs -- -Wall -Werror -static -I/tmp/include /build/unix/linux/types.go | go run mkpost.go // Code generated by the command above; see README.md. DO NOT EDIT. //go:build mips64le && linux @@ -320,12 +320,6 @@ type Taskstats struct { Ac_btime64 uint64 Compact_count uint64 Compact_delay_total uint64 - Ac_tgid uint32 - Ac_tgetime uint64 - Ac_exe_dev uint64 - Ac_exe_inode uint64 - Wpcopy_count uint64 - Wpcopy_delay_total uint64 } type cpuMask uint64 diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_mipsle.go b/vendor/golang.org/x/sys/unix/ztypes_linux_mipsle.go index 0892a73..d0155a4 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_mipsle.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_mipsle.go @@ -1,4 +1,4 @@ -// cgo -godefs -objdir=/tmp/mipsle/cgo -- -Wall -Werror -static -I/tmp/mipsle/include linux/types.go | go run mkpost.go +// cgo -godefs -- -Wall -Werror -static -I/tmp/include /build/unix/linux/types.go | go run mkpost.go // Code generated by the command above; see README.md. DO NOT EDIT. //go:build mipsle && linux @@ -320,13 +320,6 @@ type Taskstats struct { Ac_btime64 uint64 Compact_count uint64 Compact_delay_total uint64 - Ac_tgid uint32 - _ [4]byte - Ac_tgetime uint64 - Ac_exe_dev uint64 - Ac_exe_inode uint64 - Wpcopy_count uint64 - Wpcopy_delay_total uint64 } type cpuMask uint32 diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_ppc.go b/vendor/golang.org/x/sys/unix/ztypes_linux_ppc.go index e1dd483..01c17bc 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_ppc.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_ppc.go @@ -1,4 +1,4 @@ -// cgo -godefs -objdir=/tmp/ppc/cgo -- -Wall -Werror -static -I/tmp/ppc/include linux/types.go | go run mkpost.go +// cgo -godefs -- -Wall -Werror -static -I/tmp/include /build/unix/linux/types.go | go run mkpost.go // Code generated by the command above; see README.md. DO NOT EDIT. //go:build ppc && linux @@ -327,13 +327,6 @@ type Taskstats struct { Ac_btime64 uint64 Compact_count uint64 Compact_delay_total uint64 - Ac_tgid uint32 - _ [4]byte - Ac_tgetime uint64 - Ac_exe_dev uint64 - Ac_exe_inode uint64 - Wpcopy_count uint64 - Wpcopy_delay_total uint64 } type cpuMask uint32 diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64.go index d9f654c..944a9c3 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64.go @@ -1,4 +1,4 @@ -// cgo -godefs -objdir=/tmp/ppc64/cgo -- -Wall -Werror -static -I/tmp/ppc64/include linux/types.go | go run mkpost.go +// cgo -godefs -- -Wall -Werror -static -I/tmp/include /build/unix/linux/types.go | go run mkpost.go // Code generated by the command above; see README.md. DO NOT EDIT. //go:build ppc64 && linux @@ -327,12 +327,6 @@ type Taskstats struct { Ac_btime64 uint64 Compact_count uint64 Compact_delay_total uint64 - Ac_tgid uint32 - Ac_tgetime uint64 - Ac_exe_dev uint64 - Ac_exe_inode uint64 - Wpcopy_count uint64 - Wpcopy_delay_total uint64 } type cpuMask uint64 diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64le.go b/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64le.go index 74acda9..5d2c90e 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64le.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_ppc64le.go @@ -1,4 +1,4 @@ -// cgo -godefs -objdir=/tmp/ppc64le/cgo -- -Wall -Werror -static -I/tmp/ppc64le/include linux/types.go | go run mkpost.go +// cgo -godefs -- -Wall -Werror -static -I/tmp/include /build/unix/linux/types.go | go run mkpost.go // Code generated by the command above; see README.md. DO NOT EDIT. //go:build ppc64le && linux @@ -327,12 +327,6 @@ type Taskstats struct { Ac_btime64 uint64 Compact_count uint64 Compact_delay_total uint64 - Ac_tgid uint32 - Ac_tgetime uint64 - Ac_exe_dev uint64 - Ac_exe_inode uint64 - Wpcopy_count uint64 - Wpcopy_delay_total uint64 } type cpuMask uint64 diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go index 50ebe69..e173cb5 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_riscv64.go @@ -1,4 +1,4 @@ -// cgo -godefs -objdir=/tmp/riscv64/cgo -- -Wall -Werror -static -I/tmp/riscv64/include linux/types.go | go run mkpost.go +// cgo -godefs -- -Wall -Werror -static -I/tmp/include /build/unix/linux/types.go | go run mkpost.go // Code generated by the command above; see README.md. DO NOT EDIT. //go:build riscv64 && linux @@ -345,12 +345,6 @@ type Taskstats struct { Ac_btime64 uint64 Compact_count uint64 Compact_delay_total uint64 - Ac_tgid uint32 - Ac_tgetime uint64 - Ac_exe_dev uint64 - Ac_exe_inode uint64 - Wpcopy_count uint64 - Wpcopy_delay_total uint64 } type cpuMask uint64 diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_s390x.go b/vendor/golang.org/x/sys/unix/ztypes_linux_s390x.go index 75b34c2..6106715 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_s390x.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_s390x.go @@ -1,4 +1,4 @@ -// cgo -godefs -objdir=/tmp/s390x/cgo -- -Wall -Werror -static -I/tmp/s390x/include -fsigned-char linux/types.go | go run mkpost.go +// cgo -godefs -- -Wall -Werror -static -I/tmp/include -fsigned-char /build/unix/linux/types.go | go run mkpost.go // Code generated by the command above; see README.md. DO NOT EDIT. //go:build s390x && linux @@ -340,12 +340,6 @@ type Taskstats struct { Ac_btime64 uint64 Compact_count uint64 Compact_delay_total uint64 - Ac_tgid uint32 - Ac_tgetime uint64 - Ac_exe_dev uint64 - Ac_exe_inode uint64 - Wpcopy_count uint64 - Wpcopy_delay_total uint64 } type cpuMask uint64 diff --git a/vendor/golang.org/x/sys/unix/ztypes_linux_sparc64.go b/vendor/golang.org/x/sys/unix/ztypes_linux_sparc64.go index 429c3bf..ca7b37b 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_linux_sparc64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_linux_sparc64.go @@ -1,4 +1,4 @@ -// cgo -godefs -objdir=/tmp/sparc64/cgo -- -Wall -Werror -static -I/tmp/sparc64/include linux/types.go | go run mkpost.go +// cgo -godefs -- -Wall -Werror -static -I/tmp/include /build/unix/linux/types.go | go run mkpost.go // Code generated by the command above; see README.md. DO NOT EDIT. //go:build sparc64 && linux @@ -322,12 +322,6 @@ type Taskstats struct { Ac_btime64 uint64 Compact_count uint64 Compact_delay_total uint64 - Ac_tgid uint32 - Ac_tgetime uint64 - Ac_exe_dev uint64 - Ac_exe_inode uint64 - Wpcopy_count uint64 - Wpcopy_delay_total uint64 } type cpuMask uint64 diff --git a/vendor/golang.org/x/sys/unix/ztypes_openbsd_386.go b/vendor/golang.org/x/sys/unix/ztypes_openbsd_386.go index 2ed718c..baf5fe6 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_openbsd_386.go +++ b/vendor/golang.org/x/sys/unix/ztypes_openbsd_386.go @@ -94,10 +94,10 @@ type Statfs_t struct { F_namemax uint32 F_owner uint32 F_ctime uint64 - F_fstypename [16]byte - F_mntonname [90]byte - F_mntfromname [90]byte - F_mntfromspec [90]byte + F_fstypename [16]int8 + F_mntonname [90]int8 + F_mntfromname [90]int8 + F_mntfromspec [90]int8 Pad_cgo_0 [2]byte Mount_info [160]byte } diff --git a/vendor/golang.org/x/sys/unix/ztypes_openbsd_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_openbsd_amd64.go index b4fb97e..e21ae8e 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_openbsd_amd64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_openbsd_amd64.go @@ -96,10 +96,10 @@ type Statfs_t struct { F_namemax uint32 F_owner uint32 F_ctime uint64 - F_fstypename [16]byte - F_mntonname [90]byte - F_mntfromname [90]byte - F_mntfromspec [90]byte + F_fstypename [16]int8 + F_mntonname [90]int8 + F_mntfromname [90]int8 + F_mntfromspec [90]int8 _ [2]byte Mount_info [160]byte } diff --git a/vendor/golang.org/x/sys/unix/ztypes_openbsd_arm.go b/vendor/golang.org/x/sys/unix/ztypes_openbsd_arm.go index 2c46750..f190651 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_openbsd_arm.go +++ b/vendor/golang.org/x/sys/unix/ztypes_openbsd_arm.go @@ -98,10 +98,10 @@ type Statfs_t struct { F_namemax uint32 F_owner uint32 F_ctime uint64 - F_fstypename [16]byte - F_mntonname [90]byte - F_mntfromname [90]byte - F_mntfromspec [90]byte + F_fstypename [16]int8 + F_mntonname [90]int8 + F_mntfromname [90]int8 + F_mntfromspec [90]int8 _ [2]byte Mount_info [160]byte } diff --git a/vendor/golang.org/x/sys/unix/ztypes_openbsd_arm64.go b/vendor/golang.org/x/sys/unix/ztypes_openbsd_arm64.go index ddee045..84747c5 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_openbsd_arm64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_openbsd_arm64.go @@ -94,10 +94,10 @@ type Statfs_t struct { F_namemax uint32 F_owner uint32 F_ctime uint64 - F_fstypename [16]byte - F_mntonname [90]byte - F_mntfromname [90]byte - F_mntfromspec [90]byte + F_fstypename [16]int8 + F_mntonname [90]int8 + F_mntfromname [90]int8 + F_mntfromspec [90]int8 _ [2]byte Mount_info [160]byte } diff --git a/vendor/golang.org/x/sys/unix/ztypes_openbsd_mips64.go b/vendor/golang.org/x/sys/unix/ztypes_openbsd_mips64.go index eb13d4e..ac5c8b6 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_openbsd_mips64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_openbsd_mips64.go @@ -94,10 +94,10 @@ type Statfs_t struct { F_namemax uint32 F_owner uint32 F_ctime uint64 - F_fstypename [16]byte - F_mntonname [90]byte - F_mntfromname [90]byte - F_mntfromspec [90]byte + F_fstypename [16]int8 + F_mntonname [90]int8 + F_mntfromname [90]int8 + F_mntfromspec [90]int8 _ [2]byte Mount_info [160]byte } diff --git a/vendor/golang.org/x/sys/unix/ztypes_solaris_amd64.go b/vendor/golang.org/x/sys/unix/ztypes_solaris_amd64.go index c1a9b83..ad4aad2 100644 --- a/vendor/golang.org/x/sys/unix/ztypes_solaris_amd64.go +++ b/vendor/golang.org/x/sys/unix/ztypes_solaris_amd64.go @@ -178,7 +178,7 @@ type Linger struct { } type Iovec struct { - Base *byte + Base *int8 Len uint64 } diff --git a/vendor/golang.org/x/sys/windows/exec_windows.go b/vendor/golang.org/x/sys/windows/exec_windows.go index 75980fd..855698b 100644 --- a/vendor/golang.org/x/sys/windows/exec_windows.go +++ b/vendor/golang.org/x/sys/windows/exec_windows.go @@ -15,11 +15,11 @@ import ( // in http://msdn.microsoft.com/en-us/library/ms880421. // This function returns "" (2 double quotes) if s is empty. // Alternatively, these transformations are done: -// - every back slash (\) is doubled, but only if immediately -// followed by double quote ("); -// - every double quote (") is escaped by back slash (\); -// - finally, s is wrapped with double quotes (arg -> "arg"), -// but only if there is space or tab inside s. +// - every back slash (\) is doubled, but only if immediately +// followed by double quote ("); +// - every double quote (") is escaped by back slash (\); +// - finally, s is wrapped with double quotes (arg -> "arg"), +// but only if there is space or tab inside s. func EscapeArg(s string) string { if len(s) == 0 { return "\"\"" diff --git a/vendor/golang.org/x/sys/windows/setupapi_windows.go b/vendor/golang.org/x/sys/windows/setupapi_windows.go index f812648..14027da 100644 --- a/vendor/golang.org/x/sys/windows/setupapi_windows.go +++ b/vendor/golang.org/x/sys/windows/setupapi_windows.go @@ -296,7 +296,7 @@ const ( // Flag to indicate that the sorting from the INF file should be used. DI_INF_IS_SORTED DI_FLAGS = 0x00008000 - // Flag to indicate that only the INF specified by SP_DEVINSTALL_PARAMS.DriverPath should be searched. + // Flag to indicate that only the the INF specified by SP_DEVINSTALL_PARAMS.DriverPath should be searched. DI_ENUMSINGLEINF DI_FLAGS = 0x00010000 // Flag that prevents ConfigMgr from removing/re-enumerating devices during device diff --git a/vendor/golang.org/x/sys/windows/syscall_windows.go b/vendor/golang.org/x/sys/windows/syscall_windows.go index e279138..ce3075c 100644 --- a/vendor/golang.org/x/sys/windows/syscall_windows.go +++ b/vendor/golang.org/x/sys/windows/syscall_windows.go @@ -417,7 +417,6 @@ func NewCallbackCDecl(fn interface{}) uintptr { //sys GetModuleInformation(process Handle, module Handle, modinfo *ModuleInfo, cb uint32) (err error) = psapi.GetModuleInformation //sys GetModuleFileNameEx(process Handle, module Handle, filename *uint16, size uint32) (err error) = psapi.GetModuleFileNameExW //sys GetModuleBaseName(process Handle, module Handle, baseName *uint16, size uint32) (err error) = psapi.GetModuleBaseNameW -//sys QueryWorkingSetEx(process Handle, pv uintptr, cb uint32) (err error) = psapi.QueryWorkingSetEx // NT Native APIs //sys rtlNtStatusToDosErrorNoTeb(ntstatus NTStatus) (ret syscall.Errno) = ntdll.RtlNtStatusToDosErrorNoTeb @@ -624,6 +623,7 @@ var ( func getStdHandle(stdhandle uint32) (fd Handle) { r, _ := GetStdHandle(stdhandle) + CloseOnExec(r) return r } @@ -862,7 +862,6 @@ const socket_error = uintptr(^uint32(0)) //sys GetAdaptersAddresses(family uint32, flags uint32, reserved uintptr, adapterAddresses *IpAdapterAddresses, sizePointer *uint32) (errcode error) = iphlpapi.GetAdaptersAddresses //sys GetACP() (acp uint32) = kernel32.GetACP //sys MultiByteToWideChar(codePage uint32, dwFlags uint32, str *byte, nstr int32, wchar *uint16, nwchar int32) (nwrite int32, err error) = kernel32.MultiByteToWideChar -//sys getBestInterfaceEx(sockaddr unsafe.Pointer, pdwBestIfIndex *uint32) (errcode error) = iphlpapi.GetBestInterfaceEx // For testing: clients can set this flag to force // creation of IPv6 sockets to return EAFNOSUPPORT. @@ -972,32 +971,6 @@ func (sa *SockaddrUnix) sockaddr() (unsafe.Pointer, int32, error) { return unsafe.Pointer(&sa.raw), sl, nil } -type RawSockaddrBth struct { - AddressFamily [2]byte - BtAddr [8]byte - ServiceClassId [16]byte - Port [4]byte -} - -type SockaddrBth struct { - BtAddr uint64 - ServiceClassId GUID - Port uint32 - - raw RawSockaddrBth -} - -func (sa *SockaddrBth) sockaddr() (unsafe.Pointer, int32, error) { - family := AF_BTH - sa.raw = RawSockaddrBth{ - AddressFamily: *(*[2]byte)(unsafe.Pointer(&family)), - BtAddr: *(*[8]byte)(unsafe.Pointer(&sa.BtAddr)), - Port: *(*[4]byte)(unsafe.Pointer(&sa.Port)), - ServiceClassId: *(*[16]byte)(unsafe.Pointer(&sa.ServiceClassId)), - } - return unsafe.Pointer(&sa.raw), int32(unsafe.Sizeof(sa.raw)), nil -} - func (rsa *RawSockaddrAny) Sockaddr() (Sockaddr, error) { switch rsa.Addr.Family { case AF_UNIX: @@ -1073,14 +1046,6 @@ func Connect(fd Handle, sa Sockaddr) (err error) { return connect(fd, ptr, n) } -func GetBestInterfaceEx(sa Sockaddr, pdwBestIfIndex *uint32) (err error) { - ptr, _, err := sa.sockaddr() - if err != nil { - return err - } - return getBestInterfaceEx(ptr, pdwBestIfIndex) -} - func Getsockname(fd Handle) (sa Sockaddr, err error) { var rsa RawSockaddrAny l := int32(unsafe.Sizeof(rsa)) @@ -1734,71 +1699,3 @@ func LoadResourceData(module, resInfo Handle) (data []byte, err error) { h.Cap = int(size) return } - -// PSAPI_WORKING_SET_EX_BLOCK contains extended working set information for a page. -type PSAPI_WORKING_SET_EX_BLOCK uint64 - -// Valid returns the validity of this page. -// If this bit is 1, the subsequent members are valid; otherwise they should be ignored. -func (b PSAPI_WORKING_SET_EX_BLOCK) Valid() bool { - return (b & 1) == 1 -} - -// ShareCount is the number of processes that share this page. The maximum value of this member is 7. -func (b PSAPI_WORKING_SET_EX_BLOCK) ShareCount() uint64 { - return b.intField(1, 3) -} - -// Win32Protection is the memory protection attributes of the page. For a list of values, see -// https://docs.microsoft.com/en-us/windows/win32/memory/memory-protection-constants -func (b PSAPI_WORKING_SET_EX_BLOCK) Win32Protection() uint64 { - return b.intField(4, 11) -} - -// Shared returns the shared status of this page. -// If this bit is 1, the page can be shared. -func (b PSAPI_WORKING_SET_EX_BLOCK) Shared() bool { - return (b & (1 << 15)) == 1 -} - -// Node is the NUMA node. The maximum value of this member is 63. -func (b PSAPI_WORKING_SET_EX_BLOCK) Node() uint64 { - return b.intField(16, 6) -} - -// Locked returns the locked status of this page. -// If this bit is 1, the virtual page is locked in physical memory. -func (b PSAPI_WORKING_SET_EX_BLOCK) Locked() bool { - return (b & (1 << 22)) == 1 -} - -// LargePage returns the large page status of this page. -// If this bit is 1, the page is a large page. -func (b PSAPI_WORKING_SET_EX_BLOCK) LargePage() bool { - return (b & (1 << 23)) == 1 -} - -// Bad returns the bad status of this page. -// If this bit is 1, the page is has been reported as bad. -func (b PSAPI_WORKING_SET_EX_BLOCK) Bad() bool { - return (b & (1 << 31)) == 1 -} - -// intField extracts an integer field in the PSAPI_WORKING_SET_EX_BLOCK union. -func (b PSAPI_WORKING_SET_EX_BLOCK) intField(start, length int) uint64 { - var mask PSAPI_WORKING_SET_EX_BLOCK - for pos := start; pos < start+length; pos++ { - mask |= (1 << pos) - } - - masked := b & mask - return uint64(masked >> start) -} - -// PSAPI_WORKING_SET_EX_INFORMATION contains extended working set information for a process. -type PSAPI_WORKING_SET_EX_INFORMATION struct { - // The virtual address. - VirtualAddress Pointer - // A PSAPI_WORKING_SET_EX_BLOCK union that indicates the attributes of the page at VirtualAddress. - VirtualAttributes PSAPI_WORKING_SET_EX_BLOCK -} diff --git a/vendor/golang.org/x/sys/windows/types_windows.go b/vendor/golang.org/x/sys/windows/types_windows.go index f9eaca5..e19471c 100644 --- a/vendor/golang.org/x/sys/windows/types_windows.go +++ b/vendor/golang.org/x/sys/windows/types_windows.go @@ -160,10 +160,6 @@ const ( MAX_COMPUTERNAME_LENGTH = 15 - MAX_DHCPV6_DUID_LENGTH = 130 - - MAX_DNS_SUFFIX_STRING_LENGTH = 256 - TIME_ZONE_ID_UNKNOWN = 0 TIME_ZONE_ID_STANDARD = 1 @@ -2004,62 +2000,27 @@ type IpAdapterPrefix struct { } type IpAdapterAddresses struct { - Length uint32 - IfIndex uint32 - Next *IpAdapterAddresses - AdapterName *byte - FirstUnicastAddress *IpAdapterUnicastAddress - FirstAnycastAddress *IpAdapterAnycastAddress - FirstMulticastAddress *IpAdapterMulticastAddress - FirstDnsServerAddress *IpAdapterDnsServerAdapter - DnsSuffix *uint16 - Description *uint16 - FriendlyName *uint16 - PhysicalAddress [syscall.MAX_ADAPTER_ADDRESS_LENGTH]byte - PhysicalAddressLength uint32 - Flags uint32 - Mtu uint32 - IfType uint32 - OperStatus uint32 - Ipv6IfIndex uint32 - ZoneIndices [16]uint32 - FirstPrefix *IpAdapterPrefix - TransmitLinkSpeed uint64 - ReceiveLinkSpeed uint64 - FirstWinsServerAddress *IpAdapterWinsServerAddress - FirstGatewayAddress *IpAdapterGatewayAddress - Ipv4Metric uint32 - Ipv6Metric uint32 - Luid uint64 - Dhcpv4Server SocketAddress - CompartmentId uint32 - NetworkGuid GUID - ConnectionType uint32 - TunnelType uint32 - Dhcpv6Server SocketAddress - Dhcpv6ClientDuid [MAX_DHCPV6_DUID_LENGTH]byte - Dhcpv6ClientDuidLength uint32 - Dhcpv6Iaid uint32 - FirstDnsSuffix *IpAdapterDNSSuffix -} - -type IpAdapterWinsServerAddress struct { - Length uint32 - Reserved uint32 - Next *IpAdapterWinsServerAddress - Address SocketAddress -} - -type IpAdapterGatewayAddress struct { - Length uint32 - Reserved uint32 - Next *IpAdapterGatewayAddress - Address SocketAddress -} - -type IpAdapterDNSSuffix struct { - Next *IpAdapterDNSSuffix - String [MAX_DNS_SUFFIX_STRING_LENGTH]uint16 + Length uint32 + IfIndex uint32 + Next *IpAdapterAddresses + AdapterName *byte + FirstUnicastAddress *IpAdapterUnicastAddress + FirstAnycastAddress *IpAdapterAnycastAddress + FirstMulticastAddress *IpAdapterMulticastAddress + FirstDnsServerAddress *IpAdapterDnsServerAdapter + DnsSuffix *uint16 + Description *uint16 + FriendlyName *uint16 + PhysicalAddress [syscall.MAX_ADAPTER_ADDRESS_LENGTH]byte + PhysicalAddressLength uint32 + Flags uint32 + Mtu uint32 + IfType uint32 + OperStatus uint32 + Ipv6IfIndex uint32 + ZoneIndices [16]uint32 + FirstPrefix *IpAdapterPrefix + /* more fields might be present here. */ } const ( diff --git a/vendor/golang.org/x/sys/windows/zsyscall_windows.go b/vendor/golang.org/x/sys/windows/zsyscall_windows.go index 52d4742..68f52c1 100644 --- a/vendor/golang.org/x/sys/windows/zsyscall_windows.go +++ b/vendor/golang.org/x/sys/windows/zsyscall_windows.go @@ -177,7 +177,6 @@ var ( procDnsRecordListFree = moddnsapi.NewProc("DnsRecordListFree") procGetAdaptersAddresses = modiphlpapi.NewProc("GetAdaptersAddresses") procGetAdaptersInfo = modiphlpapi.NewProc("GetAdaptersInfo") - procGetBestInterfaceEx = modiphlpapi.NewProc("GetBestInterfaceEx") procGetIfEntry = modiphlpapi.NewProc("GetIfEntry") procAssignProcessToJobObject = modkernel32.NewProc("AssignProcessToJobObject") procCancelIo = modkernel32.NewProc("CancelIo") @@ -408,7 +407,6 @@ var ( procGetModuleBaseNameW = modpsapi.NewProc("GetModuleBaseNameW") procGetModuleFileNameExW = modpsapi.NewProc("GetModuleFileNameExW") procGetModuleInformation = modpsapi.NewProc("GetModuleInformation") - procQueryWorkingSetEx = modpsapi.NewProc("QueryWorkingSetEx") procSubscribeServiceChangeNotifications = modsechost.NewProc("SubscribeServiceChangeNotifications") procUnsubscribeServiceChangeNotifications = modsechost.NewProc("UnsubscribeServiceChangeNotifications") procGetUserNameExW = modsecur32.NewProc("GetUserNameExW") @@ -1541,14 +1539,6 @@ func GetAdaptersInfo(ai *IpAdapterInfo, ol *uint32) (errcode error) { return } -func getBestInterfaceEx(sockaddr unsafe.Pointer, pdwBestIfIndex *uint32) (errcode error) { - r0, _, _ := syscall.Syscall(procGetBestInterfaceEx.Addr(), 2, uintptr(sockaddr), uintptr(unsafe.Pointer(pdwBestIfIndex)), 0) - if r0 != 0 { - errcode = syscall.Errno(r0) - } - return -} - func GetIfEntry(pIfRow *MibIfRow) (errcode error) { r0, _, _ := syscall.Syscall(procGetIfEntry.Addr(), 1, uintptr(unsafe.Pointer(pIfRow)), 0, 0) if r0 != 0 { @@ -3505,14 +3495,6 @@ func GetModuleInformation(process Handle, module Handle, modinfo *ModuleInfo, cb return } -func QueryWorkingSetEx(process Handle, pv uintptr, cb uint32) (err error) { - r1, _, e1 := syscall.Syscall(procQueryWorkingSetEx.Addr(), 3, uintptr(process), uintptr(pv), uintptr(cb)) - if r1 == 0 { - err = errnoErr(e1) - } - return -} - func SubscribeServiceChangeNotifications(service Handle, eventType uint32, callback uintptr, callbackCtx uintptr, subscription *uintptr) (ret error) { ret = procSubscribeServiceChangeNotifications.Find() if ret != nil { diff --git a/vendor/modules.txt b/vendor/modules.txt index 215377c..45c07a8 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -10,14 +10,11 @@ blitter.com/go/cryptmt # blitter.com/go/goutmp v1.0.6 ## explicit; go 1.17 blitter.com/go/goutmp -# blitter.com/go/groestl v0.0.0-20220410000905-c4decbf31d64 -## explicit; go 1.18 -blitter.com/go/groestl # blitter.com/go/herradurakex v1.0.0 ## explicit blitter.com/go/herradurakex -# blitter.com/go/hopscotch v0.0.0-20220617051533-4b42ccd4e00a -## explicit; go 1.18 +# blitter.com/go/hopscotch v0.0.0-20211113042251-b8a306eea4dc +## explicit blitter.com/go/hopscotch # blitter.com/go/kyber v0.0.0-20200130200857-6f2021cb88d9 ## explicit; go 1.12 @@ -37,17 +34,17 @@ github.com/creack/pty # github.com/jameskeane/bcrypt v0.0.0-20120420032655-c3cd44c1e20f ## explicit github.com/jameskeane/bcrypt -# github.com/klauspost/cpuid/v2 v2.1.1 -## explicit; go 1.15 +# github.com/klauspost/cpuid/v2 v2.0.6 +## explicit; go 1.13 github.com/klauspost/cpuid/v2 -# github.com/klauspost/reedsolomon v1.11.0 -## explicit; go 1.17 +# github.com/klauspost/reedsolomon v1.9.16 +## explicit; go 1.14 github.com/klauspost/reedsolomon # github.com/kuking/go-frodokem v1.0.2 ## explicit; go 1.14 github.com/kuking/go-frodokem -# github.com/mattn/go-isatty v0.0.16 -## explicit; go 1.15 +# github.com/mattn/go-isatty v0.0.14 +## explicit; go 1.12 github.com/mattn/go-isatty # github.com/pkg/errors v0.9.1 ## explicit @@ -66,14 +63,14 @@ github.com/tjfoc/gmsm/sm4 github.com/xtaci/kcp-go # github.com/xtaci/lossyconn v0.0.0-20200209145036-adba10fffc37 ## explicit; go 1.13 -# golang.org/x/crypto v0.0.0-20220829220503-c86fa9a7ed90 +# golang.org/x/crypto v0.0.0-20220408190544-5352b0902921 ## explicit; go 1.17 golang.org/x/crypto/argon2 golang.org/x/crypto/bcrypt golang.org/x/crypto/blake2b golang.org/x/crypto/blowfish golang.org/x/crypto/cast5 -golang.org/x/crypto/internal/alias +golang.org/x/crypto/internal/subtle golang.org/x/crypto/pbkdf2 golang.org/x/crypto/salsa20 golang.org/x/crypto/salsa20/salsa @@ -89,7 +86,7 @@ golang.org/x/net/internal/iana golang.org/x/net/internal/socket golang.org/x/net/ipv4 golang.org/x/net/ipv6 -# golang.org/x/sys v0.0.0-20220909162455-aba9fc2a8ff2 +# golang.org/x/sys v0.0.0-20220408201424-a24fb2fb8a0f ## explicit; go 1.17 golang.org/x/sys/cpu golang.org/x/sys/internal/unsafeheader