Initial aead/chacha20 support (ChaCha20_12)

Signed-off-by: Russ Magee <rmagee@gmail.com>
This commit is contained in:
Russ Magee 2020-02-21 17:21:19 -08:00
parent 8de76520e4
commit e9aa0072a5
18 changed files with 2592 additions and 2 deletions

1
go.mod
View file

@ -9,6 +9,7 @@ require (
blitter.com/go/kyber v0.0.0-20200130200857-6f2021cb88d9
blitter.com/go/mtwist v1.0.1 // indirect
blitter.com/go/newhope v0.0.0-20200130200750-192fc08a8aae
github.com/aead/chacha20 v0.0.0-20180709150244-8b13a72661da
github.com/jameskeane/bcrypt v0.0.0-20120420032655-c3cd44c1e20f
github.com/klauspost/cpuid v1.2.2 // indirect
github.com/klauspost/reedsolomon v1.9.3 // indirect

2
go.sum
View file

@ -28,6 +28,8 @@ git.schwanenlied.me/yawning/kyber.git v0.0.0-20180530164001-a270899bd22c h1:SGOx
git.schwanenlied.me/yawning/kyber.git v0.0.0-20180530164001-a270899bd22c/go.mod h1:QrbgzU5EL/1jaMD5pD4Tiikj3R5elPMa+RMwFUTGwQU=
git.schwanenlied.me/yawning/newhope.git v0.0.0-20170622154529-9598792ba8f2 h1:89TYv/+wotJ+QWrH5B/yN0pEQutr2V/5za0VoYiVGCM=
git.schwanenlied.me/yawning/newhope.git v0.0.0-20170622154529-9598792ba8f2/go.mod h1:weMqACFGzJs4Ni+K9shsRd02N4LkDrtGlkRxISK+II0=
github.com/aead/chacha20 v0.0.0-20180709150244-8b13a72661da h1:KjTM2ks9d14ZYCvmHS9iAKVt9AyzRSqNU1qabPih5BY=
github.com/aead/chacha20 v0.0.0-20180709150244-8b13a72661da/go.mod h1:eHEWzANqSiWQsof+nXEI9bUVUyV6F53Fp89EuCh2EAA=
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/jameskeane/bcrypt v0.0.0-20120420032655-c3cd44c1e20f h1:UWGE8Vi+1Agt0lrvnd7UsmvwqWKRzb9byK9iQmsbY0Y=

21
vendor/github.com/aead/chacha20/LICENSE generated vendored Normal file
View file

@ -0,0 +1,21 @@
The MIT License (MIT)
Copyright (c) 2016 Andreas Auernhammer
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

197
vendor/github.com/aead/chacha20/chacha/chacha.go generated vendored Normal file
View file

@ -0,0 +1,197 @@
// Copyright (c) 2016 Andreas Auernhammer. All rights reserved.
// Use of this source code is governed by a license that can be
// found in the LICENSE file.
// Package chacha implements some low-level functions of the
// ChaCha cipher family.
package chacha // import "github.com/aead/chacha20/chacha"
import (
"encoding/binary"
"errors"
"math"
)
const (
// NonceSize is the size of the ChaCha20 nonce in bytes.
NonceSize = 8
// INonceSize is the size of the IETF-ChaCha20 nonce in bytes.
INonceSize = 12
// XNonceSize is the size of the XChaCha20 nonce in bytes.
XNonceSize = 24
// KeySize is the size of the key in bytes.
KeySize = 32
)
var (
useSSE2 bool
useSSSE3 bool
useAVX bool
useAVX2 bool
)
var (
errKeySize = errors.New("chacha20/chacha: bad key length")
errInvalidNonce = errors.New("chacha20/chacha: bad nonce length")
)
func setup(state *[64]byte, nonce, key []byte) (err error) {
if len(key) != KeySize {
err = errKeySize
return
}
var Nonce [16]byte
switch len(nonce) {
case NonceSize:
copy(Nonce[8:], nonce)
initialize(state, key, &Nonce)
case INonceSize:
copy(Nonce[4:], nonce)
initialize(state, key, &Nonce)
case XNonceSize:
var tmpKey [32]byte
var hNonce [16]byte
copy(hNonce[:], nonce[:16])
copy(tmpKey[:], key)
HChaCha20(&tmpKey, &hNonce, &tmpKey)
copy(Nonce[8:], nonce[16:])
initialize(state, tmpKey[:], &Nonce)
// BUG(aead): A "good" compiler will remove this (optimizations)
// But using the provided key instead of tmpKey,
// will change the key (-> probably confuses users)
for i := range tmpKey {
tmpKey[i] = 0
}
default:
err = errInvalidNonce
}
return
}
// XORKeyStream crypts bytes from src to dst using the given nonce and key.
// The length of the nonce determinds the version of ChaCha20:
// - NonceSize: ChaCha20/r with a 64 bit nonce and a 2^64 * 64 byte period.
// - INonceSize: ChaCha20/r as defined in RFC 7539 and a 2^32 * 64 byte period.
// - XNonceSize: XChaCha20/r with a 192 bit nonce and a 2^64 * 64 byte period.
// The rounds argument specifies the number of rounds performed for keystream
// generation - valid values are 8, 12 or 20. The src and dst may be the same slice
// but otherwise should not overlap. If len(dst) < len(src) this function panics.
// If the nonce is neither 64, 96 nor 192 bits long, this function panics.
func XORKeyStream(dst, src, nonce, key []byte, rounds int) {
if rounds != 20 && rounds != 12 && rounds != 8 {
panic("chacha20/chacha: bad number of rounds")
}
if len(dst) < len(src) {
panic("chacha20/chacha: dst buffer is to small")
}
if len(nonce) == INonceSize && uint64(len(src)) > (1<<38) {
panic("chacha20/chacha: src is too large")
}
var block, state [64]byte
if err := setup(&state, nonce, key); err != nil {
panic(err)
}
xorKeyStream(dst, src, &block, &state, rounds)
}
// Cipher implements ChaCha20/r (XChaCha20/r) for a given number of rounds r.
type Cipher struct {
state, block [64]byte
off int
rounds int // 20 for ChaCha20
noncesize int
}
// NewCipher returns a new *chacha.Cipher implementing the ChaCha20/r or XChaCha20/r
// (r = 8, 12 or 20) stream cipher. The nonce must be unique for one key for all time.
// The length of the nonce determinds the version of ChaCha20:
// - NonceSize: ChaCha20/r with a 64 bit nonce and a 2^64 * 64 byte period.
// - INonceSize: ChaCha20/r as defined in RFC 7539 and a 2^32 * 64 byte period.
// - XNonceSize: XChaCha20/r with a 192 bit nonce and a 2^64 * 64 byte period.
// If the nonce is neither 64, 96 nor 192 bits long, a non-nil error is returned.
func NewCipher(nonce, key []byte, rounds int) (*Cipher, error) {
if rounds != 20 && rounds != 12 && rounds != 8 {
panic("chacha20/chacha: bad number of rounds")
}
c := new(Cipher)
if err := setup(&(c.state), nonce, key); err != nil {
return nil, err
}
c.rounds = rounds
if len(nonce) == INonceSize {
c.noncesize = INonceSize
} else {
c.noncesize = NonceSize
}
return c, nil
}
// XORKeyStream crypts bytes from src to dst. Src and dst may be the same slice
// but otherwise should not overlap. If len(dst) < len(src) the function panics.
func (c *Cipher) XORKeyStream(dst, src []byte) {
if len(dst) < len(src) {
panic("chacha20/chacha: dst buffer is to small")
}
if c.off > 0 {
n := len(c.block[c.off:])
if len(src) <= n {
for i, v := range src {
dst[i] = v ^ c.block[c.off]
c.off++
}
if c.off == 64 {
c.off = 0
}
return
}
for i, v := range c.block[c.off:] {
dst[i] = src[i] ^ v
}
src = src[n:]
dst = dst[n:]
c.off = 0
}
// check for counter overflow
blocksToXOR := len(src) / 64
if len(src)%64 != 0 {
blocksToXOR++
}
var overflow bool
if c.noncesize == INonceSize {
overflow = binary.LittleEndian.Uint32(c.state[48:]) > math.MaxUint32-uint32(blocksToXOR)
} else {
overflow = binary.LittleEndian.Uint64(c.state[48:]) > math.MaxUint64-uint64(blocksToXOR)
}
if overflow {
panic("chacha20/chacha: counter overflow")
}
c.off += xorKeyStream(dst, src, &(c.block), &(c.state), c.rounds)
}
// SetCounter skips ctr * 64 byte blocks. SetCounter(0) resets the cipher.
// This function always skips the unused keystream of the current 64 byte block.
func (c *Cipher) SetCounter(ctr uint64) {
if c.noncesize == INonceSize {
binary.LittleEndian.PutUint32(c.state[48:], uint32(ctr))
} else {
binary.LittleEndian.PutUint64(c.state[48:], ctr)
}
c.off = 0
}
// HChaCha20 generates 32 pseudo-random bytes from a 128 bit nonce and a 256 bit secret key.
// It can be used as a key-derivation-function (KDF).
func HChaCha20(out *[32]byte, nonce *[16]byte, key *[32]byte) { hChaCha20(out, nonce, key) }

View file

@ -0,0 +1,406 @@
// Copyright (c) 2016 Andreas Auernhammer. All rights reserved.
// Use of this source code is governed by a license that can be
// found in the LICENSE file.
// +build amd64,!gccgo,!appengine,!nacl
#include "const.s"
#include "macro.s"
#define TWO 0(SP)
#define C16 32(SP)
#define C8 64(SP)
#define STATE_0 96(SP)
#define STATE_1 128(SP)
#define STATE_2 160(SP)
#define STATE_3 192(SP)
#define TMP_0 224(SP)
#define TMP_1 256(SP)
// func xorKeyStreamAVX(dst, src []byte, block, state *[64]byte, rounds int) int
TEXT ·xorKeyStreamAVX2(SB), 4, $320-80
MOVQ dst_base+0(FP), DI
MOVQ src_base+24(FP), SI
MOVQ block+48(FP), BX
MOVQ state+56(FP), AX
MOVQ rounds+64(FP), DX
MOVQ src_len+32(FP), CX
MOVQ SP, R8
ADDQ $32, SP
ANDQ $-32, SP
VMOVDQU 0(AX), Y2
VMOVDQU 32(AX), Y3
VPERM2I128 $0x22, Y2, Y0, Y0
VPERM2I128 $0x33, Y2, Y1, Y1
VPERM2I128 $0x22, Y3, Y2, Y2
VPERM2I128 $0x33, Y3, Y3, Y3
TESTQ CX, CX
JZ done
VMOVDQU ·one_AVX2<>(SB), Y4
VPADDD Y4, Y3, Y3
VMOVDQA Y0, STATE_0
VMOVDQA Y1, STATE_1
VMOVDQA Y2, STATE_2
VMOVDQA Y3, STATE_3
VMOVDQU ·rol16_AVX2<>(SB), Y4
VMOVDQU ·rol8_AVX2<>(SB), Y5
VMOVDQU ·two_AVX2<>(SB), Y6
VMOVDQA Y4, Y14
VMOVDQA Y5, Y15
VMOVDQA Y4, C16
VMOVDQA Y5, C8
VMOVDQA Y6, TWO
CMPQ CX, $64
JBE between_0_and_64
CMPQ CX, $192
JBE between_64_and_192
CMPQ CX, $320
JBE between_192_and_320
CMPQ CX, $448
JBE between_320_and_448
at_least_512:
VMOVDQA Y0, Y4
VMOVDQA Y1, Y5
VMOVDQA Y2, Y6
VPADDQ TWO, Y3, Y7
VMOVDQA Y0, Y8
VMOVDQA Y1, Y9
VMOVDQA Y2, Y10
VPADDQ TWO, Y7, Y11
VMOVDQA Y0, Y12
VMOVDQA Y1, Y13
VMOVDQA Y2, Y14
VPADDQ TWO, Y11, Y15
MOVQ DX, R9
chacha_loop_512:
VMOVDQA Y8, TMP_0
CHACHA_QROUND_AVX(Y0, Y1, Y2, Y3, Y8, C16, C8)
CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y8, C16, C8)
VMOVDQA TMP_0, Y8
VMOVDQA Y0, TMP_0
CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y0, C16, C8)
CHACHA_QROUND_AVX(Y12, Y13, Y14, Y15, Y0, C16, C8)
CHACHA_SHUFFLE_AVX(Y1, Y2, Y3)
CHACHA_SHUFFLE_AVX(Y5, Y6, Y7)
CHACHA_SHUFFLE_AVX(Y9, Y10, Y11)
CHACHA_SHUFFLE_AVX(Y13, Y14, Y15)
CHACHA_QROUND_AVX(Y12, Y13, Y14, Y15, Y0, C16, C8)
CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y0, C16, C8)
VMOVDQA TMP_0, Y0
VMOVDQA Y8, TMP_0
CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y8, C16, C8)
CHACHA_QROUND_AVX(Y0, Y1, Y2, Y3, Y8, C16, C8)
VMOVDQA TMP_0, Y8
CHACHA_SHUFFLE_AVX(Y3, Y2, Y1)
CHACHA_SHUFFLE_AVX(Y7, Y6, Y5)
CHACHA_SHUFFLE_AVX(Y11, Y10, Y9)
CHACHA_SHUFFLE_AVX(Y15, Y14, Y13)
SUBQ $2, R9
JA chacha_loop_512
VMOVDQA Y12, TMP_0
VMOVDQA Y13, TMP_1
VPADDD STATE_0, Y0, Y0
VPADDD STATE_1, Y1, Y1
VPADDD STATE_2, Y2, Y2
VPADDD STATE_3, Y3, Y3
XOR_AVX2(DI, SI, 0, Y0, Y1, Y2, Y3, Y12, Y13)
VMOVDQA STATE_0, Y0
VMOVDQA STATE_1, Y1
VMOVDQA STATE_2, Y2
VMOVDQA STATE_3, Y3
VPADDQ TWO, Y3, Y3
VPADDD Y0, Y4, Y4
VPADDD Y1, Y5, Y5
VPADDD Y2, Y6, Y6
VPADDD Y3, Y7, Y7
XOR_AVX2(DI, SI, 128, Y4, Y5, Y6, Y7, Y12, Y13)
VPADDQ TWO, Y3, Y3
VPADDD Y0, Y8, Y8
VPADDD Y1, Y9, Y9
VPADDD Y2, Y10, Y10
VPADDD Y3, Y11, Y11
XOR_AVX2(DI, SI, 256, Y8, Y9, Y10, Y11, Y12, Y13)
VPADDQ TWO, Y3, Y3
VPADDD TMP_0, Y0, Y12
VPADDD TMP_1, Y1, Y13
VPADDD Y2, Y14, Y14
VPADDD Y3, Y15, Y15
VPADDQ TWO, Y3, Y3
CMPQ CX, $512
JB less_than_512
XOR_AVX2(DI, SI, 384, Y12, Y13, Y14, Y15, Y4, Y5)
VMOVDQA Y3, STATE_3
ADDQ $512, SI
ADDQ $512, DI
SUBQ $512, CX
CMPQ CX, $448
JA at_least_512
TESTQ CX, CX
JZ done
VMOVDQA C16, Y14
VMOVDQA C8, Y15
CMPQ CX, $64
JBE between_0_and_64
CMPQ CX, $192
JBE between_64_and_192
CMPQ CX, $320
JBE between_192_and_320
JMP between_320_and_448
less_than_512:
XOR_UPPER_AVX2(DI, SI, 384, Y12, Y13, Y14, Y15, Y4, Y5)
EXTRACT_LOWER(BX, Y12, Y13, Y14, Y15, Y4)
ADDQ $448, SI
ADDQ $448, DI
SUBQ $448, CX
JMP finalize
between_320_and_448:
VMOVDQA Y0, Y4
VMOVDQA Y1, Y5
VMOVDQA Y2, Y6
VPADDQ TWO, Y3, Y7
VMOVDQA Y0, Y8
VMOVDQA Y1, Y9
VMOVDQA Y2, Y10
VPADDQ TWO, Y7, Y11
MOVQ DX, R9
chacha_loop_384:
CHACHA_QROUND_AVX(Y0, Y1, Y2, Y3, Y13, Y14, Y15)
CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y13, Y14, Y15)
CHACHA_SHUFFLE_AVX(Y1, Y2, Y3)
CHACHA_SHUFFLE_AVX(Y5, Y6, Y7)
CHACHA_SHUFFLE_AVX(Y9, Y10, Y11)
CHACHA_QROUND_AVX(Y0, Y1, Y2, Y3, Y13, Y14, Y15)
CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y13, Y14, Y15)
CHACHA_SHUFFLE_AVX(Y3, Y2, Y1)
CHACHA_SHUFFLE_AVX(Y7, Y6, Y5)
CHACHA_SHUFFLE_AVX(Y11, Y10, Y9)
SUBQ $2, R9
JA chacha_loop_384
VPADDD STATE_0, Y0, Y0
VPADDD STATE_1, Y1, Y1
VPADDD STATE_2, Y2, Y2
VPADDD STATE_3, Y3, Y3
XOR_AVX2(DI, SI, 0, Y0, Y1, Y2, Y3, Y12, Y13)
VMOVDQA STATE_0, Y0
VMOVDQA STATE_1, Y1
VMOVDQA STATE_2, Y2
VMOVDQA STATE_3, Y3
VPADDQ TWO, Y3, Y3
VPADDD Y0, Y4, Y4
VPADDD Y1, Y5, Y5
VPADDD Y2, Y6, Y6
VPADDD Y3, Y7, Y7
XOR_AVX2(DI, SI, 128, Y4, Y5, Y6, Y7, Y12, Y13)
VPADDQ TWO, Y3, Y3
VPADDD Y0, Y8, Y8
VPADDD Y1, Y9, Y9
VPADDD Y2, Y10, Y10
VPADDD Y3, Y11, Y11
VPADDQ TWO, Y3, Y3
CMPQ CX, $384
JB less_than_384
XOR_AVX2(DI, SI, 256, Y8, Y9, Y10, Y11, Y12, Y13)
SUBQ $384, CX
TESTQ CX, CX
JE done
ADDQ $384, SI
ADDQ $384, DI
JMP between_0_and_64
less_than_384:
XOR_UPPER_AVX2(DI, SI, 256, Y8, Y9, Y10, Y11, Y12, Y13)
EXTRACT_LOWER(BX, Y8, Y9, Y10, Y11, Y12)
ADDQ $320, SI
ADDQ $320, DI
SUBQ $320, CX
JMP finalize
between_192_and_320:
VMOVDQA Y0, Y4
VMOVDQA Y1, Y5
VMOVDQA Y2, Y6
VMOVDQA Y3, Y7
VMOVDQA Y0, Y8
VMOVDQA Y1, Y9
VMOVDQA Y2, Y10
VPADDQ TWO, Y3, Y11
MOVQ DX, R9
chacha_loop_256:
CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y13, Y14, Y15)
CHACHA_SHUFFLE_AVX(Y5, Y6, Y7)
CHACHA_SHUFFLE_AVX(Y9, Y10, Y11)
CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
CHACHA_QROUND_AVX(Y8, Y9, Y10, Y11, Y13, Y14, Y15)
CHACHA_SHUFFLE_AVX(Y7, Y6, Y5)
CHACHA_SHUFFLE_AVX(Y11, Y10, Y9)
SUBQ $2, R9
JA chacha_loop_256
VPADDD Y0, Y4, Y4
VPADDD Y1, Y5, Y5
VPADDD Y2, Y6, Y6
VPADDD Y3, Y7, Y7
VPADDQ TWO, Y3, Y3
XOR_AVX2(DI, SI, 0, Y4, Y5, Y6, Y7, Y12, Y13)
VPADDD Y0, Y8, Y8
VPADDD Y1, Y9, Y9
VPADDD Y2, Y10, Y10
VPADDD Y3, Y11, Y11
VPADDQ TWO, Y3, Y3
CMPQ CX, $256
JB less_than_256
XOR_AVX2(DI, SI, 128, Y8, Y9, Y10, Y11, Y12, Y13)
SUBQ $256, CX
TESTQ CX, CX
JE done
ADDQ $256, SI
ADDQ $256, DI
JMP between_0_and_64
less_than_256:
XOR_UPPER_AVX2(DI, SI, 128, Y8, Y9, Y10, Y11, Y12, Y13)
EXTRACT_LOWER(BX, Y8, Y9, Y10, Y11, Y12)
ADDQ $192, SI
ADDQ $192, DI
SUBQ $192, CX
JMP finalize
between_64_and_192:
VMOVDQA Y0, Y4
VMOVDQA Y1, Y5
VMOVDQA Y2, Y6
VMOVDQA Y3, Y7
MOVQ DX, R9
chacha_loop_128:
CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
CHACHA_SHUFFLE_AVX(Y5, Y6, Y7)
CHACHA_QROUND_AVX(Y4, Y5, Y6, Y7, Y13, Y14, Y15)
CHACHA_SHUFFLE_AVX(Y7, Y6, Y5)
SUBQ $2, R9
JA chacha_loop_128
VPADDD Y0, Y4, Y4
VPADDD Y1, Y5, Y5
VPADDD Y2, Y6, Y6
VPADDD Y3, Y7, Y7
VPADDQ TWO, Y3, Y3
CMPQ CX, $128
JB less_than_128
XOR_AVX2(DI, SI, 0, Y4, Y5, Y6, Y7, Y12, Y13)
SUBQ $128, CX
TESTQ CX, CX
JE done
ADDQ $128, SI
ADDQ $128, DI
JMP between_0_and_64
less_than_128:
XOR_UPPER_AVX2(DI, SI, 0, Y4, Y5, Y6, Y7, Y12, Y13)
EXTRACT_LOWER(BX, Y4, Y5, Y6, Y7, Y13)
ADDQ $64, SI
ADDQ $64, DI
SUBQ $64, CX
JMP finalize
between_0_and_64:
VMOVDQA X0, X4
VMOVDQA X1, X5
VMOVDQA X2, X6
VMOVDQA X3, X7
MOVQ DX, R9
chacha_loop_64:
CHACHA_QROUND_AVX(X4, X5, X6, X7, X13, X14, X15)
CHACHA_SHUFFLE_AVX(X5, X6, X7)
CHACHA_QROUND_AVX(X4, X5, X6, X7, X13, X14, X15)
CHACHA_SHUFFLE_AVX(X7, X6, X5)
SUBQ $2, R9
JA chacha_loop_64
VPADDD X0, X4, X4
VPADDD X1, X5, X5
VPADDD X2, X6, X6
VPADDD X3, X7, X7
VMOVDQU ·one<>(SB), X0
VPADDQ X0, X3, X3
CMPQ CX, $64
JB less_than_64
XOR_AVX(DI, SI, 0, X4, X5, X6, X7, X13)
SUBQ $64, CX
JMP done
less_than_64:
VMOVDQU X4, 0(BX)
VMOVDQU X5, 16(BX)
VMOVDQU X6, 32(BX)
VMOVDQU X7, 48(BX)
finalize:
XORQ R11, R11
XORQ R12, R12
MOVQ CX, BP
xor_loop:
MOVB 0(SI), R11
MOVB 0(BX), R12
XORQ R11, R12
MOVB R12, 0(DI)
INCQ SI
INCQ BX
INCQ DI
DECQ BP
JA xor_loop
done:
VMOVDQU X3, 48(AX)
VZEROUPPER
MOVQ R8, SP
MOVQ CX, ret+72(FP)
RET

60
vendor/github.com/aead/chacha20/chacha/chacha_386.go generated vendored Normal file
View file

@ -0,0 +1,60 @@
// Copyright (c) 2016 Andreas Auernhammer. All rights reserved.
// Use of this source code is governed by a license that can be
// found in the LICENSE file.
// +build 386,!gccgo,!appengine,!nacl
package chacha
import (
"encoding/binary"
"golang.org/x/sys/cpu"
)
func init() {
useSSE2 = cpu.X86.HasSSE2
useSSSE3 = cpu.X86.HasSSSE3
useAVX = false
useAVX2 = false
}
func initialize(state *[64]byte, key []byte, nonce *[16]byte) {
binary.LittleEndian.PutUint32(state[0:], sigma[0])
binary.LittleEndian.PutUint32(state[4:], sigma[1])
binary.LittleEndian.PutUint32(state[8:], sigma[2])
binary.LittleEndian.PutUint32(state[12:], sigma[3])
copy(state[16:], key[:])
copy(state[48:], nonce[:])
}
// This function is implemented in chacha_386.s
//go:noescape
func hChaCha20SSE2(out *[32]byte, nonce *[16]byte, key *[32]byte)
// This function is implemented in chacha_386.s
//go:noescape
func hChaCha20SSSE3(out *[32]byte, nonce *[16]byte, key *[32]byte)
// This function is implemented in chacha_386.s
//go:noescape
func xorKeyStreamSSE2(dst, src []byte, block, state *[64]byte, rounds int) int
func hChaCha20(out *[32]byte, nonce *[16]byte, key *[32]byte) {
switch {
case useSSSE3:
hChaCha20SSSE3(out, nonce, key)
case useSSE2:
hChaCha20SSE2(out, nonce, key)
default:
hChaCha20Generic(out, nonce, key)
}
}
func xorKeyStream(dst, src []byte, block, state *[64]byte, rounds int) int {
if useSSE2 {
return xorKeyStreamSSE2(dst, src, block, state, rounds)
} else {
return xorKeyStreamGeneric(dst, src, block, state, rounds)
}
}

163
vendor/github.com/aead/chacha20/chacha/chacha_386.s generated vendored Normal file
View file

@ -0,0 +1,163 @@
// Copyright (c) 2016 Andreas Auernhammer. All rights reserved.
// Use of this source code is governed by a license that can be
// found in the LICENSE file.
// +build 386,!gccgo,!appengine,!nacl
#include "const.s"
#include "macro.s"
// FINALIZE xors len bytes from src and block using
// the temp. registers t0 and t1 and writes the result
// to dst.
#define FINALIZE(dst, src, block, len, t0, t1) \
XORL t0, t0; \
XORL t1, t1; \
FINALIZE_LOOP:; \
MOVB 0(src), t0; \
MOVB 0(block), t1; \
XORL t0, t1; \
MOVB t1, 0(dst); \
INCL src; \
INCL block; \
INCL dst; \
DECL len; \
JG FINALIZE_LOOP \
#define Dst DI
#define Nonce AX
#define Key BX
#define Rounds DX
// func hChaCha20SSE2(out *[32]byte, nonce *[16]byte, key *[32]byte)
TEXT ·hChaCha20SSE2(SB), 4, $0-12
MOVL out+0(FP), Dst
MOVL nonce+4(FP), Nonce
MOVL key+8(FP), Key
MOVOU ·sigma<>(SB), X0
MOVOU 0*16(Key), X1
MOVOU 1*16(Key), X2
MOVOU 0*16(Nonce), X3
MOVL $20, Rounds
chacha_loop:
CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4)
CHACHA_SHUFFLE_SSE(X1, X2, X3)
CHACHA_QROUND_SSE2(X0, X1, X2, X3, X4)
CHACHA_SHUFFLE_SSE(X3, X2, X1)
SUBL $2, Rounds
JNZ chacha_loop
MOVOU X0, 0*16(Dst)
MOVOU X3, 1*16(Dst)
RET
// func hChaCha20SSSE3(out *[32]byte, nonce *[16]byte, key *[32]byte)
TEXT ·hChaCha20SSSE3(SB), 4, $0-12
MOVL out+0(FP), Dst
MOVL nonce+4(FP), Nonce
MOVL key+8(FP), Key
MOVOU ·sigma<>(SB), X0
MOVOU 0*16(Key), X1
MOVOU 1*16(Key), X2
MOVOU 0*16(Nonce), X3
MOVL $20, Rounds
MOVOU ·rol16<>(SB), X5
MOVOU ·rol8<>(SB), X6
chacha_loop:
CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, X5, X6)
CHACHA_SHUFFLE_SSE(X1, X2, X3)
CHACHA_QROUND_SSSE3(X0, X1, X2, X3, X4, X5, X6)
CHACHA_SHUFFLE_SSE(X3, X2, X1)
SUBL $2, Rounds
JNZ chacha_loop
MOVOU X0, 0*16(Dst)
MOVOU X3, 1*16(Dst)
RET
#undef Dst
#undef Nonce
#undef Key
#undef Rounds
#define State AX
#define Dst DI
#define Src SI
#define Len DX
#define Tmp0 BX
#define Tmp1 BP
// func xorKeyStreamSSE2(dst, src []byte, block, state *[64]byte, rounds int) int
TEXT ·xorKeyStreamSSE2(SB), 4, $0-40
MOVL dst_base+0(FP), Dst
MOVL src_base+12(FP), Src
MOVL state+28(FP), State
MOVL src_len+16(FP), Len
MOVL $0, ret+36(FP) // Number of bytes written to the keystream buffer - 0 iff len mod 64 == 0
MOVOU 0*16(State), X0
MOVOU 1*16(State), X1
MOVOU 2*16(State), X2
MOVOU 3*16(State), X3
TESTL Len, Len
JZ DONE
GENERATE_KEYSTREAM:
MOVO X0, X4
MOVO X1, X5
MOVO X2, X6
MOVO X3, X7
MOVL rounds+32(FP), Tmp0
CHACHA_LOOP:
CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0)
CHACHA_SHUFFLE_SSE(X5, X6, X7)
CHACHA_QROUND_SSE2(X4, X5, X6, X7, X0)
CHACHA_SHUFFLE_SSE(X7, X6, X5)
SUBL $2, Tmp0
JA CHACHA_LOOP
MOVOU 0*16(State), X0 // Restore X0 from state
PADDL X0, X4
PADDL X1, X5
PADDL X2, X6
PADDL X3, X7
MOVOU ·one<>(SB), X0
PADDQ X0, X3
CMPL Len, $64
JL BUFFER_KEYSTREAM
XOR_SSE(Dst, Src, 0, X4, X5, X6, X7, X0)
MOVOU 0*16(State), X0 // Restore X0 from state
ADDL $64, Src
ADDL $64, Dst
SUBL $64, Len
JZ DONE
JMP GENERATE_KEYSTREAM // There is at least one more plaintext byte
BUFFER_KEYSTREAM:
MOVL block+24(FP), State
MOVOU X4, 0(State)
MOVOU X5, 16(State)
MOVOU X6, 32(State)
MOVOU X7, 48(State)
MOVL Len, ret+36(FP) // Number of bytes written to the keystream buffer - 0 < Len < 64
FINALIZE(Dst, Src, State, Len, Tmp0, Tmp1)
DONE:
MOVL state+28(FP), State
MOVOU X3, 3*16(State)
RET
#undef State
#undef Dst
#undef Src
#undef Len
#undef Tmp0
#undef Tmp1

76
vendor/github.com/aead/chacha20/chacha/chacha_amd64.go generated vendored Normal file
View file

@ -0,0 +1,76 @@
// Copyright (c) 2017 Andreas Auernhammer. All rights reserved.
// Use of this source code is governed by a license that can be
// found in the LICENSE file.
// +build go1.7,amd64,!gccgo,!appengine,!nacl
package chacha
import "golang.org/x/sys/cpu"
func init() {
useSSE2 = cpu.X86.HasSSE2
useSSSE3 = cpu.X86.HasSSSE3
useAVX = cpu.X86.HasAVX
useAVX2 = cpu.X86.HasAVX2
}
// This function is implemented in chacha_amd64.s
//go:noescape
func initialize(state *[64]byte, key []byte, nonce *[16]byte)
// This function is implemented in chacha_amd64.s
//go:noescape
func hChaCha20SSE2(out *[32]byte, nonce *[16]byte, key *[32]byte)
// This function is implemented in chacha_amd64.s
//go:noescape
func hChaCha20SSSE3(out *[32]byte, nonce *[16]byte, key *[32]byte)
// This function is implemented in chachaAVX2_amd64.s
//go:noescape
func hChaCha20AVX(out *[32]byte, nonce *[16]byte, key *[32]byte)
// This function is implemented in chacha_amd64.s
//go:noescape
func xorKeyStreamSSE2(dst, src []byte, block, state *[64]byte, rounds int) int
// This function is implemented in chacha_amd64.s
//go:noescape
func xorKeyStreamSSSE3(dst, src []byte, block, state *[64]byte, rounds int) int
// This function is implemented in chacha_amd64.s
//go:noescape
func xorKeyStreamAVX(dst, src []byte, block, state *[64]byte, rounds int) int
// This function is implemented in chachaAVX2_amd64.s
//go:noescape
func xorKeyStreamAVX2(dst, src []byte, block, state *[64]byte, rounds int) int
func hChaCha20(out *[32]byte, nonce *[16]byte, key *[32]byte) {
switch {
case useAVX:
hChaCha20AVX(out, nonce, key)
case useSSSE3:
hChaCha20SSSE3(out, nonce, key)
case useSSE2:
hChaCha20SSE2(out, nonce, key)
default:
hChaCha20Generic(out, nonce, key)
}
}
func xorKeyStream(dst, src []byte, block, state *[64]byte, rounds int) int {
switch {
case useAVX2:
return xorKeyStreamAVX2(dst, src, block, state, rounds)
case useAVX:
return xorKeyStreamAVX(dst, src, block, state, rounds)
case useSSSE3:
return xorKeyStreamSSSE3(dst, src, block, state, rounds)
case useSSE2:
return xorKeyStreamSSE2(dst, src, block, state, rounds)
default:
return xorKeyStreamGeneric(dst, src, block, state, rounds)
}
}

1072
vendor/github.com/aead/chacha20/chacha/chacha_amd64.s generated vendored Normal file

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,319 @@
// Copyright (c) 2016 Andreas Auernhammer. All rights reserved.
// Use of this source code is governed by a license that can be
// found in the LICENSE file.
package chacha
import "encoding/binary"
var sigma = [4]uint32{0x61707865, 0x3320646e, 0x79622d32, 0x6b206574}
func xorKeyStreamGeneric(dst, src []byte, block, state *[64]byte, rounds int) int {
for len(src) >= 64 {
chachaGeneric(block, state, rounds)
for i, v := range block {
dst[i] = src[i] ^ v
}
src = src[64:]
dst = dst[64:]
}
n := len(src)
if n > 0 {
chachaGeneric(block, state, rounds)
for i, v := range src {
dst[i] = v ^ block[i]
}
}
return n
}
func chachaGeneric(dst *[64]byte, state *[64]byte, rounds int) {
v00 := binary.LittleEndian.Uint32(state[0:])
v01 := binary.LittleEndian.Uint32(state[4:])
v02 := binary.LittleEndian.Uint32(state[8:])
v03 := binary.LittleEndian.Uint32(state[12:])
v04 := binary.LittleEndian.Uint32(state[16:])
v05 := binary.LittleEndian.Uint32(state[20:])
v06 := binary.LittleEndian.Uint32(state[24:])
v07 := binary.LittleEndian.Uint32(state[28:])
v08 := binary.LittleEndian.Uint32(state[32:])
v09 := binary.LittleEndian.Uint32(state[36:])
v10 := binary.LittleEndian.Uint32(state[40:])
v11 := binary.LittleEndian.Uint32(state[44:])
v12 := binary.LittleEndian.Uint32(state[48:])
v13 := binary.LittleEndian.Uint32(state[52:])
v14 := binary.LittleEndian.Uint32(state[56:])
v15 := binary.LittleEndian.Uint32(state[60:])
s00, s01, s02, s03, s04, s05, s06, s07 := v00, v01, v02, v03, v04, v05, v06, v07
s08, s09, s10, s11, s12, s13, s14, s15 := v08, v09, v10, v11, v12, v13, v14, v15
for i := 0; i < rounds; i += 2 {
v00 += v04
v12 ^= v00
v12 = (v12 << 16) | (v12 >> 16)
v08 += v12
v04 ^= v08
v04 = (v04 << 12) | (v04 >> 20)
v00 += v04
v12 ^= v00
v12 = (v12 << 8) | (v12 >> 24)
v08 += v12
v04 ^= v08
v04 = (v04 << 7) | (v04 >> 25)
v01 += v05
v13 ^= v01
v13 = (v13 << 16) | (v13 >> 16)
v09 += v13
v05 ^= v09
v05 = (v05 << 12) | (v05 >> 20)
v01 += v05
v13 ^= v01
v13 = (v13 << 8) | (v13 >> 24)
v09 += v13
v05 ^= v09
v05 = (v05 << 7) | (v05 >> 25)
v02 += v06
v14 ^= v02
v14 = (v14 << 16) | (v14 >> 16)
v10 += v14
v06 ^= v10
v06 = (v06 << 12) | (v06 >> 20)
v02 += v06
v14 ^= v02
v14 = (v14 << 8) | (v14 >> 24)
v10 += v14
v06 ^= v10
v06 = (v06 << 7) | (v06 >> 25)
v03 += v07
v15 ^= v03
v15 = (v15 << 16) | (v15 >> 16)
v11 += v15
v07 ^= v11
v07 = (v07 << 12) | (v07 >> 20)
v03 += v07
v15 ^= v03
v15 = (v15 << 8) | (v15 >> 24)
v11 += v15
v07 ^= v11
v07 = (v07 << 7) | (v07 >> 25)
v00 += v05
v15 ^= v00
v15 = (v15 << 16) | (v15 >> 16)
v10 += v15
v05 ^= v10
v05 = (v05 << 12) | (v05 >> 20)
v00 += v05
v15 ^= v00
v15 = (v15 << 8) | (v15 >> 24)
v10 += v15
v05 ^= v10
v05 = (v05 << 7) | (v05 >> 25)
v01 += v06
v12 ^= v01
v12 = (v12 << 16) | (v12 >> 16)
v11 += v12
v06 ^= v11
v06 = (v06 << 12) | (v06 >> 20)
v01 += v06
v12 ^= v01
v12 = (v12 << 8) | (v12 >> 24)
v11 += v12
v06 ^= v11
v06 = (v06 << 7) | (v06 >> 25)
v02 += v07
v13 ^= v02
v13 = (v13 << 16) | (v13 >> 16)
v08 += v13
v07 ^= v08
v07 = (v07 << 12) | (v07 >> 20)
v02 += v07
v13 ^= v02
v13 = (v13 << 8) | (v13 >> 24)
v08 += v13
v07 ^= v08
v07 = (v07 << 7) | (v07 >> 25)
v03 += v04
v14 ^= v03
v14 = (v14 << 16) | (v14 >> 16)
v09 += v14
v04 ^= v09
v04 = (v04 << 12) | (v04 >> 20)
v03 += v04
v14 ^= v03
v14 = (v14 << 8) | (v14 >> 24)
v09 += v14
v04 ^= v09
v04 = (v04 << 7) | (v04 >> 25)
}
v00 += s00
v01 += s01
v02 += s02
v03 += s03
v04 += s04
v05 += s05
v06 += s06
v07 += s07
v08 += s08
v09 += s09
v10 += s10
v11 += s11
v12 += s12
v13 += s13
v14 += s14
v15 += s15
s12++
binary.LittleEndian.PutUint32(state[48:], s12)
if s12 == 0 { // indicates overflow
s13++
binary.LittleEndian.PutUint32(state[52:], s13)
}
binary.LittleEndian.PutUint32(dst[0:], v00)
binary.LittleEndian.PutUint32(dst[4:], v01)
binary.LittleEndian.PutUint32(dst[8:], v02)
binary.LittleEndian.PutUint32(dst[12:], v03)
binary.LittleEndian.PutUint32(dst[16:], v04)
binary.LittleEndian.PutUint32(dst[20:], v05)
binary.LittleEndian.PutUint32(dst[24:], v06)
binary.LittleEndian.PutUint32(dst[28:], v07)
binary.LittleEndian.PutUint32(dst[32:], v08)
binary.LittleEndian.PutUint32(dst[36:], v09)
binary.LittleEndian.PutUint32(dst[40:], v10)
binary.LittleEndian.PutUint32(dst[44:], v11)
binary.LittleEndian.PutUint32(dst[48:], v12)
binary.LittleEndian.PutUint32(dst[52:], v13)
binary.LittleEndian.PutUint32(dst[56:], v14)
binary.LittleEndian.PutUint32(dst[60:], v15)
}
func hChaCha20Generic(out *[32]byte, nonce *[16]byte, key *[32]byte) {
v00 := sigma[0]
v01 := sigma[1]
v02 := sigma[2]
v03 := sigma[3]
v04 := binary.LittleEndian.Uint32(key[0:])
v05 := binary.LittleEndian.Uint32(key[4:])
v06 := binary.LittleEndian.Uint32(key[8:])
v07 := binary.LittleEndian.Uint32(key[12:])
v08 := binary.LittleEndian.Uint32(key[16:])
v09 := binary.LittleEndian.Uint32(key[20:])
v10 := binary.LittleEndian.Uint32(key[24:])
v11 := binary.LittleEndian.Uint32(key[28:])
v12 := binary.LittleEndian.Uint32(nonce[0:])
v13 := binary.LittleEndian.Uint32(nonce[4:])
v14 := binary.LittleEndian.Uint32(nonce[8:])
v15 := binary.LittleEndian.Uint32(nonce[12:])
for i := 0; i < 20; i += 2 {
v00 += v04
v12 ^= v00
v12 = (v12 << 16) | (v12 >> 16)
v08 += v12
v04 ^= v08
v04 = (v04 << 12) | (v04 >> 20)
v00 += v04
v12 ^= v00
v12 = (v12 << 8) | (v12 >> 24)
v08 += v12
v04 ^= v08
v04 = (v04 << 7) | (v04 >> 25)
v01 += v05
v13 ^= v01
v13 = (v13 << 16) | (v13 >> 16)
v09 += v13
v05 ^= v09
v05 = (v05 << 12) | (v05 >> 20)
v01 += v05
v13 ^= v01
v13 = (v13 << 8) | (v13 >> 24)
v09 += v13
v05 ^= v09
v05 = (v05 << 7) | (v05 >> 25)
v02 += v06
v14 ^= v02
v14 = (v14 << 16) | (v14 >> 16)
v10 += v14
v06 ^= v10
v06 = (v06 << 12) | (v06 >> 20)
v02 += v06
v14 ^= v02
v14 = (v14 << 8) | (v14 >> 24)
v10 += v14
v06 ^= v10
v06 = (v06 << 7) | (v06 >> 25)
v03 += v07
v15 ^= v03
v15 = (v15 << 16) | (v15 >> 16)
v11 += v15
v07 ^= v11
v07 = (v07 << 12) | (v07 >> 20)
v03 += v07
v15 ^= v03
v15 = (v15 << 8) | (v15 >> 24)
v11 += v15
v07 ^= v11
v07 = (v07 << 7) | (v07 >> 25)
v00 += v05
v15 ^= v00
v15 = (v15 << 16) | (v15 >> 16)
v10 += v15
v05 ^= v10
v05 = (v05 << 12) | (v05 >> 20)
v00 += v05
v15 ^= v00
v15 = (v15 << 8) | (v15 >> 24)
v10 += v15
v05 ^= v10
v05 = (v05 << 7) | (v05 >> 25)
v01 += v06
v12 ^= v01
v12 = (v12 << 16) | (v12 >> 16)
v11 += v12
v06 ^= v11
v06 = (v06 << 12) | (v06 >> 20)
v01 += v06
v12 ^= v01
v12 = (v12 << 8) | (v12 >> 24)
v11 += v12
v06 ^= v11
v06 = (v06 << 7) | (v06 >> 25)
v02 += v07
v13 ^= v02
v13 = (v13 << 16) | (v13 >> 16)
v08 += v13
v07 ^= v08
v07 = (v07 << 12) | (v07 >> 20)
v02 += v07
v13 ^= v02
v13 = (v13 << 8) | (v13 >> 24)
v08 += v13
v07 ^= v08
v07 = (v07 << 7) | (v07 >> 25)
v03 += v04
v14 ^= v03
v14 = (v14 << 16) | (v14 >> 16)
v09 += v14
v04 ^= v09
v04 = (v04 << 12) | (v04 >> 20)
v03 += v04
v14 ^= v03
v14 = (v14 << 8) | (v14 >> 24)
v09 += v14
v04 ^= v09
v04 = (v04 << 7) | (v04 >> 25)
}
binary.LittleEndian.PutUint32(out[0:], v00)
binary.LittleEndian.PutUint32(out[4:], v01)
binary.LittleEndian.PutUint32(out[8:], v02)
binary.LittleEndian.PutUint32(out[12:], v03)
binary.LittleEndian.PutUint32(out[16:], v12)
binary.LittleEndian.PutUint32(out[20:], v13)
binary.LittleEndian.PutUint32(out[24:], v14)
binary.LittleEndian.PutUint32(out[28:], v15)
}

33
vendor/github.com/aead/chacha20/chacha/chacha_ref.go generated vendored Normal file
View file

@ -0,0 +1,33 @@
// Copyright (c) 2016 Andreas Auernhammer. All rights reserved.
// Use of this source code is governed by a license that can be
// found in the LICENSE file.
// +build !amd64,!386 gccgo appengine nacl
package chacha
import "encoding/binary"
func init() {
useSSE2 = false
useSSSE3 = false
useAVX = false
useAVX2 = false
}
func initialize(state *[64]byte, key []byte, nonce *[16]byte) {
binary.LittleEndian.PutUint32(state[0:], sigma[0])
binary.LittleEndian.PutUint32(state[4:], sigma[1])
binary.LittleEndian.PutUint32(state[8:], sigma[2])
binary.LittleEndian.PutUint32(state[12:], sigma[3])
copy(state[16:], key[:])
copy(state[48:], nonce[:])
}
func xorKeyStream(dst, src []byte, block, state *[64]byte, rounds int) int {
return xorKeyStreamGeneric(dst, src, block, state, rounds)
}
func hChaCha20(out *[32]byte, nonce *[16]byte, key *[32]byte) {
hChaCha20Generic(out, nonce, key)
}

53
vendor/github.com/aead/chacha20/chacha/const.s generated vendored Normal file
View file

@ -0,0 +1,53 @@
// Copyright (c) 2018 Andreas Auernhammer. All rights reserved.
// Use of this source code is governed by a license that can be
// found in the LICENSE file.
// +build 386,!gccgo,!appengine,!nacl amd64,!gccgo,!appengine,!nacl
#include "textflag.h"
DATA ·sigma<>+0x00(SB)/4, $0x61707865
DATA ·sigma<>+0x04(SB)/4, $0x3320646e
DATA ·sigma<>+0x08(SB)/4, $0x79622d32
DATA ·sigma<>+0x0C(SB)/4, $0x6b206574
GLOBL ·sigma<>(SB), (NOPTR+RODATA), $16 // The 4 ChaCha initialization constants
// SSE2/SSE3/AVX constants
DATA ·one<>+0x00(SB)/8, $1
DATA ·one<>+0x08(SB)/8, $0
GLOBL ·one<>(SB), (NOPTR+RODATA), $16 // The constant 1 as 128 bit value
DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302
DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
GLOBL ·rol16<>(SB), (NOPTR+RODATA), $16 // The PSHUFB 16 bit left rotate constant
DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003
DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
GLOBL ·rol8<>(SB), (NOPTR+RODATA), $16 // The PSHUFB 8 bit left rotate constant
// AVX2 constants
DATA ·one_AVX2<>+0x00(SB)/8, $0
DATA ·one_AVX2<>+0x08(SB)/8, $0
DATA ·one_AVX2<>+0x10(SB)/8, $1
DATA ·one_AVX2<>+0x18(SB)/8, $0
GLOBL ·one_AVX2<>(SB), (NOPTR+RODATA), $32 // The constant 1 as 256 bit value
DATA ·two_AVX2<>+0x00(SB)/8, $2
DATA ·two_AVX2<>+0x08(SB)/8, $0
DATA ·two_AVX2<>+0x10(SB)/8, $2
DATA ·two_AVX2<>+0x18(SB)/8, $0
GLOBL ·two_AVX2<>(SB), (NOPTR+RODATA), $32
DATA ·rol16_AVX2<>+0x00(SB)/8, $0x0504070601000302
DATA ·rol16_AVX2<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
DATA ·rol16_AVX2<>+0x10(SB)/8, $0x0504070601000302
DATA ·rol16_AVX2<>+0x18(SB)/8, $0x0D0C0F0E09080B0A
GLOBL ·rol16_AVX2<>(SB), (NOPTR+RODATA), $32 // The VPSHUFB 16 bit left rotate constant
DATA ·rol8_AVX2<>+0x00(SB)/8, $0x0605040702010003
DATA ·rol8_AVX2<>+0x08(SB)/8, $0x0E0D0C0F0A09080B
DATA ·rol8_AVX2<>+0x10(SB)/8, $0x0605040702010003
DATA ·rol8_AVX2<>+0x18(SB)/8, $0x0E0D0C0F0A09080B
GLOBL ·rol8_AVX2<>(SB), (NOPTR+RODATA), $32 // The VPSHUFB 8 bit left rotate constant

163
vendor/github.com/aead/chacha20/chacha/macro.s generated vendored Normal file
View file

@ -0,0 +1,163 @@
// Copyright (c) 2018 Andreas Auernhammer. All rights reserved.
// Use of this source code is governed by a license that can be
// found in the LICENSE file.
// +build 386,!gccgo,!appengine,!nacl amd64,!gccgo,!appengine,!nacl
// ROTL_SSE rotates all 4 32 bit values of the XMM register v
// left by n bits using SSE2 instructions (0 <= n <= 32).
// The XMM register t is used as a temp. register.
#define ROTL_SSE(n, t, v) \
MOVO v, t; \
PSLLL $n, t; \
PSRLL $(32-n), v; \
PXOR t, v
// ROTL_AVX rotates all 4/8 32 bit values of the AVX/AVX2 register v
// left by n bits using AVX/AVX2 instructions (0 <= n <= 32).
// The AVX/AVX2 register t is used as a temp. register.
#define ROTL_AVX(n, t, v) \
VPSLLD $n, v, t; \
VPSRLD $(32-n), v, v; \
VPXOR v, t, v
// CHACHA_QROUND_SSE2 performs a ChaCha quarter-round using the
// 4 XMM registers v0, v1, v2 and v3. It uses only ROTL_SSE2 for
// rotations. The XMM register t is used as a temp. register.
#define CHACHA_QROUND_SSE2(v0, v1, v2, v3, t) \
PADDL v1, v0; \
PXOR v0, v3; \
ROTL_SSE(16, t, v3); \
PADDL v3, v2; \
PXOR v2, v1; \
ROTL_SSE(12, t, v1); \
PADDL v1, v0; \
PXOR v0, v3; \
ROTL_SSE(8, t, v3); \
PADDL v3, v2; \
PXOR v2, v1; \
ROTL_SSE(7, t, v1)
// CHACHA_QROUND_SSSE3 performs a ChaCha quarter-round using the
// 4 XMM registers v0, v1, v2 and v3. It uses PSHUFB for 8/16 bit
// rotations. The XMM register t is used as a temp. register.
//
// r16 holds the PSHUFB constant for a 16 bit left rotate.
// r8 holds the PSHUFB constant for a 8 bit left rotate.
#define CHACHA_QROUND_SSSE3(v0, v1, v2, v3, t, r16, r8) \
PADDL v1, v0; \
PXOR v0, v3; \
PSHUFB r16, v3; \
PADDL v3, v2; \
PXOR v2, v1; \
ROTL_SSE(12, t, v1); \
PADDL v1, v0; \
PXOR v0, v3; \
PSHUFB r8, v3; \
PADDL v3, v2; \
PXOR v2, v1; \
ROTL_SSE(7, t, v1)
// CHACHA_QROUND_AVX performs a ChaCha quarter-round using the
// 4 AVX/AVX2 registers v0, v1, v2 and v3. It uses VPSHUFB for 8/16 bit
// rotations. The AVX/AVX2 register t is used as a temp. register.
//
// r16 holds the VPSHUFB constant for a 16 bit left rotate.
// r8 holds the VPSHUFB constant for a 8 bit left rotate.
#define CHACHA_QROUND_AVX(v0, v1, v2, v3, t, r16, r8) \
VPADDD v0, v1, v0; \
VPXOR v3, v0, v3; \
VPSHUFB r16, v3, v3; \
VPADDD v2, v3, v2; \
VPXOR v1, v2, v1; \
ROTL_AVX(12, t, v1); \
VPADDD v0, v1, v0; \
VPXOR v3, v0, v3; \
VPSHUFB r8, v3, v3; \
VPADDD v2, v3, v2; \
VPXOR v1, v2, v1; \
ROTL_AVX(7, t, v1)
// CHACHA_SHUFFLE_SSE performs a ChaCha shuffle using the
// 3 XMM registers v1, v2 and v3. The inverse shuffle is
// performed by switching v1 and v3: CHACHA_SHUFFLE_SSE(v3, v2, v1).
#define CHACHA_SHUFFLE_SSE(v1, v2, v3) \
PSHUFL $0x39, v1, v1; \
PSHUFL $0x4E, v2, v2; \
PSHUFL $0x93, v3, v3
// CHACHA_SHUFFLE_AVX performs a ChaCha shuffle using the
// 3 AVX/AVX2 registers v1, v2 and v3. The inverse shuffle is
// performed by switching v1 and v3: CHACHA_SHUFFLE_AVX(v3, v2, v1).
#define CHACHA_SHUFFLE_AVX(v1, v2, v3) \
VPSHUFD $0x39, v1, v1; \
VPSHUFD $0x4E, v2, v2; \
VPSHUFD $0x93, v3, v3
// XOR_SSE extracts 4x16 byte vectors from src at
// off, xors all vectors with the corresponding XMM
// register (v0 - v3) and writes the result to dst
// at off.
// The XMM register t is used as a temp. register.
#define XOR_SSE(dst, src, off, v0, v1, v2, v3, t) \
MOVOU 0+off(src), t; \
PXOR v0, t; \
MOVOU t, 0+off(dst); \
MOVOU 16+off(src), t; \
PXOR v1, t; \
MOVOU t, 16+off(dst); \
MOVOU 32+off(src), t; \
PXOR v2, t; \
MOVOU t, 32+off(dst); \
MOVOU 48+off(src), t; \
PXOR v3, t; \
MOVOU t, 48+off(dst)
// XOR_AVX extracts 4x16 byte vectors from src at
// off, xors all vectors with the corresponding AVX
// register (v0 - v3) and writes the result to dst
// at off.
// The XMM register t is used as a temp. register.
#define XOR_AVX(dst, src, off, v0, v1, v2, v3, t) \
VPXOR 0+off(src), v0, t; \
VMOVDQU t, 0+off(dst); \
VPXOR 16+off(src), v1, t; \
VMOVDQU t, 16+off(dst); \
VPXOR 32+off(src), v2, t; \
VMOVDQU t, 32+off(dst); \
VPXOR 48+off(src), v3, t; \
VMOVDQU t, 48+off(dst)
#define XOR_AVX2(dst, src, off, v0, v1, v2, v3, t0, t1) \
VMOVDQU (0+off)(src), t0; \
VPERM2I128 $32, v1, v0, t1; \
VPXOR t0, t1, t0; \
VMOVDQU t0, (0+off)(dst); \
VMOVDQU (32+off)(src), t0; \
VPERM2I128 $32, v3, v2, t1; \
VPXOR t0, t1, t0; \
VMOVDQU t0, (32+off)(dst); \
VMOVDQU (64+off)(src), t0; \
VPERM2I128 $49, v1, v0, t1; \
VPXOR t0, t1, t0; \
VMOVDQU t0, (64+off)(dst); \
VMOVDQU (96+off)(src), t0; \
VPERM2I128 $49, v3, v2, t1; \
VPXOR t0, t1, t0; \
VMOVDQU t0, (96+off)(dst)
#define XOR_UPPER_AVX2(dst, src, off, v0, v1, v2, v3, t0, t1) \
VMOVDQU (0+off)(src), t0; \
VPERM2I128 $32, v1, v0, t1; \
VPXOR t0, t1, t0; \
VMOVDQU t0, (0+off)(dst); \
VMOVDQU (32+off)(src), t0; \
VPERM2I128 $32, v3, v2, t1; \
VPXOR t0, t1, t0; \
VMOVDQU t0, (32+off)(dst); \
#define EXTRACT_LOWER(dst, v0, v1, v2, v3, t0) \
VPERM2I128 $49, v1, v0, t0; \
VMOVDQU t0, 0(dst); \
VPERM2I128 $49, v3, v2, t0; \
VMOVDQU t0, 32(dst)

2
vendor/modules.txt vendored
View file

@ -15,6 +15,8 @@ blitter.com/go/kyber
blitter.com/go/mtwist
# blitter.com/go/newhope v0.0.0-20200130200750-192fc08a8aae
blitter.com/go/newhope
# github.com/aead/chacha20 v0.0.0-20180709150244-8b13a72661da
github.com/aead/chacha20/chacha
# github.com/jameskeane/bcrypt v0.0.0-20120420032655-c3cd44c1e20f
github.com/jameskeane/bcrypt
# github.com/klauspost/cpuid v1.2.2

View file

@ -624,7 +624,7 @@ func main() {
flag.BoolVar(&vopt, "v", false, "show version")
flag.BoolVar(&dbg, "d", false, "debug logging")
flag.StringVar(&cipherAlg, "c", "C_AES_256", "session `cipher` [C_AES_256 | C_TWOFISH_128 | C_BLOWFISH_64 | C_CRYPTMT1]")
flag.StringVar(&cipherAlg, "c", "C_AES_256", "session `cipher` [C_AES_256 | C_TWOFISH_128 | C_BLOWFISH_64 | C_CRYPTMT1 | C_CHACHA20_12]")
flag.StringVar(&hmacAlg, "m", "H_SHA256", "session `HMAC` [H_SHA256 | H_SHA512]")
flag.StringVar(&kexAlg, "k", "KEX_HERRADURA512", "KEx `alg` [KEX_HERRADURA{256/512/1024/2048} | KEX_KYBER{512/768/1024} | KEX_NEWHOPE | KEX_NEWHOPE_SIMPLE]")
flag.StringVar(&kcpMode, "K", "unused", "KCP `alg`, one of [KCP_NONE | KCP_AES | KCP_BLOWFISH | KCP_CAST5 | KCP_SM4 | KCP_SALSA20 | KCP_SIMPLEXOR | KCP_TEA | KCP_3DES | KCP_TWOFISH | KCP_XTEA] to use KCP (github.com/xtaci/kcp-go) reliable UDP instead of TCP")

View file

@ -21,6 +21,7 @@ import (
"log"
"blitter.com/go/cryptmt"
"github.com/aead/chacha20/chacha"
"golang.org/x/crypto/blowfish"
"golang.org/x/crypto/twofish"
@ -104,6 +105,18 @@ func (hc Conn) getStream(keymat []byte) (rc cipher.Stream, mc hash.Hash, err err
case CAlgCryptMT1:
rc = cryptmt.New(nil, nil, keymat)
log.Printf("[cipher CRYPTMT1 (%d)]\n", copts)
case CAlgChaCha20_12:
keymat = expandKeyMat(keymat, chacha.KeySize)
key = keymat[0:chacha.KeySize]
ivlen = chacha.INonceSize
iv = keymat[chacha.KeySize : chacha.KeySize+ivlen]
rc, err = chacha.NewCipher(iv, key, 20)
if err != nil {
log.Printf("[ChaCha20 config error]\n")
fmt.Printf("[ChaCha20 config error]\n")
}
// TODO: SetCounter() to something derived from key or nonce or extra keymat?
log.Printf("[cipher CHACHA20_12 (%d)]\n", copts)
default:
log.Printf("[invalid cipher (%d)]\n", copts)
fmt.Printf("DOOFUS SET A VALID CIPHER ALG (%d)\n", copts)

View file

@ -99,6 +99,7 @@ const (
CAlgTwofish128 // golang.org/x/crypto/twofish
CAlgBlowfish64 // golang.org/x/crypto/blowfish
CAlgCryptMT1 //cryptmt using mtwist64
CAlgChaCha20_12
CAlgNoneDisallowed
)

View file

@ -145,6 +145,8 @@ func (c *CSCipherAlg) String() string {
return "C_BLOWFISH_64"
case CAlgCryptMT1:
return "C_CRYPTMT1"
case CAlgChaCha20_12:
return "C_CHACHA20_12"
default:
return "C_ERR_UNK"
}
@ -280,6 +282,8 @@ func _new(kexAlg KEXAlg, conn *net.Conn) (hc *Conn, e error) {
hc.kex = KEX_HERRADURA512
log.Printf("[KEx alg %d ?? defaults to %d]\n", kexAlg, hc.kex)
}
//hc.logCipherText = true // !!! DEBUGGING ONLY !!! NEVER DEPLOY this uncommented !!!
return
}
@ -298,7 +302,7 @@ func _new(kexAlg KEXAlg, conn *net.Conn) (hc *Conn, e error) {
//
// Session (symmetric) crypto
//
// C_AES_256 C_TWOFISH_128 C_BLOWFISH_128 C_CRYPTMT1
// C_AES_256 C_TWOFISH_128 C_BLOWFISH_128 C_CRYPTMT1 C_CHACHA20_12
//
// Session HMACs
//
@ -322,6 +326,10 @@ func (hc *Conn) applyConnExtensions(extensions ...string) {
log.Println("[extension arg = C_CRYPTMT1]")
hc.cipheropts &= (0xFFFFFF00)
hc.cipheropts |= CAlgCryptMT1
case "C_CHACHA20_12":
log.Println("[extension arg = C_CHACHA20_12]")
hc.cipheropts &= (0xFFFFFF00)
hc.cipheropts |= CAlgChaCha20_12
case "H_SHA256":
log.Println("[extension arg = H_SHA256]")
hc.cipheropts &= (0xFFFF00FF)