From 4f11593f2618e4fadc2af5aa743d9a5b48fde37c Mon Sep 17 00:00:00 2001 From: "arraykeys@gmail.com" Date: Mon, 7 May 2018 16:21:58 +0800 Subject: [PATCH] Signed-off-by: arraykeys@gmail.com --- CHANGELOG | 5 + Godeps/Godeps.json | 140 +- config.go | 2 + services/args.go | 2 + services/sps.go | 70 +- utils/functions.go | 57 +- utils/serve-channel.go | 34 +- utils/ss/conn.go | 186 +++ utils/ss/encrypt.go | 274 ++++ utils/ss/leakybuf.go | 45 + utils/ss/pipe.go | 105 ++ utils/ss/util.go | 124 ++ utils/structs.go | 36 +- vendor/github.com/Yawning/chacha20/LICENSE | 122 ++ vendor/github.com/Yawning/chacha20/README.md | 14 + .../github.com/Yawning/chacha20/chacha20.go | 273 ++++ .../Yawning/chacha20/chacha20_amd64.go | 95 ++ .../Yawning/chacha20/chacha20_amd64.py | 1295 +++++++++++++++++ .../Yawning/chacha20/chacha20_amd64.s | 1180 +++++++++++++++ .../Yawning/chacha20/chacha20_ref.go | 394 +++++ .../Yawning/chacha20/chacha20_ref_go19.go | 395 +++++ 21 files changed, 4729 insertions(+), 119 deletions(-) create mode 100644 utils/ss/conn.go create mode 100644 utils/ss/encrypt.go create mode 100644 utils/ss/leakybuf.go create mode 100644 utils/ss/pipe.go create mode 100644 utils/ss/util.go create mode 100644 vendor/github.com/Yawning/chacha20/LICENSE create mode 100644 vendor/github.com/Yawning/chacha20/README.md create mode 100644 vendor/github.com/Yawning/chacha20/chacha20.go create mode 100644 vendor/github.com/Yawning/chacha20/chacha20_amd64.go create mode 100644 vendor/github.com/Yawning/chacha20/chacha20_amd64.py create mode 100644 vendor/github.com/Yawning/chacha20/chacha20_amd64.s create mode 100644 vendor/github.com/Yawning/chacha20/chacha20_ref.go create mode 100644 vendor/github.com/Yawning/chacha20/chacha20_ref_go19.go diff --git a/CHANGELOG b/CHANGELOG index 923a68b..b2e0409 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,4 +1,9 @@ proxy更新日志 +v4.8 +1.优化了SPS连接HTTP上级的指令,避免了某些代理不响应的问题. + + + v4.7 1.增加了基于gomobile的sdk,对android/ios/windows/linux/mac提供SDK支持. 2.优化了bridge的日志,增加了client和server的掉线日志. diff --git a/Godeps/Godeps.json b/Godeps/Godeps.json index 3757dd0..0f6e52d 100644 --- a/Godeps/Godeps.json +++ b/Godeps/Godeps.json @@ -1,11 +1,27 @@ { "ImportPath": "github.com/snail007/goproxy", - "GoVersion": "go1.9", + "GoVersion": "go1.8", "GodepVersion": "v80", "Packages": [ "./..." ], "Deps": [ + { + "ImportPath": "github.com/Yawning/chacha20", + "Rev": "e3b1f968fc6397b51d963fee8ec8711a47bc0ce8" + }, + { + "ImportPath": "github.com/alecthomas/template", + "Rev": "a0175ee3bccc567396460bf5acd36800cb10c49c" + }, + { + "ImportPath": "github.com/alecthomas/template/parse", + "Rev": "a0175ee3bccc567396460bf5acd36800cb10c49c" + }, + { + "ImportPath": "github.com/alecthomas/units", + "Rev": "2efee857e7cfd4f3d0138cc3cbb1b4966962b93a" + }, { "ImportPath": "github.com/golang/snappy", "Rev": "553a641470496b2327abcac10b36396bd98e45c9" @@ -15,66 +31,15 @@ "Comment": "v1.0.4-1-g40b5202", "Rev": "40b520211179dbf7eaafaa7fe1ffaa1b7d929ee0" }, - { - "ImportPath": "github.com/xtaci/kcp-go", - "Comment": "v3.19-6-g21da33a", - "Rev": "21da33a6696d67c1bffb3c954366499d613097a6" - }, - { - "ImportPath": "github.com/xtaci/smux", - "Comment": "v1.0.6", - "Rev": "ebec7ef2574b42a7088cd7751176483e0a27d458" - }, - { - "ImportPath": "golang.org/x/crypto/pbkdf2", - "Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8" - }, - { - "ImportPath": "golang.org/x/crypto/ssh", - "Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8" - }, - { - "ImportPath": "golang.org/x/time/rate", - "Rev": "6dc17368e09b0e8634d71cac8168d853e869a0c7" - }, - { - "ImportPath": "gopkg.in/alecthomas/kingpin.v2", - "Comment": "v2.2.5", - "Rev": "1087e65c9441605df944fb12c33f0fe7072d18ca" - }, - { - "ImportPath": "golang.org/x/crypto/ed25519", - "Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8" - }, - { - "ImportPath": "golang.org/x/net/ipv4", - "Rev": "5ccada7d0a7ba9aeb5d3aca8d3501b4c2a509fec" - }, - { - "ImportPath": "golang.org/x/net/ipv6", - "Rev": "5ccada7d0a7ba9aeb5d3aca8d3501b4c2a509fec" - }, - { - "ImportPath": "golang.org/x/crypto/ed25519/internal/edwards25519", - "Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8" - }, - { - "ImportPath": "golang.org/x/net/bpf", - "Rev": "5ccada7d0a7ba9aeb5d3aca8d3501b4c2a509fec" - }, - { - "ImportPath": "golang.org/x/net/internal/iana", - "Rev": "5ccada7d0a7ba9aeb5d3aca8d3501b4c2a509fec" - }, - { - "ImportPath": "golang.org/x/net/internal/socket", - "Rev": "5ccada7d0a7ba9aeb5d3aca8d3501b4c2a509fec" - }, { "ImportPath": "github.com/pkg/errors", "Comment": "v0.8.0-6-g602255c", "Rev": "602255cdb6deaf1523ea53ac30eae5554ba7bee9" }, + { + "ImportPath": "github.com/templexxx/cpufeat", + "Rev": "3794dfbfb04749f896b521032f69383f24c3687e" + }, { "ImportPath": "github.com/templexxx/reedsolomon", "Comment": "0.1.1-4-g7092926", @@ -90,6 +55,16 @@ "Comment": "v1.0.1-3-g9d99fac", "Rev": "9d99face20b0dd300b7db50b3f69758de41c096a" }, + { + "ImportPath": "github.com/xtaci/kcp-go", + "Comment": "v3.19-6-g21da33a", + "Rev": "21da33a6696d67c1bffb3c954366499d613097a6" + }, + { + "ImportPath": "github.com/xtaci/smux", + "Comment": "v1.0.6", + "Rev": "ebec7ef2574b42a7088cd7751176483e0a27d458" + }, { "ImportPath": "golang.org/x/crypto/blowfish", "Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8" @@ -98,10 +73,34 @@ "ImportPath": "golang.org/x/crypto/cast5", "Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8" }, + { + "ImportPath": "golang.org/x/crypto/curve25519", + "Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8" + }, + { + "ImportPath": "golang.org/x/crypto/ed25519", + "Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8" + }, + { + "ImportPath": "golang.org/x/crypto/ed25519/internal/edwards25519", + "Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8" + }, + { + "ImportPath": "golang.org/x/crypto/pbkdf2", + "Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8" + }, { "ImportPath": "golang.org/x/crypto/salsa20", "Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8" }, + { + "ImportPath": "golang.org/x/crypto/salsa20/salsa", + "Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8" + }, + { + "ImportPath": "golang.org/x/crypto/ssh", + "Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8" + }, { "ImportPath": "golang.org/x/crypto/tea", "Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8" @@ -115,28 +114,33 @@ "Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8" }, { - "ImportPath": "github.com/templexxx/cpufeat", - "Rev": "3794dfbfb04749f896b521032f69383f24c3687e" + "ImportPath": "golang.org/x/net/bpf", + "Rev": "5ccada7d0a7ba9aeb5d3aca8d3501b4c2a509fec" }, { - "ImportPath": "golang.org/x/crypto/salsa20/salsa", - "Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8" + "ImportPath": "golang.org/x/net/internal/iana", + "Rev": "5ccada7d0a7ba9aeb5d3aca8d3501b4c2a509fec" }, { - "ImportPath": "golang.org/x/crypto/curve25519", - "Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8" + "ImportPath": "golang.org/x/net/internal/socket", + "Rev": "5ccada7d0a7ba9aeb5d3aca8d3501b4c2a509fec" }, { - "ImportPath": "github.com/alecthomas/template", - "Rev": "a0175ee3bccc567396460bf5acd36800cb10c49c" + "ImportPath": "golang.org/x/net/ipv4", + "Rev": "5ccada7d0a7ba9aeb5d3aca8d3501b4c2a509fec" }, { - "ImportPath": "github.com/alecthomas/units", - "Rev": "2efee857e7cfd4f3d0138cc3cbb1b4966962b93a" + "ImportPath": "golang.org/x/net/ipv6", + "Rev": "5ccada7d0a7ba9aeb5d3aca8d3501b4c2a509fec" }, { - "ImportPath": "github.com/alecthomas/template/parse", - "Rev": "a0175ee3bccc567396460bf5acd36800cb10c49c" + "ImportPath": "golang.org/x/time/rate", + "Rev": "6dc17368e09b0e8634d71cac8168d853e869a0c7" + }, + { + "ImportPath": "gopkg.in/alecthomas/kingpin.v2", + "Comment": "v2.2.5", + "Rev": "1087e65c9441605df944fb12c33f0fe7072d18ca" } ] } diff --git a/config.go b/config.go index 16e0079..f43728f 100755 --- a/config.go +++ b/config.go @@ -242,6 +242,8 @@ func initConfig() (err error) { spsArgs.ParentKey = sps.Flag("parent-key", "the password for auto encrypt/decrypt parent connection data").Short('Z').Default("").String() spsArgs.LocalCompress = sps.Flag("local-compress", "auto compress/decompress data on local connection").Short('m').Default("false").Bool() spsArgs.ParentCompress = sps.Flag("parent-compress", "auto compress/decompress data on parent connection").Short('M').Default("false").Bool() + spsArgs.SSMethod = sps.Hidden().Flag("ss-method", "").Short('h').Default("aes-256-cfb").String() + spsArgs.SSKey = sps.Hidden().Flag("ss-key", "").Short('j').Default("sspassword").String() //parse args serviceName := kingpin.MustParse(app.Parse(os.Args[1:])) diff --git a/services/args.go b/services/args.go index 3d097a2..59c4e0e 100644 --- a/services/args.go +++ b/services/args.go @@ -232,6 +232,8 @@ type SPSArgs struct { ParentKey *string LocalCompress *bool ParentCompress *bool + SSMethod *string + SSKey *string } func (a *SPSArgs) Protocol() string { diff --git a/services/sps.go b/services/sps.go index d41f72a..be3ff5a 100644 --- a/services/sps.go +++ b/services/sps.go @@ -16,6 +16,7 @@ import ( "github.com/snail007/goproxy/utils" "github.com/snail007/goproxy/utils/conncrypt" "github.com/snail007/goproxy/utils/socks" + "src/github.com/snail007/goproxy/utils/ss" ) type SPS struct { @@ -26,6 +27,7 @@ type SPS struct { serverChannels []*utils.ServerChannel userConns utils.ConcurrentMap log *logger.Logger + cipher *ss.Cipher } func NewSPS() Service { @@ -67,6 +69,13 @@ func (s *SPS) InitService() (err error) { (*s).domainResolver = utils.NewDomainResolver(*s.cfg.DNSAddress, *s.cfg.DNSTTL) } err = s.InitBasicAuth() + if *s.cfg.SSMethod != "" && *s.cfg.SSKey != "" { + s.cipher, err = ss.NewCipher(*s.cfg.SSMethod, *s.cfg.SSKey) + if err != nil { + s.log.Printf("error generating cipher : %s", err) + return + } + } return } func (s *SPS) InitOutConnPool() { @@ -171,39 +180,52 @@ func (s *SPS) callback(inConn net.Conn) { } } func (s *SPS) OutToTCP(inConn *net.Conn) (err error) { - buf := make([]byte, 1024) - n, err := (*inConn).Read(buf) - header := buf[:n] + bInConn := utils.NewBufferedConn(*inConn) + //important + //action read will regist read event to system, + //when data arrived , system call process + //so that we can get buffered bytes count + //otherwise Buffered() always return 0 + bInConn.ReadByte() + bInConn.UnreadByte() + + n := 8 + if n > bInConn.Buffered() { + n = bInConn.Buffered() + } + h, err := bInConn.Peek(n) if err != nil { - s.log.Printf("ERR:%s", err) - utils.CloseConn(inConn) + s.log.Printf("peek error %s ", err) + (*inConn).Close() return } + + *inConn = bInConn address := "" var auth socks.Auth var forwardBytes []byte //fmt.Printf("%v", header) - if header[0] == socks.VERSION_V5 { + if utils.IsSocks5(h) { //socks5 server var serverConn *socks.ServerConn if s.IsBasicAuth() { - serverConn = socks.NewServerConn(inConn, time.Millisecond*time.Duration(*s.cfg.Timeout), &s.basicAuth, "", header) + serverConn = socks.NewServerConn(inConn, time.Millisecond*time.Duration(*s.cfg.Timeout), &s.basicAuth, "", nil) } else { - serverConn = socks.NewServerConn(inConn, time.Millisecond*time.Duration(*s.cfg.Timeout), nil, "", header) + serverConn = socks.NewServerConn(inConn, time.Millisecond*time.Duration(*s.cfg.Timeout), nil, "", nil) } if err = serverConn.Handshake(); err != nil { return } address = serverConn.Target() auth = serverConn.AuthData() - } else if bytes.IndexByte(header, '\n') != -1 { + } else if utils.IsHTTP(h) { //http var request utils.HTTPRequest (*inConn).SetDeadline(time.Now().Add(time.Millisecond * time.Duration(*s.cfg.Timeout))) if s.IsBasicAuth() { - request, err = utils.NewHTTPRequest(inConn, 1024, true, &s.basicAuth, header) + request, err = utils.NewHTTPRequest(inConn, 1024, true, &s.basicAuth, nil) } else { - request, err = utils.NewHTTPRequest(inConn, 1024, false, nil, header) + request, err = utils.NewHTTPRequest(inConn, 1024, false, nil, nil) } (*inConn).SetDeadline(time.Time{}) if err != nil { @@ -211,7 +233,7 @@ func (s *SPS) OutToTCP(inConn *net.Conn) (err error) { utils.CloseConn(inConn) return } - if len(header) >= 7 && strings.ToLower(string(header[:7])) == "connect" { + if len(h) >= 7 && strings.ToLower(string(h[:7])) == "connect" { //https request.HTTPSReply() //s.log.Printf("https reply: %s", request.Host) @@ -231,7 +253,21 @@ func (s *SPS) OutToTCP(inConn *net.Conn) (err error) { } } } else { - s.log.Printf("unknown request from: %s,%s", (*inConn).RemoteAddr(), string(header)) + //ss + ssConn := ss.NewConn(*inConn, s.cipher.Copy()) + address, err = ss.GetRequest(ssConn) + if err != nil { + return + } + // ensure the host does not contain some illegal characters, NUL may panic on Win32 + if strings.ContainsRune(address, 0x00) { + err = errors.New("invalid domain name") + return + } + *inConn = ssConn + } + if err != nil { + s.log.Printf("unknown request from: %s,%s", (*inConn).RemoteAddr(), string(h)) utils.CloseConn(inConn) err = errors.New("unknown request") return @@ -256,7 +292,7 @@ func (s *SPS) OutToTCP(inConn *net.Conn) (err error) { if *s.cfg.ParentServiceType == "http" { //http parent pb := new(bytes.Buffer) - pb.Write([]byte(fmt.Sprintf("CONNECT %s HTTP/1.1\r\nProxy-Connection: Keep-Alive\r\n", address))) + pb.Write([]byte(fmt.Sprintf("CONNECT %s HTTP/1.1\r\nHost:%s\r\nProxy-Connection: Keep-Alive\r\n", address, address))) //Proxy-Authorization:\r\n u := "" if *s.cfg.ParentAuth != "" { @@ -305,12 +341,12 @@ func (s *SPS) OutToTCP(inConn *net.Conn) (err error) { err = fmt.Errorf("parent auth data format error") return } - clientConn = socks.NewClientConn(&outConn, "tcp", address, time.Millisecond*time.Duration(*s.cfg.Timeout), &socks.Auth{User: a[0], Password: a[1]}, header) + clientConn = socks.NewClientConn(&outConn, "tcp", address, time.Millisecond*time.Duration(*s.cfg.Timeout), &socks.Auth{User: a[0], Password: a[1]}, nil) } else { if !s.IsBasicAuth() && auth.Password != "" && auth.User != "" { - clientConn = socks.NewClientConn(&outConn, "tcp", address, time.Millisecond*time.Duration(*s.cfg.Timeout), &auth, header) + clientConn = socks.NewClientConn(&outConn, "tcp", address, time.Millisecond*time.Duration(*s.cfg.Timeout), &auth, nil) } else { - clientConn = socks.NewClientConn(&outConn, "tcp", address, time.Millisecond*time.Duration(*s.cfg.Timeout), nil, header) + clientConn = socks.NewClientConn(&outConn, "tcp", address, time.Millisecond*time.Duration(*s.cfg.Timeout), nil, nil) } } if err = clientConn.Handshake(); err != nil { diff --git a/utils/functions.go b/utils/functions.go index b5fd408..4774774 100755 --- a/utils/functions.go +++ b/utils/functions.go @@ -10,7 +10,6 @@ import ( "encoding/pem" "errors" "fmt" - "github.com/snail007/goproxy/services/kcpcfg" "io" "io/ioutil" "log" @@ -20,13 +19,16 @@ import ( "os" "os/exec" + "github.com/snail007/goproxy/services/kcpcfg" + "golang.org/x/crypto/pbkdf2" - "github.com/snail007/goproxy/utils/id" "strconv" "strings" "time" + "github.com/snail007/goproxy/utils/id" + kcp "github.com/xtaci/kcp-go" ) @@ -68,7 +70,9 @@ func IoBind(dst io.ReadWriteCloser, src io.ReadWriteCloser, fn func(err interfac } src.Close() dst.Close() - fn(err) + if fn != nil { + fn(err) + } }() } func ioCopy(dst io.ReadWriter, src io.ReadWriter) (err error) { @@ -171,33 +175,6 @@ func ConnectKCPHost(hostAndPort string, config kcpcfg.KCPConfigArgs) (conn net.C return NewCompStream(kcpconn), err } -func ListenTls(ip string, port int, certBytes, keyBytes, caCertBytes []byte) (ln *net.Listener, err error) { - - var cert tls.Certificate - cert, err = tls.X509KeyPair(certBytes, keyBytes) - if err != nil { - return - } - clientCertPool := x509.NewCertPool() - caBytes := certBytes - if caCertBytes != nil { - caBytes = caCertBytes - } - ok := clientCertPool.AppendCertsFromPEM(caBytes) - if !ok { - err = errors.New("failed to parse root certificate") - } - config := &tls.Config{ - ClientCAs: clientCertPool, - Certificates: []tls.Certificate{cert}, - ClientAuth: tls.RequireAndVerifyClientCert, - } - _ln, err := tls.Listen("tcp", fmt.Sprintf("%s:%d", ip, port), config) - if err == nil { - ln = &_ln - } - return -} func PathExists(_path string) bool { _, err := os.Stat(_path) if err != nil && os.IsNotExist(err) { @@ -624,6 +601,26 @@ func IsIternalIP(domainOrIP string) bool { } return false } +func IsHTTP(head []byte) bool { + keys := []string{"GET", "HEAD", "POST", "PUT", "DELETE", "CONNECT", "OPTIONS", "TRACE", "PATCH"} + for _, key := range keys { + if bytes.HasPrefix(head, []byte(key)) || bytes.HasPrefix(head, []byte(strings.ToLower(key))) { + return true + } + } + return false +} +func IsSocks5(head []byte) bool { + if len(head) < 3 { + return false + } + if head[0] == uint8(0x05) && 0 < int(head[1]) && int(head[1]) < 255 { + if len(head) == 2+int(head[1]) { + return true + } + } + return false +} // type sockaddr struct { // family uint16 diff --git a/utils/serve-channel.go b/utils/serve-channel.go index aefe543..1c1552d 100644 --- a/utils/serve-channel.go +++ b/utils/serve-channel.go @@ -1,13 +1,17 @@ package utils import ( + "crypto/tls" + "crypto/x509" + "errors" "fmt" - "github.com/snail007/goproxy/services/kcpcfg" "log" "net" "runtime/debug" "strconv" + "github.com/snail007/goproxy/services/kcpcfg" + kcp "github.com/xtaci/kcp-go" ) @@ -43,7 +47,7 @@ func (sc *ServerChannel) SetErrAcceptHandler(fn func(err error)) { sc.errAcceptHandler = fn } func (sc *ServerChannel) ListenTls(certBytes, keyBytes, caCertBytes []byte, fn func(conn net.Conn)) (err error) { - sc.Listener, err = ListenTls(sc.ip, sc.port, certBytes, keyBytes, caCertBytes) + sc.Listener, err = sc.listenTls(sc.ip, sc.port, certBytes, keyBytes, caCertBytes) if err == nil { go func() { defer func() { @@ -73,7 +77,33 @@ func (sc *ServerChannel) ListenTls(certBytes, keyBytes, caCertBytes []byte, fn f } return } +func (sc *ServerChannel) listenTls(ip string, port int, certBytes, keyBytes, caCertBytes []byte) (ln *net.Listener, err error) { + var cert tls.Certificate + cert, err = tls.X509KeyPair(certBytes, keyBytes) + if err != nil { + return + } + clientCertPool := x509.NewCertPool() + caBytes := certBytes + if caCertBytes != nil { + caBytes = caCertBytes + } + ok := clientCertPool.AppendCertsFromPEM(caBytes) + if !ok { + err = errors.New("failed to parse root certificate") + } + config := &tls.Config{ + ClientCAs: clientCertPool, + Certificates: []tls.Certificate{cert}, + ClientAuth: tls.RequireAndVerifyClientCert, + } + _ln, err := tls.Listen("tcp", fmt.Sprintf("%s:%d", ip, port), config) + if err == nil { + ln = &_ln + } + return +} func (sc *ServerChannel) ListenTCP(fn func(conn net.Conn)) (err error) { var l net.Listener l, err = net.Listen("tcp", fmt.Sprintf("%s:%d", sc.ip, sc.port)) diff --git a/utils/ss/conn.go b/utils/ss/conn.go new file mode 100644 index 0000000..46513f2 --- /dev/null +++ b/utils/ss/conn.go @@ -0,0 +1,186 @@ +package ss + +import ( + "encoding/binary" + "fmt" + "io" + "net" + "strconv" +) + +const ( + OneTimeAuthMask byte = 0x10 + AddrMask byte = 0xf +) + +type Conn struct { + net.Conn + *Cipher + readBuf []byte + writeBuf []byte + chunkId uint32 +} + +func NewConn(c net.Conn, cipher *Cipher) *Conn { + return &Conn{ + Conn: c, + Cipher: cipher, + readBuf: leakyBuf.Get(), + writeBuf: leakyBuf.Get()} +} + +func (c *Conn) Close() error { + leakyBuf.Put(c.readBuf) + leakyBuf.Put(c.writeBuf) + return c.Conn.Close() +} + +func RawAddr(addr string) (buf []byte, err error) { + host, portStr, err := net.SplitHostPort(addr) + if err != nil { + return nil, fmt.Errorf("ss: address error %s %v", addr, err) + } + port, err := strconv.Atoi(portStr) + if err != nil { + return nil, fmt.Errorf("ss: invalid port %s", addr) + } + + hostLen := len(host) + l := 1 + 1 + hostLen + 2 // addrType + lenByte + address + port + buf = make([]byte, l) + buf[0] = 3 // 3 means the address is domain name + buf[1] = byte(hostLen) // host address length followed by host address + copy(buf[2:], host) + binary.BigEndian.PutUint16(buf[2+hostLen:2+hostLen+2], uint16(port)) + return +} + +// This is intended for use by users implementing a local socks proxy. +// rawaddr shoud contain part of the data in socks request, starting from the +// ATYP field. (Refer to rfc1928 for more information.) +func DialWithRawAddr(rawaddr []byte, server string, cipher *Cipher) (c *Conn, err error) { + conn, err := net.Dial("tcp", server) + if err != nil { + return + } + c = NewConn(conn, cipher) + if cipher.ota { + if c.enc == nil { + if _, err = c.initEncrypt(); err != nil { + return + } + } + // since we have initEncrypt, we must send iv manually + conn.Write(cipher.iv) + rawaddr[0] |= OneTimeAuthMask + rawaddr = otaConnectAuth(cipher.iv, cipher.key, rawaddr) + } + if _, err = c.write(rawaddr); err != nil { + c.Close() + return nil, err + } + return +} + +// addr should be in the form of host:port +func Dial(addr, server string, cipher *Cipher) (c *Conn, err error) { + ra, err := RawAddr(addr) + if err != nil { + return + } + return DialWithRawAddr(ra, server, cipher) +} + +func (c *Conn) GetIv() (iv []byte) { + iv = make([]byte, len(c.iv)) + copy(iv, c.iv) + return +} + +func (c *Conn) GetKey() (key []byte) { + key = make([]byte, len(c.key)) + copy(key, c.key) + return +} + +func (c *Conn) IsOta() bool { + return c.ota +} + +func (c *Conn) GetAndIncrChunkId() (chunkId uint32) { + chunkId = c.chunkId + c.chunkId += 1 + return +} + +func (c *Conn) Read(b []byte) (n int, err error) { + if c.dec == nil { + iv := make([]byte, c.info.ivLen) + if _, err = io.ReadFull(c.Conn, iv); err != nil { + return + } + if err = c.initDecrypt(iv); err != nil { + return + } + if len(c.iv) == 0 { + c.iv = iv + } + } + + cipherData := c.readBuf + if len(b) > len(cipherData) { + cipherData = make([]byte, len(b)) + } else { + cipherData = cipherData[:len(b)] + } + + n, err = c.Conn.Read(cipherData) + if n > 0 { + c.decrypt(b[0:n], cipherData[0:n]) + } + return +} + +func (c *Conn) Write(b []byte) (n int, err error) { + nn := len(b) + if c.ota { + chunkId := c.GetAndIncrChunkId() + b = otaReqChunkAuth(c.iv, chunkId, b) + } + headerLen := len(b) - nn + + n, err = c.write(b) + // Make sure <= 0 <= len(b), where b is the slice passed in. + if n >= headerLen { + n -= headerLen + } + return +} + +func (c *Conn) write(b []byte) (n int, err error) { + var iv []byte + if c.enc == nil { + iv, err = c.initEncrypt() + if err != nil { + return + } + } + + cipherData := c.writeBuf + dataSize := len(b) + len(iv) + if dataSize > len(cipherData) { + cipherData = make([]byte, dataSize) + } else { + cipherData = cipherData[:dataSize] + } + + if iv != nil { + // Put initialization vector in buffer, do a single write to send both + // iv and data. + copy(cipherData, iv) + } + + c.encrypt(cipherData[len(iv):], b) + n, err = c.Conn.Write(cipherData) + return +} diff --git a/utils/ss/encrypt.go b/utils/ss/encrypt.go new file mode 100644 index 0000000..6553aa0 --- /dev/null +++ b/utils/ss/encrypt.go @@ -0,0 +1,274 @@ +package ss + +import ( + "crypto/aes" + "crypto/cipher" + "crypto/des" + "crypto/md5" + "crypto/rand" + "crypto/rc4" + "encoding/binary" + "errors" + "io" + "strings" + + "github.com/Yawning/chacha20" + "golang.org/x/crypto/blowfish" + "golang.org/x/crypto/cast5" + "golang.org/x/crypto/salsa20/salsa" +) + +var errEmptyPassword = errors.New("empty key") + +func md5sum(d []byte) []byte { + h := md5.New() + h.Write(d) + return h.Sum(nil) +} + +func evpBytesToKey(password string, keyLen int) (key []byte) { + const md5Len = 16 + + cnt := (keyLen-1)/md5Len + 1 + m := make([]byte, cnt*md5Len) + copy(m, md5sum([]byte(password))) + + // Repeatedly call md5 until bytes generated is enough. + // Each call to md5 uses data: prev md5 sum + password. + d := make([]byte, md5Len+len(password)) + start := 0 + for i := 1; i < cnt; i++ { + start += md5Len + copy(d, m[start-md5Len:start]) + copy(d[md5Len:], password) + copy(m[start:], md5sum(d)) + } + return m[:keyLen] +} + +type DecOrEnc int + +const ( + Decrypt DecOrEnc = iota + Encrypt +) + +func newStream(block cipher.Block, err error, key, iv []byte, + doe DecOrEnc) (cipher.Stream, error) { + if err != nil { + return nil, err + } + if doe == Encrypt { + return cipher.NewCFBEncrypter(block, iv), nil + } else { + return cipher.NewCFBDecrypter(block, iv), nil + } +} + +func newAESCFBStream(key, iv []byte, doe DecOrEnc) (cipher.Stream, error) { + block, err := aes.NewCipher(key) + return newStream(block, err, key, iv, doe) +} + +func newAESCTRStream(key, iv []byte, doe DecOrEnc) (cipher.Stream, error) { + block, err := aes.NewCipher(key) + if err != nil { + return nil, err + } + return cipher.NewCTR(block, iv), nil +} + +func newDESStream(key, iv []byte, doe DecOrEnc) (cipher.Stream, error) { + block, err := des.NewCipher(key) + return newStream(block, err, key, iv, doe) +} + +func newBlowFishStream(key, iv []byte, doe DecOrEnc) (cipher.Stream, error) { + block, err := blowfish.NewCipher(key) + return newStream(block, err, key, iv, doe) +} + +func newCast5Stream(key, iv []byte, doe DecOrEnc) (cipher.Stream, error) { + block, err := cast5.NewCipher(key) + return newStream(block, err, key, iv, doe) +} + +func newRC4MD5Stream(key, iv []byte, _ DecOrEnc) (cipher.Stream, error) { + h := md5.New() + h.Write(key) + h.Write(iv) + rc4key := h.Sum(nil) + + return rc4.NewCipher(rc4key) +} + +func newChaCha20Stream(key, iv []byte, _ DecOrEnc) (cipher.Stream, error) { + return chacha20.NewCipher(key, iv) +} + +func newChaCha20IETFStream(key, iv []byte, _ DecOrEnc) (cipher.Stream, error) { + return chacha20.NewCipher(key, iv) +} + +type salsaStreamCipher struct { + nonce [8]byte + key [32]byte + counter int +} + +func (c *salsaStreamCipher) XORKeyStream(dst, src []byte) { + var buf []byte + padLen := c.counter % 64 + dataSize := len(src) + padLen + if cap(dst) >= dataSize { + buf = dst[:dataSize] + } else if leakyBufSize >= dataSize { + buf = leakyBuf.Get() + defer leakyBuf.Put(buf) + buf = buf[:dataSize] + } else { + buf = make([]byte, dataSize) + } + + var subNonce [16]byte + copy(subNonce[:], c.nonce[:]) + binary.LittleEndian.PutUint64(subNonce[len(c.nonce):], uint64(c.counter/64)) + + // It's difficult to avoid data copy here. src or dst maybe slice from + // Conn.Read/Write, which can't have padding. + copy(buf[padLen:], src[:]) + salsa.XORKeyStream(buf, buf, &subNonce, &c.key) + copy(dst, buf[padLen:]) + + c.counter += len(src) +} + +func newSalsa20Stream(key, iv []byte, _ DecOrEnc) (cipher.Stream, error) { + var c salsaStreamCipher + copy(c.nonce[:], iv[:8]) + copy(c.key[:], key[:32]) + return &c, nil +} + +type cipherInfo struct { + keyLen int + ivLen int + newStream func(key, iv []byte, doe DecOrEnc) (cipher.Stream, error) +} + +var cipherMethod = map[string]*cipherInfo{ + "aes-128-cfb": {16, 16, newAESCFBStream}, + "aes-192-cfb": {24, 16, newAESCFBStream}, + "aes-256-cfb": {32, 16, newAESCFBStream}, + "aes-128-ctr": {16, 16, newAESCTRStream}, + "aes-192-ctr": {24, 16, newAESCTRStream}, + "aes-256-ctr": {32, 16, newAESCTRStream}, + "des-cfb": {8, 8, newDESStream}, + "bf-cfb": {16, 8, newBlowFishStream}, + "cast5-cfb": {16, 8, newCast5Stream}, + "rc4-md5": {16, 16, newRC4MD5Stream}, + "rc4-md5-6": {16, 6, newRC4MD5Stream}, + "chacha20": {32, 8, newChaCha20Stream}, + "chacha20-ietf": {32, 12, newChaCha20IETFStream}, + "salsa20": {32, 8, newSalsa20Stream}, +} + +func CheckCipherMethod(method string) error { + if method == "" { + method = "aes-256-cfb" + } + _, ok := cipherMethod[method] + if !ok { + return errors.New("Unsupported encryption method: " + method) + } + return nil +} + +type Cipher struct { + enc cipher.Stream + dec cipher.Stream + key []byte + info *cipherInfo + ota bool // one-time auth + iv []byte +} + +// NewCipher creates a cipher that can be used in Dial() etc. +// Use cipher.Copy() to create a new cipher with the same method and password +// to avoid the cost of repeated cipher initialization. +func NewCipher(method, password string) (c *Cipher, err error) { + if password == "" { + return nil, errEmptyPassword + } + var ota bool + if strings.HasSuffix(strings.ToLower(method), "-auth") { + method = method[:len(method)-5] // len("-auth") = 5 + ota = true + } else { + ota = false + } + mi, ok := cipherMethod[method] + if !ok { + return nil, errors.New("Unsupported encryption method: " + method) + } + + key := evpBytesToKey(password, mi.keyLen) + + c = &Cipher{key: key, info: mi} + + if err != nil { + return nil, err + } + c.ota = ota + return c, nil +} + +// Initializes the block cipher with CFB mode, returns IV. +func (c *Cipher) initEncrypt() (iv []byte, err error) { + if c.iv == nil { + iv = make([]byte, c.info.ivLen) + if _, err := io.ReadFull(rand.Reader, iv); err != nil { + return nil, err + } + c.iv = iv + } else { + iv = c.iv + } + c.enc, err = c.info.newStream(c.key, iv, Encrypt) + return +} + +func (c *Cipher) initDecrypt(iv []byte) (err error) { + c.dec, err = c.info.newStream(c.key, iv, Decrypt) + return +} + +func (c *Cipher) encrypt(dst, src []byte) { + c.enc.XORKeyStream(dst, src) +} + +func (c *Cipher) decrypt(dst, src []byte) { + c.dec.XORKeyStream(dst, src) +} + +// Copy creates a new cipher at it's initial state. +func (c *Cipher) Copy() *Cipher { + // This optimization maybe not necessary. But without this function, we + // need to maintain a table cache for newTableCipher and use lock to + // protect concurrent access to that cache. + + // AES and DES ciphers does not return specific types, so it's difficult + // to create copy. But their initizliation time is less than 4000ns on my + // 2.26 GHz Intel Core 2 Duo processor. So no need to worry. + + // Currently, blow-fish and cast5 initialization cost is an order of + // maganitude slower than other ciphers. (I'm not sure whether this is + // because the current implementation is not highly optimized, or this is + // the nature of the algorithm.) + + nc := *c + nc.enc = nil + nc.dec = nil + nc.ota = c.ota + return &nc +} diff --git a/utils/ss/leakybuf.go b/utils/ss/leakybuf.go new file mode 100644 index 0000000..029d93f --- /dev/null +++ b/utils/ss/leakybuf.go @@ -0,0 +1,45 @@ +// Provides leaky buffer, based on the example in Effective Go. +package ss + +type LeakyBuf struct { + bufSize int // size of each buffer + freeList chan []byte +} + +const leakyBufSize = 4108 // data.len(2) + hmacsha1(10) + data(4096) +const maxNBuf = 2048 + +var leakyBuf = NewLeakyBuf(maxNBuf, leakyBufSize) + +// NewLeakyBuf creates a leaky buffer which can hold at most n buffer, each +// with bufSize bytes. +func NewLeakyBuf(n, bufSize int) *LeakyBuf { + return &LeakyBuf{ + bufSize: bufSize, + freeList: make(chan []byte, n), + } +} + +// Get returns a buffer from the leaky buffer or create a new buffer. +func (lb *LeakyBuf) Get() (b []byte) { + select { + case b = <-lb.freeList: + default: + b = make([]byte, lb.bufSize) + } + return +} + +// Put add the buffer into the free buffer pool for reuse. Panic if the buffer +// size is not the same with the leaky buffer's. This is intended to expose +// error usage of leaky buffer. +func (lb *LeakyBuf) Put(b []byte) { + if len(b) != lb.bufSize { + panic("invalid buffer size that's put into leaky buffer") + } + select { + case lb.freeList <- b: + default: + } + return +} diff --git a/utils/ss/pipe.go b/utils/ss/pipe.go new file mode 100644 index 0000000..5f4645b --- /dev/null +++ b/utils/ss/pipe.go @@ -0,0 +1,105 @@ +package ss + +import ( + "bytes" + "encoding/binary" + "io" + "log" + "net" + "time" +) + +func SetReadTimeout(c net.Conn) { + c.SetReadDeadline(time.Now().Add(time.Second * 5)) +} + +// PipeThenClose copies data from src to dst, closes dst when done. +func PipeThenClose(src, dst net.Conn, addFlow func(int)) { + defer dst.Close() + buf := leakyBuf.Get() + defer leakyBuf.Put(buf) + for { + SetReadTimeout(src) + n, err := src.Read(buf) + if addFlow != nil { + addFlow(n) + } + // read may return EOF with n > 0 + // should always process n > 0 bytes before handling error + if n > 0 { + // Note: avoid overwrite err returned by Read. + if _, err := dst.Write(buf[0:n]); err != nil { + log.Println("write:", err) + break + } + } + if err != nil { + // Always "use of closed network connection", but no easy way to + // identify this specific error. So just leave the error along for now. + // More info here: https://code.google.com/p/go/issues/detail?id=4373 + /* + if bool(log.) && err != io.EOF { + log.Println("read:", err) + } + */ + break + } + } + return +} + +// PipeThenClose copies data from src to dst, closes dst when done, with ota verification. +func PipeThenCloseOta(src *Conn, dst net.Conn, addFlow func(int)) { + const ( + dataLenLen = 2 + hmacSha1Len = 10 + idxData0 = dataLenLen + hmacSha1Len + ) + + defer func() { + dst.Close() + }() + // sometimes it have to fill large block + buf := leakyBuf.Get() + defer leakyBuf.Put(buf) + for i := 1; ; i += 1 { + SetReadTimeout(src) + if n, err := io.ReadFull(src, buf[:dataLenLen+hmacSha1Len]); err != nil { + if err == io.EOF { + break + } + log.Printf("conn=%p #%v read header error n=%v: %v", src, i, n, err) + break + } + dataLen := binary.BigEndian.Uint16(buf[:dataLenLen]) + expectedHmacSha1 := buf[dataLenLen:idxData0] + + var dataBuf []byte + if len(buf) < int(idxData0+dataLen) { + dataBuf = make([]byte, dataLen) + } else { + dataBuf = buf[idxData0 : idxData0+dataLen] + } + if n, err := io.ReadFull(src, dataBuf); err != nil { + if err == io.EOF { + break + } + log.Printf("conn=%p #%v read data error n=%v: %v", src, i, n, err) + break + } + addFlow(int(dataLen)) + chunkIdBytes := make([]byte, 4) + chunkId := src.GetAndIncrChunkId() + binary.BigEndian.PutUint32(chunkIdBytes, chunkId) + actualHmacSha1 := HmacSha1(append(src.GetIv(), chunkIdBytes...), dataBuf) + if !bytes.Equal(expectedHmacSha1, actualHmacSha1) { + log.Printf("conn=%p #%v read data hmac-sha1 mismatch, iv=%v chunkId=%v src=%v dst=%v len=%v expeced=%v actual=%v", src, i, src.GetIv(), chunkId, src.RemoteAddr(), dst.RemoteAddr(), dataLen, expectedHmacSha1, actualHmacSha1) + break + } + if n, err := dst.Write(dataBuf); err != nil { + log.Printf("conn=%p #%v write data error n=%v: %v", dst, i, n, err) + break + } + } + return +} diff --git a/utils/ss/util.go b/utils/ss/util.go new file mode 100644 index 0000000..290c4a7 --- /dev/null +++ b/utils/ss/util.go @@ -0,0 +1,124 @@ +package ss + +import ( + "crypto/hmac" + "crypto/sha1" + "encoding/binary" + "errors" + "fmt" + "io" + "net" + "os" + "strconv" +) + +func IsFileExists(path string) (bool, error) { + stat, err := os.Stat(path) + if err == nil { + if stat.Mode()&os.ModeType == 0 { + return true, nil + } + return false, errors.New(path + " exists but is not regular file") + } + if os.IsNotExist(err) { + return false, nil + } + return false, err +} + +func HmacSha1(key []byte, data []byte) []byte { + hmacSha1 := hmac.New(sha1.New, key) + hmacSha1.Write(data) + return hmacSha1.Sum(nil)[:10] +} + +func otaConnectAuth(iv, key, data []byte) []byte { + return append(data, HmacSha1(append(iv, key...), data)...) +} + +func otaReqChunkAuth(iv []byte, chunkId uint32, data []byte) []byte { + nb := make([]byte, 2) + binary.BigEndian.PutUint16(nb, uint16(len(data))) + chunkIdBytes := make([]byte, 4) + binary.BigEndian.PutUint32(chunkIdBytes, chunkId) + header := append(nb, HmacSha1(append(iv, chunkIdBytes...), data)...) + return append(header, data...) +} + +const ( + idType = 0 // address type index + idIP0 = 1 // ip addres start index + idDmLen = 1 // domain address length index + idDm0 = 2 // domain address start index + + typeIPv4 = 1 // type is ipv4 address + typeDm = 3 // type is domain address + typeIPv6 = 4 // type is ipv6 address + + lenIPv4 = net.IPv4len + 2 // ipv4 + 2port + lenIPv6 = net.IPv6len + 2 // ipv6 + 2port + lenDmBase = 2 // 1addrLen + 2port, plus addrLen + lenHmacSha1 = 10 +) + +func GetRequest(conn *Conn) (host string, err error) { + + // buf size should at least have the same size with the largest possible + // request size (when addrType is 3, domain name has at most 256 bytes) + // 1(addrType) + 1(lenByte) + 255(max length address) + 2(port) + 10(hmac-sha1) + buf := make([]byte, 269) + // read till we get possible domain length field + if _, err = io.ReadFull(conn, buf[:idType+1]); err != nil { + return + } + + var reqStart, reqEnd int + addrType := buf[idType] + switch addrType & AddrMask { + case typeIPv4: + reqStart, reqEnd = idIP0, idIP0+lenIPv4 + case typeIPv6: + reqStart, reqEnd = idIP0, idIP0+lenIPv6 + case typeDm: + if _, err = io.ReadFull(conn, buf[idType+1:idDmLen+1]); err != nil { + return + } + reqStart, reqEnd = idDm0, idDm0+int(buf[idDmLen])+lenDmBase + default: + err = fmt.Errorf("addr type %d not supported", addrType&AddrMask) + return + } + + if _, err = io.ReadFull(conn, buf[reqStart:reqEnd]); err != nil { + return + } + + // Return string for typeIP is not most efficient, but browsers (Chrome, + // Safari, Firefox) all seems using typeDm exclusively. So this is not a + // big problem. + switch addrType & AddrMask { + case typeIPv4: + host = net.IP(buf[idIP0 : idIP0+net.IPv4len]).String() + case typeIPv6: + host = net.IP(buf[idIP0 : idIP0+net.IPv6len]).String() + case typeDm: + host = string(buf[idDm0 : idDm0+int(buf[idDmLen])]) + } + // parse port + port := binary.BigEndian.Uint16(buf[reqEnd-2 : reqEnd]) + host = net.JoinHostPort(host, strconv.Itoa(int(port))) + + return +} + +type ClosedFlag struct { + flag bool +} + +func (flag *ClosedFlag) SetClosed() { + flag.flag = true +} + +func (flag *ClosedFlag) IsClosed() bool { + return flag.flag +} diff --git a/utils/structs.go b/utils/structs.go index acc4b45..d15bb6f 100644 --- a/utils/structs.go +++ b/utils/structs.go @@ -1,13 +1,12 @@ package utils import ( + "bufio" "bytes" "crypto/tls" "encoding/base64" "errors" "fmt" - "github.com/snail007/goproxy/services/kcpcfg" - "github.com/snail007/goproxy/utils/sni" "io" "io/ioutil" "log" @@ -17,6 +16,9 @@ import ( "sync" "time" + "github.com/snail007/goproxy/services/kcpcfg" + "github.com/snail007/goproxy/utils/sni" + "github.com/golang/snappy" "github.com/miekg/dns" ) @@ -792,3 +794,33 @@ func (c *CompStream) SetReadDeadline(t time.Time) error { func (c *CompStream) SetWriteDeadline(t time.Time) error { return c.conn.SetWriteDeadline(t) } + +type BufferedConn struct { + r *bufio.Reader + net.Conn // So that most methods are embedded +} + +func NewBufferedConn(c net.Conn) BufferedConn { + return BufferedConn{bufio.NewReader(c), c} +} + +func NewBufferedConnSize(c net.Conn, n int) BufferedConn { + return BufferedConn{bufio.NewReaderSize(c, n), c} +} + +func (b BufferedConn) Peek(n int) ([]byte, error) { + return b.r.Peek(n) +} + +func (b BufferedConn) Read(p []byte) (int, error) { + return b.r.Read(p) +} +func (b BufferedConn) ReadByte() (byte, error) { + return b.r.ReadByte() +} +func (b BufferedConn) UnreadByte() error { + return b.r.UnreadByte() +} +func (b BufferedConn) Buffered() int { + return b.r.Buffered() +} diff --git a/vendor/github.com/Yawning/chacha20/LICENSE b/vendor/github.com/Yawning/chacha20/LICENSE new file mode 100644 index 0000000..6ca207e --- /dev/null +++ b/vendor/github.com/Yawning/chacha20/LICENSE @@ -0,0 +1,122 @@ +Creative Commons Legal Code + +CC0 1.0 Universal + + CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE + LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN + ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS + INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES + REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS + PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM + THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED + HEREUNDER. + +Statement of Purpose + +The laws of most jurisdictions throughout the world automatically confer +exclusive Copyright and Related Rights (defined below) upon the creator +and subsequent owner(s) (each and all, an "owner") of an original work of +authorship and/or a database (each, a "Work"). + +Certain owners wish to permanently relinquish those rights to a Work for +the purpose of contributing to a commons of creative, cultural and +scientific works ("Commons") that the public can reliably and without fear +of later claims of infringement build upon, modify, incorporate in other +works, reuse and redistribute as freely as possible in any form whatsoever +and for any purposes, including without limitation commercial purposes. +These owners may contribute to the Commons to promote the ideal of a free +culture and the further production of creative, cultural and scientific +works, or to gain reputation or greater distribution for their Work in +part through the use and efforts of others. + +For these and/or other purposes and motivations, and without any +expectation of additional consideration or compensation, the person +associating CC0 with a Work (the "Affirmer"), to the extent that he or she +is an owner of Copyright and Related Rights in the Work, voluntarily +elects to apply CC0 to the Work and publicly distribute the Work under its +terms, with knowledge of his or her Copyright and Related Rights in the +Work and the meaning and intended legal effect of CC0 on those rights. + +1. Copyright and Related Rights. A Work made available under CC0 may be +protected by copyright and related or neighboring rights ("Copyright and +Related Rights"). Copyright and Related Rights include, but are not +limited to, the following: + + i. the right to reproduce, adapt, distribute, perform, display, + communicate, and translate a Work; + ii. moral rights retained by the original author(s) and/or performer(s); +iii. publicity and privacy rights pertaining to a person's image or + likeness depicted in a Work; + iv. rights protecting against unfair competition in regards to a Work, + subject to the limitations in paragraph 4(a), below; + v. rights protecting the extraction, dissemination, use and reuse of data + in a Work; + vi. database rights (such as those arising under Directive 96/9/EC of the + European Parliament and of the Council of 11 March 1996 on the legal + protection of databases, and under any national implementation + thereof, including any amended or successor version of such + directive); and +vii. other similar, equivalent or corresponding rights throughout the + world based on applicable law or treaty, and any national + implementations thereof. + +2. Waiver. To the greatest extent permitted by, but not in contravention +of, applicable law, Affirmer hereby overtly, fully, permanently, +irrevocably and unconditionally waives, abandons, and surrenders all of +Affirmer's Copyright and Related Rights and associated claims and causes +of action, whether now known or unknown (including existing as well as +future claims and causes of action), in the Work (i) in all territories +worldwide, (ii) for the maximum duration provided by applicable law or +treaty (including future time extensions), (iii) in any current or future +medium and for any number of copies, and (iv) for any purpose whatsoever, +including without limitation commercial, advertising or promotional +purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each +member of the public at large and to the detriment of Affirmer's heirs and +successors, fully intending that such Waiver shall not be subject to +revocation, rescission, cancellation, termination, or any other legal or +equitable action to disrupt the quiet enjoyment of the Work by the public +as contemplated by Affirmer's express Statement of Purpose. + +3. Public License Fallback. Should any part of the Waiver for any reason +be judged legally invalid or ineffective under applicable law, then the +Waiver shall be preserved to the maximum extent permitted taking into +account Affirmer's express Statement of Purpose. In addition, to the +extent the Waiver is so judged Affirmer hereby grants to each affected +person a royalty-free, non transferable, non sublicensable, non exclusive, +irrevocable and unconditional license to exercise Affirmer's Copyright and +Related Rights in the Work (i) in all territories worldwide, (ii) for the +maximum duration provided by applicable law or treaty (including future +time extensions), (iii) in any current or future medium and for any number +of copies, and (iv) for any purpose whatsoever, including without +limitation commercial, advertising or promotional purposes (the +"License"). The License shall be deemed effective as of the date CC0 was +applied by Affirmer to the Work. Should any part of the License for any +reason be judged legally invalid or ineffective under applicable law, such +partial invalidity or ineffectiveness shall not invalidate the remainder +of the License, and in such case Affirmer hereby affirms that he or she +will not (i) exercise any of his or her remaining Copyright and Related +Rights in the Work or (ii) assert any associated claims and causes of +action with respect to the Work, in either case contrary to Affirmer's +express Statement of Purpose. + +4. Limitations and Disclaimers. + + a. No trademark or patent rights held by Affirmer are waived, abandoned, + surrendered, licensed or otherwise affected by this document. + b. Affirmer offers the Work as-is and makes no representations or + warranties of any kind concerning the Work, express, implied, + statutory or otherwise, including without limitation warranties of + title, merchantability, fitness for a particular purpose, non + infringement, or the absence of latent or other defects, accuracy, or + the present or absence of errors, whether or not discoverable, all to + the greatest extent permissible under applicable law. + c. Affirmer disclaims responsibility for clearing rights of other persons + that may apply to the Work or any use thereof, including without + limitation any person's Copyright and Related Rights in the Work. + Further, Affirmer disclaims responsibility for obtaining any necessary + consents, permissions or other rights required for any use of the + Work. + d. Affirmer understands and acknowledges that Creative Commons is not a + party to this document and has no duty or obligation with respect to + this CC0 or use of the Work. + diff --git a/vendor/github.com/Yawning/chacha20/README.md b/vendor/github.com/Yawning/chacha20/README.md new file mode 100644 index 0000000..9080a84 --- /dev/null +++ b/vendor/github.com/Yawning/chacha20/README.md @@ -0,0 +1,14 @@ +### chacha20 - ChaCha20 +#### Yawning Angel (yawning at schwanenlied dot me) + +Yet another Go ChaCha20 implementation. Everything else I found was slow, +didn't support all the variants I need to use, or relied on cgo to go fast. + +Features: + + * 20 round, 256 bit key only. Everything else is pointless and stupid. + * IETF 96 bit nonce variant. + * XChaCha 24 byte nonce variant. + * SSE2 and AVX2 support on amd64 targets. + * Incremental encrypt/decrypt support, unlike golang.org/x/crypto/salsa20. + diff --git a/vendor/github.com/Yawning/chacha20/chacha20.go b/vendor/github.com/Yawning/chacha20/chacha20.go new file mode 100644 index 0000000..07d5e4b --- /dev/null +++ b/vendor/github.com/Yawning/chacha20/chacha20.go @@ -0,0 +1,273 @@ +// chacha20.go - A ChaCha stream cipher implementation. +// +// To the extent possible under law, Yawning Angel has waived all copyright +// and related or neighboring rights to chacha20, using the Creative +// Commons "CC0" public domain dedication. See LICENSE or +// for full details. + +package chacha20 + +import ( + "crypto/cipher" + "encoding/binary" + "errors" + "math" + "runtime" +) + +const ( + // KeySize is the ChaCha20 key size in bytes. + KeySize = 32 + + // NonceSize is the ChaCha20 nonce size in bytes. + NonceSize = 8 + + // INonceSize is the IETF ChaCha20 nonce size in bytes. + INonceSize = 12 + + // XNonceSize is the XChaCha20 nonce size in bytes. + XNonceSize = 24 + + // HNonceSize is the HChaCha20 nonce size in bytes. + HNonceSize = 16 + + // BlockSize is the ChaCha20 block size in bytes. + BlockSize = 64 + + stateSize = 16 + chachaRounds = 20 + + // The constant "expand 32-byte k" as little endian uint32s. + sigma0 = uint32(0x61707865) + sigma1 = uint32(0x3320646e) + sigma2 = uint32(0x79622d32) + sigma3 = uint32(0x6b206574) +) + +var ( + // ErrInvalidKey is the error returned when the key is invalid. + ErrInvalidKey = errors.New("key length must be KeySize bytes") + + // ErrInvalidNonce is the error returned when the nonce is invalid. + ErrInvalidNonce = errors.New("nonce length must be NonceSize/INonceSize/XNonceSize bytes") + + // ErrInvalidCounter is the error returned when the counter is invalid. + ErrInvalidCounter = errors.New("block counter is invalid (out of range)") + + useUnsafe = false + usingVectors = false + blocksFn = blocksRef +) + +// A Cipher is an instance of ChaCha20/XChaCha20 using a particular key and +// nonce. +type Cipher struct { + state [stateSize]uint32 + + buf [BlockSize]byte + off int + ietf bool +} + +// Reset zeros the key data so that it will no longer appear in the process's +// memory. +func (c *Cipher) Reset() { + for i := range c.state { + c.state[i] = 0 + } + for i := range c.buf { + c.buf[i] = 0 + } +} + +// XORKeyStream sets dst to the result of XORing src with the key stream. Dst +// and src may be the same slice but otherwise should not overlap. +func (c *Cipher) XORKeyStream(dst, src []byte) { + if len(dst) < len(src) { + src = src[:len(dst)] + } + + for remaining := len(src); remaining > 0; { + // Process multiple blocks at once. + if c.off == BlockSize { + nrBlocks := remaining / BlockSize + directBytes := nrBlocks * BlockSize + if nrBlocks > 0 { + blocksFn(&c.state, src, dst, nrBlocks, c.ietf) + remaining -= directBytes + if remaining == 0 { + return + } + dst = dst[directBytes:] + src = src[directBytes:] + } + + // If there's a partial block, generate 1 block of keystream into + // the internal buffer. + blocksFn(&c.state, nil, c.buf[:], 1, c.ietf) + c.off = 0 + } + + // Process partial blocks from the buffered keystream. + toXor := BlockSize - c.off + if remaining < toXor { + toXor = remaining + } + if toXor > 0 { + for i, v := range src[:toXor] { + dst[i] = v ^ c.buf[c.off+i] + } + dst = dst[toXor:] + src = src[toXor:] + + remaining -= toXor + c.off += toXor + } + } +} + +// KeyStream sets dst to the raw keystream. +func (c *Cipher) KeyStream(dst []byte) { + for remaining := len(dst); remaining > 0; { + // Process multiple blocks at once. + if c.off == BlockSize { + nrBlocks := remaining / BlockSize + directBytes := nrBlocks * BlockSize + if nrBlocks > 0 { + blocksFn(&c.state, nil, dst, nrBlocks, c.ietf) + remaining -= directBytes + if remaining == 0 { + return + } + dst = dst[directBytes:] + } + + // If there's a partial block, generate 1 block of keystream into + // the internal buffer. + blocksFn(&c.state, nil, c.buf[:], 1, c.ietf) + c.off = 0 + } + + // Process partial blocks from the buffered keystream. + toCopy := BlockSize - c.off + if remaining < toCopy { + toCopy = remaining + } + if toCopy > 0 { + copy(dst[:toCopy], c.buf[c.off:c.off+toCopy]) + dst = dst[toCopy:] + remaining -= toCopy + c.off += toCopy + } + } +} + +// ReKey reinitializes the ChaCha20/XChaCha20 instance with the provided key +// and nonce. +func (c *Cipher) ReKey(key, nonce []byte) error { + if len(key) != KeySize { + return ErrInvalidKey + } + + switch len(nonce) { + case NonceSize: + case INonceSize: + case XNonceSize: + var subkey [KeySize]byte + var subnonce [HNonceSize]byte + copy(subnonce[:], nonce[0:16]) + HChaCha(key, &subnonce, &subkey) + key = subkey[:] + nonce = nonce[16:24] + defer func() { + for i := range subkey { + subkey[i] = 0 + } + }() + default: + return ErrInvalidNonce + } + + c.Reset() + c.state[0] = sigma0 + c.state[1] = sigma1 + c.state[2] = sigma2 + c.state[3] = sigma3 + c.state[4] = binary.LittleEndian.Uint32(key[0:4]) + c.state[5] = binary.LittleEndian.Uint32(key[4:8]) + c.state[6] = binary.LittleEndian.Uint32(key[8:12]) + c.state[7] = binary.LittleEndian.Uint32(key[12:16]) + c.state[8] = binary.LittleEndian.Uint32(key[16:20]) + c.state[9] = binary.LittleEndian.Uint32(key[20:24]) + c.state[10] = binary.LittleEndian.Uint32(key[24:28]) + c.state[11] = binary.LittleEndian.Uint32(key[28:32]) + c.state[12] = 0 + if len(nonce) == INonceSize { + c.state[13] = binary.LittleEndian.Uint32(nonce[0:4]) + c.state[14] = binary.LittleEndian.Uint32(nonce[4:8]) + c.state[15] = binary.LittleEndian.Uint32(nonce[8:12]) + c.ietf = true + } else { + c.state[13] = 0 + c.state[14] = binary.LittleEndian.Uint32(nonce[0:4]) + c.state[15] = binary.LittleEndian.Uint32(nonce[4:8]) + c.ietf = false + } + c.off = BlockSize + return nil + +} + +// Seek sets the block counter to a given offset. +func (c *Cipher) Seek(blockCounter uint64) error { + if c.ietf { + if blockCounter > math.MaxUint32 { + return ErrInvalidCounter + } + c.state[12] = uint32(blockCounter) + } else { + c.state[12] = uint32(blockCounter) + c.state[13] = uint32(blockCounter >> 32) + } + c.off = BlockSize + return nil +} + +// NewCipher returns a new ChaCha20/XChaCha20 instance. +func NewCipher(key, nonce []byte) (*Cipher, error) { + c := new(Cipher) + if err := c.ReKey(key, nonce); err != nil { + return nil, err + } + return c, nil +} + +// HChaCha is the HChaCha20 hash function used to make XChaCha. +func HChaCha(key []byte, nonce *[HNonceSize]byte, out *[32]byte) { + var x [stateSize]uint32 // Last 4 slots unused, sigma hardcoded. + x[0] = binary.LittleEndian.Uint32(key[0:4]) + x[1] = binary.LittleEndian.Uint32(key[4:8]) + x[2] = binary.LittleEndian.Uint32(key[8:12]) + x[3] = binary.LittleEndian.Uint32(key[12:16]) + x[4] = binary.LittleEndian.Uint32(key[16:20]) + x[5] = binary.LittleEndian.Uint32(key[20:24]) + x[6] = binary.LittleEndian.Uint32(key[24:28]) + x[7] = binary.LittleEndian.Uint32(key[28:32]) + x[8] = binary.LittleEndian.Uint32(nonce[0:4]) + x[9] = binary.LittleEndian.Uint32(nonce[4:8]) + x[10] = binary.LittleEndian.Uint32(nonce[8:12]) + x[11] = binary.LittleEndian.Uint32(nonce[12:16]) + hChaChaRef(&x, out) +} + +func init() { + switch runtime.GOARCH { + case "386", "amd64": + // Abuse unsafe to skip calling binary.LittleEndian.PutUint32 + // in the critical path. This is a big boost on systems that are + // little endian and not overly picky about alignment. + useUnsafe = true + } +} + +var _ cipher.Stream = (*Cipher)(nil) diff --git a/vendor/github.com/Yawning/chacha20/chacha20_amd64.go b/vendor/github.com/Yawning/chacha20/chacha20_amd64.go new file mode 100644 index 0000000..05adad1 --- /dev/null +++ b/vendor/github.com/Yawning/chacha20/chacha20_amd64.go @@ -0,0 +1,95 @@ +// chacha20_amd64.go - AMD64 optimized chacha20. +// +// To the extent possible under law, Yawning Angel has waived all copyright +// and related or neighboring rights to chacha20, using the Creative +// Commons "CC0" public domain dedication. See LICENSE or +// for full details. + +// +build amd64,!gccgo,!appengine + +package chacha20 + +import ( + "math" +) + +var usingAVX2 = false + +func blocksAmd64SSE2(x *uint32, inp, outp *byte, nrBlocks uint) + +func blocksAmd64AVX2(x *uint32, inp, outp *byte, nrBlocks uint) + +func cpuidAmd64(cpuidParams *uint32) + +func xgetbv0Amd64(xcrVec *uint32) + +func blocksAmd64(x *[stateSize]uint32, in []byte, out []byte, nrBlocks int, isIetf bool) { + // Probably unneeded, but stating this explicitly simplifies the assembly. + if nrBlocks == 0 { + return + } + + if isIetf { + var totalBlocks uint64 + totalBlocks = uint64(x[12]) + uint64(nrBlocks) + if totalBlocks > math.MaxUint32 { + panic("chacha20: Exceeded keystream per nonce limit") + } + } + + if in == nil { + for i := range out { + out[i] = 0 + } + in = out + } + + // Pointless to call the AVX2 code for just a single block, since half of + // the output gets discarded... + if usingAVX2 && nrBlocks > 1 { + blocksAmd64AVX2(&x[0], &in[0], &out[0], uint(nrBlocks)) + } else { + blocksAmd64SSE2(&x[0], &in[0], &out[0], uint(nrBlocks)) + } +} + +func supportsAVX2() bool { + // https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family + const ( + osXsaveBit = 1 << 27 + avx2Bit = 1 << 5 + ) + + // Check to see if CPUID actually supports the leaf that indicates AVX2. + // CPUID.(EAX=0H, ECX=0H) >= 7 + regs := [4]uint32{0x00} + cpuidAmd64(®s[0]) + if regs[0] < 7 { + return false + } + + // Check to see if the OS knows how to save/restore XMM/YMM state. + // CPUID.(EAX=01H, ECX=0H):ECX.OSXSAVE[bit 27]==1 + regs = [4]uint32{0x01} + cpuidAmd64(®s[0]) + if regs[2]&osXsaveBit == 0 { + return false + } + xcrRegs := [2]uint32{} + xgetbv0Amd64(&xcrRegs[0]) + if xcrRegs[0]&6 != 6 { + return false + } + + // Check for AVX2 support. + // CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5]==1 + regs = [4]uint32{0x07} + cpuidAmd64(®s[0]) + return regs[1]&avx2Bit != 0 +} + +func init() { + blocksFn = blocksAmd64 + usingVectors = true + usingAVX2 = supportsAVX2() +} diff --git a/vendor/github.com/Yawning/chacha20/chacha20_amd64.py b/vendor/github.com/Yawning/chacha20/chacha20_amd64.py new file mode 100644 index 0000000..3bfebf4 --- /dev/null +++ b/vendor/github.com/Yawning/chacha20/chacha20_amd64.py @@ -0,0 +1,1295 @@ +#!/usr/bin/env python3 +# +# To the extent possible under law, Yawning Angel has waived all copyright +# and related or neighboring rights to chacha20, using the Creative +# Commons "CC0" public domain dedication. See LICENSE or +# for full details. + +# +# cgo sucks. Plan 9 assembly sucks. Real languages have SIMD intrinsics. +# The least terrible/retarded option is to use a Python code generator, so +# that's what I did. +# +# Code based on Ted Krovetz's vec128 C implementation, with corrections +# to use a 64 bit counter instead of 32 bit, and to allow unaligned input and +# output pointers. +# +# Dependencies: https://github.com/Maratyszcza/PeachPy +# +# python3 -m peachpy.x86_64 -mabi=goasm -S -o chacha20_amd64.s chacha20_amd64.py +# + +from peachpy import * +from peachpy.x86_64 import * + +x = Argument(ptr(uint32_t)) +inp = Argument(ptr(const_uint8_t)) +outp = Argument(ptr(uint8_t)) +nrBlocks = Argument(ptr(size_t)) + +# +# SSE2 helper functions. A temporary register is explicitly passed in because +# the main fast loop uses every single register (and even spills) so manual +# control is needed. +# +# This used to also have a DQROUNDS helper that did 2 rounds of ChaCha like +# in the C code, but the C code has the luxury of an optimizer reordering +# everything, while this does not. +# + +def ROTW16_sse2(tmp, d): + MOVDQA(tmp, d) + PSLLD(tmp, 16) + PSRLD(d, 16) + PXOR(d, tmp) + +def ROTW12_sse2(tmp, b): + MOVDQA(tmp, b) + PSLLD(tmp, 12) + PSRLD(b, 20) + PXOR(b, tmp) + +def ROTW8_sse2(tmp, d): + MOVDQA(tmp, d) + PSLLD(tmp, 8) + PSRLD(d, 24) + PXOR(d, tmp) + +def ROTW7_sse2(tmp, b): + MOVDQA(tmp, b) + PSLLD(tmp, 7) + PSRLD(b, 25) + PXOR(b, tmp) + +def WriteXor_sse2(tmp, inp, outp, d, v0, v1, v2, v3): + MOVDQU(tmp, [inp+d]) + PXOR(tmp, v0) + MOVDQU([outp+d], tmp) + MOVDQU(tmp, [inp+d+16]) + PXOR(tmp, v1) + MOVDQU([outp+d+16], tmp) + MOVDQU(tmp, [inp+d+32]) + PXOR(tmp, v2) + MOVDQU([outp+d+32], tmp) + MOVDQU(tmp, [inp+d+48]) + PXOR(tmp, v3) + MOVDQU([outp+d+48], tmp) + +# SSE2 ChaCha20 (aka vec128). Does not handle partial blocks, and will +# process 4/2/1 blocks at a time. +with Function("blocksAmd64SSE2", (x, inp, outp, nrBlocks)): + reg_x = GeneralPurposeRegister64() + reg_inp = GeneralPurposeRegister64() + reg_outp = GeneralPurposeRegister64() + reg_blocks = GeneralPurposeRegister64() + reg_sp_save = GeneralPurposeRegister64() + + LOAD.ARGUMENT(reg_x, x) + LOAD.ARGUMENT(reg_inp, inp) + LOAD.ARGUMENT(reg_outp, outp) + LOAD.ARGUMENT(reg_blocks, nrBlocks) + + # Align the stack to a 32 byte boundary. + MOV(reg_sp_save, registers.rsp) + AND(registers.rsp, 0xffffffffffffffe0) + SUB(registers.rsp, 0x20) + + # Build the counter increment vector on the stack, and allocate the scratch + # space + xmm_v0 = XMMRegister() + PXOR(xmm_v0, xmm_v0) + SUB(registers.rsp, 16+16) + MOVDQA([registers.rsp], xmm_v0) + reg_tmp = GeneralPurposeRegister32() + MOV(reg_tmp, 0x00000001) + MOV([registers.rsp], reg_tmp) + mem_one = [registers.rsp] # (Stack) Counter increment vector + mem_tmp0 = [registers.rsp+16] # (Stack) Scratch space. + + mem_s0 = [reg_x] # (Memory) Cipher state [0..3] + mem_s1 = [reg_x+16] # (Memory) Cipher state [4..7] + mem_s2 = [reg_x+32] # (Memory) Cipher state [8..11] + mem_s3 = [reg_x+48] # (Memory) Cipher state [12..15] + + # xmm_v0 allocated above... + xmm_v1 = XMMRegister() + xmm_v2 = XMMRegister() + xmm_v3 = XMMRegister() + + xmm_v4 = XMMRegister() + xmm_v5 = XMMRegister() + xmm_v6 = XMMRegister() + xmm_v7 = XMMRegister() + + xmm_v8 = XMMRegister() + xmm_v9 = XMMRegister() + xmm_v10 = XMMRegister() + xmm_v11 = XMMRegister() + + xmm_v12 = XMMRegister() + xmm_v13 = XMMRegister() + xmm_v14 = XMMRegister() + xmm_v15 = XMMRegister() + + xmm_tmp = xmm_v12 + + # + # 4 blocks at a time. + # + + reg_rounds = GeneralPurposeRegister64() + + vector_loop4 = Loop() + SUB(reg_blocks, 4) + JB(vector_loop4.end) + with vector_loop4: + MOVDQU(xmm_v0, mem_s0) + MOVDQU(xmm_v1, mem_s1) + MOVDQU(xmm_v2, mem_s2) + MOVDQU(xmm_v3, mem_s3) + + MOVDQA(xmm_v4, xmm_v0) + MOVDQA(xmm_v5, xmm_v1) + MOVDQA(xmm_v6, xmm_v2) + MOVDQA(xmm_v7, xmm_v3) + PADDQ(xmm_v7, mem_one) + + MOVDQA(xmm_v8, xmm_v0) + MOVDQA(xmm_v9, xmm_v1) + MOVDQA(xmm_v10, xmm_v2) + MOVDQA(xmm_v11, xmm_v7) + PADDQ(xmm_v11, mem_one) + + MOVDQA(xmm_v12, xmm_v0) + MOVDQA(xmm_v13, xmm_v1) + MOVDQA(xmm_v14, xmm_v2) + MOVDQA(xmm_v15, xmm_v11) + PADDQ(xmm_v15, mem_one) + + MOV(reg_rounds, 20) + rounds_loop4 = Loop() + with rounds_loop4: + # a += b; d ^= a; d = ROTW16(d); + PADDD(xmm_v0, xmm_v1) + PADDD(xmm_v4, xmm_v5) + PADDD(xmm_v8, xmm_v9) + PADDD(xmm_v12, xmm_v13) + PXOR(xmm_v3, xmm_v0) + PXOR(xmm_v7, xmm_v4) + PXOR(xmm_v11, xmm_v8) + PXOR(xmm_v15, xmm_v12) + + MOVDQA(mem_tmp0, xmm_tmp) # Save + + ROTW16_sse2(xmm_tmp, xmm_v3) + ROTW16_sse2(xmm_tmp, xmm_v7) + ROTW16_sse2(xmm_tmp, xmm_v11) + ROTW16_sse2(xmm_tmp, xmm_v15) + + # c += d; b ^= c; b = ROTW12(b); + PADDD(xmm_v2, xmm_v3) + PADDD(xmm_v6, xmm_v7) + PADDD(xmm_v10, xmm_v11) + PADDD(xmm_v14, xmm_v15) + PXOR(xmm_v1, xmm_v2) + PXOR(xmm_v5, xmm_v6) + PXOR(xmm_v9, xmm_v10) + PXOR(xmm_v13, xmm_v14) + ROTW12_sse2(xmm_tmp, xmm_v1) + ROTW12_sse2(xmm_tmp, xmm_v5) + ROTW12_sse2(xmm_tmp, xmm_v9) + ROTW12_sse2(xmm_tmp, xmm_v13) + + # a += b; d ^= a; d = ROTW8(d); + MOVDQA(xmm_tmp, mem_tmp0) # Restore + + PADDD(xmm_v0, xmm_v1) + PADDD(xmm_v4, xmm_v5) + PADDD(xmm_v8, xmm_v9) + PADDD(xmm_v12, xmm_v13) + PXOR(xmm_v3, xmm_v0) + PXOR(xmm_v7, xmm_v4) + PXOR(xmm_v11, xmm_v8) + PXOR(xmm_v15, xmm_v12) + + MOVDQA(mem_tmp0, xmm_tmp) # Save + + ROTW8_sse2(xmm_tmp, xmm_v3) + ROTW8_sse2(xmm_tmp, xmm_v7) + ROTW8_sse2(xmm_tmp, xmm_v11) + ROTW8_sse2(xmm_tmp, xmm_v15) + + # c += d; b ^= c; b = ROTW7(b) + PADDD(xmm_v2, xmm_v3) + PADDD(xmm_v6, xmm_v7) + PADDD(xmm_v10, xmm_v11) + PADDD(xmm_v14, xmm_v15) + PXOR(xmm_v1, xmm_v2) + PXOR(xmm_v5, xmm_v6) + PXOR(xmm_v9, xmm_v10) + PXOR(xmm_v13, xmm_v14) + ROTW7_sse2(xmm_tmp, xmm_v1) + ROTW7_sse2(xmm_tmp, xmm_v5) + ROTW7_sse2(xmm_tmp, xmm_v9) + ROTW7_sse2(xmm_tmp, xmm_v13) + + # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); + PSHUFD(xmm_v1, xmm_v1, 0x39) + PSHUFD(xmm_v5, xmm_v5, 0x39) + PSHUFD(xmm_v9, xmm_v9, 0x39) + PSHUFD(xmm_v13, xmm_v13, 0x39) + PSHUFD(xmm_v2, xmm_v2, 0x4e) + PSHUFD(xmm_v6, xmm_v6, 0x4e) + PSHUFD(xmm_v10, xmm_v10, 0x4e) + PSHUFD(xmm_v14, xmm_v14, 0x4e) + PSHUFD(xmm_v3, xmm_v3, 0x93) + PSHUFD(xmm_v7, xmm_v7, 0x93) + PSHUFD(xmm_v11, xmm_v11, 0x93) + PSHUFD(xmm_v15, xmm_v15, 0x93) + + MOVDQA(xmm_tmp, mem_tmp0) # Restore + + # a += b; d ^= a; d = ROTW16(d); + PADDD(xmm_v0, xmm_v1) + PADDD(xmm_v4, xmm_v5) + PADDD(xmm_v8, xmm_v9) + PADDD(xmm_v12, xmm_v13) + PXOR(xmm_v3, xmm_v0) + PXOR(xmm_v7, xmm_v4) + PXOR(xmm_v11, xmm_v8) + PXOR(xmm_v15, xmm_v12) + + MOVDQA(mem_tmp0, xmm_tmp) # Save + + ROTW16_sse2(xmm_tmp, xmm_v3) + ROTW16_sse2(xmm_tmp, xmm_v7) + ROTW16_sse2(xmm_tmp, xmm_v11) + ROTW16_sse2(xmm_tmp, xmm_v15) + + # c += d; b ^= c; b = ROTW12(b); + PADDD(xmm_v2, xmm_v3) + PADDD(xmm_v6, xmm_v7) + PADDD(xmm_v10, xmm_v11) + PADDD(xmm_v14, xmm_v15) + PXOR(xmm_v1, xmm_v2) + PXOR(xmm_v5, xmm_v6) + PXOR(xmm_v9, xmm_v10) + PXOR(xmm_v13, xmm_v14) + ROTW12_sse2(xmm_tmp, xmm_v1) + ROTW12_sse2(xmm_tmp, xmm_v5) + ROTW12_sse2(xmm_tmp, xmm_v9) + ROTW12_sse2(xmm_tmp, xmm_v13) + + # a += b; d ^= a; d = ROTW8(d); + MOVDQA(xmm_tmp, mem_tmp0) # Restore + + PADDD(xmm_v0, xmm_v1) + PADDD(xmm_v4, xmm_v5) + PADDD(xmm_v8, xmm_v9) + PADDD(xmm_v12, xmm_v13) + PXOR(xmm_v3, xmm_v0) + PXOR(xmm_v7, xmm_v4) + PXOR(xmm_v11, xmm_v8) + PXOR(xmm_v15, xmm_v12) + + MOVDQA(mem_tmp0, xmm_tmp) # Save + + ROTW8_sse2(xmm_tmp, xmm_v3) + ROTW8_sse2(xmm_tmp, xmm_v7) + ROTW8_sse2(xmm_tmp, xmm_v11) + ROTW8_sse2(xmm_tmp, xmm_v15) + + # c += d; b ^= c; b = ROTW7(b) + PADDD(xmm_v2, xmm_v3) + PADDD(xmm_v6, xmm_v7) + PADDD(xmm_v10, xmm_v11) + PADDD(xmm_v14, xmm_v15) + PXOR(xmm_v1, xmm_v2) + PXOR(xmm_v5, xmm_v6) + PXOR(xmm_v9, xmm_v10) + PXOR(xmm_v13, xmm_v14) + ROTW7_sse2(xmm_tmp, xmm_v1) + ROTW7_sse2(xmm_tmp, xmm_v5) + ROTW7_sse2(xmm_tmp, xmm_v9) + ROTW7_sse2(xmm_tmp, xmm_v13) + + # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); + PSHUFD(xmm_v1, xmm_v1, 0x93) + PSHUFD(xmm_v5, xmm_v5, 0x93) + PSHUFD(xmm_v9, xmm_v9, 0x93) + PSHUFD(xmm_v13, xmm_v13, 0x93) + PSHUFD(xmm_v2, xmm_v2, 0x4e) + PSHUFD(xmm_v6, xmm_v6, 0x4e) + PSHUFD(xmm_v10, xmm_v10, 0x4e) + PSHUFD(xmm_v14, xmm_v14, 0x4e) + PSHUFD(xmm_v3, xmm_v3, 0x39) + PSHUFD(xmm_v7, xmm_v7, 0x39) + PSHUFD(xmm_v11, xmm_v11, 0x39) + PSHUFD(xmm_v15, xmm_v15, 0x39) + + MOVDQA(xmm_tmp, mem_tmp0) # Restore + + SUB(reg_rounds, 2) + JNZ(rounds_loop4.begin) + + MOVDQA(mem_tmp0, xmm_tmp) + + PADDD(xmm_v0, mem_s0) + PADDD(xmm_v1, mem_s1) + PADDD(xmm_v2, mem_s2) + PADDD(xmm_v3, mem_s3) + WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3) + MOVDQU(xmm_v3, mem_s3) + PADDQ(xmm_v3, mem_one) + + PADDD(xmm_v4, mem_s0) + PADDD(xmm_v5, mem_s1) + PADDD(xmm_v6, mem_s2) + PADDD(xmm_v7, xmm_v3) + WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 64, xmm_v4, xmm_v5, xmm_v6, xmm_v7) + PADDQ(xmm_v3, mem_one) + + PADDD(xmm_v8, mem_s0) + PADDD(xmm_v9, mem_s1) + PADDD(xmm_v10, mem_s2) + PADDD(xmm_v11, xmm_v3) + WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 128, xmm_v8, xmm_v9, xmm_v10, xmm_v11) + PADDQ(xmm_v3, mem_one) + + MOVDQA(xmm_tmp, mem_tmp0) + + PADDD(xmm_v12, mem_s0) + PADDD(xmm_v13, mem_s1) + PADDD(xmm_v14, mem_s2) + PADDD(xmm_v15, xmm_v3) + WriteXor_sse2(xmm_v0, reg_inp, reg_outp, 192, xmm_v12, xmm_v13, xmm_v14, xmm_v15) + PADDQ(xmm_v3, mem_one) + + MOVDQU(mem_s3, xmm_v3) + + ADD(reg_inp, 4 * 64) + ADD(reg_outp, 4 * 64) + + SUB(reg_blocks, 4) + JAE(vector_loop4.begin) + + ADD(reg_blocks, 4) + out = Label() + JZ(out) + + # Past this point, we no longer need to use every single register to hold + # the in progress state. + + xmm_s0 = xmm_v8 + xmm_s1 = xmm_v9 + xmm_s2 = xmm_v10 + xmm_s3 = xmm_v11 + xmm_one = xmm_v13 + MOVDQU(xmm_s0, mem_s0) + MOVDQU(xmm_s1, mem_s1) + MOVDQU(xmm_s2, mem_s2) + MOVDQU(xmm_s3, mem_s3) + MOVDQA(xmm_one, mem_one) + + # + # 2 blocks at a time. + # + + process_1_block = Label() + SUB(reg_blocks, 2) + JB(process_1_block) # < 2 blocks remaining. + + MOVDQA(xmm_v0, xmm_s0) + MOVDQA(xmm_v1, xmm_s1) + MOVDQA(xmm_v2, xmm_s2) + MOVDQA(xmm_v3, xmm_s3) + + MOVDQA(xmm_v4, xmm_v0) + MOVDQA(xmm_v5, xmm_v1) + MOVDQA(xmm_v6, xmm_v2) + MOVDQA(xmm_v7, xmm_v3) + PADDQ(xmm_v7, xmm_one) + + MOV(reg_rounds, 20) + rounds_loop2 = Loop() + with rounds_loop2: + # a += b; d ^= a; d = ROTW16(d); + PADDD(xmm_v0, xmm_v1) + PADDD(xmm_v4, xmm_v5) + PXOR(xmm_v3, xmm_v0) + PXOR(xmm_v7, xmm_v4) + ROTW16_sse2(xmm_tmp, xmm_v3) + ROTW16_sse2(xmm_tmp, xmm_v7) + + # c += d; b ^= c; b = ROTW12(b); + PADDD(xmm_v2, xmm_v3) + PADDD(xmm_v6, xmm_v7) + PXOR(xmm_v1, xmm_v2) + PXOR(xmm_v5, xmm_v6) + ROTW12_sse2(xmm_tmp, xmm_v1) + ROTW12_sse2(xmm_tmp, xmm_v5) + + # a += b; d ^= a; d = ROTW8(d); + PADDD(xmm_v0, xmm_v1) + PADDD(xmm_v4, xmm_v5) + PXOR(xmm_v3, xmm_v0) + PXOR(xmm_v7, xmm_v4) + ROTW8_sse2(xmm_tmp, xmm_v3) + ROTW8_sse2(xmm_tmp, xmm_v7) + + # c += d; b ^= c; b = ROTW7(b) + PADDD(xmm_v2, xmm_v3) + PADDD(xmm_v6, xmm_v7) + PXOR(xmm_v1, xmm_v2) + PXOR(xmm_v5, xmm_v6) + ROTW7_sse2(xmm_tmp, xmm_v1) + ROTW7_sse2(xmm_tmp, xmm_v5) + + # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); + PSHUFD(xmm_v1, xmm_v1, 0x39) + PSHUFD(xmm_v5, xmm_v5, 0x39) + PSHUFD(xmm_v2, xmm_v2, 0x4e) + PSHUFD(xmm_v6, xmm_v6, 0x4e) + PSHUFD(xmm_v3, xmm_v3, 0x93) + PSHUFD(xmm_v7, xmm_v7, 0x93) + + # a += b; d ^= a; d = ROTW16(d); + PADDD(xmm_v0, xmm_v1) + PADDD(xmm_v4, xmm_v5) + PXOR(xmm_v3, xmm_v0) + PXOR(xmm_v7, xmm_v4) + ROTW16_sse2(xmm_tmp, xmm_v3) + ROTW16_sse2(xmm_tmp, xmm_v7) + + # c += d; b ^= c; b = ROTW12(b); + PADDD(xmm_v2, xmm_v3) + PADDD(xmm_v6, xmm_v7) + PXOR(xmm_v1, xmm_v2) + PXOR(xmm_v5, xmm_v6) + ROTW12_sse2(xmm_tmp, xmm_v1) + ROTW12_sse2(xmm_tmp, xmm_v5) + + # a += b; d ^= a; d = ROTW8(d); + PADDD(xmm_v0, xmm_v1) + PADDD(xmm_v4, xmm_v5) + PXOR(xmm_v3, xmm_v0) + PXOR(xmm_v7, xmm_v4) + ROTW8_sse2(xmm_tmp, xmm_v3) + ROTW8_sse2(xmm_tmp, xmm_v7) + + # c += d; b ^= c; b = ROTW7(b) + PADDD(xmm_v2, xmm_v3) + PADDD(xmm_v6, xmm_v7) + PXOR(xmm_v1, xmm_v2) + PXOR(xmm_v5, xmm_v6) + ROTW7_sse2(xmm_tmp, xmm_v1) + ROTW7_sse2(xmm_tmp, xmm_v5) + + # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); + PSHUFD(xmm_v1, xmm_v1, 0x93) + PSHUFD(xmm_v5, xmm_v5, 0x93) + PSHUFD(xmm_v2, xmm_v2, 0x4e) + PSHUFD(xmm_v6, xmm_v6, 0x4e) + PSHUFD(xmm_v3, xmm_v3, 0x39) + PSHUFD(xmm_v7, xmm_v7, 0x39) + + SUB(reg_rounds, 2) + JNZ(rounds_loop2.begin) + + PADDD(xmm_v0, xmm_s0) + PADDD(xmm_v1, xmm_s1) + PADDD(xmm_v2, xmm_s2) + PADDD(xmm_v3, xmm_s3) + WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3) + PADDQ(xmm_s3, xmm_one) + + PADDD(xmm_v4, xmm_s0) + PADDD(xmm_v5, xmm_s1) + PADDD(xmm_v6, xmm_s2) + PADDD(xmm_v7, xmm_s3) + WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 64, xmm_v4, xmm_v5, xmm_v6, xmm_v7) + PADDQ(xmm_s3, xmm_one) + + ADD(reg_inp, 2 * 64) + ADD(reg_outp, 2 * 64) + SUB(reg_blocks, 2) + + LABEL(process_1_block) + ADD(reg_blocks, 2) + out_serial = Label() + JZ(out_serial) + + # + # 1 block at a time. Only executed once, because if there was > 1, + # the parallel code would have processed it already. + # + + MOVDQA(xmm_v0, xmm_s0) + MOVDQA(xmm_v1, xmm_s1) + MOVDQA(xmm_v2, xmm_s2) + MOVDQA(xmm_v3, xmm_s3) + + MOV(reg_rounds, 20) + rounds_loop1 = Loop() + with rounds_loop1: + # a += b; d ^= a; d = ROTW16(d); + PADDD(xmm_v0, xmm_v1) + PXOR(xmm_v3, xmm_v0) + ROTW16_sse2(xmm_tmp, xmm_v3) + + # c += d; b ^= c; b = ROTW12(b); + PADDD(xmm_v2, xmm_v3) + PXOR(xmm_v1, xmm_v2) + ROTW12_sse2(xmm_tmp, xmm_v1) + + # a += b; d ^= a; d = ROTW8(d); + PADDD(xmm_v0, xmm_v1) + PXOR(xmm_v3, xmm_v0) + ROTW8_sse2(xmm_tmp, xmm_v3) + + # c += d; b ^= c; b = ROTW7(b) + PADDD(xmm_v2, xmm_v3) + PXOR(xmm_v1, xmm_v2) + ROTW7_sse2(xmm_tmp, xmm_v1) + + # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); + PSHUFD(xmm_v1, xmm_v1, 0x39) + PSHUFD(xmm_v2, xmm_v2, 0x4e) + PSHUFD(xmm_v3, xmm_v3, 0x93) + + # a += b; d ^= a; d = ROTW16(d); + PADDD(xmm_v0, xmm_v1) + PXOR(xmm_v3, xmm_v0) + ROTW16_sse2(xmm_tmp, xmm_v3) + + # c += d; b ^= c; b = ROTW12(b); + PADDD(xmm_v2, xmm_v3) + PXOR(xmm_v1, xmm_v2) + ROTW12_sse2(xmm_tmp, xmm_v1) + + # a += b; d ^= a; d = ROTW8(d); + PADDD(xmm_v0, xmm_v1) + PXOR(xmm_v3, xmm_v0) + ROTW8_sse2(xmm_tmp, xmm_v3) + + # c += d; b ^= c; b = ROTW7(b) + PADDD(xmm_v2, xmm_v3) + PXOR(xmm_v1, xmm_v2) + ROTW7_sse2(xmm_tmp, xmm_v1) + + # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); + PSHUFD(xmm_v1, xmm_v1, 0x93) + PSHUFD(xmm_v2, xmm_v2, 0x4e) + PSHUFD(xmm_v3, xmm_v3, 0x39) + + SUB(reg_rounds, 2) + JNZ(rounds_loop1.begin) + + PADDD(xmm_v0, xmm_s0) + PADDD(xmm_v1, xmm_s1) + PADDD(xmm_v2, xmm_s2) + PADDD(xmm_v3, xmm_s3) + WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3) + PADDQ(xmm_s3, xmm_one) + + LABEL(out_serial) + + # Write back the updated counter. Stoping at 2^70 bytes is the user's + # problem, not mine. (Skipped if there's exactly a multiple of 4 blocks + # because the counter is incremented in memory while looping.) + MOVDQU(mem_s3, xmm_s3) + + LABEL(out) + + # Paranoia, cleanse the scratch space. + PXOR(xmm_v0, xmm_v0) + MOVDQA(mem_tmp0, xmm_v0) + + # Remove our stack allocation. + MOV(registers.rsp, reg_sp_save) + + RETURN() + +# +# AVX2 helpers. Like the SSE2 equivalents, the scratch register is explicit, +# and more helpers are used to increase readability for destructive operations. +# +# XXX/Performance: ROTW16_avx2/ROTW8_avx2 both can use VPSHUFFB. +# + +def ADD_avx2(dst, src): + VPADDD(dst, dst, src) + +def XOR_avx2(dst, src): + VPXOR(dst, dst, src) + +def ROTW16_avx2(tmp, d): + VPSLLD(tmp, d, 16) + VPSRLD(d, d, 16) + XOR_avx2(d, tmp) + +def ROTW12_avx2(tmp, b): + VPSLLD(tmp, b, 12) + VPSRLD(b, b, 20) + XOR_avx2(b, tmp) + +def ROTW8_avx2(tmp, d): + VPSLLD(tmp, d, 8) + VPSRLD(d, d, 24) + XOR_avx2(d, tmp) + +def ROTW7_avx2(tmp, b): + VPSLLD(tmp, b, 7) + VPSRLD(b, b, 25) + XOR_avx2(b, tmp) + +def WriteXor_avx2(tmp, inp, outp, d, v0, v1, v2, v3): + # XOR_WRITE(out+ 0, in+ 0, _mm256_permute2x128_si256(v0,v1,0x20)); + VPERM2I128(tmp, v0, v1, 0x20) + VPXOR(tmp, tmp, [inp+d]) + VMOVDQU([outp+d], tmp) + + # XOR_WRITE(out+32, in+32, _mm256_permute2x128_si256(v2,v3,0x20)); + VPERM2I128(tmp, v2, v3, 0x20) + VPXOR(tmp, tmp, [inp+d+32]) + VMOVDQU([outp+d+32], tmp) + + # XOR_WRITE(out+64, in+64, _mm256_permute2x128_si256(v0,v1,0x31)); + VPERM2I128(tmp, v0, v1, 0x31) + VPXOR(tmp, tmp, [inp+d+64]) + VMOVDQU([outp+d+64], tmp) + + # XOR_WRITE(out+96, in+96, _mm256_permute2x128_si256(v2,v3,0x31)); + VPERM2I128(tmp, v2, v3, 0x31) + VPXOR(tmp, tmp, [inp+d+96]) + VMOVDQU([outp+d+96], tmp) + +# AVX2 ChaCha20 (aka avx2). Does not handle partial blocks, will process +# 8/4/2 blocks at a time. +with Function("blocksAmd64AVX2", (x, inp, outp, nrBlocks), target=uarch.broadwell): + reg_x = GeneralPurposeRegister64() + reg_inp = GeneralPurposeRegister64() + reg_outp = GeneralPurposeRegister64() + reg_blocks = GeneralPurposeRegister64() + reg_sp_save = GeneralPurposeRegister64() + + LOAD.ARGUMENT(reg_x, x) + LOAD.ARGUMENT(reg_inp, inp) + LOAD.ARGUMENT(reg_outp, outp) + LOAD.ARGUMENT(reg_blocks, nrBlocks) + + # Align the stack to a 32 byte boundary. + MOV(reg_sp_save, registers.rsp) + AND(registers.rsp, 0xffffffffffffffe0) + SUB(registers.rsp, 0x20) + + x_s0 = [reg_x] # (Memory) Cipher state [0..3] + x_s1 = [reg_x+16] # (Memory) Cipher state [4..7] + x_s2 = [reg_x+32] # (Memory) Cipher state [8..11] + x_s3 = [reg_x+48] # (Memory) Cipher state [12..15] + + ymm_v0 = YMMRegister() + ymm_v1 = YMMRegister() + ymm_v2 = YMMRegister() + ymm_v3 = YMMRegister() + + ymm_v4 = YMMRegister() + ymm_v5 = YMMRegister() + ymm_v6 = YMMRegister() + ymm_v7 = YMMRegister() + + ymm_v8 = YMMRegister() + ymm_v9 = YMMRegister() + ymm_v10 = YMMRegister() + ymm_v11 = YMMRegister() + + ymm_v12 = YMMRegister() + ymm_v13 = YMMRegister() + ymm_v14 = YMMRegister() + ymm_v15 = YMMRegister() + + ymm_tmp0 = ymm_v12 + + # Allocate the neccecary stack space for the counter vector and two ymm + # registers that we will spill. + SUB(registers.rsp, 96) + mem_tmp0 = [registers.rsp+64] # (Stack) Scratch space. + mem_s3 = [registers.rsp+32] # (Stack) Working copy of s3. (8x) + mem_inc = [registers.rsp] # (Stack) Counter increment vector. + + # Increment the counter for one side of the state vector. + VPXOR(ymm_tmp0, ymm_tmp0, ymm_tmp0) + VMOVDQU(mem_inc, ymm_tmp0) + reg_tmp = GeneralPurposeRegister32() + MOV(reg_tmp, 0x00000001) + MOV([registers.rsp+16], reg_tmp) + VBROADCASTI128(ymm_v3, x_s3) + VPADDQ(ymm_v3, ymm_v3, [registers.rsp]) + VMOVDQA(mem_s3, ymm_v3) + + # As we process 2xN blocks at a time, so the counter increment for both + # sides of the state vector is 2. + MOV(reg_tmp, 0x00000002) + MOV([registers.rsp], reg_tmp) + MOV([registers.rsp+16], reg_tmp) + + out_write_even = Label() + out_write_odd = Label() + + # + # 8 blocks at a time. Ted Krovetz's avx2 code does not do this, but it's + # a decent gain despite all the pain... + # + + reg_rounds = GeneralPurposeRegister64() + + vector_loop8 = Loop() + SUB(reg_blocks, 8) + JB(vector_loop8.end) + with vector_loop8: + VBROADCASTI128(ymm_v0, x_s0) + VBROADCASTI128(ymm_v1, x_s1) + VBROADCASTI128(ymm_v2, x_s2) + VMOVDQA(ymm_v3, mem_s3) + + VMOVDQA(ymm_v4, ymm_v0) + VMOVDQA(ymm_v5, ymm_v1) + VMOVDQA(ymm_v6, ymm_v2) + VPADDQ(ymm_v7, ymm_v3, mem_inc) + + VMOVDQA(ymm_v8, ymm_v0) + VMOVDQA(ymm_v9, ymm_v1) + VMOVDQA(ymm_v10, ymm_v2) + VPADDQ(ymm_v11, ymm_v7, mem_inc) + + VMOVDQA(ymm_v12, ymm_v0) + VMOVDQA(ymm_v13, ymm_v1) + VMOVDQA(ymm_v14, ymm_v2) + VPADDQ(ymm_v15, ymm_v11, mem_inc) + + MOV(reg_rounds, 20) + rounds_loop8 = Loop() + with rounds_loop8: + # a += b; d ^= a; d = ROTW16(d); + ADD_avx2(ymm_v0, ymm_v1) + ADD_avx2(ymm_v4, ymm_v5) + ADD_avx2(ymm_v8, ymm_v9) + ADD_avx2(ymm_v12, ymm_v13) + XOR_avx2(ymm_v3, ymm_v0) + XOR_avx2(ymm_v7, ymm_v4) + XOR_avx2(ymm_v11, ymm_v8) + XOR_avx2(ymm_v15, ymm_v12) + + VMOVDQA(mem_tmp0, ymm_tmp0) # Save + + ROTW16_avx2(ymm_tmp0, ymm_v3) + ROTW16_avx2(ymm_tmp0, ymm_v7) + ROTW16_avx2(ymm_tmp0, ymm_v11) + ROTW16_avx2(ymm_tmp0, ymm_v15) + + # c += d; b ^= c; b = ROTW12(b); + ADD_avx2(ymm_v2, ymm_v3) + ADD_avx2(ymm_v6, ymm_v7) + ADD_avx2(ymm_v10, ymm_v11) + ADD_avx2(ymm_v14, ymm_v15) + XOR_avx2(ymm_v1, ymm_v2) + XOR_avx2(ymm_v5, ymm_v6) + XOR_avx2(ymm_v9, ymm_v10) + XOR_avx2(ymm_v13, ymm_v14) + ROTW12_avx2(ymm_tmp0, ymm_v1) + ROTW12_avx2(ymm_tmp0, ymm_v5) + ROTW12_avx2(ymm_tmp0, ymm_v9) + ROTW12_avx2(ymm_tmp0, ymm_v13) + + # a += b; d ^= a; d = ROTW8(d); + VMOVDQA(ymm_tmp0, mem_tmp0) # Restore + + ADD_avx2(ymm_v0, ymm_v1) + ADD_avx2(ymm_v4, ymm_v5) + ADD_avx2(ymm_v8, ymm_v9) + ADD_avx2(ymm_v12, ymm_v13) + XOR_avx2(ymm_v3, ymm_v0) + XOR_avx2(ymm_v7, ymm_v4) + XOR_avx2(ymm_v11, ymm_v8) + XOR_avx2(ymm_v15, ymm_v12) + + VMOVDQA(mem_tmp0, ymm_tmp0) # Save + + ROTW8_avx2(ymm_tmp0, ymm_v3) + ROTW8_avx2(ymm_tmp0, ymm_v7) + ROTW8_avx2(ymm_tmp0, ymm_v11) + ROTW8_avx2(ymm_tmp0, ymm_v15) + + # c += d; b ^= c; b = ROTW7(b) + ADD_avx2(ymm_v2, ymm_v3) + ADD_avx2(ymm_v6, ymm_v7) + ADD_avx2(ymm_v10, ymm_v11) + ADD_avx2(ymm_v14, ymm_v15) + XOR_avx2(ymm_v1, ymm_v2) + XOR_avx2(ymm_v5, ymm_v6) + XOR_avx2(ymm_v9, ymm_v10) + XOR_avx2(ymm_v13, ymm_v14) + ROTW7_avx2(ymm_tmp0, ymm_v1) + ROTW7_avx2(ymm_tmp0, ymm_v5) + ROTW7_avx2(ymm_tmp0, ymm_v9) + ROTW7_avx2(ymm_tmp0, ymm_v13) + + # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); + VPSHUFD(ymm_v1, ymm_v1, 0x39) + VPSHUFD(ymm_v5, ymm_v5, 0x39) + VPSHUFD(ymm_v9, ymm_v9, 0x39) + VPSHUFD(ymm_v13, ymm_v13, 0x39) + VPSHUFD(ymm_v2, ymm_v2, 0x4e) + VPSHUFD(ymm_v6, ymm_v6, 0x4e) + VPSHUFD(ymm_v10, ymm_v10, 0x4e) + VPSHUFD(ymm_v14, ymm_v14, 0x4e) + VPSHUFD(ymm_v3, ymm_v3, 0x93) + VPSHUFD(ymm_v7, ymm_v7, 0x93) + VPSHUFD(ymm_v11, ymm_v11, 0x93) + VPSHUFD(ymm_v15, ymm_v15, 0x93) + + # a += b; d ^= a; d = ROTW16(d); + VMOVDQA(ymm_tmp0, mem_tmp0) # Restore + + ADD_avx2(ymm_v0, ymm_v1) + ADD_avx2(ymm_v4, ymm_v5) + ADD_avx2(ymm_v8, ymm_v9) + ADD_avx2(ymm_v12, ymm_v13) + XOR_avx2(ymm_v3, ymm_v0) + XOR_avx2(ymm_v7, ymm_v4) + XOR_avx2(ymm_v11, ymm_v8) + XOR_avx2(ymm_v15, ymm_v12) + + VMOVDQA(mem_tmp0, ymm_tmp0) # Save + + ROTW16_avx2(ymm_tmp0, ymm_v3) + ROTW16_avx2(ymm_tmp0, ymm_v7) + ROTW16_avx2(ymm_tmp0, ymm_v11) + ROTW16_avx2(ymm_tmp0, ymm_v15) + + # c += d; b ^= c; b = ROTW12(b); + ADD_avx2(ymm_v2, ymm_v3) + ADD_avx2(ymm_v6, ymm_v7) + ADD_avx2(ymm_v10, ymm_v11) + ADD_avx2(ymm_v14, ymm_v15) + XOR_avx2(ymm_v1, ymm_v2) + XOR_avx2(ymm_v5, ymm_v6) + XOR_avx2(ymm_v9, ymm_v10) + XOR_avx2(ymm_v13, ymm_v14) + ROTW12_avx2(ymm_tmp0, ymm_v1) + ROTW12_avx2(ymm_tmp0, ymm_v5) + ROTW12_avx2(ymm_tmp0, ymm_v9) + ROTW12_avx2(ymm_tmp0, ymm_v13) + + # a += b; d ^= a; d = ROTW8(d); + VMOVDQA(ymm_tmp0, mem_tmp0) # Restore + + ADD_avx2(ymm_v0, ymm_v1) + ADD_avx2(ymm_v4, ymm_v5) + ADD_avx2(ymm_v8, ymm_v9) + ADD_avx2(ymm_v12, ymm_v13) + XOR_avx2(ymm_v3, ymm_v0) + XOR_avx2(ymm_v7, ymm_v4) + XOR_avx2(ymm_v11, ymm_v8) + XOR_avx2(ymm_v15, ymm_v12) + + VMOVDQA(mem_tmp0, ymm_tmp0) # Save + + ROTW8_avx2(ymm_tmp0, ymm_v3) + ROTW8_avx2(ymm_tmp0, ymm_v7) + ROTW8_avx2(ymm_tmp0, ymm_v11) + ROTW8_avx2(ymm_tmp0, ymm_v15) + + # c += d; b ^= c; b = ROTW7(b) + ADD_avx2(ymm_v2, ymm_v3) + ADD_avx2(ymm_v6, ymm_v7) + ADD_avx2(ymm_v10, ymm_v11) + ADD_avx2(ymm_v14, ymm_v15) + XOR_avx2(ymm_v1, ymm_v2) + XOR_avx2(ymm_v5, ymm_v6) + XOR_avx2(ymm_v9, ymm_v10) + XOR_avx2(ymm_v13, ymm_v14) + ROTW7_avx2(ymm_tmp0, ymm_v1) + ROTW7_avx2(ymm_tmp0, ymm_v5) + ROTW7_avx2(ymm_tmp0, ymm_v9) + ROTW7_avx2(ymm_tmp0, ymm_v13) + + # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); + VPSHUFD(ymm_v1, ymm_v1, 0x93) + VPSHUFD(ymm_v5, ymm_v5, 0x93) + VPSHUFD(ymm_v9, ymm_v9, 0x93) + VPSHUFD(ymm_v13, ymm_v13, 0x93) + VPSHUFD(ymm_v2, ymm_v2, 0x4e) + VPSHUFD(ymm_v6, ymm_v6, 0x4e) + VPSHUFD(ymm_v10, ymm_v10, 0x4e) + VPSHUFD(ymm_v14, ymm_v14, 0x4e) + VPSHUFD(ymm_v3, ymm_v3, 0x39) + VPSHUFD(ymm_v7, ymm_v7, 0x39) + VPSHUFD(ymm_v11, ymm_v11, 0x39) + VPSHUFD(ymm_v15, ymm_v15, 0x39) + + VMOVDQA(ymm_tmp0, mem_tmp0) # Restore + + SUB(reg_rounds, 2) + JNZ(rounds_loop8.begin) + + # ymm_v12 is in mem_tmp0 and is current.... + + # XXX: I assume VBROADCASTI128 is about as fast as VMOVDQA.... + VBROADCASTI128(ymm_tmp0, x_s0) + ADD_avx2(ymm_v0, ymm_tmp0) + ADD_avx2(ymm_v4, ymm_tmp0) + ADD_avx2(ymm_v8, ymm_tmp0) + ADD_avx2(ymm_tmp0, mem_tmp0) + VMOVDQA(mem_tmp0, ymm_tmp0) + + VBROADCASTI128(ymm_tmp0, x_s1) + ADD_avx2(ymm_v1, ymm_tmp0) + ADD_avx2(ymm_v5, ymm_tmp0) + ADD_avx2(ymm_v9, ymm_tmp0) + ADD_avx2(ymm_v13, ymm_tmp0) + + VBROADCASTI128(ymm_tmp0, x_s2) + ADD_avx2(ymm_v2, ymm_tmp0) + ADD_avx2(ymm_v6, ymm_tmp0) + ADD_avx2(ymm_v10, ymm_tmp0) + ADD_avx2(ymm_v14, ymm_tmp0) + + ADD_avx2(ymm_v3, mem_s3) + WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 0, ymm_v0, ymm_v1, ymm_v2, ymm_v3) + VMOVDQA(ymm_v3, mem_s3) + ADD_avx2(ymm_v3, mem_inc) + + ADD_avx2(ymm_v7, ymm_v3) + WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 128, ymm_v4, ymm_v5, ymm_v6, ymm_v7) + ADD_avx2(ymm_v3, mem_inc) + + ADD_avx2(ymm_v11, ymm_v3) + WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 256, ymm_v8, ymm_v9, ymm_v10, ymm_v11) + ADD_avx2(ymm_v3, mem_inc) + + VMOVDQA(ymm_v12, mem_tmp0) + ADD_avx2(ymm_v15, ymm_v3) + WriteXor_avx2(ymm_v0, reg_inp, reg_outp, 384, ymm_v12, ymm_v13, ymm_v14, ymm_v15) + ADD_avx2(ymm_v3, mem_inc) + + VMOVDQA(mem_s3, ymm_v3) + + ADD(reg_inp, 8 * 64) + ADD(reg_outp, 8 * 64) + + SUB(reg_blocks, 8) + JAE(vector_loop8.begin) + + # ymm_v3 contains a current copy of mem_s3 either from when it was built, + # or because the loop updates it. Copy this before we mess with the block + # counter in case we need to write it back and return. + ymm_s3 = ymm_v11 + VMOVDQA(ymm_s3, ymm_v3) + + ADD(reg_blocks, 8) + JZ(out_write_even) + + # We now actually can do everything in registers. + ymm_s0 = ymm_v8 + VBROADCASTI128(ymm_s0, x_s0) + ymm_s1 = ymm_v9 + VBROADCASTI128(ymm_s1, x_s1) + ymm_s2 = ymm_v10 + VBROADCASTI128(ymm_s2, x_s2) + ymm_inc = ymm_v14 + VMOVDQA(ymm_inc, mem_inc) + + # + # 4 blocks at a time. + # + + process_2_blocks = Label() + SUB(reg_blocks, 4) + JB(process_2_blocks) # < 4 blocks remaining. + + VMOVDQA(ymm_v0, ymm_s0) + VMOVDQA(ymm_v1, ymm_s1) + VMOVDQA(ymm_v2, ymm_s2) + VMOVDQA(ymm_v3, ymm_s3) + + VMOVDQA(ymm_v4, ymm_v0) + VMOVDQA(ymm_v5, ymm_v1) + VMOVDQA(ymm_v6, ymm_v2) + VPADDQ(ymm_v7, ymm_v3, ymm_inc) + + MOV(reg_rounds, 20) + rounds_loop4 = Loop() + with rounds_loop4: + # a += b; d ^= a; d = ROTW16(d); + ADD_avx2(ymm_v0, ymm_v1) + ADD_avx2(ymm_v4, ymm_v5) + XOR_avx2(ymm_v3, ymm_v0) + XOR_avx2(ymm_v7, ymm_v4) + ROTW16_avx2(ymm_tmp0, ymm_v3) + ROTW16_avx2(ymm_tmp0, ymm_v7) + + # c += d; b ^= c; b = ROTW12(b); + ADD_avx2(ymm_v2, ymm_v3) + ADD_avx2(ymm_v6, ymm_v7) + XOR_avx2(ymm_v1, ymm_v2) + XOR_avx2(ymm_v5, ymm_v6) + ROTW12_avx2(ymm_tmp0, ymm_v1) + ROTW12_avx2(ymm_tmp0, ymm_v5) + + # a += b; d ^= a; d = ROTW8(d); + ADD_avx2(ymm_v0, ymm_v1) + ADD_avx2(ymm_v4, ymm_v5) + XOR_avx2(ymm_v3, ymm_v0) + XOR_avx2(ymm_v7, ymm_v4) + ROTW8_avx2(ymm_tmp0, ymm_v3) + ROTW8_avx2(ymm_tmp0, ymm_v7) + + # c += d; b ^= c; b = ROTW7(b) + ADD_avx2(ymm_v2, ymm_v3) + ADD_avx2(ymm_v6, ymm_v7) + XOR_avx2(ymm_v1, ymm_v2) + XOR_avx2(ymm_v5, ymm_v6) + ROTW7_avx2(ymm_tmp0, ymm_v1) + ROTW7_avx2(ymm_tmp0, ymm_v5) + + # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); + VPSHUFD(ymm_v1, ymm_v1, 0x39) + VPSHUFD(ymm_v5, ymm_v5, 0x39) + VPSHUFD(ymm_v2, ymm_v2, 0x4e) + VPSHUFD(ymm_v6, ymm_v6, 0x4e) + VPSHUFD(ymm_v3, ymm_v3, 0x93) + VPSHUFD(ymm_v7, ymm_v7, 0x93) + + # a += b; d ^= a; d = ROTW16(d); + ADD_avx2(ymm_v0, ymm_v1) + ADD_avx2(ymm_v4, ymm_v5) + XOR_avx2(ymm_v3, ymm_v0) + XOR_avx2(ymm_v7, ymm_v4) + ROTW16_avx2(ymm_tmp0, ymm_v3) + ROTW16_avx2(ymm_tmp0, ymm_v7) + + # c += d; b ^= c; b = ROTW12(b); + ADD_avx2(ymm_v2, ymm_v3) + ADD_avx2(ymm_v6, ymm_v7) + XOR_avx2(ymm_v1, ymm_v2) + XOR_avx2(ymm_v5, ymm_v6) + ROTW12_avx2(ymm_tmp0, ymm_v1) + ROTW12_avx2(ymm_tmp0, ymm_v5) + + # a += b; d ^= a; d = ROTW8(d); + ADD_avx2(ymm_v0, ymm_v1) + ADD_avx2(ymm_v4, ymm_v5) + XOR_avx2(ymm_v3, ymm_v0) + XOR_avx2(ymm_v7, ymm_v4) + ROTW8_avx2(ymm_tmp0, ymm_v3) + ROTW8_avx2(ymm_tmp0, ymm_v7) + + # c += d; b ^= c; b = ROTW7(b) + ADD_avx2(ymm_v2, ymm_v3) + ADD_avx2(ymm_v6, ymm_v7) + XOR_avx2(ymm_v1, ymm_v2) + XOR_avx2(ymm_v5, ymm_v6) + ROTW7_avx2(ymm_tmp0, ymm_v1) + ROTW7_avx2(ymm_tmp0, ymm_v5) + + # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); + VPSHUFD(ymm_v1, ymm_v1, 0x93) + VPSHUFD(ymm_v5, ymm_v5, 0x93) + VPSHUFD(ymm_v2, ymm_v2, 0x4e) + VPSHUFD(ymm_v6, ymm_v6, 0x4e) + VPSHUFD(ymm_v3, ymm_v3, 0x39) + VPSHUFD(ymm_v7, ymm_v7, 0x39) + + SUB(reg_rounds, 2) + JNZ(rounds_loop4.begin) + + ADD_avx2(ymm_v0, ymm_s0) + ADD_avx2(ymm_v1, ymm_s1) + ADD_avx2(ymm_v2, ymm_s2) + ADD_avx2(ymm_v3, ymm_s3) + WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 0, ymm_v0, ymm_v1, ymm_v2, ymm_v3) + ADD_avx2(ymm_s3, ymm_inc) + + ADD_avx2(ymm_v4, ymm_s0) + ADD_avx2(ymm_v5, ymm_s1) + ADD_avx2(ymm_v6, ymm_s2) + ADD_avx2(ymm_v7, ymm_s3) + WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 128, ymm_v4, ymm_v5, ymm_v6, ymm_v7) + ADD_avx2(ymm_s3, ymm_inc) + + ADD(reg_inp, 4 * 64) + ADD(reg_outp, 4 * 64) + SUB(reg_blocks, 4) + + LABEL(process_2_blocks) + ADD(reg_blocks, 4) + JZ(out_write_even) # 0 blocks left. + + # + # 2/1 blocks at a time. The two codepaths are unified because + # with AVX2 we do 2 blocks at a time anyway, and this only gets called + # if 3/2/1 blocks are remaining, so the extra branches don't hurt that + # much. + # + + vector_loop2 = Loop() + with vector_loop2: + VMOVDQA(ymm_v0, ymm_s0) + VMOVDQA(ymm_v1, ymm_s1) + VMOVDQA(ymm_v2, ymm_s2) + VMOVDQA(ymm_v3, ymm_s3) + + MOV(reg_rounds, 20) + rounds_loop2 = Loop() + with rounds_loop2: + # a += b; d ^= a; d = ROTW16(d); + ADD_avx2(ymm_v0, ymm_v1) + XOR_avx2(ymm_v3, ymm_v0) + ROTW16_avx2(ymm_tmp0, ymm_v3) + + # c += d; b ^= c; b = ROTW12(b); + ADD_avx2(ymm_v2, ymm_v3) + XOR_avx2(ymm_v1, ymm_v2) + ROTW12_avx2(ymm_tmp0, ymm_v1) + + # a += b; d ^= a; d = ROTW8(d); + ADD_avx2(ymm_v0, ymm_v1) + XOR_avx2(ymm_v3, ymm_v0) + ROTW8_avx2(ymm_tmp0, ymm_v3) + + # c += d; b ^= c; b = ROTW7(b) + ADD_avx2(ymm_v2, ymm_v3) + XOR_avx2(ymm_v1, ymm_v2) + ROTW7_avx2(ymm_tmp0, ymm_v1) + + # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); + VPSHUFD(ymm_v1, ymm_v1, 0x39) + VPSHUFD(ymm_v2, ymm_v2, 0x4e) + VPSHUFD(ymm_v3, ymm_v3, 0x93) + + # a += b; d ^= a; d = ROTW16(d); + ADD_avx2(ymm_v0, ymm_v1) + XOR_avx2(ymm_v3, ymm_v0) + ROTW16_avx2(ymm_tmp0, ymm_v3) + + # c += d; b ^= c; b = ROTW12(b); + ADD_avx2(ymm_v2, ymm_v3) + XOR_avx2(ymm_v1, ymm_v2) + ROTW12_avx2(ymm_tmp0, ymm_v1) + + # a += b; d ^= a; d = ROTW8(d); + ADD_avx2(ymm_v0, ymm_v1) + XOR_avx2(ymm_v3, ymm_v0) + ROTW8_avx2(ymm_tmp0, ymm_v3) + + # c += d; b ^= c; b = ROTW7(b) + ADD_avx2(ymm_v2, ymm_v3) + XOR_avx2(ymm_v1, ymm_v2) + ROTW7_avx2(ymm_tmp0, ymm_v1) + + # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d); + VPSHUFD(ymm_v1, ymm_v1, 0x93) + VPSHUFD(ymm_v2, ymm_v2, 0x4e) + VPSHUFD(ymm_v3, ymm_v3, 0x39) + + SUB(reg_rounds, 2) + JNZ(rounds_loop2.begin) + + ADD_avx2(ymm_v0, ymm_s0) + ADD_avx2(ymm_v1, ymm_s1) + ADD_avx2(ymm_v2, ymm_s2) + ADD_avx2(ymm_v3, ymm_s3) + + # XOR_WRITE(out+ 0, in+ 0, _mm256_permute2x128_si256(v0,v1,0x20)); + VPERM2I128(ymm_tmp0, ymm_v0, ymm_v1, 0x20) + VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp]) + VMOVDQU([reg_outp], ymm_tmp0) + + # XOR_WRITE(out+32, in+32, _mm256_permute2x128_si256(v2,v3,0x20)); + VPERM2I128(ymm_tmp0, ymm_v2, ymm_v3, 0x20) + VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+32]) + VMOVDQU([reg_outp+32], ymm_tmp0) + + SUB(reg_blocks, 1) + JZ(out_write_odd) + + ADD_avx2(ymm_s3, ymm_inc) + + # XOR_WRITE(out+64, in+64, _mm256_permute2x128_si256(v0,v1,0x31)); + VPERM2I128(ymm_tmp0, ymm_v0, ymm_v1, 0x31) + VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+64]) + VMOVDQU([reg_outp+64], ymm_tmp0) + + # XOR_WRITE(out+96, in+96, _mm256_permute2x128_si256(v2,v3,0x31)); + VPERM2I128(ymm_tmp0, ymm_v2, ymm_v3, 0x31) + VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+96]) + VMOVDQU([reg_outp+96], ymm_tmp0) + + SUB(reg_blocks, 1) + JZ(out_write_even) + + ADD(reg_inp, 2 * 64) + ADD(reg_outp, 2 * 64) + JMP(vector_loop2.begin) + + LABEL(out_write_odd) + VPERM2I128(ymm_s3, ymm_s3, ymm_s3, 0x01) # Odd number of blocks. + + LABEL(out_write_even) + VMOVDQU(x_s3, ymm_s3.as_xmm) # Write back ymm_s3 to x_v3 + + # Paranoia, cleanse the scratch space. + VPXOR(ymm_v0, ymm_v0, ymm_v0) + VMOVDQA(mem_tmp0, ymm_v0) + VMOVDQA(mem_s3, ymm_v0) + + # Clear all YMM (and XMM) registers. + VZEROALL() + + # Remove our stack allocation. + MOV(registers.rsp, reg_sp_save) + + RETURN() + +# +# CPUID +# + +cpuidParams = Argument(ptr(uint32_t)) + +with Function("cpuidAmd64", (cpuidParams,)): + reg_params = registers.r15 + LOAD.ARGUMENT(reg_params, cpuidParams) + + MOV(registers.eax, [reg_params]) + MOV(registers.ecx, [reg_params+8]) + + CPUID() + + MOV([reg_params], registers.eax) + MOV([reg_params+4], registers.ebx) + MOV([reg_params+8], registers.ecx) + MOV([reg_params+12], registers.edx) + + RETURN() + +# +# XGETBV (ECX = 0) +# + +xcrVec = Argument(ptr(uint32_t)) + +with Function("xgetbv0Amd64", (xcrVec,)): + reg_vec = GeneralPurposeRegister64() + + LOAD.ARGUMENT(reg_vec, xcrVec) + + XOR(registers.ecx, registers.ecx) + + XGETBV() + + MOV([reg_vec], registers.eax) + MOV([reg_vec+4], registers.edx) + + RETURN() diff --git a/vendor/github.com/Yawning/chacha20/chacha20_amd64.s b/vendor/github.com/Yawning/chacha20/chacha20_amd64.s new file mode 100644 index 0000000..e3792af --- /dev/null +++ b/vendor/github.com/Yawning/chacha20/chacha20_amd64.s @@ -0,0 +1,1180 @@ +// +build !noasm +// Generated by PeachPy 0.2.0 from chacha20_amd64.py + + +// func blocksAmd64SSE2(x *uint32, inp *uint8, outp *uint8, nrBlocks *uint) +TEXT ·blocksAmd64SSE2(SB),4,$0-32 + MOVQ x+0(FP), AX + MOVQ inp+8(FP), BX + MOVQ outp+16(FP), CX + MOVQ nrBlocks+24(FP), DX + MOVQ SP, DI + ANDQ $18446744073709551584, SP + SUBQ $32, SP + PXOR X0, X0 + SUBQ $32, SP + MOVO X0, 0(SP) + MOVL $1, SI + MOVL SI, 0(SP) + SUBQ $4, DX + JCS vector_loop4_end +vector_loop4_begin: + MOVOU 0(AX), X0 + MOVOU 16(AX), X1 + MOVOU 32(AX), X2 + MOVOU 48(AX), X3 + MOVO X0, X4 + MOVO X1, X5 + MOVO X2, X6 + MOVO X3, X7 + PADDQ 0(SP), X7 + MOVO X0, X8 + MOVO X1, X9 + MOVO X2, X10 + MOVO X7, X11 + PADDQ 0(SP), X11 + MOVO X0, X12 + MOVO X1, X13 + MOVO X2, X14 + MOVO X11, X15 + PADDQ 0(SP), X15 + MOVQ $20, SI +rounds_loop4_begin: + PADDL X1, X0 + PADDL X5, X4 + PADDL X9, X8 + PADDL X13, X12 + PXOR X0, X3 + PXOR X4, X7 + PXOR X8, X11 + PXOR X12, X15 + MOVO X12, 16(SP) + MOVO X3, X12 + PSLLL $16, X12 + PSRLL $16, X3 + PXOR X12, X3 + MOVO X7, X12 + PSLLL $16, X12 + PSRLL $16, X7 + PXOR X12, X7 + MOVO X11, X12 + PSLLL $16, X12 + PSRLL $16, X11 + PXOR X12, X11 + MOVO X15, X12 + PSLLL $16, X12 + PSRLL $16, X15 + PXOR X12, X15 + PADDL X3, X2 + PADDL X7, X6 + PADDL X11, X10 + PADDL X15, X14 + PXOR X2, X1 + PXOR X6, X5 + PXOR X10, X9 + PXOR X14, X13 + MOVO X1, X12 + PSLLL $12, X12 + PSRLL $20, X1 + PXOR X12, X1 + MOVO X5, X12 + PSLLL $12, X12 + PSRLL $20, X5 + PXOR X12, X5 + MOVO X9, X12 + PSLLL $12, X12 + PSRLL $20, X9 + PXOR X12, X9 + MOVO X13, X12 + PSLLL $12, X12 + PSRLL $20, X13 + PXOR X12, X13 + MOVO 16(SP), X12 + PADDL X1, X0 + PADDL X5, X4 + PADDL X9, X8 + PADDL X13, X12 + PXOR X0, X3 + PXOR X4, X7 + PXOR X8, X11 + PXOR X12, X15 + MOVO X12, 16(SP) + MOVO X3, X12 + PSLLL $8, X12 + PSRLL $24, X3 + PXOR X12, X3 + MOVO X7, X12 + PSLLL $8, X12 + PSRLL $24, X7 + PXOR X12, X7 + MOVO X11, X12 + PSLLL $8, X12 + PSRLL $24, X11 + PXOR X12, X11 + MOVO X15, X12 + PSLLL $8, X12 + PSRLL $24, X15 + PXOR X12, X15 + PADDL X3, X2 + PADDL X7, X6 + PADDL X11, X10 + PADDL X15, X14 + PXOR X2, X1 + PXOR X6, X5 + PXOR X10, X9 + PXOR X14, X13 + MOVO X1, X12 + PSLLL $7, X12 + PSRLL $25, X1 + PXOR X12, X1 + MOVO X5, X12 + PSLLL $7, X12 + PSRLL $25, X5 + PXOR X12, X5 + MOVO X9, X12 + PSLLL $7, X12 + PSRLL $25, X9 + PXOR X12, X9 + MOVO X13, X12 + PSLLL $7, X12 + PSRLL $25, X13 + PXOR X12, X13 + PSHUFL $57, X1, X1 + PSHUFL $57, X5, X5 + PSHUFL $57, X9, X9 + PSHUFL $57, X13, X13 + PSHUFL $78, X2, X2 + PSHUFL $78, X6, X6 + PSHUFL $78, X10, X10 + PSHUFL $78, X14, X14 + PSHUFL $147, X3, X3 + PSHUFL $147, X7, X7 + PSHUFL $147, X11, X11 + PSHUFL $147, X15, X15 + MOVO 16(SP), X12 + PADDL X1, X0 + PADDL X5, X4 + PADDL X9, X8 + PADDL X13, X12 + PXOR X0, X3 + PXOR X4, X7 + PXOR X8, X11 + PXOR X12, X15 + MOVO X12, 16(SP) + MOVO X3, X12 + PSLLL $16, X12 + PSRLL $16, X3 + PXOR X12, X3 + MOVO X7, X12 + PSLLL $16, X12 + PSRLL $16, X7 + PXOR X12, X7 + MOVO X11, X12 + PSLLL $16, X12 + PSRLL $16, X11 + PXOR X12, X11 + MOVO X15, X12 + PSLLL $16, X12 + PSRLL $16, X15 + PXOR X12, X15 + PADDL X3, X2 + PADDL X7, X6 + PADDL X11, X10 + PADDL X15, X14 + PXOR X2, X1 + PXOR X6, X5 + PXOR X10, X9 + PXOR X14, X13 + MOVO X1, X12 + PSLLL $12, X12 + PSRLL $20, X1 + PXOR X12, X1 + MOVO X5, X12 + PSLLL $12, X12 + PSRLL $20, X5 + PXOR X12, X5 + MOVO X9, X12 + PSLLL $12, X12 + PSRLL $20, X9 + PXOR X12, X9 + MOVO X13, X12 + PSLLL $12, X12 + PSRLL $20, X13 + PXOR X12, X13 + MOVO 16(SP), X12 + PADDL X1, X0 + PADDL X5, X4 + PADDL X9, X8 + PADDL X13, X12 + PXOR X0, X3 + PXOR X4, X7 + PXOR X8, X11 + PXOR X12, X15 + MOVO X12, 16(SP) + MOVO X3, X12 + PSLLL $8, X12 + PSRLL $24, X3 + PXOR X12, X3 + MOVO X7, X12 + PSLLL $8, X12 + PSRLL $24, X7 + PXOR X12, X7 + MOVO X11, X12 + PSLLL $8, X12 + PSRLL $24, X11 + PXOR X12, X11 + MOVO X15, X12 + PSLLL $8, X12 + PSRLL $24, X15 + PXOR X12, X15 + PADDL X3, X2 + PADDL X7, X6 + PADDL X11, X10 + PADDL X15, X14 + PXOR X2, X1 + PXOR X6, X5 + PXOR X10, X9 + PXOR X14, X13 + MOVO X1, X12 + PSLLL $7, X12 + PSRLL $25, X1 + PXOR X12, X1 + MOVO X5, X12 + PSLLL $7, X12 + PSRLL $25, X5 + PXOR X12, X5 + MOVO X9, X12 + PSLLL $7, X12 + PSRLL $25, X9 + PXOR X12, X9 + MOVO X13, X12 + PSLLL $7, X12 + PSRLL $25, X13 + PXOR X12, X13 + PSHUFL $147, X1, X1 + PSHUFL $147, X5, X5 + PSHUFL $147, X9, X9 + PSHUFL $147, X13, X13 + PSHUFL $78, X2, X2 + PSHUFL $78, X6, X6 + PSHUFL $78, X10, X10 + PSHUFL $78, X14, X14 + PSHUFL $57, X3, X3 + PSHUFL $57, X7, X7 + PSHUFL $57, X11, X11 + PSHUFL $57, X15, X15 + MOVO 16(SP), X12 + SUBQ $2, SI + JNE rounds_loop4_begin + MOVO X12, 16(SP) + PADDL 0(AX), X0 + PADDL 16(AX), X1 + PADDL 32(AX), X2 + PADDL 48(AX), X3 + MOVOU 0(BX), X12 + PXOR X0, X12 + MOVOU X12, 0(CX) + MOVOU 16(BX), X12 + PXOR X1, X12 + MOVOU X12, 16(CX) + MOVOU 32(BX), X12 + PXOR X2, X12 + MOVOU X12, 32(CX) + MOVOU 48(BX), X12 + PXOR X3, X12 + MOVOU X12, 48(CX) + MOVOU 48(AX), X3 + PADDQ 0(SP), X3 + PADDL 0(AX), X4 + PADDL 16(AX), X5 + PADDL 32(AX), X6 + PADDL X3, X7 + MOVOU 64(BX), X12 + PXOR X4, X12 + MOVOU X12, 64(CX) + MOVOU 80(BX), X12 + PXOR X5, X12 + MOVOU X12, 80(CX) + MOVOU 96(BX), X12 + PXOR X6, X12 + MOVOU X12, 96(CX) + MOVOU 112(BX), X12 + PXOR X7, X12 + MOVOU X12, 112(CX) + PADDQ 0(SP), X3 + PADDL 0(AX), X8 + PADDL 16(AX), X9 + PADDL 32(AX), X10 + PADDL X3, X11 + MOVOU 128(BX), X12 + PXOR X8, X12 + MOVOU X12, 128(CX) + MOVOU 144(BX), X12 + PXOR X9, X12 + MOVOU X12, 144(CX) + MOVOU 160(BX), X12 + PXOR X10, X12 + MOVOU X12, 160(CX) + MOVOU 176(BX), X12 + PXOR X11, X12 + MOVOU X12, 176(CX) + PADDQ 0(SP), X3 + MOVO 16(SP), X12 + PADDL 0(AX), X12 + PADDL 16(AX), X13 + PADDL 32(AX), X14 + PADDL X3, X15 + MOVOU 192(BX), X0 + PXOR X12, X0 + MOVOU X0, 192(CX) + MOVOU 208(BX), X0 + PXOR X13, X0 + MOVOU X0, 208(CX) + MOVOU 224(BX), X0 + PXOR X14, X0 + MOVOU X0, 224(CX) + MOVOU 240(BX), X0 + PXOR X15, X0 + MOVOU X0, 240(CX) + PADDQ 0(SP), X3 + MOVOU X3, 48(AX) + ADDQ $256, BX + ADDQ $256, CX + SUBQ $4, DX + JCC vector_loop4_begin +vector_loop4_end: + ADDQ $4, DX + JEQ out + MOVOU 0(AX), X8 + MOVOU 16(AX), X9 + MOVOU 32(AX), X10 + MOVOU 48(AX), X11 + MOVO 0(SP), X13 + SUBQ $2, DX + JCS process_1_block + MOVO X8, X0 + MOVO X9, X1 + MOVO X10, X2 + MOVO X11, X3 + MOVO X0, X4 + MOVO X1, X5 + MOVO X2, X6 + MOVO X3, X7 + PADDQ X13, X7 + MOVQ $20, SI +rounds_loop2_begin: + PADDL X1, X0 + PADDL X5, X4 + PXOR X0, X3 + PXOR X4, X7 + MOVO X3, X12 + PSLLL $16, X12 + PSRLL $16, X3 + PXOR X12, X3 + MOVO X7, X12 + PSLLL $16, X12 + PSRLL $16, X7 + PXOR X12, X7 + PADDL X3, X2 + PADDL X7, X6 + PXOR X2, X1 + PXOR X6, X5 + MOVO X1, X12 + PSLLL $12, X12 + PSRLL $20, X1 + PXOR X12, X1 + MOVO X5, X12 + PSLLL $12, X12 + PSRLL $20, X5 + PXOR X12, X5 + PADDL X1, X0 + PADDL X5, X4 + PXOR X0, X3 + PXOR X4, X7 + MOVO X3, X12 + PSLLL $8, X12 + PSRLL $24, X3 + PXOR X12, X3 + MOVO X7, X12 + PSLLL $8, X12 + PSRLL $24, X7 + PXOR X12, X7 + PADDL X3, X2 + PADDL X7, X6 + PXOR X2, X1 + PXOR X6, X5 + MOVO X1, X12 + PSLLL $7, X12 + PSRLL $25, X1 + PXOR X12, X1 + MOVO X5, X12 + PSLLL $7, X12 + PSRLL $25, X5 + PXOR X12, X5 + PSHUFL $57, X1, X1 + PSHUFL $57, X5, X5 + PSHUFL $78, X2, X2 + PSHUFL $78, X6, X6 + PSHUFL $147, X3, X3 + PSHUFL $147, X7, X7 + PADDL X1, X0 + PADDL X5, X4 + PXOR X0, X3 + PXOR X4, X7 + MOVO X3, X12 + PSLLL $16, X12 + PSRLL $16, X3 + PXOR X12, X3 + MOVO X7, X12 + PSLLL $16, X12 + PSRLL $16, X7 + PXOR X12, X7 + PADDL X3, X2 + PADDL X7, X6 + PXOR X2, X1 + PXOR X6, X5 + MOVO X1, X12 + PSLLL $12, X12 + PSRLL $20, X1 + PXOR X12, X1 + MOVO X5, X12 + PSLLL $12, X12 + PSRLL $20, X5 + PXOR X12, X5 + PADDL X1, X0 + PADDL X5, X4 + PXOR X0, X3 + PXOR X4, X7 + MOVO X3, X12 + PSLLL $8, X12 + PSRLL $24, X3 + PXOR X12, X3 + MOVO X7, X12 + PSLLL $8, X12 + PSRLL $24, X7 + PXOR X12, X7 + PADDL X3, X2 + PADDL X7, X6 + PXOR X2, X1 + PXOR X6, X5 + MOVO X1, X12 + PSLLL $7, X12 + PSRLL $25, X1 + PXOR X12, X1 + MOVO X5, X12 + PSLLL $7, X12 + PSRLL $25, X5 + PXOR X12, X5 + PSHUFL $147, X1, X1 + PSHUFL $147, X5, X5 + PSHUFL $78, X2, X2 + PSHUFL $78, X6, X6 + PSHUFL $57, X3, X3 + PSHUFL $57, X7, X7 + SUBQ $2, SI + JNE rounds_loop2_begin + PADDL X8, X0 + PADDL X9, X1 + PADDL X10, X2 + PADDL X11, X3 + MOVOU 0(BX), X12 + PXOR X0, X12 + MOVOU X12, 0(CX) + MOVOU 16(BX), X12 + PXOR X1, X12 + MOVOU X12, 16(CX) + MOVOU 32(BX), X12 + PXOR X2, X12 + MOVOU X12, 32(CX) + MOVOU 48(BX), X12 + PXOR X3, X12 + MOVOU X12, 48(CX) + PADDQ X13, X11 + PADDL X8, X4 + PADDL X9, X5 + PADDL X10, X6 + PADDL X11, X7 + MOVOU 64(BX), X12 + PXOR X4, X12 + MOVOU X12, 64(CX) + MOVOU 80(BX), X12 + PXOR X5, X12 + MOVOU X12, 80(CX) + MOVOU 96(BX), X12 + PXOR X6, X12 + MOVOU X12, 96(CX) + MOVOU 112(BX), X12 + PXOR X7, X12 + MOVOU X12, 112(CX) + PADDQ X13, X11 + ADDQ $128, BX + ADDQ $128, CX + SUBQ $2, DX +process_1_block: + ADDQ $2, DX + JEQ out_serial + MOVO X8, X0 + MOVO X9, X1 + MOVO X10, X2 + MOVO X11, X3 + MOVQ $20, SI +rounds_loop1_begin: + PADDL X1, X0 + PXOR X0, X3 + MOVO X3, X12 + PSLLL $16, X12 + PSRLL $16, X3 + PXOR X12, X3 + PADDL X3, X2 + PXOR X2, X1 + MOVO X1, X12 + PSLLL $12, X12 + PSRLL $20, X1 + PXOR X12, X1 + PADDL X1, X0 + PXOR X0, X3 + MOVO X3, X12 + PSLLL $8, X12 + PSRLL $24, X3 + PXOR X12, X3 + PADDL X3, X2 + PXOR X2, X1 + MOVO X1, X12 + PSLLL $7, X12 + PSRLL $25, X1 + PXOR X12, X1 + PSHUFL $57, X1, X1 + PSHUFL $78, X2, X2 + PSHUFL $147, X3, X3 + PADDL X1, X0 + PXOR X0, X3 + MOVO X3, X12 + PSLLL $16, X12 + PSRLL $16, X3 + PXOR X12, X3 + PADDL X3, X2 + PXOR X2, X1 + MOVO X1, X12 + PSLLL $12, X12 + PSRLL $20, X1 + PXOR X12, X1 + PADDL X1, X0 + PXOR X0, X3 + MOVO X3, X12 + PSLLL $8, X12 + PSRLL $24, X3 + PXOR X12, X3 + PADDL X3, X2 + PXOR X2, X1 + MOVO X1, X12 + PSLLL $7, X12 + PSRLL $25, X1 + PXOR X12, X1 + PSHUFL $147, X1, X1 + PSHUFL $78, X2, X2 + PSHUFL $57, X3, X3 + SUBQ $2, SI + JNE rounds_loop1_begin + PADDL X8, X0 + PADDL X9, X1 + PADDL X10, X2 + PADDL X11, X3 + MOVOU 0(BX), X12 + PXOR X0, X12 + MOVOU X12, 0(CX) + MOVOU 16(BX), X12 + PXOR X1, X12 + MOVOU X12, 16(CX) + MOVOU 32(BX), X12 + PXOR X2, X12 + MOVOU X12, 32(CX) + MOVOU 48(BX), X12 + PXOR X3, X12 + MOVOU X12, 48(CX) + PADDQ X13, X11 +out_serial: + MOVOU X11, 48(AX) +out: + PXOR X0, X0 + MOVO X0, 16(SP) + MOVQ DI, SP + RET + +// func blocksAmd64AVX2(x *uint32, inp *uint8, outp *uint8, nrBlocks *uint) +TEXT ·blocksAmd64AVX2(SB),4,$0-32 + MOVQ x+0(FP), AX + MOVQ inp+8(FP), BX + MOVQ outp+16(FP), CX + MOVQ nrBlocks+24(FP), DX + MOVQ SP, DI + ANDQ $18446744073709551584, SP + SUBQ $32, SP + SUBQ $96, SP + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm0, ymm0, ymm0 + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x04; BYTE $0x24 // VMOVDQU [rsp], ymm0 + MOVL $1, SI + MOVL SI, 16(SP) + BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x48; BYTE $0x30 // VBROADCASTI128 ymm1, [rax + 48] + BYTE $0xC5; BYTE $0xF5; BYTE $0xD4; BYTE $0x0C; BYTE $0x24 // VPADDQ ymm1, ymm1, [rsp] + BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x4C; BYTE $0x24; BYTE $0x20 // VMOVDQA [rsp + 32], ymm1 + MOVL $2, SI + MOVL SI, 0(SP) + MOVL SI, 16(SP) + SUBQ $8, DX + JCS vector_loop8_end +vector_loop8_begin: + BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x10 // VBROADCASTI128 ymm2, [rax] + BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x58; BYTE $0x10 // VBROADCASTI128 ymm3, [rax + 16] + BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x60; BYTE $0x20 // VBROADCASTI128 ymm4, [rax + 32] + BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x4C; BYTE $0x24; BYTE $0x20 // VMOVDQA ymm1, [rsp + 32] + BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xEA // VMOVDQA ymm5, ymm2 + BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xF3 // VMOVDQA ymm6, ymm3 + BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xFC // VMOVDQA ymm7, ymm4 + BYTE $0xC5; BYTE $0x75; BYTE $0xD4; BYTE $0x04; BYTE $0x24 // VPADDQ ymm8, ymm1, [rsp] + BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xCA // VMOVDQA ymm9, ymm2 + BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xD3 // VMOVDQA ymm10, ymm3 + BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xDC // VMOVDQA ymm11, ymm4 + BYTE $0xC5; BYTE $0x3D; BYTE $0xD4; BYTE $0x24; BYTE $0x24 // VPADDQ ymm12, ymm8, [rsp] + BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xC2 // VMOVDQA ymm0, ymm2 + BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xEB // VMOVDQA ymm13, ymm3 + BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xF4 // VMOVDQA ymm14, ymm4 + BYTE $0xC5; BYTE $0x1D; BYTE $0xD4; BYTE $0x3C; BYTE $0x24 // VPADDQ ymm15, ymm12, [rsp] + MOVQ $20, SI +rounds_loop8_begin: + BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3 + BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6 + BYTE $0xC4; BYTE $0x41; BYTE $0x35; BYTE $0xFE; BYTE $0xCA // VPADDD ymm9, ymm9, ymm10 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0xFE; BYTE $0xC5 // VPADDD ymm0, ymm0, ymm13 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2 + BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5 + BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xEF; BYTE $0xE1 // VPXOR ymm12, ymm12, ymm9 + BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA [rsp + 64], ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x10 // VPSLLD ymm0, ymm1, 16 + BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x10 // VPSRLD ymm1, ymm1, 16 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x10 // VPSLLD ymm0, ymm8, 16 + BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x10 // VPSRLD ymm8, ymm8, 16 + BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF4; BYTE $0x10 // VPSLLD ymm0, ymm12, 16 + BYTE $0xC4; BYTE $0xC1; BYTE $0x1D; BYTE $0x72; BYTE $0xD4; BYTE $0x10 // VPSRLD ymm12, ymm12, 16 + BYTE $0xC5; BYTE $0x1D; BYTE $0xEF; BYTE $0xE0 // VPXOR ymm12, ymm12, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF7; BYTE $0x10 // VPSLLD ymm0, ymm15, 16 + BYTE $0xC4; BYTE $0xC1; BYTE $0x05; BYTE $0x72; BYTE $0xD7; BYTE $0x10 // VPSRLD ymm15, ymm15, 16 + BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0 + BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1 + BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8 + BYTE $0xC4; BYTE $0x41; BYTE $0x25; BYTE $0xFE; BYTE $0xDC // VPADDD ymm11, ymm11, ymm12 + BYTE $0xC4; BYTE $0x41; BYTE $0x0D; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm14, ymm14, ymm15 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4 + BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7 + BYTE $0xC4; BYTE $0x41; BYTE $0x2D; BYTE $0xEF; BYTE $0xD3 // VPXOR ymm10, ymm10, ymm11 + BYTE $0xC4; BYTE $0x41; BYTE $0x15; BYTE $0xEF; BYTE $0xEE // VPXOR ymm13, ymm13, ymm14 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x0C // VPSLLD ymm0, ymm3, 12 + BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x14 // VPSRLD ymm3, ymm3, 20 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x0C // VPSLLD ymm0, ymm6, 12 + BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x14 // VPSRLD ymm6, ymm6, 20 + BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF2; BYTE $0x0C // VPSLLD ymm0, ymm10, 12 + BYTE $0xC4; BYTE $0xC1; BYTE $0x2D; BYTE $0x72; BYTE $0xD2; BYTE $0x14 // VPSRLD ymm10, ymm10, 20 + BYTE $0xC5; BYTE $0x2D; BYTE $0xEF; BYTE $0xD0 // VPXOR ymm10, ymm10, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF5; BYTE $0x0C // VPSLLD ymm0, ymm13, 12 + BYTE $0xC4; BYTE $0xC1; BYTE $0x15; BYTE $0x72; BYTE $0xD5; BYTE $0x14 // VPSRLD ymm13, ymm13, 20 + BYTE $0xC5; BYTE $0x15; BYTE $0xEF; BYTE $0xE8 // VPXOR ymm13, ymm13, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA ymm0, [rsp + 64] + BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3 + BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6 + BYTE $0xC4; BYTE $0x41; BYTE $0x35; BYTE $0xFE; BYTE $0xCA // VPADDD ymm9, ymm9, ymm10 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0xFE; BYTE $0xC5 // VPADDD ymm0, ymm0, ymm13 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2 + BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5 + BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xEF; BYTE $0xE1 // VPXOR ymm12, ymm12, ymm9 + BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA [rsp + 64], ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x08 // VPSLLD ymm0, ymm1, 8 + BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x18 // VPSRLD ymm1, ymm1, 24 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x08 // VPSLLD ymm0, ymm8, 8 + BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x18 // VPSRLD ymm8, ymm8, 24 + BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF4; BYTE $0x08 // VPSLLD ymm0, ymm12, 8 + BYTE $0xC4; BYTE $0xC1; BYTE $0x1D; BYTE $0x72; BYTE $0xD4; BYTE $0x18 // VPSRLD ymm12, ymm12, 24 + BYTE $0xC5; BYTE $0x1D; BYTE $0xEF; BYTE $0xE0 // VPXOR ymm12, ymm12, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF7; BYTE $0x08 // VPSLLD ymm0, ymm15, 8 + BYTE $0xC4; BYTE $0xC1; BYTE $0x05; BYTE $0x72; BYTE $0xD7; BYTE $0x18 // VPSRLD ymm15, ymm15, 24 + BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0 + BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1 + BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8 + BYTE $0xC4; BYTE $0x41; BYTE $0x25; BYTE $0xFE; BYTE $0xDC // VPADDD ymm11, ymm11, ymm12 + BYTE $0xC4; BYTE $0x41; BYTE $0x0D; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm14, ymm14, ymm15 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4 + BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7 + BYTE $0xC4; BYTE $0x41; BYTE $0x2D; BYTE $0xEF; BYTE $0xD3 // VPXOR ymm10, ymm10, ymm11 + BYTE $0xC4; BYTE $0x41; BYTE $0x15; BYTE $0xEF; BYTE $0xEE // VPXOR ymm13, ymm13, ymm14 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x07 // VPSLLD ymm0, ymm3, 7 + BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x19 // VPSRLD ymm3, ymm3, 25 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x07 // VPSLLD ymm0, ymm6, 7 + BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x19 // VPSRLD ymm6, ymm6, 25 + BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF2; BYTE $0x07 // VPSLLD ymm0, ymm10, 7 + BYTE $0xC4; BYTE $0xC1; BYTE $0x2D; BYTE $0x72; BYTE $0xD2; BYTE $0x19 // VPSRLD ymm10, ymm10, 25 + BYTE $0xC5; BYTE $0x2D; BYTE $0xEF; BYTE $0xD0 // VPXOR ymm10, ymm10, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF5; BYTE $0x07 // VPSLLD ymm0, ymm13, 7 + BYTE $0xC4; BYTE $0xC1; BYTE $0x15; BYTE $0x72; BYTE $0xD5; BYTE $0x19 // VPSRLD ymm13, ymm13, 25 + BYTE $0xC5; BYTE $0x15; BYTE $0xEF; BYTE $0xE8 // VPXOR ymm13, ymm13, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xDB; BYTE $0x39 // VPSHUFD ymm3, ymm3, 57 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xF6; BYTE $0x39 // VPSHUFD ymm6, ymm6, 57 + BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xD2; BYTE $0x39 // VPSHUFD ymm10, ymm10, 57 + BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xED; BYTE $0x39 // VPSHUFD ymm13, ymm13, 57 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xE4; BYTE $0x4E // VPSHUFD ymm4, ymm4, 78 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xFF; BYTE $0x4E // VPSHUFD ymm7, ymm7, 78 + BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xDB; BYTE $0x4E // VPSHUFD ymm11, ymm11, 78 + BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xF6; BYTE $0x4E // VPSHUFD ymm14, ymm14, 78 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xC9; BYTE $0x93 // VPSHUFD ymm1, ymm1, 147 + BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xC0; BYTE $0x93 // VPSHUFD ymm8, ymm8, 147 + BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xE4; BYTE $0x93 // VPSHUFD ymm12, ymm12, 147 + BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xFF; BYTE $0x93 // VPSHUFD ymm15, ymm15, 147 + BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA ymm0, [rsp + 64] + BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3 + BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6 + BYTE $0xC4; BYTE $0x41; BYTE $0x35; BYTE $0xFE; BYTE $0xCA // VPADDD ymm9, ymm9, ymm10 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0xFE; BYTE $0xC5 // VPADDD ymm0, ymm0, ymm13 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2 + BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5 + BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xEF; BYTE $0xE1 // VPXOR ymm12, ymm12, ymm9 + BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA [rsp + 64], ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x10 // VPSLLD ymm0, ymm1, 16 + BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x10 // VPSRLD ymm1, ymm1, 16 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x10 // VPSLLD ymm0, ymm8, 16 + BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x10 // VPSRLD ymm8, ymm8, 16 + BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF4; BYTE $0x10 // VPSLLD ymm0, ymm12, 16 + BYTE $0xC4; BYTE $0xC1; BYTE $0x1D; BYTE $0x72; BYTE $0xD4; BYTE $0x10 // VPSRLD ymm12, ymm12, 16 + BYTE $0xC5; BYTE $0x1D; BYTE $0xEF; BYTE $0xE0 // VPXOR ymm12, ymm12, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF7; BYTE $0x10 // VPSLLD ymm0, ymm15, 16 + BYTE $0xC4; BYTE $0xC1; BYTE $0x05; BYTE $0x72; BYTE $0xD7; BYTE $0x10 // VPSRLD ymm15, ymm15, 16 + BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0 + BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1 + BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8 + BYTE $0xC4; BYTE $0x41; BYTE $0x25; BYTE $0xFE; BYTE $0xDC // VPADDD ymm11, ymm11, ymm12 + BYTE $0xC4; BYTE $0x41; BYTE $0x0D; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm14, ymm14, ymm15 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4 + BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7 + BYTE $0xC4; BYTE $0x41; BYTE $0x2D; BYTE $0xEF; BYTE $0xD3 // VPXOR ymm10, ymm10, ymm11 + BYTE $0xC4; BYTE $0x41; BYTE $0x15; BYTE $0xEF; BYTE $0xEE // VPXOR ymm13, ymm13, ymm14 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x0C // VPSLLD ymm0, ymm3, 12 + BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x14 // VPSRLD ymm3, ymm3, 20 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x0C // VPSLLD ymm0, ymm6, 12 + BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x14 // VPSRLD ymm6, ymm6, 20 + BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF2; BYTE $0x0C // VPSLLD ymm0, ymm10, 12 + BYTE $0xC4; BYTE $0xC1; BYTE $0x2D; BYTE $0x72; BYTE $0xD2; BYTE $0x14 // VPSRLD ymm10, ymm10, 20 + BYTE $0xC5; BYTE $0x2D; BYTE $0xEF; BYTE $0xD0 // VPXOR ymm10, ymm10, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF5; BYTE $0x0C // VPSLLD ymm0, ymm13, 12 + BYTE $0xC4; BYTE $0xC1; BYTE $0x15; BYTE $0x72; BYTE $0xD5; BYTE $0x14 // VPSRLD ymm13, ymm13, 20 + BYTE $0xC5; BYTE $0x15; BYTE $0xEF; BYTE $0xE8 // VPXOR ymm13, ymm13, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA ymm0, [rsp + 64] + BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3 + BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6 + BYTE $0xC4; BYTE $0x41; BYTE $0x35; BYTE $0xFE; BYTE $0xCA // VPADDD ymm9, ymm9, ymm10 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0xFE; BYTE $0xC5 // VPADDD ymm0, ymm0, ymm13 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2 + BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5 + BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xEF; BYTE $0xE1 // VPXOR ymm12, ymm12, ymm9 + BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA [rsp + 64], ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x08 // VPSLLD ymm0, ymm1, 8 + BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x18 // VPSRLD ymm1, ymm1, 24 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x08 // VPSLLD ymm0, ymm8, 8 + BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x18 // VPSRLD ymm8, ymm8, 24 + BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF4; BYTE $0x08 // VPSLLD ymm0, ymm12, 8 + BYTE $0xC4; BYTE $0xC1; BYTE $0x1D; BYTE $0x72; BYTE $0xD4; BYTE $0x18 // VPSRLD ymm12, ymm12, 24 + BYTE $0xC5; BYTE $0x1D; BYTE $0xEF; BYTE $0xE0 // VPXOR ymm12, ymm12, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF7; BYTE $0x08 // VPSLLD ymm0, ymm15, 8 + BYTE $0xC4; BYTE $0xC1; BYTE $0x05; BYTE $0x72; BYTE $0xD7; BYTE $0x18 // VPSRLD ymm15, ymm15, 24 + BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0 + BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1 + BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8 + BYTE $0xC4; BYTE $0x41; BYTE $0x25; BYTE $0xFE; BYTE $0xDC // VPADDD ymm11, ymm11, ymm12 + BYTE $0xC4; BYTE $0x41; BYTE $0x0D; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm14, ymm14, ymm15 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4 + BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7 + BYTE $0xC4; BYTE $0x41; BYTE $0x2D; BYTE $0xEF; BYTE $0xD3 // VPXOR ymm10, ymm10, ymm11 + BYTE $0xC4; BYTE $0x41; BYTE $0x15; BYTE $0xEF; BYTE $0xEE // VPXOR ymm13, ymm13, ymm14 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x07 // VPSLLD ymm0, ymm3, 7 + BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x19 // VPSRLD ymm3, ymm3, 25 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x07 // VPSLLD ymm0, ymm6, 7 + BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x19 // VPSRLD ymm6, ymm6, 25 + BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF2; BYTE $0x07 // VPSLLD ymm0, ymm10, 7 + BYTE $0xC4; BYTE $0xC1; BYTE $0x2D; BYTE $0x72; BYTE $0xD2; BYTE $0x19 // VPSRLD ymm10, ymm10, 25 + BYTE $0xC5; BYTE $0x2D; BYTE $0xEF; BYTE $0xD0 // VPXOR ymm10, ymm10, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF5; BYTE $0x07 // VPSLLD ymm0, ymm13, 7 + BYTE $0xC4; BYTE $0xC1; BYTE $0x15; BYTE $0x72; BYTE $0xD5; BYTE $0x19 // VPSRLD ymm13, ymm13, 25 + BYTE $0xC5; BYTE $0x15; BYTE $0xEF; BYTE $0xE8 // VPXOR ymm13, ymm13, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xDB; BYTE $0x93 // VPSHUFD ymm3, ymm3, 147 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xF6; BYTE $0x93 // VPSHUFD ymm6, ymm6, 147 + BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xD2; BYTE $0x93 // VPSHUFD ymm10, ymm10, 147 + BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xED; BYTE $0x93 // VPSHUFD ymm13, ymm13, 147 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xE4; BYTE $0x4E // VPSHUFD ymm4, ymm4, 78 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xFF; BYTE $0x4E // VPSHUFD ymm7, ymm7, 78 + BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xDB; BYTE $0x4E // VPSHUFD ymm11, ymm11, 78 + BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xF6; BYTE $0x4E // VPSHUFD ymm14, ymm14, 78 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xC9; BYTE $0x39 // VPSHUFD ymm1, ymm1, 57 + BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xC0; BYTE $0x39 // VPSHUFD ymm8, ymm8, 57 + BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xE4; BYTE $0x39 // VPSHUFD ymm12, ymm12, 57 + BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xFF; BYTE $0x39 // VPSHUFD ymm15, ymm15, 57 + BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA ymm0, [rsp + 64] + SUBQ $2, SI + JNE rounds_loop8_begin + BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x00 // VBROADCASTI128 ymm0, [rax] + BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD0 // VPADDD ymm2, ymm2, ymm0 + BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xE8 // VPADDD ymm5, ymm5, ymm0 + BYTE $0xC5; BYTE $0x35; BYTE $0xFE; BYTE $0xC8 // VPADDD ymm9, ymm9, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0xFE; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VPADDD ymm0, ymm0, [rsp + 64] + BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA [rsp + 64], ymm0 + BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x40; BYTE $0x10 // VBROADCASTI128 ymm0, [rax + 16] + BYTE $0xC5; BYTE $0xE5; BYTE $0xFE; BYTE $0xD8 // VPADDD ymm3, ymm3, ymm0 + BYTE $0xC5; BYTE $0xCD; BYTE $0xFE; BYTE $0xF0 // VPADDD ymm6, ymm6, ymm0 + BYTE $0xC5; BYTE $0x2D; BYTE $0xFE; BYTE $0xD0 // VPADDD ymm10, ymm10, ymm0 + BYTE $0xC5; BYTE $0x15; BYTE $0xFE; BYTE $0xE8 // VPADDD ymm13, ymm13, ymm0 + BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x40; BYTE $0x20 // VBROADCASTI128 ymm0, [rax + 32] + BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE0 // VPADDD ymm4, ymm4, ymm0 + BYTE $0xC5; BYTE $0xC5; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm0 + BYTE $0xC5; BYTE $0x25; BYTE $0xFE; BYTE $0xD8 // VPADDD ymm11, ymm11, ymm0 + BYTE $0xC5; BYTE $0x0D; BYTE $0xFE; BYTE $0xF0 // VPADDD ymm14, ymm14, ymm0 + BYTE $0xC5; BYTE $0xF5; BYTE $0xFE; BYTE $0x4C; BYTE $0x24; BYTE $0x20 // VPADDD ymm1, ymm1, [rsp + 32] + BYTE $0xC4; BYTE $0xE3; BYTE $0x6D; BYTE $0x46; BYTE $0xC3; BYTE $0x20 // VPERM2I128 ymm0, ymm2, ymm3, 32 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x03 // VPXOR ymm0, ymm0, [rbx] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x01 // VMOVDQU [rcx], ymm0 + BYTE $0xC4; BYTE $0xE3; BYTE $0x5D; BYTE $0x46; BYTE $0xC1; BYTE $0x20 // VPERM2I128 ymm0, ymm4, ymm1, 32 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x20 // VPXOR ymm0, ymm0, [rbx + 32] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x20 // VMOVDQU [rcx + 32], ymm0 + BYTE $0xC4; BYTE $0xE3; BYTE $0x6D; BYTE $0x46; BYTE $0xC3; BYTE $0x31 // VPERM2I128 ymm0, ymm2, ymm3, 49 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x40 // VPXOR ymm0, ymm0, [rbx + 64] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x40 // VMOVDQU [rcx + 64], ymm0 + BYTE $0xC4; BYTE $0xE3; BYTE $0x5D; BYTE $0x46; BYTE $0xC1; BYTE $0x31 // VPERM2I128 ymm0, ymm4, ymm1, 49 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x60 // VPXOR ymm0, ymm0, [rbx + 96] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x60 // VMOVDQU [rcx + 96], ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x4C; BYTE $0x24; BYTE $0x20 // VMOVDQA ymm1, [rsp + 32] + BYTE $0xC5; BYTE $0xF5; BYTE $0xFE; BYTE $0x0C; BYTE $0x24 // VPADDD ymm1, ymm1, [rsp] + BYTE $0xC5; BYTE $0x3D; BYTE $0xFE; BYTE $0xC1 // VPADDD ymm8, ymm8, ymm1 + BYTE $0xC4; BYTE $0xE3; BYTE $0x55; BYTE $0x46; BYTE $0xC6; BYTE $0x20 // VPERM2I128 ymm0, ymm5, ymm6, 32 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x80; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 128] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x80; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 128], ymm0 + BYTE $0xC4; BYTE $0xC3; BYTE $0x45; BYTE $0x46; BYTE $0xC0; BYTE $0x20 // VPERM2I128 ymm0, ymm7, ymm8, 32 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0xA0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 160] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0xA0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 160], ymm0 + BYTE $0xC4; BYTE $0xE3; BYTE $0x55; BYTE $0x46; BYTE $0xC6; BYTE $0x31 // VPERM2I128 ymm0, ymm5, ymm6, 49 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0xC0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 192] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0xC0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 192], ymm0 + BYTE $0xC4; BYTE $0xC3; BYTE $0x45; BYTE $0x46; BYTE $0xC0; BYTE $0x31 // VPERM2I128 ymm0, ymm7, ymm8, 49 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0xE0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 224] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0xE0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 224], ymm0 + BYTE $0xC5; BYTE $0xF5; BYTE $0xFE; BYTE $0x0C; BYTE $0x24 // VPADDD ymm1, ymm1, [rsp] + BYTE $0xC5; BYTE $0x1D; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm12, ymm12, ymm1 + BYTE $0xC4; BYTE $0xC3; BYTE $0x35; BYTE $0x46; BYTE $0xC2; BYTE $0x20 // VPERM2I128 ymm0, ymm9, ymm10, 32 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x00; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 256] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x00; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 256], ymm0 + BYTE $0xC4; BYTE $0xC3; BYTE $0x25; BYTE $0x46; BYTE $0xC4; BYTE $0x20 // VPERM2I128 ymm0, ymm11, ymm12, 32 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x20; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 288] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x20; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 288], ymm0 + BYTE $0xC4; BYTE $0xC3; BYTE $0x35; BYTE $0x46; BYTE $0xC2; BYTE $0x31 // VPERM2I128 ymm0, ymm9, ymm10, 49 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x40; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 320] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x40; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 320], ymm0 + BYTE $0xC4; BYTE $0xC3; BYTE $0x25; BYTE $0x46; BYTE $0xC4; BYTE $0x31 // VPERM2I128 ymm0, ymm11, ymm12, 49 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x60; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 352] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x60; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 352], ymm0 + BYTE $0xC5; BYTE $0xF5; BYTE $0xFE; BYTE $0x0C; BYTE $0x24 // VPADDD ymm1, ymm1, [rsp] + BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA ymm0, [rsp + 64] + BYTE $0xC5; BYTE $0x05; BYTE $0xFE; BYTE $0xF9 // VPADDD ymm15, ymm15, ymm1 + BYTE $0xC4; BYTE $0xC3; BYTE $0x7D; BYTE $0x46; BYTE $0xD5; BYTE $0x20 // VPERM2I128 ymm2, ymm0, ymm13, 32 + BYTE $0xC5; BYTE $0xED; BYTE $0xEF; BYTE $0x93; BYTE $0x80; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm2, ymm2, [rbx + 384] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x91; BYTE $0x80; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 384], ymm2 + BYTE $0xC4; BYTE $0xC3; BYTE $0x0D; BYTE $0x46; BYTE $0xD7; BYTE $0x20 // VPERM2I128 ymm2, ymm14, ymm15, 32 + BYTE $0xC5; BYTE $0xED; BYTE $0xEF; BYTE $0x93; BYTE $0xA0; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm2, ymm2, [rbx + 416] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x91; BYTE $0xA0; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 416], ymm2 + BYTE $0xC4; BYTE $0xC3; BYTE $0x7D; BYTE $0x46; BYTE $0xD5; BYTE $0x31 // VPERM2I128 ymm2, ymm0, ymm13, 49 + BYTE $0xC5; BYTE $0xED; BYTE $0xEF; BYTE $0x93; BYTE $0xC0; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm2, ymm2, [rbx + 448] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x91; BYTE $0xC0; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 448], ymm2 + BYTE $0xC4; BYTE $0xC3; BYTE $0x0D; BYTE $0x46; BYTE $0xD7; BYTE $0x31 // VPERM2I128 ymm2, ymm14, ymm15, 49 + BYTE $0xC5; BYTE $0xED; BYTE $0xEF; BYTE $0x93; BYTE $0xE0; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm2, ymm2, [rbx + 480] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x91; BYTE $0xE0; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 480], ymm2 + BYTE $0xC5; BYTE $0xF5; BYTE $0xFE; BYTE $0x0C; BYTE $0x24 // VPADDD ymm1, ymm1, [rsp] + BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x4C; BYTE $0x24; BYTE $0x20 // VMOVDQA [rsp + 32], ymm1 + ADDQ $512, BX + ADDQ $512, CX + SUBQ $8, DX + JCC vector_loop8_begin +vector_loop8_end: + BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xE1 // VMOVDQA ymm12, ymm1 + ADDQ $8, DX + JEQ out_write_even + BYTE $0xC4; BYTE $0x62; BYTE $0x7D; BYTE $0x5A; BYTE $0x08 // VBROADCASTI128 ymm9, [rax] + BYTE $0xC4; BYTE $0x62; BYTE $0x7D; BYTE $0x5A; BYTE $0x50; BYTE $0x10 // VBROADCASTI128 ymm10, [rax + 16] + BYTE $0xC4; BYTE $0x62; BYTE $0x7D; BYTE $0x5A; BYTE $0x58; BYTE $0x20 // VBROADCASTI128 ymm11, [rax + 32] + BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0x34; BYTE $0x24 // VMOVDQA ymm14, [rsp] + SUBQ $4, DX + JCS process_2_blocks + BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xCA // VMOVDQA ymm2, ymm9 + BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xD3 // VMOVDQA ymm3, ymm10 + BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xDC // VMOVDQA ymm4, ymm11 + BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xE1 // VMOVDQA ymm1, ymm12 + BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xEA // VMOVDQA ymm5, ymm2 + BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xF3 // VMOVDQA ymm6, ymm3 + BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xFC // VMOVDQA ymm7, ymm4 + BYTE $0xC4; BYTE $0x41; BYTE $0x75; BYTE $0xD4; BYTE $0xC6 // VPADDQ ymm8, ymm1, ymm14 + MOVQ $20, SI +rounds_loop4_begin: + BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3 + BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2 + BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x10 // VPSLLD ymm0, ymm1, 16 + BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x10 // VPSRLD ymm1, ymm1, 16 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x10 // VPSLLD ymm0, ymm8, 16 + BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x10 // VPSRLD ymm8, ymm8, 16 + BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0 + BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1 + BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4 + BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x0C // VPSLLD ymm0, ymm3, 12 + BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x14 // VPSRLD ymm3, ymm3, 20 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x0C // VPSLLD ymm0, ymm6, 12 + BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x14 // VPSRLD ymm6, ymm6, 20 + BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0 + BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3 + BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2 + BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x08 // VPSLLD ymm0, ymm1, 8 + BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x18 // VPSRLD ymm1, ymm1, 24 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x08 // VPSLLD ymm0, ymm8, 8 + BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x18 // VPSRLD ymm8, ymm8, 24 + BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0 + BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1 + BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4 + BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x07 // VPSLLD ymm0, ymm3, 7 + BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x19 // VPSRLD ymm3, ymm3, 25 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x07 // VPSLLD ymm0, ymm6, 7 + BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x19 // VPSRLD ymm6, ymm6, 25 + BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xDB; BYTE $0x39 // VPSHUFD ymm3, ymm3, 57 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xF6; BYTE $0x39 // VPSHUFD ymm6, ymm6, 57 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xE4; BYTE $0x4E // VPSHUFD ymm4, ymm4, 78 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xFF; BYTE $0x4E // VPSHUFD ymm7, ymm7, 78 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xC9; BYTE $0x93 // VPSHUFD ymm1, ymm1, 147 + BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xC0; BYTE $0x93 // VPSHUFD ymm8, ymm8, 147 + BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3 + BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2 + BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x10 // VPSLLD ymm0, ymm1, 16 + BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x10 // VPSRLD ymm1, ymm1, 16 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x10 // VPSLLD ymm0, ymm8, 16 + BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x10 // VPSRLD ymm8, ymm8, 16 + BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0 + BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1 + BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4 + BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x0C // VPSLLD ymm0, ymm3, 12 + BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x14 // VPSRLD ymm3, ymm3, 20 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x0C // VPSLLD ymm0, ymm6, 12 + BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x14 // VPSRLD ymm6, ymm6, 20 + BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0 + BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3 + BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2 + BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x08 // VPSLLD ymm0, ymm1, 8 + BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x18 // VPSRLD ymm1, ymm1, 24 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0 + BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x08 // VPSLLD ymm0, ymm8, 8 + BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x18 // VPSRLD ymm8, ymm8, 24 + BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0 + BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1 + BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4 + BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x07 // VPSLLD ymm0, ymm3, 7 + BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x19 // VPSRLD ymm3, ymm3, 25 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x07 // VPSLLD ymm0, ymm6, 7 + BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x19 // VPSRLD ymm6, ymm6, 25 + BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xDB; BYTE $0x93 // VPSHUFD ymm3, ymm3, 147 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xF6; BYTE $0x93 // VPSHUFD ymm6, ymm6, 147 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xE4; BYTE $0x4E // VPSHUFD ymm4, ymm4, 78 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xFF; BYTE $0x4E // VPSHUFD ymm7, ymm7, 78 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xC9; BYTE $0x39 // VPSHUFD ymm1, ymm1, 57 + BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xC0; BYTE $0x39 // VPSHUFD ymm8, ymm8, 57 + SUBQ $2, SI + JNE rounds_loop4_begin + BYTE $0xC4; BYTE $0xC1; BYTE $0x6D; BYTE $0xFE; BYTE $0xD1 // VPADDD ymm2, ymm2, ymm9 + BYTE $0xC4; BYTE $0xC1; BYTE $0x65; BYTE $0xFE; BYTE $0xDA // VPADDD ymm3, ymm3, ymm10 + BYTE $0xC4; BYTE $0xC1; BYTE $0x5D; BYTE $0xFE; BYTE $0xE3 // VPADDD ymm4, ymm4, ymm11 + BYTE $0xC4; BYTE $0xC1; BYTE $0x75; BYTE $0xFE; BYTE $0xCC // VPADDD ymm1, ymm1, ymm12 + BYTE $0xC4; BYTE $0xE3; BYTE $0x6D; BYTE $0x46; BYTE $0xC3; BYTE $0x20 // VPERM2I128 ymm0, ymm2, ymm3, 32 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x03 // VPXOR ymm0, ymm0, [rbx] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x01 // VMOVDQU [rcx], ymm0 + BYTE $0xC4; BYTE $0xE3; BYTE $0x5D; BYTE $0x46; BYTE $0xC1; BYTE $0x20 // VPERM2I128 ymm0, ymm4, ymm1, 32 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x20 // VPXOR ymm0, ymm0, [rbx + 32] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x20 // VMOVDQU [rcx + 32], ymm0 + BYTE $0xC4; BYTE $0xE3; BYTE $0x6D; BYTE $0x46; BYTE $0xC3; BYTE $0x31 // VPERM2I128 ymm0, ymm2, ymm3, 49 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x40 // VPXOR ymm0, ymm0, [rbx + 64] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x40 // VMOVDQU [rcx + 64], ymm0 + BYTE $0xC4; BYTE $0xE3; BYTE $0x5D; BYTE $0x46; BYTE $0xC1; BYTE $0x31 // VPERM2I128 ymm0, ymm4, ymm1, 49 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x60 // VPXOR ymm0, ymm0, [rbx + 96] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x60 // VMOVDQU [rcx + 96], ymm0 + BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xFE; BYTE $0xE6 // VPADDD ymm12, ymm12, ymm14 + BYTE $0xC4; BYTE $0xC1; BYTE $0x55; BYTE $0xFE; BYTE $0xE9 // VPADDD ymm5, ymm5, ymm9 + BYTE $0xC4; BYTE $0xC1; BYTE $0x4D; BYTE $0xFE; BYTE $0xF2 // VPADDD ymm6, ymm6, ymm10 + BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xFB // VPADDD ymm7, ymm7, ymm11 + BYTE $0xC4; BYTE $0x41; BYTE $0x3D; BYTE $0xFE; BYTE $0xC4 // VPADDD ymm8, ymm8, ymm12 + BYTE $0xC4; BYTE $0xE3; BYTE $0x55; BYTE $0x46; BYTE $0xC6; BYTE $0x20 // VPERM2I128 ymm0, ymm5, ymm6, 32 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x80; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 128] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x80; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 128], ymm0 + BYTE $0xC4; BYTE $0xC3; BYTE $0x45; BYTE $0x46; BYTE $0xC0; BYTE $0x20 // VPERM2I128 ymm0, ymm7, ymm8, 32 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0xA0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 160] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0xA0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 160], ymm0 + BYTE $0xC4; BYTE $0xE3; BYTE $0x55; BYTE $0x46; BYTE $0xC6; BYTE $0x31 // VPERM2I128 ymm0, ymm5, ymm6, 49 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0xC0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 192] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0xC0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 192], ymm0 + BYTE $0xC4; BYTE $0xC3; BYTE $0x45; BYTE $0x46; BYTE $0xC0; BYTE $0x31 // VPERM2I128 ymm0, ymm7, ymm8, 49 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0xE0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 224] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0xE0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 224], ymm0 + BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xFE; BYTE $0xE6 // VPADDD ymm12, ymm12, ymm14 + ADDQ $256, BX + ADDQ $256, CX + SUBQ $4, DX +process_2_blocks: + ADDQ $4, DX + JEQ out_write_even +vector_loop2_begin: + BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xCA // VMOVDQA ymm2, ymm9 + BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xD3 // VMOVDQA ymm3, ymm10 + BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xDC // VMOVDQA ymm4, ymm11 + BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xE1 // VMOVDQA ymm1, ymm12 + MOVQ $20, SI +rounds_loop2_begin: + BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x10 // VPSLLD ymm0, ymm1, 16 + BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x10 // VPSRLD ymm1, ymm1, 16 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0 + BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x0C // VPSLLD ymm0, ymm3, 12 + BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x14 // VPSRLD ymm3, ymm3, 20 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0 + BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x08 // VPSLLD ymm0, ymm1, 8 + BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x18 // VPSRLD ymm1, ymm1, 24 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0 + BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x07 // VPSLLD ymm0, ymm3, 7 + BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x19 // VPSRLD ymm3, ymm3, 25 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xDB; BYTE $0x39 // VPSHUFD ymm3, ymm3, 57 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xE4; BYTE $0x4E // VPSHUFD ymm4, ymm4, 78 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xC9; BYTE $0x93 // VPSHUFD ymm1, ymm1, 147 + BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x10 // VPSLLD ymm0, ymm1, 16 + BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x10 // VPSRLD ymm1, ymm1, 16 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0 + BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x0C // VPSLLD ymm0, ymm3, 12 + BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x14 // VPSRLD ymm3, ymm3, 20 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0 + BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x08 // VPSLLD ymm0, ymm1, 8 + BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x18 // VPSRLD ymm1, ymm1, 24 + BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0 + BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4 + BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x07 // VPSLLD ymm0, ymm3, 7 + BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x19 // VPSRLD ymm3, ymm3, 25 + BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xDB; BYTE $0x93 // VPSHUFD ymm3, ymm3, 147 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xE4; BYTE $0x4E // VPSHUFD ymm4, ymm4, 78 + BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xC9; BYTE $0x39 // VPSHUFD ymm1, ymm1, 57 + SUBQ $2, SI + JNE rounds_loop2_begin + BYTE $0xC4; BYTE $0xC1; BYTE $0x6D; BYTE $0xFE; BYTE $0xD1 // VPADDD ymm2, ymm2, ymm9 + BYTE $0xC4; BYTE $0xC1; BYTE $0x65; BYTE $0xFE; BYTE $0xDA // VPADDD ymm3, ymm3, ymm10 + BYTE $0xC4; BYTE $0xC1; BYTE $0x5D; BYTE $0xFE; BYTE $0xE3 // VPADDD ymm4, ymm4, ymm11 + BYTE $0xC4; BYTE $0xC1; BYTE $0x75; BYTE $0xFE; BYTE $0xCC // VPADDD ymm1, ymm1, ymm12 + BYTE $0xC4; BYTE $0xE3; BYTE $0x6D; BYTE $0x46; BYTE $0xC3; BYTE $0x20 // VPERM2I128 ymm0, ymm2, ymm3, 32 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x03 // VPXOR ymm0, ymm0, [rbx] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x01 // VMOVDQU [rcx], ymm0 + BYTE $0xC4; BYTE $0xE3; BYTE $0x5D; BYTE $0x46; BYTE $0xC1; BYTE $0x20 // VPERM2I128 ymm0, ymm4, ymm1, 32 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x20 // VPXOR ymm0, ymm0, [rbx + 32] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x20 // VMOVDQU [rcx + 32], ymm0 + SUBQ $1, DX + JEQ out_write_odd + BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xFE; BYTE $0xE6 // VPADDD ymm12, ymm12, ymm14 + BYTE $0xC4; BYTE $0xE3; BYTE $0x6D; BYTE $0x46; BYTE $0xC3; BYTE $0x31 // VPERM2I128 ymm0, ymm2, ymm3, 49 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x40 // VPXOR ymm0, ymm0, [rbx + 64] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x40 // VMOVDQU [rcx + 64], ymm0 + BYTE $0xC4; BYTE $0xE3; BYTE $0x5D; BYTE $0x46; BYTE $0xC1; BYTE $0x31 // VPERM2I128 ymm0, ymm4, ymm1, 49 + BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x60 // VPXOR ymm0, ymm0, [rbx + 96] + BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x60 // VMOVDQU [rcx + 96], ymm0 + SUBQ $1, DX + JEQ out_write_even + ADDQ $128, BX + ADDQ $128, CX + JMP vector_loop2_begin +out_write_odd: + BYTE $0xC4; BYTE $0x43; BYTE $0x1D; BYTE $0x46; BYTE $0xE4; BYTE $0x01 // VPERM2I128 ymm12, ymm12, ymm12, 1 +out_write_even: + BYTE $0xC5; BYTE $0x7A; BYTE $0x7F; BYTE $0x60; BYTE $0x30 // VMOVDQU [rax + 48], xmm12 + BYTE $0xC5; BYTE $0xED; BYTE $0xEF; BYTE $0xD2 // VPXOR ymm2, ymm2, ymm2 + BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x54; BYTE $0x24; BYTE $0x40 // VMOVDQA [rsp + 64], ymm2 + BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x54; BYTE $0x24; BYTE $0x20 // VMOVDQA [rsp + 32], ymm2 + BYTE $0xC5; BYTE $0xFC; BYTE $0x77 // VZEROALL + MOVQ DI, SP + RET + +// func cpuidAmd64(cpuidParams *uint32) +TEXT ·cpuidAmd64(SB),4,$0-8 + MOVQ cpuidParams+0(FP), R15 + MOVL 0(R15), AX + MOVL 8(R15), CX + CPUID + MOVL AX, 0(R15) + MOVL BX, 4(R15) + MOVL CX, 8(R15) + MOVL DX, 12(R15) + RET + +// func xgetbv0Amd64(xcrVec *uint32) +TEXT ·xgetbv0Amd64(SB),4,$0-8 + MOVQ xcrVec+0(FP), BX + XORL CX, CX + BYTE $0x0F; BYTE $0x01; BYTE $0xD0 // XGETBV + MOVL AX, 0(BX) + MOVL DX, 4(BX) + RET diff --git a/vendor/github.com/Yawning/chacha20/chacha20_ref.go b/vendor/github.com/Yawning/chacha20/chacha20_ref.go new file mode 100644 index 0000000..fcdc8c6 --- /dev/null +++ b/vendor/github.com/Yawning/chacha20/chacha20_ref.go @@ -0,0 +1,394 @@ +// chacha20_ref.go - Reference ChaCha20. +// +// To the extent possible under law, Yawning Angel has waived all copyright +// and related or neighboring rights to chacha20, using the Creative +// Commons "CC0" public domain dedication. See LICENSE or +// for full details. + +// +build !go1.9 + +package chacha20 + +import ( + "encoding/binary" + "math" + "unsafe" +) + +func blocksRef(x *[stateSize]uint32, in []byte, out []byte, nrBlocks int, isIetf bool) { + if isIetf { + var totalBlocks uint64 + totalBlocks = uint64(x[12]) + uint64(nrBlocks) + if totalBlocks > math.MaxUint32 { + panic("chacha20: Exceeded keystream per nonce limit") + } + } + + // This routine ignores x[0]...x[4] in favor the const values since it's + // ever so slightly faster. + + for n := 0; n < nrBlocks; n++ { + x0, x1, x2, x3 := sigma0, sigma1, sigma2, sigma3 + x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15] + + for i := chachaRounds; i > 0; i -= 2 { + // quarterround(x, 0, 4, 8, 12) + x0 += x4 + x12 ^= x0 + x12 = (x12 << 16) | (x12 >> 16) + x8 += x12 + x4 ^= x8 + x4 = (x4 << 12) | (x4 >> 20) + x0 += x4 + x12 ^= x0 + x12 = (x12 << 8) | (x12 >> 24) + x8 += x12 + x4 ^= x8 + x4 = (x4 << 7) | (x4 >> 25) + + // quarterround(x, 1, 5, 9, 13) + x1 += x5 + x13 ^= x1 + x13 = (x13 << 16) | (x13 >> 16) + x9 += x13 + x5 ^= x9 + x5 = (x5 << 12) | (x5 >> 20) + x1 += x5 + x13 ^= x1 + x13 = (x13 << 8) | (x13 >> 24) + x9 += x13 + x5 ^= x9 + x5 = (x5 << 7) | (x5 >> 25) + + // quarterround(x, 2, 6, 10, 14) + x2 += x6 + x14 ^= x2 + x14 = (x14 << 16) | (x14 >> 16) + x10 += x14 + x6 ^= x10 + x6 = (x6 << 12) | (x6 >> 20) + x2 += x6 + x14 ^= x2 + x14 = (x14 << 8) | (x14 >> 24) + x10 += x14 + x6 ^= x10 + x6 = (x6 << 7) | (x6 >> 25) + + // quarterround(x, 3, 7, 11, 15) + x3 += x7 + x15 ^= x3 + x15 = (x15 << 16) | (x15 >> 16) + x11 += x15 + x7 ^= x11 + x7 = (x7 << 12) | (x7 >> 20) + x3 += x7 + x15 ^= x3 + x15 = (x15 << 8) | (x15 >> 24) + x11 += x15 + x7 ^= x11 + x7 = (x7 << 7) | (x7 >> 25) + + // quarterround(x, 0, 5, 10, 15) + x0 += x5 + x15 ^= x0 + x15 = (x15 << 16) | (x15 >> 16) + x10 += x15 + x5 ^= x10 + x5 = (x5 << 12) | (x5 >> 20) + x0 += x5 + x15 ^= x0 + x15 = (x15 << 8) | (x15 >> 24) + x10 += x15 + x5 ^= x10 + x5 = (x5 << 7) | (x5 >> 25) + + // quarterround(x, 1, 6, 11, 12) + x1 += x6 + x12 ^= x1 + x12 = (x12 << 16) | (x12 >> 16) + x11 += x12 + x6 ^= x11 + x6 = (x6 << 12) | (x6 >> 20) + x1 += x6 + x12 ^= x1 + x12 = (x12 << 8) | (x12 >> 24) + x11 += x12 + x6 ^= x11 + x6 = (x6 << 7) | (x6 >> 25) + + // quarterround(x, 2, 7, 8, 13) + x2 += x7 + x13 ^= x2 + x13 = (x13 << 16) | (x13 >> 16) + x8 += x13 + x7 ^= x8 + x7 = (x7 << 12) | (x7 >> 20) + x2 += x7 + x13 ^= x2 + x13 = (x13 << 8) | (x13 >> 24) + x8 += x13 + x7 ^= x8 + x7 = (x7 << 7) | (x7 >> 25) + + // quarterround(x, 3, 4, 9, 14) + x3 += x4 + x14 ^= x3 + x14 = (x14 << 16) | (x14 >> 16) + x9 += x14 + x4 ^= x9 + x4 = (x4 << 12) | (x4 >> 20) + x3 += x4 + x14 ^= x3 + x14 = (x14 << 8) | (x14 >> 24) + x9 += x14 + x4 ^= x9 + x4 = (x4 << 7) | (x4 >> 25) + } + + // On amd64 at least, this is a rather big boost. + if useUnsafe { + if in != nil { + inArr := (*[16]uint32)(unsafe.Pointer(&in[n*BlockSize])) + outArr := (*[16]uint32)(unsafe.Pointer(&out[n*BlockSize])) + outArr[0] = inArr[0] ^ (x0 + sigma0) + outArr[1] = inArr[1] ^ (x1 + sigma1) + outArr[2] = inArr[2] ^ (x2 + sigma2) + outArr[3] = inArr[3] ^ (x3 + sigma3) + outArr[4] = inArr[4] ^ (x4 + x[4]) + outArr[5] = inArr[5] ^ (x5 + x[5]) + outArr[6] = inArr[6] ^ (x6 + x[6]) + outArr[7] = inArr[7] ^ (x7 + x[7]) + outArr[8] = inArr[8] ^ (x8 + x[8]) + outArr[9] = inArr[9] ^ (x9 + x[9]) + outArr[10] = inArr[10] ^ (x10 + x[10]) + outArr[11] = inArr[11] ^ (x11 + x[11]) + outArr[12] = inArr[12] ^ (x12 + x[12]) + outArr[13] = inArr[13] ^ (x13 + x[13]) + outArr[14] = inArr[14] ^ (x14 + x[14]) + outArr[15] = inArr[15] ^ (x15 + x[15]) + } else { + outArr := (*[16]uint32)(unsafe.Pointer(&out[n*BlockSize])) + outArr[0] = x0 + sigma0 + outArr[1] = x1 + sigma1 + outArr[2] = x2 + sigma2 + outArr[3] = x3 + sigma3 + outArr[4] = x4 + x[4] + outArr[5] = x5 + x[5] + outArr[6] = x6 + x[6] + outArr[7] = x7 + x[7] + outArr[8] = x8 + x[8] + outArr[9] = x9 + x[9] + outArr[10] = x10 + x[10] + outArr[11] = x11 + x[11] + outArr[12] = x12 + x[12] + outArr[13] = x13 + x[13] + outArr[14] = x14 + x[14] + outArr[15] = x15 + x[15] + } + } else { + // Slow path, either the architecture cares about alignment, or is not little endian. + x0 += sigma0 + x1 += sigma1 + x2 += sigma2 + x3 += sigma3 + x4 += x[4] + x5 += x[5] + x6 += x[6] + x7 += x[7] + x8 += x[8] + x9 += x[9] + x10 += x[10] + x11 += x[11] + x12 += x[12] + x13 += x[13] + x14 += x[14] + x15 += x[15] + if in != nil { + binary.LittleEndian.PutUint32(out[0:4], binary.LittleEndian.Uint32(in[0:4])^x0) + binary.LittleEndian.PutUint32(out[4:8], binary.LittleEndian.Uint32(in[4:8])^x1) + binary.LittleEndian.PutUint32(out[8:12], binary.LittleEndian.Uint32(in[8:12])^x2) + binary.LittleEndian.PutUint32(out[12:16], binary.LittleEndian.Uint32(in[12:16])^x3) + binary.LittleEndian.PutUint32(out[16:20], binary.LittleEndian.Uint32(in[16:20])^x4) + binary.LittleEndian.PutUint32(out[20:24], binary.LittleEndian.Uint32(in[20:24])^x5) + binary.LittleEndian.PutUint32(out[24:28], binary.LittleEndian.Uint32(in[24:28])^x6) + binary.LittleEndian.PutUint32(out[28:32], binary.LittleEndian.Uint32(in[28:32])^x7) + binary.LittleEndian.PutUint32(out[32:36], binary.LittleEndian.Uint32(in[32:36])^x8) + binary.LittleEndian.PutUint32(out[36:40], binary.LittleEndian.Uint32(in[36:40])^x9) + binary.LittleEndian.PutUint32(out[40:44], binary.LittleEndian.Uint32(in[40:44])^x10) + binary.LittleEndian.PutUint32(out[44:48], binary.LittleEndian.Uint32(in[44:48])^x11) + binary.LittleEndian.PutUint32(out[48:52], binary.LittleEndian.Uint32(in[48:52])^x12) + binary.LittleEndian.PutUint32(out[52:56], binary.LittleEndian.Uint32(in[52:56])^x13) + binary.LittleEndian.PutUint32(out[56:60], binary.LittleEndian.Uint32(in[56:60])^x14) + binary.LittleEndian.PutUint32(out[60:64], binary.LittleEndian.Uint32(in[60:64])^x15) + in = in[BlockSize:] + } else { + binary.LittleEndian.PutUint32(out[0:4], x0) + binary.LittleEndian.PutUint32(out[4:8], x1) + binary.LittleEndian.PutUint32(out[8:12], x2) + binary.LittleEndian.PutUint32(out[12:16], x3) + binary.LittleEndian.PutUint32(out[16:20], x4) + binary.LittleEndian.PutUint32(out[20:24], x5) + binary.LittleEndian.PutUint32(out[24:28], x6) + binary.LittleEndian.PutUint32(out[28:32], x7) + binary.LittleEndian.PutUint32(out[32:36], x8) + binary.LittleEndian.PutUint32(out[36:40], x9) + binary.LittleEndian.PutUint32(out[40:44], x10) + binary.LittleEndian.PutUint32(out[44:48], x11) + binary.LittleEndian.PutUint32(out[48:52], x12) + binary.LittleEndian.PutUint32(out[52:56], x13) + binary.LittleEndian.PutUint32(out[56:60], x14) + binary.LittleEndian.PutUint32(out[60:64], x15) + } + out = out[BlockSize:] + } + + // Stoping at 2^70 bytes per nonce is the user's responsibility. + ctr := uint64(x[13])<<32 | uint64(x[12]) + ctr++ + x[12] = uint32(ctr) + x[13] = uint32(ctr >> 32) + } +} + +func hChaChaRef(x *[stateSize]uint32, out *[32]byte) { + x0, x1, x2, x3 := sigma0, sigma1, sigma2, sigma3 + x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11] + + for i := chachaRounds; i > 0; i -= 2 { + // quarterround(x, 0, 4, 8, 12) + x0 += x4 + x12 ^= x0 + x12 = (x12 << 16) | (x12 >> 16) + x8 += x12 + x4 ^= x8 + x4 = (x4 << 12) | (x4 >> 20) + x0 += x4 + x12 ^= x0 + x12 = (x12 << 8) | (x12 >> 24) + x8 += x12 + x4 ^= x8 + x4 = (x4 << 7) | (x4 >> 25) + + // quarterround(x, 1, 5, 9, 13) + x1 += x5 + x13 ^= x1 + x13 = (x13 << 16) | (x13 >> 16) + x9 += x13 + x5 ^= x9 + x5 = (x5 << 12) | (x5 >> 20) + x1 += x5 + x13 ^= x1 + x13 = (x13 << 8) | (x13 >> 24) + x9 += x13 + x5 ^= x9 + x5 = (x5 << 7) | (x5 >> 25) + + // quarterround(x, 2, 6, 10, 14) + x2 += x6 + x14 ^= x2 + x14 = (x14 << 16) | (x14 >> 16) + x10 += x14 + x6 ^= x10 + x6 = (x6 << 12) | (x6 >> 20) + x2 += x6 + x14 ^= x2 + x14 = (x14 << 8) | (x14 >> 24) + x10 += x14 + x6 ^= x10 + x6 = (x6 << 7) | (x6 >> 25) + + // quarterround(x, 3, 7, 11, 15) + x3 += x7 + x15 ^= x3 + x15 = (x15 << 16) | (x15 >> 16) + x11 += x15 + x7 ^= x11 + x7 = (x7 << 12) | (x7 >> 20) + x3 += x7 + x15 ^= x3 + x15 = (x15 << 8) | (x15 >> 24) + x11 += x15 + x7 ^= x11 + x7 = (x7 << 7) | (x7 >> 25) + + // quarterround(x, 0, 5, 10, 15) + x0 += x5 + x15 ^= x0 + x15 = (x15 << 16) | (x15 >> 16) + x10 += x15 + x5 ^= x10 + x5 = (x5 << 12) | (x5 >> 20) + x0 += x5 + x15 ^= x0 + x15 = (x15 << 8) | (x15 >> 24) + x10 += x15 + x5 ^= x10 + x5 = (x5 << 7) | (x5 >> 25) + + // quarterround(x, 1, 6, 11, 12) + x1 += x6 + x12 ^= x1 + x12 = (x12 << 16) | (x12 >> 16) + x11 += x12 + x6 ^= x11 + x6 = (x6 << 12) | (x6 >> 20) + x1 += x6 + x12 ^= x1 + x12 = (x12 << 8) | (x12 >> 24) + x11 += x12 + x6 ^= x11 + x6 = (x6 << 7) | (x6 >> 25) + + // quarterround(x, 2, 7, 8, 13) + x2 += x7 + x13 ^= x2 + x13 = (x13 << 16) | (x13 >> 16) + x8 += x13 + x7 ^= x8 + x7 = (x7 << 12) | (x7 >> 20) + x2 += x7 + x13 ^= x2 + x13 = (x13 << 8) | (x13 >> 24) + x8 += x13 + x7 ^= x8 + x7 = (x7 << 7) | (x7 >> 25) + + // quarterround(x, 3, 4, 9, 14) + x3 += x4 + x14 ^= x3 + x14 = (x14 << 16) | (x14 >> 16) + x9 += x14 + x4 ^= x9 + x4 = (x4 << 12) | (x4 >> 20) + x3 += x4 + x14 ^= x3 + x14 = (x14 << 8) | (x14 >> 24) + x9 += x14 + x4 ^= x9 + x4 = (x4 << 7) | (x4 >> 25) + } + + // HChaCha returns x0...x3 | x12...x15, which corresponds to the + // indexes of the ChaCha constant and the indexes of the IV. + if useUnsafe { + outArr := (*[16]uint32)(unsafe.Pointer(&out[0])) + outArr[0] = x0 + outArr[1] = x1 + outArr[2] = x2 + outArr[3] = x3 + outArr[4] = x12 + outArr[5] = x13 + outArr[6] = x14 + outArr[7] = x15 + } else { + binary.LittleEndian.PutUint32(out[0:4], x0) + binary.LittleEndian.PutUint32(out[4:8], x1) + binary.LittleEndian.PutUint32(out[8:12], x2) + binary.LittleEndian.PutUint32(out[12:16], x3) + binary.LittleEndian.PutUint32(out[16:20], x12) + binary.LittleEndian.PutUint32(out[20:24], x13) + binary.LittleEndian.PutUint32(out[24:28], x14) + binary.LittleEndian.PutUint32(out[28:32], x15) + } + return +} diff --git a/vendor/github.com/Yawning/chacha20/chacha20_ref_go19.go b/vendor/github.com/Yawning/chacha20/chacha20_ref_go19.go new file mode 100644 index 0000000..8405c22 --- /dev/null +++ b/vendor/github.com/Yawning/chacha20/chacha20_ref_go19.go @@ -0,0 +1,395 @@ +// chacha20_ref.go - Reference ChaCha20. +// +// To the extent possible under law, Yawning Angel has waived all copyright +// and related or neighboring rights to chacha20, using the Creative +// Commons "CC0" public domain dedication. See LICENSE or +// for full details. + +// +build go1.9 + +package chacha20 + +import ( + "encoding/binary" + "math" + "math/bits" + "unsafe" +) + +func blocksRef(x *[stateSize]uint32, in []byte, out []byte, nrBlocks int, isIetf bool) { + if isIetf { + var totalBlocks uint64 + totalBlocks = uint64(x[12]) + uint64(nrBlocks) + if totalBlocks > math.MaxUint32 { + panic("chacha20: Exceeded keystream per nonce limit") + } + } + + // This routine ignores x[0]...x[4] in favor the const values since it's + // ever so slightly faster. + + for n := 0; n < nrBlocks; n++ { + x0, x1, x2, x3 := sigma0, sigma1, sigma2, sigma3 + x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15] + + for i := chachaRounds; i > 0; i -= 2 { + // quarterround(x, 0, 4, 8, 12) + x0 += x4 + x12 ^= x0 + x12 = bits.RotateLeft32(x12, 16) + x8 += x12 + x4 ^= x8 + x4 = bits.RotateLeft32(x4, 12) + x0 += x4 + x12 ^= x0 + x12 = bits.RotateLeft32(x12, 8) + x8 += x12 + x4 ^= x8 + x4 = bits.RotateLeft32(x4, 7) + + // quarterround(x, 1, 5, 9, 13) + x1 += x5 + x13 ^= x1 + x13 = bits.RotateLeft32(x13, 16) + x9 += x13 + x5 ^= x9 + x5 = bits.RotateLeft32(x5, 12) + x1 += x5 + x13 ^= x1 + x13 = bits.RotateLeft32(x13, 8) + x9 += x13 + x5 ^= x9 + x5 = bits.RotateLeft32(x5, 7) + + // quarterround(x, 2, 6, 10, 14) + x2 += x6 + x14 ^= x2 + x14 = bits.RotateLeft32(x14, 16) + x10 += x14 + x6 ^= x10 + x6 = bits.RotateLeft32(x6, 12) + x2 += x6 + x14 ^= x2 + x14 = bits.RotateLeft32(x14, 8) + x10 += x14 + x6 ^= x10 + x6 = bits.RotateLeft32(x6, 7) + + // quarterround(x, 3, 7, 11, 15) + x3 += x7 + x15 ^= x3 + x15 = bits.RotateLeft32(x15, 16) + x11 += x15 + x7 ^= x11 + x7 = bits.RotateLeft32(x7, 12) + x3 += x7 + x15 ^= x3 + x15 = bits.RotateLeft32(x15, 8) + x11 += x15 + x7 ^= x11 + x7 = bits.RotateLeft32(x7, 7) + + // quarterround(x, 0, 5, 10, 15) + x0 += x5 + x15 ^= x0 + x15 = bits.RotateLeft32(x15, 16) + x10 += x15 + x5 ^= x10 + x5 = bits.RotateLeft32(x5, 12) + x0 += x5 + x15 ^= x0 + x15 = bits.RotateLeft32(x15, 8) + x10 += x15 + x5 ^= x10 + x5 = bits.RotateLeft32(x5, 7) + + // quarterround(x, 1, 6, 11, 12) + x1 += x6 + x12 ^= x1 + x12 = bits.RotateLeft32(x12, 16) + x11 += x12 + x6 ^= x11 + x6 = bits.RotateLeft32(x6, 12) + x1 += x6 + x12 ^= x1 + x12 = bits.RotateLeft32(x12, 8) + x11 += x12 + x6 ^= x11 + x6 = bits.RotateLeft32(x6, 7) + + // quarterround(x, 2, 7, 8, 13) + x2 += x7 + x13 ^= x2 + x13 = bits.RotateLeft32(x13, 16) + x8 += x13 + x7 ^= x8 + x7 = bits.RotateLeft32(x7, 12) + x2 += x7 + x13 ^= x2 + x13 = bits.RotateLeft32(x13, 8) + x8 += x13 + x7 ^= x8 + x7 = bits.RotateLeft32(x7, 7) + + // quarterround(x, 3, 4, 9, 14) + x3 += x4 + x14 ^= x3 + x14 = bits.RotateLeft32(x14, 16) + x9 += x14 + x4 ^= x9 + x4 = bits.RotateLeft32(x4, 12) + x3 += x4 + x14 ^= x3 + x14 = bits.RotateLeft32(x14, 8) + x9 += x14 + x4 ^= x9 + x4 = bits.RotateLeft32(x4, 7) + } + + // On amd64 at least, this is a rather big boost. + if useUnsafe { + if in != nil { + inArr := (*[16]uint32)(unsafe.Pointer(&in[n*BlockSize])) + outArr := (*[16]uint32)(unsafe.Pointer(&out[n*BlockSize])) + outArr[0] = inArr[0] ^ (x0 + sigma0) + outArr[1] = inArr[1] ^ (x1 + sigma1) + outArr[2] = inArr[2] ^ (x2 + sigma2) + outArr[3] = inArr[3] ^ (x3 + sigma3) + outArr[4] = inArr[4] ^ (x4 + x[4]) + outArr[5] = inArr[5] ^ (x5 + x[5]) + outArr[6] = inArr[6] ^ (x6 + x[6]) + outArr[7] = inArr[7] ^ (x7 + x[7]) + outArr[8] = inArr[8] ^ (x8 + x[8]) + outArr[9] = inArr[9] ^ (x9 + x[9]) + outArr[10] = inArr[10] ^ (x10 + x[10]) + outArr[11] = inArr[11] ^ (x11 + x[11]) + outArr[12] = inArr[12] ^ (x12 + x[12]) + outArr[13] = inArr[13] ^ (x13 + x[13]) + outArr[14] = inArr[14] ^ (x14 + x[14]) + outArr[15] = inArr[15] ^ (x15 + x[15]) + } else { + outArr := (*[16]uint32)(unsafe.Pointer(&out[n*BlockSize])) + outArr[0] = x0 + sigma0 + outArr[1] = x1 + sigma1 + outArr[2] = x2 + sigma2 + outArr[3] = x3 + sigma3 + outArr[4] = x4 + x[4] + outArr[5] = x5 + x[5] + outArr[6] = x6 + x[6] + outArr[7] = x7 + x[7] + outArr[8] = x8 + x[8] + outArr[9] = x9 + x[9] + outArr[10] = x10 + x[10] + outArr[11] = x11 + x[11] + outArr[12] = x12 + x[12] + outArr[13] = x13 + x[13] + outArr[14] = x14 + x[14] + outArr[15] = x15 + x[15] + } + } else { + // Slow path, either the architecture cares about alignment, or is not little endian. + x0 += sigma0 + x1 += sigma1 + x2 += sigma2 + x3 += sigma3 + x4 += x[4] + x5 += x[5] + x6 += x[6] + x7 += x[7] + x8 += x[8] + x9 += x[9] + x10 += x[10] + x11 += x[11] + x12 += x[12] + x13 += x[13] + x14 += x[14] + x15 += x[15] + if in != nil { + binary.LittleEndian.PutUint32(out[0:4], binary.LittleEndian.Uint32(in[0:4])^x0) + binary.LittleEndian.PutUint32(out[4:8], binary.LittleEndian.Uint32(in[4:8])^x1) + binary.LittleEndian.PutUint32(out[8:12], binary.LittleEndian.Uint32(in[8:12])^x2) + binary.LittleEndian.PutUint32(out[12:16], binary.LittleEndian.Uint32(in[12:16])^x3) + binary.LittleEndian.PutUint32(out[16:20], binary.LittleEndian.Uint32(in[16:20])^x4) + binary.LittleEndian.PutUint32(out[20:24], binary.LittleEndian.Uint32(in[20:24])^x5) + binary.LittleEndian.PutUint32(out[24:28], binary.LittleEndian.Uint32(in[24:28])^x6) + binary.LittleEndian.PutUint32(out[28:32], binary.LittleEndian.Uint32(in[28:32])^x7) + binary.LittleEndian.PutUint32(out[32:36], binary.LittleEndian.Uint32(in[32:36])^x8) + binary.LittleEndian.PutUint32(out[36:40], binary.LittleEndian.Uint32(in[36:40])^x9) + binary.LittleEndian.PutUint32(out[40:44], binary.LittleEndian.Uint32(in[40:44])^x10) + binary.LittleEndian.PutUint32(out[44:48], binary.LittleEndian.Uint32(in[44:48])^x11) + binary.LittleEndian.PutUint32(out[48:52], binary.LittleEndian.Uint32(in[48:52])^x12) + binary.LittleEndian.PutUint32(out[52:56], binary.LittleEndian.Uint32(in[52:56])^x13) + binary.LittleEndian.PutUint32(out[56:60], binary.LittleEndian.Uint32(in[56:60])^x14) + binary.LittleEndian.PutUint32(out[60:64], binary.LittleEndian.Uint32(in[60:64])^x15) + in = in[BlockSize:] + } else { + binary.LittleEndian.PutUint32(out[0:4], x0) + binary.LittleEndian.PutUint32(out[4:8], x1) + binary.LittleEndian.PutUint32(out[8:12], x2) + binary.LittleEndian.PutUint32(out[12:16], x3) + binary.LittleEndian.PutUint32(out[16:20], x4) + binary.LittleEndian.PutUint32(out[20:24], x5) + binary.LittleEndian.PutUint32(out[24:28], x6) + binary.LittleEndian.PutUint32(out[28:32], x7) + binary.LittleEndian.PutUint32(out[32:36], x8) + binary.LittleEndian.PutUint32(out[36:40], x9) + binary.LittleEndian.PutUint32(out[40:44], x10) + binary.LittleEndian.PutUint32(out[44:48], x11) + binary.LittleEndian.PutUint32(out[48:52], x12) + binary.LittleEndian.PutUint32(out[52:56], x13) + binary.LittleEndian.PutUint32(out[56:60], x14) + binary.LittleEndian.PutUint32(out[60:64], x15) + } + out = out[BlockSize:] + } + + // Stoping at 2^70 bytes per nonce is the user's responsibility. + ctr := uint64(x[13])<<32 | uint64(x[12]) + ctr++ + x[12] = uint32(ctr) + x[13] = uint32(ctr >> 32) + } +} + +func hChaChaRef(x *[stateSize]uint32, out *[32]byte) { + x0, x1, x2, x3 := sigma0, sigma1, sigma2, sigma3 + x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11] + + for i := chachaRounds; i > 0; i -= 2 { + // quarterround(x, 0, 4, 8, 12) + x0 += x4 + x12 ^= x0 + x12 = bits.RotateLeft32(x12, 16) + x8 += x12 + x4 ^= x8 + x4 = bits.RotateLeft32(x4, 12) + x0 += x4 + x12 ^= x0 + x12 = bits.RotateLeft32(x12, 8) + x8 += x12 + x4 ^= x8 + x4 = bits.RotateLeft32(x4, 7) + + // quarterround(x, 1, 5, 9, 13) + x1 += x5 + x13 ^= x1 + x13 = bits.RotateLeft32(x13, 16) + x9 += x13 + x5 ^= x9 + x5 = bits.RotateLeft32(x5, 12) + x1 += x5 + x13 ^= x1 + x13 = bits.RotateLeft32(x13, 8) + x9 += x13 + x5 ^= x9 + x5 = bits.RotateLeft32(x5, 7) + + // quarterround(x, 2, 6, 10, 14) + x2 += x6 + x14 ^= x2 + x14 = bits.RotateLeft32(x14, 16) + x10 += x14 + x6 ^= x10 + x6 = bits.RotateLeft32(x6, 12) + x2 += x6 + x14 ^= x2 + x14 = bits.RotateLeft32(x14, 8) + x10 += x14 + x6 ^= x10 + x6 = bits.RotateLeft32(x6, 7) + + // quarterround(x, 3, 7, 11, 15) + x3 += x7 + x15 ^= x3 + x15 = bits.RotateLeft32(x15, 16) + x11 += x15 + x7 ^= x11 + x7 = bits.RotateLeft32(x7, 12) + x3 += x7 + x15 ^= x3 + x15 = bits.RotateLeft32(x15, 8) + x11 += x15 + x7 ^= x11 + x7 = bits.RotateLeft32(x7, 7) + + // quarterround(x, 0, 5, 10, 15) + x0 += x5 + x15 ^= x0 + x15 = bits.RotateLeft32(x15, 16) + x10 += x15 + x5 ^= x10 + x5 = bits.RotateLeft32(x5, 12) + x0 += x5 + x15 ^= x0 + x15 = bits.RotateLeft32(x15, 8) + x10 += x15 + x5 ^= x10 + x5 = bits.RotateLeft32(x5, 7) + + // quarterround(x, 1, 6, 11, 12) + x1 += x6 + x12 ^= x1 + x12 = bits.RotateLeft32(x12, 16) + x11 += x12 + x6 ^= x11 + x6 = bits.RotateLeft32(x6, 12) + x1 += x6 + x12 ^= x1 + x12 = bits.RotateLeft32(x12, 8) + x11 += x12 + x6 ^= x11 + x6 = bits.RotateLeft32(x6, 7) + + // quarterround(x, 2, 7, 8, 13) + x2 += x7 + x13 ^= x2 + x13 = bits.RotateLeft32(x13, 16) + x8 += x13 + x7 ^= x8 + x7 = bits.RotateLeft32(x7, 12) + x2 += x7 + x13 ^= x2 + x13 = bits.RotateLeft32(x13, 8) + x8 += x13 + x7 ^= x8 + x7 = bits.RotateLeft32(x7, 7) + + // quarterround(x, 3, 4, 9, 14) + x3 += x4 + x14 ^= x3 + x14 = bits.RotateLeft32(x14, 16) + x9 += x14 + x4 ^= x9 + x4 = bits.RotateLeft32(x4, 12) + x3 += x4 + x14 ^= x3 + x14 = bits.RotateLeft32(x14, 8) + x9 += x14 + x4 ^= x9 + x4 = bits.RotateLeft32(x4, 7) + } + + // HChaCha returns x0...x3 | x12...x15, which corresponds to the + // indexes of the ChaCha constant and the indexes of the IV. + if useUnsafe { + outArr := (*[16]uint32)(unsafe.Pointer(&out[0])) + outArr[0] = x0 + outArr[1] = x1 + outArr[2] = x2 + outArr[3] = x3 + outArr[4] = x12 + outArr[5] = x13 + outArr[6] = x14 + outArr[7] = x15 + } else { + binary.LittleEndian.PutUint32(out[0:4], x0) + binary.LittleEndian.PutUint32(out[4:8], x1) + binary.LittleEndian.PutUint32(out[8:12], x2) + binary.LittleEndian.PutUint32(out[12:16], x3) + binary.LittleEndian.PutUint32(out[16:20], x12) + binary.LittleEndian.PutUint32(out[20:24], x13) + binary.LittleEndian.PutUint32(out[24:28], x14) + binary.LittleEndian.PutUint32(out[28:32], x15) + } + return +}