diff --git a/CHANGELOG b/CHANGELOG
index 923a68b..17d8c4a 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,4 +1,13 @@
proxy更新日志
+v4.8
+1.优化了SPS连接HTTP上级的指令,避免了某些代理不响应的问题.
+2.SPS功能增加了参数:
+--disable-http:禁用http(s)代理
+--disable-socks:禁用socks代理.
+默认都是false(开启).
+3.重构了部分代码的日志部分,保证了日志按着预期输出.
+
+
v4.7
1.增加了基于gomobile的sdk,对android/ios/windows/linux/mac提供SDK支持.
2.优化了bridge的日志,增加了client和server的掉线日志.
diff --git a/Godeps/Godeps.json b/Godeps/Godeps.json
index 3757dd0..0f6e52d 100644
--- a/Godeps/Godeps.json
+++ b/Godeps/Godeps.json
@@ -1,11 +1,27 @@
{
"ImportPath": "github.com/snail007/goproxy",
- "GoVersion": "go1.9",
+ "GoVersion": "go1.8",
"GodepVersion": "v80",
"Packages": [
"./..."
],
"Deps": [
+ {
+ "ImportPath": "github.com/Yawning/chacha20",
+ "Rev": "e3b1f968fc6397b51d963fee8ec8711a47bc0ce8"
+ },
+ {
+ "ImportPath": "github.com/alecthomas/template",
+ "Rev": "a0175ee3bccc567396460bf5acd36800cb10c49c"
+ },
+ {
+ "ImportPath": "github.com/alecthomas/template/parse",
+ "Rev": "a0175ee3bccc567396460bf5acd36800cb10c49c"
+ },
+ {
+ "ImportPath": "github.com/alecthomas/units",
+ "Rev": "2efee857e7cfd4f3d0138cc3cbb1b4966962b93a"
+ },
{
"ImportPath": "github.com/golang/snappy",
"Rev": "553a641470496b2327abcac10b36396bd98e45c9"
@@ -15,66 +31,15 @@
"Comment": "v1.0.4-1-g40b5202",
"Rev": "40b520211179dbf7eaafaa7fe1ffaa1b7d929ee0"
},
- {
- "ImportPath": "github.com/xtaci/kcp-go",
- "Comment": "v3.19-6-g21da33a",
- "Rev": "21da33a6696d67c1bffb3c954366499d613097a6"
- },
- {
- "ImportPath": "github.com/xtaci/smux",
- "Comment": "v1.0.6",
- "Rev": "ebec7ef2574b42a7088cd7751176483e0a27d458"
- },
- {
- "ImportPath": "golang.org/x/crypto/pbkdf2",
- "Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8"
- },
- {
- "ImportPath": "golang.org/x/crypto/ssh",
- "Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8"
- },
- {
- "ImportPath": "golang.org/x/time/rate",
- "Rev": "6dc17368e09b0e8634d71cac8168d853e869a0c7"
- },
- {
- "ImportPath": "gopkg.in/alecthomas/kingpin.v2",
- "Comment": "v2.2.5",
- "Rev": "1087e65c9441605df944fb12c33f0fe7072d18ca"
- },
- {
- "ImportPath": "golang.org/x/crypto/ed25519",
- "Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8"
- },
- {
- "ImportPath": "golang.org/x/net/ipv4",
- "Rev": "5ccada7d0a7ba9aeb5d3aca8d3501b4c2a509fec"
- },
- {
- "ImportPath": "golang.org/x/net/ipv6",
- "Rev": "5ccada7d0a7ba9aeb5d3aca8d3501b4c2a509fec"
- },
- {
- "ImportPath": "golang.org/x/crypto/ed25519/internal/edwards25519",
- "Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8"
- },
- {
- "ImportPath": "golang.org/x/net/bpf",
- "Rev": "5ccada7d0a7ba9aeb5d3aca8d3501b4c2a509fec"
- },
- {
- "ImportPath": "golang.org/x/net/internal/iana",
- "Rev": "5ccada7d0a7ba9aeb5d3aca8d3501b4c2a509fec"
- },
- {
- "ImportPath": "golang.org/x/net/internal/socket",
- "Rev": "5ccada7d0a7ba9aeb5d3aca8d3501b4c2a509fec"
- },
{
"ImportPath": "github.com/pkg/errors",
"Comment": "v0.8.0-6-g602255c",
"Rev": "602255cdb6deaf1523ea53ac30eae5554ba7bee9"
},
+ {
+ "ImportPath": "github.com/templexxx/cpufeat",
+ "Rev": "3794dfbfb04749f896b521032f69383f24c3687e"
+ },
{
"ImportPath": "github.com/templexxx/reedsolomon",
"Comment": "0.1.1-4-g7092926",
@@ -90,6 +55,16 @@
"Comment": "v1.0.1-3-g9d99fac",
"Rev": "9d99face20b0dd300b7db50b3f69758de41c096a"
},
+ {
+ "ImportPath": "github.com/xtaci/kcp-go",
+ "Comment": "v3.19-6-g21da33a",
+ "Rev": "21da33a6696d67c1bffb3c954366499d613097a6"
+ },
+ {
+ "ImportPath": "github.com/xtaci/smux",
+ "Comment": "v1.0.6",
+ "Rev": "ebec7ef2574b42a7088cd7751176483e0a27d458"
+ },
{
"ImportPath": "golang.org/x/crypto/blowfish",
"Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8"
@@ -98,10 +73,34 @@
"ImportPath": "golang.org/x/crypto/cast5",
"Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8"
},
+ {
+ "ImportPath": "golang.org/x/crypto/curve25519",
+ "Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8"
+ },
+ {
+ "ImportPath": "golang.org/x/crypto/ed25519",
+ "Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8"
+ },
+ {
+ "ImportPath": "golang.org/x/crypto/ed25519/internal/edwards25519",
+ "Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8"
+ },
+ {
+ "ImportPath": "golang.org/x/crypto/pbkdf2",
+ "Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8"
+ },
{
"ImportPath": "golang.org/x/crypto/salsa20",
"Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8"
},
+ {
+ "ImportPath": "golang.org/x/crypto/salsa20/salsa",
+ "Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8"
+ },
+ {
+ "ImportPath": "golang.org/x/crypto/ssh",
+ "Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8"
+ },
{
"ImportPath": "golang.org/x/crypto/tea",
"Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8"
@@ -115,28 +114,33 @@
"Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8"
},
{
- "ImportPath": "github.com/templexxx/cpufeat",
- "Rev": "3794dfbfb04749f896b521032f69383f24c3687e"
+ "ImportPath": "golang.org/x/net/bpf",
+ "Rev": "5ccada7d0a7ba9aeb5d3aca8d3501b4c2a509fec"
},
{
- "ImportPath": "golang.org/x/crypto/salsa20/salsa",
- "Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8"
+ "ImportPath": "golang.org/x/net/internal/iana",
+ "Rev": "5ccada7d0a7ba9aeb5d3aca8d3501b4c2a509fec"
},
{
- "ImportPath": "golang.org/x/crypto/curve25519",
- "Rev": "0fcca4842a8d74bfddc2c96a073bd2a4d2a7a2e8"
+ "ImportPath": "golang.org/x/net/internal/socket",
+ "Rev": "5ccada7d0a7ba9aeb5d3aca8d3501b4c2a509fec"
},
{
- "ImportPath": "github.com/alecthomas/template",
- "Rev": "a0175ee3bccc567396460bf5acd36800cb10c49c"
+ "ImportPath": "golang.org/x/net/ipv4",
+ "Rev": "5ccada7d0a7ba9aeb5d3aca8d3501b4c2a509fec"
},
{
- "ImportPath": "github.com/alecthomas/units",
- "Rev": "2efee857e7cfd4f3d0138cc3cbb1b4966962b93a"
+ "ImportPath": "golang.org/x/net/ipv6",
+ "Rev": "5ccada7d0a7ba9aeb5d3aca8d3501b4c2a509fec"
},
{
- "ImportPath": "github.com/alecthomas/template/parse",
- "Rev": "a0175ee3bccc567396460bf5acd36800cb10c49c"
+ "ImportPath": "golang.org/x/time/rate",
+ "Rev": "6dc17368e09b0e8634d71cac8168d853e869a0c7"
+ },
+ {
+ "ImportPath": "gopkg.in/alecthomas/kingpin.v2",
+ "Comment": "v2.2.5",
+ "Rev": "1087e65c9441605df944fb12c33f0fe7072d18ca"
}
]
}
diff --git a/README_ZH.md b/README_ZH.md
index 5f97120..3ecfe94 100644
--- a/README_ZH.md
+++ b/README_ZH.md
@@ -137,7 +137,8 @@ Proxy是golang实现的高性能http,https,websocket,tcp,udp,socks5代理服务
- [6.6 认证功能](#66-认证功能)
- [6.7 自定义加密](#67-自定义加密)
- [6.8 压缩传输](#68-压缩传输)
- - [6.9 查看帮助](#69-查看帮助)
+ - [6.9 禁用协议](#69-禁用协议)
+ - [6.10 查看帮助](#610-查看帮助)
- [7. KCP配置](#7kcp配置)
- [7.1 配置介绍](#71-配置介绍)
- [7.2 详细配置](#72-详细配置)
@@ -956,7 +957,16 @@ proxy的sps代理在tcp之上可以通过自定义加密和tls标准加密以及
这样通过本地代理8080访问网站的时候就是通过与上级压缩传输访问目标网站.
-#### **6.9 查看帮助**
+#### **6.9 禁用协议**
+SPS默认情况下一个端口支持http(s)和socks5两种代理协议,我们可以通过参数禁用某个协议
+比如:
+1.禁用HTTP(S)代理功能只保留SOCKS5代理功能,参数:`--disable-http`.
+`proxy sps -T tcp -P 3.3.3.3:8888 -M -t tcp -p :8080 --disable-http`
+
+1.禁用SOCKS5代理功能只保留HTTP(S)代理功能,参数:`--disable-socks`.
+`proxy sps -T tcp -P 3.3.3.3:8888 -M -t tcp -p :8080 --disable-http`
+
+#### **6.10 查看帮助**
`./proxy help sps`
### **7.KCP配置**
diff --git a/config.go b/config.go
index 16e0079..5c2e80a 100755
--- a/config.go
+++ b/config.go
@@ -242,6 +242,11 @@ func initConfig() (err error) {
spsArgs.ParentKey = sps.Flag("parent-key", "the password for auto encrypt/decrypt parent connection data").Short('Z').Default("").String()
spsArgs.LocalCompress = sps.Flag("local-compress", "auto compress/decompress data on local connection").Short('m').Default("false").Bool()
spsArgs.ParentCompress = sps.Flag("parent-compress", "auto compress/decompress data on parent connection").Short('M').Default("false").Bool()
+ spsArgs.SSMethod = sps.Flag("ss-method", "").Hidden().Short('h').Default("aes-256-cfb").String()
+ spsArgs.SSKey = sps.Flag("ss-key", "").Hidden().Short('j').Default("sspassword").String()
+ spsArgs.DisableHTTP = sps.Flag("disable-http", "disable http(s) proxy").Default("false").Bool()
+ spsArgs.DisableSocks5 = sps.Flag("disable-socks", "disable socks proxy").Default("false").Bool()
+ spsArgs.DisableSS = sps.Flag("disable-ss", "").Hidden().Default("false").Bool()
//parse args
serviceName := kingpin.MustParse(app.Parse(os.Args[1:]))
diff --git a/services/args.go b/services/args.go
index 3d097a2..c54c5c8 100644
--- a/services/args.go
+++ b/services/args.go
@@ -232,6 +232,11 @@ type SPSArgs struct {
ParentKey *string
LocalCompress *bool
ParentCompress *bool
+ SSMethod *string
+ SSKey *string
+ DisableHTTP *bool
+ DisableSocks5 *bool
+ DisableSS *bool
}
func (a *SPSArgs) Protocol() string {
diff --git a/services/http.go b/services/http.go
index 5584585..abb8f86 100644
--- a/services/http.go
+++ b/services/http.go
@@ -97,10 +97,10 @@ func (s *HTTP) CheckArgs() (err error) {
func (s *HTTP) InitService() (err error) {
s.InitBasicAuth()
if *s.cfg.Parent != "" {
- s.checker = utils.NewChecker(*s.cfg.HTTPTimeout, int64(*s.cfg.Interval), *s.cfg.Blocked, *s.cfg.Direct)
+ s.checker = utils.NewChecker(*s.cfg.HTTPTimeout, int64(*s.cfg.Interval), *s.cfg.Blocked, *s.cfg.Direct, s.log)
}
if *s.cfg.DNSAddress != "" {
- (*s).domainResolver = utils.NewDomainResolver(*s.cfg.DNSAddress, *s.cfg.DNSTTL)
+ (*s).domainResolver = utils.NewDomainResolver(*s.cfg.DNSAddress, *s.cfg.DNSTTL, s.log)
}
if *s.cfg.ParentType == "ssh" {
err = s.ConnectSSH()
@@ -178,13 +178,13 @@ func (s *HTTP) Start(args interface{}, log *logger.Logger) (err error) {
if addr != "" {
host, port, _ := net.SplitHostPort(addr)
p, _ := strconv.Atoi(port)
- sc := utils.NewServerChannel(host, p)
+ sc := utils.NewServerChannel(host, p, s.log)
if *s.cfg.LocalType == TYPE_TCP {
err = sc.ListenTCP(s.callback)
} else if *s.cfg.LocalType == TYPE_TLS {
err = sc.ListenTls(s.cfg.CertBytes, s.cfg.KeyBytes, s.cfg.CaCertBytes, s.callback)
} else if *s.cfg.LocalType == TYPE_KCP {
- err = sc.ListenKCP(s.cfg.KCP, s.callback)
+ err = sc.ListenKCP(s.cfg.KCP, s.callback, s.log)
}
if err != nil {
return
@@ -215,7 +215,7 @@ func (s *HTTP) callback(inConn net.Conn) {
}
var err interface{}
var req utils.HTTPRequest
- req, err = utils.NewHTTPRequest(&inConn, 4096, s.IsBasicAuth(), &s.basicAuth)
+ req, err = utils.NewHTTPRequest(&inConn, 4096, s.IsBasicAuth(), &s.basicAuth, s.log)
if err != nil {
if err != io.EOF {
s.log.Printf("decoder error , from %s, ERR:%s", inConn.RemoteAddr(), err)
@@ -321,7 +321,7 @@ func (s *HTTP) OutToTCP(useProxy bool, address string, inConn *net.Conn, req *ut
utils.IoBind((*inConn), outConn, func(err interface{}) {
s.log.Printf("conn %s - %s released [%s]", inAddr, outAddr, req.Host)
s.userConns.Remove(inAddr)
- })
+ }, s.log)
s.log.Printf("conn %s - %s connected [%s]", inAddr, outAddr, req.Host)
if c, ok := s.userConns.Get(inAddr); ok {
(*c.(*net.Conn)).Close()
@@ -403,9 +403,9 @@ func (s *HTTP) InitOutConnPool() {
}
func (s *HTTP) InitBasicAuth() (err error) {
if *s.cfg.DNSAddress != "" {
- s.basicAuth = utils.NewBasicAuth(&(*s).domainResolver)
+ s.basicAuth = utils.NewBasicAuth(&(*s).domainResolver, s.log)
} else {
- s.basicAuth = utils.NewBasicAuth(nil)
+ s.basicAuth = utils.NewBasicAuth(nil, s.log)
}
if *s.cfg.AuthURL != "" {
s.basicAuth.SetAuthURL(*s.cfg.AuthURL, *s.cfg.AuthURLOkCode, *s.cfg.AuthURLTimeout, *s.cfg.AuthURLRetry)
diff --git a/services/mux_bridge.go b/services/mux_bridge.go
index 63302a6..ad8921c 100644
--- a/services/mux_bridge.go
+++ b/services/mux_bridge.go
@@ -90,13 +90,13 @@ func (s *MuxBridge) Start(args interface{}, log *logger.Logger) (err error) {
host, port, _ := net.SplitHostPort(*s.cfg.Local)
p, _ := strconv.Atoi(port)
- sc := utils.NewServerChannel(host, p)
+ sc := utils.NewServerChannel(host, p, s.log)
if *s.cfg.LocalType == TYPE_TCP {
err = sc.ListenTCP(s.handler)
} else if *s.cfg.LocalType == TYPE_TLS {
err = sc.ListenTls(s.cfg.CertBytes, s.cfg.KeyBytes, nil, s.handler)
} else if *s.cfg.LocalType == TYPE_KCP {
- err = sc.ListenKCP(s.cfg.KCP, s.handler)
+ err = sc.ListenKCP(s.cfg.KCP, s.handler, s.log)
}
if err != nil {
return
diff --git a/services/mux_client.go b/services/mux_client.go
index 54eed3c..9b3efcb 100644
--- a/services/mux_client.go
+++ b/services/mux_client.go
@@ -300,6 +300,6 @@ func (s *MuxClient) ServeConn(inConn *smux.Stream, localAddr, ID string) {
} else {
utils.IoBind(inConn, outConn, func(err interface{}) {
s.log.Printf("stream %s released", ID)
- })
+ }, s.log)
}
}
diff --git a/services/mux_server.go b/services/mux_server.go
index 4fd9b1e..776429c 100644
--- a/services/mux_server.go
+++ b/services/mux_server.go
@@ -200,7 +200,7 @@ func (s *MuxServer) Start(args interface{}, log *logger.Logger) (err error) {
}
host, port, _ := net.SplitHostPort(*s.cfg.Local)
p, _ := strconv.Atoi(port)
- s.sc = utils.NewServerChannel(host, p)
+ s.sc = utils.NewServerChannel(host, p, s.log)
if *s.cfg.IsUDP {
err = s.sc.ListenUDP(func(packet []byte, localAddr, srcAddr *net.UDPAddr) {
s.udpChn <- MuxUDPItem{
@@ -258,7 +258,7 @@ func (s *MuxServer) Start(args interface{}, log *logger.Logger) (err error) {
} else {
utils.IoBind(inConn, outConn, func(err interface{}) {
s.log.Printf("%s stream %s released", *s.cfg.Key, ID)
- })
+ }, s.log)
}
})
if err != nil {
diff --git a/services/socks.go b/services/socks.go
index d585408..16cccc5 100644
--- a/services/socks.go
+++ b/services/socks.go
@@ -110,9 +110,9 @@ func (s *Socks) CheckArgs() (err error) {
func (s *Socks) InitService() (err error) {
s.InitBasicAuth()
if *s.cfg.DNSAddress != "" {
- (*s).domainResolver = utils.NewDomainResolver(*s.cfg.DNSAddress, *s.cfg.DNSTTL)
+ (*s).domainResolver = utils.NewDomainResolver(*s.cfg.DNSAddress, *s.cfg.DNSTTL, s.log)
}
- s.checker = utils.NewChecker(*s.cfg.Timeout, int64(*s.cfg.Interval), *s.cfg.Blocked, *s.cfg.Direct)
+ s.checker = utils.NewChecker(*s.cfg.Timeout, int64(*s.cfg.Interval), *s.cfg.Blocked, *s.cfg.Direct, s.log)
if *s.cfg.ParentType == "ssh" {
e := s.ConnectSSH()
if e != nil {
@@ -147,7 +147,7 @@ func (s *Socks) InitService() (err error) {
if *s.cfg.ParentType == "ssh" {
s.log.Printf("warn: socks udp not suppored for ssh")
} else {
- s.udpSC = utils.NewServerChannelHost(*s.cfg.UDPLocal)
+ s.udpSC = utils.NewServerChannelHost(*s.cfg.UDPLocal, s.log)
e := s.udpSC.ListenUDP(s.udpCallback)
if e != nil {
err = fmt.Errorf("init udp service fail, ERR: %s", e)
@@ -197,13 +197,13 @@ func (s *Socks) Start(args interface{}, log *logger.Logger) (err error) {
if *s.cfg.UDPParent != "" {
s.log.Printf("use socks udp parent %s", *s.cfg.UDPParent)
}
- sc := utils.NewServerChannelHost(*s.cfg.Local)
+ sc := utils.NewServerChannelHost(*s.cfg.Local, s.log)
if *s.cfg.LocalType == TYPE_TCP {
err = sc.ListenTCP(s.socksConnCallback)
} else if *s.cfg.LocalType == TYPE_TLS {
err = sc.ListenTls(s.cfg.CertBytes, s.cfg.KeyBytes, nil, s.socksConnCallback)
} else if *s.cfg.LocalType == TYPE_KCP {
- err = sc.ListenKCP(s.cfg.KCP, s.socksConnCallback)
+ err = sc.ListenKCP(s.cfg.KCP, s.socksConnCallback, s.log)
}
if err != nil {
return
@@ -557,7 +557,7 @@ func (s *Socks) proxyTCP(inConn *net.Conn, methodReq socks.MethodsRequest, reque
s.log.Printf("conn %s - %s connected", inAddr, request.Addr())
utils.IoBind(*inConn, outConn, func(err interface{}) {
s.log.Printf("conn %s - %s released", inAddr, request.Addr())
- })
+ }, s.log)
if c, ok := s.userConns.Get(inAddr); ok {
(*c.(*net.Conn)).Close()
s.userConns.Remove(inAddr)
@@ -689,9 +689,9 @@ func (s *Socks) ConnectSSH() (err error) {
}
func (s *Socks) InitBasicAuth() (err error) {
if *s.cfg.DNSAddress != "" {
- s.basicAuth = utils.NewBasicAuth(&(*s).domainResolver)
+ s.basicAuth = utils.NewBasicAuth(&(*s).domainResolver, s.log)
} else {
- s.basicAuth = utils.NewBasicAuth(nil)
+ s.basicAuth = utils.NewBasicAuth(nil, s.log)
}
if *s.cfg.AuthURL != "" {
s.basicAuth.SetAuthURL(*s.cfg.AuthURL, *s.cfg.AuthURLOkCode, *s.cfg.AuthURLTimeout, *s.cfg.AuthURLRetry)
diff --git a/services/sps.go b/services/sps.go
index b10aabc..ea0193b 100644
--- a/services/sps.go
+++ b/services/sps.go
@@ -16,6 +16,7 @@ import (
"github.com/snail007/goproxy/utils"
"github.com/snail007/goproxy/utils/conncrypt"
"github.com/snail007/goproxy/utils/socks"
+ "github.com/snail007/goproxy/utils/ss"
)
type SPS struct {
@@ -26,6 +27,7 @@ type SPS struct {
serverChannels []*utils.ServerChannel
userConns utils.ConcurrentMap
log *logger.Logger
+ cipher *ss.Cipher
}
func NewSPS() Service {
@@ -64,9 +66,16 @@ func (s *SPS) CheckArgs() (err error) {
func (s *SPS) InitService() (err error) {
s.InitOutConnPool()
if *s.cfg.DNSAddress != "" {
- (*s).domainResolver = utils.NewDomainResolver(*s.cfg.DNSAddress, *s.cfg.DNSTTL)
+ (*s).domainResolver = utils.NewDomainResolver(*s.cfg.DNSAddress, *s.cfg.DNSTTL, s.log)
}
err = s.InitBasicAuth()
+ if *s.cfg.SSMethod != "" && *s.cfg.SSKey != "" {
+ s.cipher, err = ss.NewCipher(*s.cfg.SSMethod, *s.cfg.SSKey)
+ if err != nil {
+ s.log.Printf("error generating cipher : %s", err)
+ return
+ }
+ }
return
}
func (s *SPS) InitOutConnPool() {
@@ -102,7 +111,12 @@ func (s *SPS) StopService() {
}
}
for _, c := range s.userConns.Items() {
- (*c.(*net.Conn)).Close()
+ if _,ok:=c.(*net.Conn);ok{
+ (*c.(*net.Conn)).Close()
+ }
+ if _,ok:=c.(**net.Conn);ok{
+ (*(*c.(**net.Conn))).Close()
+ }
}
}
func (s *SPS) Start(args interface{}, log *logger.Logger) (err error) {
@@ -119,13 +133,13 @@ func (s *SPS) Start(args interface{}, log *logger.Logger) (err error) {
if addr != "" {
host, port, _ := net.SplitHostPort(*s.cfg.Local)
p, _ := strconv.Atoi(port)
- sc := utils.NewServerChannel(host, p)
+ sc := utils.NewServerChannel(host, p, s.log)
if *s.cfg.LocalType == TYPE_TCP {
err = sc.ListenTCP(s.callback)
} else if *s.cfg.LocalType == TYPE_TLS {
err = sc.ListenTls(s.cfg.CertBytes, s.cfg.KeyBytes, s.cfg.CaCertBytes, s.callback)
} else if *s.cfg.LocalType == TYPE_KCP {
- err = sc.ListenKCP(s.cfg.KCP, s.callback)
+ err = sc.ListenKCP(s.cfg.KCP, s.callback, s.log)
}
if err != nil {
return
@@ -171,39 +185,60 @@ func (s *SPS) callback(inConn net.Conn) {
}
}
func (s *SPS) OutToTCP(inConn *net.Conn) (err error) {
- buf := make([]byte, 1024)
- n, err := (*inConn).Read(buf)
- header := buf[:n]
+ bInConn := utils.NewBufferedConn(*inConn)
+ //important
+ //action read will regist read event to system,
+ //when data arrived , system call process
+ //so that we can get buffered bytes count
+ //otherwise Buffered() always return 0
+ bInConn.ReadByte()
+ bInConn.UnreadByte()
+
+ n := 8
+ if n > bInConn.Buffered() {
+ n = bInConn.Buffered()
+ }
+ h, err := bInConn.Peek(n)
if err != nil {
- s.log.Printf("ERR:%s", err)
- utils.CloseConn(inConn)
+ s.log.Printf("peek error %s ", err)
+ (*inConn).Close()
return
}
+
+ *inConn = bInConn
address := ""
var auth socks.Auth
var forwardBytes []byte
//fmt.Printf("%v", header)
- if header[0] == socks.VERSION_V5 {
+ if utils.IsSocks5(h) {
+ if *s.cfg.DisableSocks5 {
+ (*inConn).Close()
+ return
+ }
//socks5 server
var serverConn *socks.ServerConn
if s.IsBasicAuth() {
- serverConn = socks.NewServerConn(inConn, time.Millisecond*time.Duration(*s.cfg.Timeout), &s.basicAuth, "", header)
+ serverConn = socks.NewServerConn(inConn, time.Millisecond*time.Duration(*s.cfg.Timeout), &s.basicAuth, "", nil)
} else {
- serverConn = socks.NewServerConn(inConn, time.Millisecond*time.Duration(*s.cfg.Timeout), nil, "", header)
+ serverConn = socks.NewServerConn(inConn, time.Millisecond*time.Duration(*s.cfg.Timeout), nil, "", nil)
}
if err = serverConn.Handshake(); err != nil {
return
}
address = serverConn.Target()
auth = serverConn.AuthData()
- } else if bytes.IndexByte(header, '\n') != -1 {
+ } else if utils.IsHTTP(h) {
+ if *s.cfg.DisableHTTP {
+ (*inConn).Close()
+ return
+ }
//http
var request utils.HTTPRequest
(*inConn).SetDeadline(time.Now().Add(time.Millisecond * time.Duration(*s.cfg.Timeout)))
if s.IsBasicAuth() {
- request, err = utils.NewHTTPRequest(inConn, 1024, true, &s.basicAuth, header)
+ request, err = utils.NewHTTPRequest(inConn, 1024, true, &s.basicAuth, s.log)
} else {
- request, err = utils.NewHTTPRequest(inConn, 1024, false, nil, header)
+ request, err = utils.NewHTTPRequest(inConn, 1024, false, nil, s.log)
}
(*inConn).SetDeadline(time.Time{})
if err != nil {
@@ -211,12 +246,13 @@ func (s *SPS) OutToTCP(inConn *net.Conn) (err error) {
utils.CloseConn(inConn)
return
}
- if len(header) >= 7 && strings.ToLower(string(header[:7])) == "connect" {
+ if len(h) >= 7 && strings.ToLower(string(h[:7])) == "connect" {
//https
request.HTTPSReply()
//s.log.Printf("https reply: %s", request.Host)
} else {
- forwardBytes = request.HeadBuf
+ //forwardBytes = bytes.TrimRight(request.HeadBuf,"\r\n")
+ forwardBytes = bytes.TrimRight(request.HeadBuf,"\r\n")
}
address = request.Host
var userpass string
@@ -231,7 +267,28 @@ func (s *SPS) OutToTCP(inConn *net.Conn) (err error) {
}
}
} else {
- s.log.Printf("unknown request from: %s,%s", (*inConn).RemoteAddr(), string(header))
+ //ss
+ if *s.cfg.DisableSS {
+ (*inConn).Close()
+ return
+ }
+ (*inConn).SetDeadline(time.Now().Add(time.Second * 5))
+ ssConn := ss.NewConn(*inConn, s.cipher.Copy())
+ address, err = ss.GetRequest(ssConn)
+ (*inConn).SetDeadline(time.Time{})
+ if err != nil {
+ return
+ }
+ // ensure the host does not contain some illegal characters, NUL may panic on Win32
+ if strings.ContainsRune(address, 0x00) {
+ err = errors.New("invalid domain name")
+ return
+ }
+ *inConn = ssConn
+ }
+ if err != nil || address == "" {
+ s.log.Printf("unknown request from: %s,%s", (*inConn).RemoteAddr(), string(h))
+ (*inConn).Close()
utils.CloseConn(inConn)
err = errors.New("unknown request")
return
@@ -255,8 +312,16 @@ func (s *SPS) OutToTCP(inConn *net.Conn) (err error) {
//ask parent for connect to target address
if *s.cfg.ParentServiceType == "http" {
//http parent
+ isHTTPS:=false
pb := new(bytes.Buffer)
- pb.Write([]byte(fmt.Sprintf("CONNECT %s HTTP/1.1\r\nProxy-Connection: Keep-Alive\r\n", address)))
+ if len(forwardBytes) == 0 {
+ isHTTPS=true
+ pb.Write([]byte(fmt.Sprintf("CONNECT %s HTTP/1.1", address)))
+ }
+ pb.WriteString("\r\nProxy-Connection: Keep-Alive\r\n")
+
+ s.log.Printf("before bytes:%s",string(forwardBytes))
+
//Proxy-Authorization:\r\n
u := ""
if *s.cfg.ParentAuth != "" {
@@ -274,7 +339,22 @@ func (s *SPS) OutToTCP(inConn *net.Conn) (err error) {
if u != "" {
pb.Write([]byte(fmt.Sprintf("Proxy-Authorization:Basic %s\r\n", base64.StdEncoding.EncodeToString([]byte(u)))))
}
- pb.Write([]byte("\r\n"))
+
+ if isHTTPS{
+ pb.Write([]byte("\r\n"))
+ }else{
+ //do remove some headers with forwardBytes
+ //forwardBytes
+
+ forwardBytes=bytes.Replace(forwardBytes,[]byte("\r\n"),[]byte(pb.Bytes()),1)
+ pb.Reset()
+ pb.Write(forwardBytes)
+ if !bytes.Contains(forwardBytes,[]byte("\r\n\r\n\r\n")){
+ pb.Write([]byte("\r\n\r\n"))
+ }
+ forwardBytes=nil
+ }
+
outConn.SetDeadline(time.Now().Add(time.Millisecond * time.Duration(*s.cfg.Timeout)))
_, err = outConn.Write(pb.Bytes())
outConn.SetDeadline(time.Time{})
@@ -284,17 +364,23 @@ func (s *SPS) OutToTCP(inConn *net.Conn) (err error) {
utils.CloseConn(&outConn)
return
}
- reply := make([]byte, 1024)
- outConn.SetDeadline(time.Now().Add(time.Millisecond * time.Duration(*s.cfg.Timeout)))
- _, err = outConn.Read(reply)
- outConn.SetDeadline(time.Time{})
- if err != nil {
- s.log.Printf("read reply from %s , err:%s", *s.cfg.Parent, err)
- utils.CloseConn(inConn)
- utils.CloseConn(&outConn)
- return
+
+ s.log.Printf("ishttps:%v",isHTTPS)
+ s.log.Printf("bytes:%s",string(pb.Bytes()))
+
+ if isHTTPS{
+ reply := make([]byte, 1024)
+ outConn.SetDeadline(time.Now().Add(time.Millisecond * time.Duration(*s.cfg.Timeout)))
+ _, err = outConn.Read(reply)
+ outConn.SetDeadline(time.Time{})
+ if err != nil {
+ s.log.Printf("read reply from %s , err:%s", *s.cfg.Parent, err)
+ utils.CloseConn(inConn)
+ utils.CloseConn(&outConn)
+ return
+ }
+ //s.log.Printf("reply: %s", string(reply[:n]))
}
- //s.log.Printf("reply: %s", string(reply[:n]))
} else {
s.log.Printf("connect %s", address)
//socks client
@@ -305,12 +391,12 @@ func (s *SPS) OutToTCP(inConn *net.Conn) (err error) {
err = fmt.Errorf("parent auth data format error")
return
}
- clientConn = socks.NewClientConn(&outConn, "tcp", address, time.Millisecond*time.Duration(*s.cfg.Timeout), &socks.Auth{User: a[0], Password: a[1]}, header)
+ clientConn = socks.NewClientConn(&outConn, "tcp", address, time.Millisecond*time.Duration(*s.cfg.Timeout), &socks.Auth{User: a[0], Password: a[1]}, nil)
} else {
if !s.IsBasicAuth() && auth.Password != "" && auth.User != "" {
- clientConn = socks.NewClientConn(&outConn, "tcp", address, time.Millisecond*time.Duration(*s.cfg.Timeout), &auth, header)
+ clientConn = socks.NewClientConn(&outConn, "tcp", address, time.Millisecond*time.Duration(*s.cfg.Timeout), &auth, nil)
} else {
- clientConn = socks.NewClientConn(&outConn, "tcp", address, time.Millisecond*time.Duration(*s.cfg.Timeout), nil, header)
+ clientConn = socks.NewClientConn(&outConn, "tcp", address, time.Millisecond*time.Duration(*s.cfg.Timeout), nil, nil)
}
}
if err = clientConn.Handshake(); err != nil {
@@ -327,7 +413,7 @@ func (s *SPS) OutToTCP(inConn *net.Conn) (err error) {
utils.IoBind((*inConn), outConn, func(err interface{}) {
s.log.Printf("conn %s - %s released", inAddr, outAddr)
s.userConns.Remove(inAddr)
- })
+ }, s.log)
s.log.Printf("conn %s - %s connected", inAddr, outAddr)
if c, ok := s.userConns.Get(inAddr); ok {
(*c.(*net.Conn)).Close()
@@ -337,9 +423,9 @@ func (s *SPS) OutToTCP(inConn *net.Conn) (err error) {
}
func (s *SPS) InitBasicAuth() (err error) {
if *s.cfg.DNSAddress != "" {
- s.basicAuth = utils.NewBasicAuth(&(*s).domainResolver)
+ s.basicAuth = utils.NewBasicAuth(&(*s).domainResolver, s.log)
} else {
- s.basicAuth = utils.NewBasicAuth(nil)
+ s.basicAuth = utils.NewBasicAuth(nil, s.log)
}
if *s.cfg.AuthURL != "" {
s.basicAuth.SetAuthURL(*s.cfg.AuthURL, *s.cfg.AuthURLOkCode, *s.cfg.AuthURLTimeout, *s.cfg.AuthURLRetry)
diff --git a/services/tcp.go b/services/tcp.go
index 52d436c..eab2830 100644
--- a/services/tcp.go
+++ b/services/tcp.go
@@ -84,14 +84,14 @@ func (s *TCP) Start(args interface{}, log *logger.Logger) (err error) {
s.log.Printf("use %s parent %s", *s.cfg.ParentType, *s.cfg.Parent)
host, port, _ := net.SplitHostPort(*s.cfg.Local)
p, _ := strconv.Atoi(port)
- sc := utils.NewServerChannel(host, p)
+ sc := utils.NewServerChannel(host, p, s.log)
if *s.cfg.LocalType == TYPE_TCP {
err = sc.ListenTCP(s.callback)
} else if *s.cfg.LocalType == TYPE_TLS {
err = sc.ListenTls(s.cfg.CertBytes, s.cfg.KeyBytes, nil, s.callback)
} else if *s.cfg.LocalType == TYPE_KCP {
- err = sc.ListenKCP(s.cfg.KCP, s.callback)
+ err = sc.ListenKCP(s.cfg.KCP, s.callback, s.log)
}
if err != nil {
return
@@ -143,7 +143,7 @@ func (s *TCP) OutToTCP(inConn *net.Conn) (err error) {
utils.IoBind((*inConn), outConn, func(err interface{}) {
s.log.Printf("conn %s - %s released", inAddr, outAddr)
s.userConns.Remove(inAddr)
- })
+ }, s.log)
s.log.Printf("conn %s - %s connected", inAddr, outAddr)
if c, ok := s.userConns.Get(inAddr); ok {
(*c.(*net.Conn)).Close()
diff --git a/services/tunnel_bridge.go b/services/tunnel_bridge.go
index 952286d..333523d 100644
--- a/services/tunnel_bridge.go
+++ b/services/tunnel_bridge.go
@@ -73,7 +73,7 @@ func (s *TunnelBridge) Start(args interface{}, log *logger.Logger) (err error) {
}
host, port, _ := net.SplitHostPort(*s.cfg.Local)
p, _ := strconv.Atoi(port)
- sc := utils.NewServerChannel(host, p)
+ sc := utils.NewServerChannel(host, p, s.log)
err = sc.ListenTls(s.cfg.CertBytes, s.cfg.KeyBytes, nil, s.callback)
if err != nil {
@@ -173,7 +173,7 @@ func (s *TunnelBridge) callback(inConn net.Conn) {
// s.cmClient.RemoveOne(key, ID)
// s.cmServer.RemoveOne(serverID, ID)
s.log.Printf("conn %s released", ID)
- })
+ }, s.log)
// s.cmClient.Add(key, ID, &inConn)
s.log.Printf("conn %s created", ID)
diff --git a/services/tunnel_client.go b/services/tunnel_client.go
index ef82024..29af4be 100644
--- a/services/tunnel_client.go
+++ b/services/tunnel_client.go
@@ -284,7 +284,7 @@ func (s *TunnelClient) ServeConn(localAddr, ID, serverID string) {
utils.IoBind(inConn, outConn, func(err interface{}) {
s.log.Printf("conn %s released", ID)
s.userConns.Remove(inAddr)
- })
+ }, s.log)
if c, ok := s.userConns.Get(inAddr); ok {
(*c.(*net.Conn)).Close()
}
diff --git a/services/tunnel_server.go b/services/tunnel_server.go
index 3e84199..6c4086e 100644
--- a/services/tunnel_server.go
+++ b/services/tunnel_server.go
@@ -206,7 +206,7 @@ func (s *TunnelServer) Start(args interface{}, log *logger.Logger) (err error) {
}
host, port, _ := net.SplitHostPort(*s.cfg.Local)
p, _ := strconv.Atoi(port)
- s.sc = utils.NewServerChannel(host, p)
+ s.sc = utils.NewServerChannel(host, p, s.log)
if *s.cfg.IsUDP {
err = s.sc.ListenUDP(func(packet []byte, localAddr, srcAddr *net.UDPAddr) {
s.udpChn <- UDPItem{
@@ -246,7 +246,7 @@ func (s *TunnelServer) Start(args interface{}, log *logger.Logger) (err error) {
utils.IoBind(inConn, outConn, func(err interface{}) {
s.userConns.Remove(inAddr)
s.log.Printf("%s conn %s released", *s.cfg.Key, ID)
- })
+ }, s.log)
if c, ok := s.userConns.Get(inAddr); ok {
(*c.(*net.Conn)).Close()
}
diff --git a/services/udp.go b/services/udp.go
index 240eb57..57cdd59 100644
--- a/services/udp.go
+++ b/services/udp.go
@@ -84,7 +84,7 @@ func (s *UDP) Start(args interface{}, log *logger.Logger) (err error) {
}
host, port, _ := net.SplitHostPort(*s.cfg.Local)
p, _ := strconv.Atoi(port)
- sc := utils.NewServerChannel(host, p)
+ sc := utils.NewServerChannel(host, p, s.log)
s.sc = &sc
err = sc.ListenUDP(s.callback)
if err != nil {
diff --git a/utils/functions.go b/utils/functions.go
index b5fd408..21ca84f 100755
--- a/utils/functions.go
+++ b/utils/functions.go
@@ -10,27 +10,29 @@ import (
"encoding/pem"
"errors"
"fmt"
- "github.com/snail007/goproxy/services/kcpcfg"
"io"
"io/ioutil"
- "log"
+ logger "log"
"math/rand"
"net"
"net/http"
"os"
"os/exec"
+ "github.com/snail007/goproxy/services/kcpcfg"
+
"golang.org/x/crypto/pbkdf2"
- "github.com/snail007/goproxy/utils/id"
"strconv"
"strings"
"time"
+ "github.com/snail007/goproxy/utils/id"
+
kcp "github.com/xtaci/kcp-go"
)
-func IoBind(dst io.ReadWriteCloser, src io.ReadWriteCloser, fn func(err interface{})) {
+func IoBind(dst io.ReadWriteCloser, src io.ReadWriteCloser, fn func(err interface{}), log *logger.Logger) {
go func() {
defer func() {
if err := recover(); err != nil {
@@ -68,7 +70,9 @@ func IoBind(dst io.ReadWriteCloser, src io.ReadWriteCloser, fn func(err interfac
}
src.Close()
dst.Close()
- fn(err)
+ if fn != nil {
+ fn(err)
+ }
}()
}
func ioCopy(dst io.ReadWriter, src io.ReadWriter) (err error) {
@@ -171,33 +175,6 @@ func ConnectKCPHost(hostAndPort string, config kcpcfg.KCPConfigArgs) (conn net.C
return NewCompStream(kcpconn), err
}
-func ListenTls(ip string, port int, certBytes, keyBytes, caCertBytes []byte) (ln *net.Listener, err error) {
-
- var cert tls.Certificate
- cert, err = tls.X509KeyPair(certBytes, keyBytes)
- if err != nil {
- return
- }
- clientCertPool := x509.NewCertPool()
- caBytes := certBytes
- if caCertBytes != nil {
- caBytes = caCertBytes
- }
- ok := clientCertPool.AppendCertsFromPEM(caBytes)
- if !ok {
- err = errors.New("failed to parse root certificate")
- }
- config := &tls.Config{
- ClientCAs: clientCertPool,
- Certificates: []tls.Certificate{cert},
- ClientAuth: tls.RequireAndVerifyClientCert,
- }
- _ln, err := tls.Listen("tcp", fmt.Sprintf("%s:%d", ip, port), config)
- if err == nil {
- ln = &_ln
- }
- return
-}
func PathExists(_path string) bool {
_, err := os.Stat(_path)
if err != nil && os.IsNotExist(err) {
@@ -245,7 +222,7 @@ func Keygen() (err error) {
cmd := exec.Command("sh", "-c", "openssl genrsa -out ca.key 2048")
out, err = cmd.CombinedOutput()
if err != nil {
- log.Printf("err:%s", err)
+ logger.Printf("err:%s", err)
return
}
fmt.Println(string(out))
@@ -254,7 +231,7 @@ func Keygen() (err error) {
cmd = exec.Command("sh", "-c", cmdStr)
out, err = cmd.CombinedOutput()
if err != nil {
- log.Printf("err:%s", err)
+ logger.Printf("err:%s", err)
return
}
fmt.Println(string(out))
@@ -273,7 +250,7 @@ func Keygen() (err error) {
cmd := exec.Command("sh", "-c", "openssl genrsa -out "+name+".key 2048")
out, err = cmd.CombinedOutput()
if err != nil {
- log.Printf("err:%s", err)
+ logger.Printf("err:%s", err)
return
}
fmt.Println(string(out))
@@ -283,7 +260,7 @@ func Keygen() (err error) {
cmd = exec.Command("sh", "-c", cmdStr)
out, err = cmd.CombinedOutput()
if err != nil {
- log.Printf("err:%s", err)
+ logger.Printf("err:%s", err)
return
}
fmt.Println(string(out))
@@ -293,7 +270,7 @@ func Keygen() (err error) {
cmd = exec.Command("sh", "-c", cmdStr)
out, err = cmd.CombinedOutput()
if err != nil {
- log.Printf("err:%s", err)
+ logger.Printf("err:%s", err)
return
}
@@ -307,7 +284,7 @@ proxy keygen ca client0 30 //generate client0.crt client0.key and use ca.crt sig
cmd := exec.Command("sh", "-c", "openssl genrsa -out proxy.key 2048")
out, err = cmd.CombinedOutput()
if err != nil {
- log.Printf("err:%s", err)
+ logger.Printf("err:%s", err)
return
}
fmt.Println(string(out))
@@ -316,7 +293,7 @@ proxy keygen ca client0 30 //generate client0.crt client0.key and use ca.crt sig
cmd = exec.Command("sh", "-c", cmdStr)
out, err = cmd.CombinedOutput()
if err != nil {
- log.Printf("err:%s", err)
+ logger.Printf("err:%s", err)
return
}
fmt.Println(string(out))
@@ -624,6 +601,26 @@ func IsIternalIP(domainOrIP string) bool {
}
return false
}
+func IsHTTP(head []byte) bool {
+ keys := []string{"GET", "HEAD", "POST", "PUT", "DELETE", "CONNECT", "OPTIONS", "TRACE", "PATCH"}
+ for _, key := range keys {
+ if bytes.HasPrefix(head, []byte(key)) || bytes.HasPrefix(head, []byte(strings.ToLower(key))) {
+ return true
+ }
+ }
+ return false
+}
+func IsSocks5(head []byte) bool {
+ if len(head) < 3 {
+ return false
+ }
+ if head[0] == uint8(0x05) && 0 < int(head[1]) && int(head[1]) < 255 {
+ if len(head) == 2+int(head[1]) {
+ return true
+ }
+ }
+ return false
+}
// type sockaddr struct {
// family uint16
diff --git a/utils/serve-channel.go b/utils/serve-channel.go
index aefe543..80d6fcc 100644
--- a/utils/serve-channel.go
+++ b/utils/serve-channel.go
@@ -1,13 +1,17 @@
package utils
import (
+ "crypto/tls"
+ "crypto/x509"
+ "errors"
"fmt"
- "github.com/snail007/goproxy/services/kcpcfg"
- "log"
+ logger "log"
"net"
"runtime/debug"
"strconv"
+ "github.com/snail007/goproxy/services/kcpcfg"
+
kcp "github.com/xtaci/kcp-go"
)
@@ -17,23 +21,26 @@ type ServerChannel struct {
Listener *net.Listener
UDPListener *net.UDPConn
errAcceptHandler func(err error)
+ log *logger.Logger
}
-func NewServerChannel(ip string, port int) ServerChannel {
+func NewServerChannel(ip string, port int, log *logger.Logger) ServerChannel {
return ServerChannel{
ip: ip,
port: port,
+ log: log,
errAcceptHandler: func(err error) {
log.Printf("accept error , ERR:%s", err)
},
}
}
-func NewServerChannelHost(host string) ServerChannel {
+func NewServerChannelHost(host string, log *logger.Logger) ServerChannel {
h, port, _ := net.SplitHostPort(host)
p, _ := strconv.Atoi(port)
return ServerChannel{
ip: h,
port: p,
+ log: log,
errAcceptHandler: func(err error) {
log.Printf("accept error , ERR:%s", err)
},
@@ -43,12 +50,12 @@ func (sc *ServerChannel) SetErrAcceptHandler(fn func(err error)) {
sc.errAcceptHandler = fn
}
func (sc *ServerChannel) ListenTls(certBytes, keyBytes, caCertBytes []byte, fn func(conn net.Conn)) (err error) {
- sc.Listener, err = ListenTls(sc.ip, sc.port, certBytes, keyBytes, caCertBytes)
+ sc.Listener, err = sc.listenTls(sc.ip, sc.port, certBytes, keyBytes, caCertBytes)
if err == nil {
go func() {
defer func() {
if e := recover(); e != nil {
- log.Printf("ListenTls crashed , err : %s , \ntrace:%s", e, string(debug.Stack()))
+ sc.log.Printf("ListenTls crashed , err : %s , \ntrace:%s", e, string(debug.Stack()))
}
}()
for {
@@ -58,7 +65,7 @@ func (sc *ServerChannel) ListenTls(certBytes, keyBytes, caCertBytes []byte, fn f
go func() {
defer func() {
if e := recover(); e != nil {
- log.Printf("tls connection handler crashed , err : %s , \ntrace:%s", e, string(debug.Stack()))
+ sc.log.Printf("tls connection handler crashed , err : %s , \ntrace:%s", e, string(debug.Stack()))
}
}()
fn(conn)
@@ -73,7 +80,33 @@ func (sc *ServerChannel) ListenTls(certBytes, keyBytes, caCertBytes []byte, fn f
}
return
}
+func (sc *ServerChannel) listenTls(ip string, port int, certBytes, keyBytes, caCertBytes []byte) (ln *net.Listener, err error) {
+ var cert tls.Certificate
+ cert, err = tls.X509KeyPair(certBytes, keyBytes)
+ if err != nil {
+ return
+ }
+ clientCertPool := x509.NewCertPool()
+ caBytes := certBytes
+ if caCertBytes != nil {
+ caBytes = caCertBytes
+ }
+ ok := clientCertPool.AppendCertsFromPEM(caBytes)
+ if !ok {
+ err = errors.New("failed to parse root certificate")
+ }
+ config := &tls.Config{
+ ClientCAs: clientCertPool,
+ Certificates: []tls.Certificate{cert},
+ ClientAuth: tls.RequireAndVerifyClientCert,
+ }
+ _ln, err := tls.Listen("tcp", fmt.Sprintf("%s:%d", ip, port), config)
+ if err == nil {
+ ln = &_ln
+ }
+ return
+}
func (sc *ServerChannel) ListenTCP(fn func(conn net.Conn)) (err error) {
var l net.Listener
l, err = net.Listen("tcp", fmt.Sprintf("%s:%d", sc.ip, sc.port))
@@ -82,7 +115,7 @@ func (sc *ServerChannel) ListenTCP(fn func(conn net.Conn)) (err error) {
go func() {
defer func() {
if e := recover(); e != nil {
- log.Printf("ListenTCP crashed , err : %s , \ntrace:%s", e, string(debug.Stack()))
+ sc.log.Printf("ListenTCP crashed , err : %s , \ntrace:%s", e, string(debug.Stack()))
}
}()
for {
@@ -92,7 +125,7 @@ func (sc *ServerChannel) ListenTCP(fn func(conn net.Conn)) (err error) {
go func() {
defer func() {
if e := recover(); e != nil {
- log.Printf("tcp connection handler crashed , err : %s , \ntrace:%s", e, string(debug.Stack()))
+ sc.log.Printf("tcp connection handler crashed , err : %s , \ntrace:%s", e, string(debug.Stack()))
}
}()
fn(conn)
@@ -114,7 +147,7 @@ func (sc *ServerChannel) ListenUDP(fn func(packet []byte, localAddr, srcAddr *ne
go func() {
defer func() {
if e := recover(); e != nil {
- log.Printf("ListenUDP crashed , err : %s , \ntrace:%s", e, string(debug.Stack()))
+ sc.log.Printf("ListenUDP crashed , err : %s , \ntrace:%s", e, string(debug.Stack()))
}
}()
for {
@@ -125,7 +158,7 @@ func (sc *ServerChannel) ListenUDP(fn func(packet []byte, localAddr, srcAddr *ne
go func() {
defer func() {
if e := recover(); e != nil {
- log.Printf("udp data handler crashed , err : %s , \ntrace:%s", e, string(debug.Stack()))
+ sc.log.Printf("udp data handler crashed , err : %s , \ntrace:%s", e, string(debug.Stack()))
}
}()
fn(packet, addr, srcAddr)
@@ -139,7 +172,7 @@ func (sc *ServerChannel) ListenUDP(fn func(packet []byte, localAddr, srcAddr *ne
}
return
}
-func (sc *ServerChannel) ListenKCP(config kcpcfg.KCPConfigArgs, fn func(conn net.Conn)) (err error) {
+func (sc *ServerChannel) ListenKCP(config kcpcfg.KCPConfigArgs, fn func(conn net.Conn), log *logger.Logger) (err error) {
lis, err := kcp.ListenWithOptions(fmt.Sprintf("%s:%d", sc.ip, sc.port), config.Block, *config.DataShard, *config.ParityShard)
if err == nil {
if err = lis.SetDSCP(*config.DSCP); err != nil {
@@ -159,7 +192,7 @@ func (sc *ServerChannel) ListenKCP(config kcpcfg.KCPConfigArgs, fn func(conn net
go func() {
defer func() {
if e := recover(); e != nil {
- log.Printf("ListenKCP crashed , err : %s , \ntrace:%s", e, string(debug.Stack()))
+ sc.log.Printf("ListenKCP crashed , err : %s , \ntrace:%s", e, string(debug.Stack()))
}
}()
for {
@@ -169,7 +202,7 @@ func (sc *ServerChannel) ListenKCP(config kcpcfg.KCPConfigArgs, fn func(conn net
go func() {
defer func() {
if e := recover(); e != nil {
- log.Printf("kcp connection handler crashed , err : %s , \ntrace:%s", e, string(debug.Stack()))
+ sc.log.Printf("kcp connection handler crashed , err : %s , \ntrace:%s", e, string(debug.Stack()))
}
}()
conn.SetStreamMode(true)
diff --git a/utils/ss/conn.go b/utils/ss/conn.go
new file mode 100644
index 0000000..46513f2
--- /dev/null
+++ b/utils/ss/conn.go
@@ -0,0 +1,186 @@
+package ss
+
+import (
+ "encoding/binary"
+ "fmt"
+ "io"
+ "net"
+ "strconv"
+)
+
+const (
+ OneTimeAuthMask byte = 0x10
+ AddrMask byte = 0xf
+)
+
+type Conn struct {
+ net.Conn
+ *Cipher
+ readBuf []byte
+ writeBuf []byte
+ chunkId uint32
+}
+
+func NewConn(c net.Conn, cipher *Cipher) *Conn {
+ return &Conn{
+ Conn: c,
+ Cipher: cipher,
+ readBuf: leakyBuf.Get(),
+ writeBuf: leakyBuf.Get()}
+}
+
+func (c *Conn) Close() error {
+ leakyBuf.Put(c.readBuf)
+ leakyBuf.Put(c.writeBuf)
+ return c.Conn.Close()
+}
+
+func RawAddr(addr string) (buf []byte, err error) {
+ host, portStr, err := net.SplitHostPort(addr)
+ if err != nil {
+ return nil, fmt.Errorf("ss: address error %s %v", addr, err)
+ }
+ port, err := strconv.Atoi(portStr)
+ if err != nil {
+ return nil, fmt.Errorf("ss: invalid port %s", addr)
+ }
+
+ hostLen := len(host)
+ l := 1 + 1 + hostLen + 2 // addrType + lenByte + address + port
+ buf = make([]byte, l)
+ buf[0] = 3 // 3 means the address is domain name
+ buf[1] = byte(hostLen) // host address length followed by host address
+ copy(buf[2:], host)
+ binary.BigEndian.PutUint16(buf[2+hostLen:2+hostLen+2], uint16(port))
+ return
+}
+
+// This is intended for use by users implementing a local socks proxy.
+// rawaddr shoud contain part of the data in socks request, starting from the
+// ATYP field. (Refer to rfc1928 for more information.)
+func DialWithRawAddr(rawaddr []byte, server string, cipher *Cipher) (c *Conn, err error) {
+ conn, err := net.Dial("tcp", server)
+ if err != nil {
+ return
+ }
+ c = NewConn(conn, cipher)
+ if cipher.ota {
+ if c.enc == nil {
+ if _, err = c.initEncrypt(); err != nil {
+ return
+ }
+ }
+ // since we have initEncrypt, we must send iv manually
+ conn.Write(cipher.iv)
+ rawaddr[0] |= OneTimeAuthMask
+ rawaddr = otaConnectAuth(cipher.iv, cipher.key, rawaddr)
+ }
+ if _, err = c.write(rawaddr); err != nil {
+ c.Close()
+ return nil, err
+ }
+ return
+}
+
+// addr should be in the form of host:port
+func Dial(addr, server string, cipher *Cipher) (c *Conn, err error) {
+ ra, err := RawAddr(addr)
+ if err != nil {
+ return
+ }
+ return DialWithRawAddr(ra, server, cipher)
+}
+
+func (c *Conn) GetIv() (iv []byte) {
+ iv = make([]byte, len(c.iv))
+ copy(iv, c.iv)
+ return
+}
+
+func (c *Conn) GetKey() (key []byte) {
+ key = make([]byte, len(c.key))
+ copy(key, c.key)
+ return
+}
+
+func (c *Conn) IsOta() bool {
+ return c.ota
+}
+
+func (c *Conn) GetAndIncrChunkId() (chunkId uint32) {
+ chunkId = c.chunkId
+ c.chunkId += 1
+ return
+}
+
+func (c *Conn) Read(b []byte) (n int, err error) {
+ if c.dec == nil {
+ iv := make([]byte, c.info.ivLen)
+ if _, err = io.ReadFull(c.Conn, iv); err != nil {
+ return
+ }
+ if err = c.initDecrypt(iv); err != nil {
+ return
+ }
+ if len(c.iv) == 0 {
+ c.iv = iv
+ }
+ }
+
+ cipherData := c.readBuf
+ if len(b) > len(cipherData) {
+ cipherData = make([]byte, len(b))
+ } else {
+ cipherData = cipherData[:len(b)]
+ }
+
+ n, err = c.Conn.Read(cipherData)
+ if n > 0 {
+ c.decrypt(b[0:n], cipherData[0:n])
+ }
+ return
+}
+
+func (c *Conn) Write(b []byte) (n int, err error) {
+ nn := len(b)
+ if c.ota {
+ chunkId := c.GetAndIncrChunkId()
+ b = otaReqChunkAuth(c.iv, chunkId, b)
+ }
+ headerLen := len(b) - nn
+
+ n, err = c.write(b)
+ // Make sure <= 0 <= len(b), where b is the slice passed in.
+ if n >= headerLen {
+ n -= headerLen
+ }
+ return
+}
+
+func (c *Conn) write(b []byte) (n int, err error) {
+ var iv []byte
+ if c.enc == nil {
+ iv, err = c.initEncrypt()
+ if err != nil {
+ return
+ }
+ }
+
+ cipherData := c.writeBuf
+ dataSize := len(b) + len(iv)
+ if dataSize > len(cipherData) {
+ cipherData = make([]byte, dataSize)
+ } else {
+ cipherData = cipherData[:dataSize]
+ }
+
+ if iv != nil {
+ // Put initialization vector in buffer, do a single write to send both
+ // iv and data.
+ copy(cipherData, iv)
+ }
+
+ c.encrypt(cipherData[len(iv):], b)
+ n, err = c.Conn.Write(cipherData)
+ return
+}
diff --git a/utils/ss/encrypt.go b/utils/ss/encrypt.go
new file mode 100644
index 0000000..6553aa0
--- /dev/null
+++ b/utils/ss/encrypt.go
@@ -0,0 +1,274 @@
+package ss
+
+import (
+ "crypto/aes"
+ "crypto/cipher"
+ "crypto/des"
+ "crypto/md5"
+ "crypto/rand"
+ "crypto/rc4"
+ "encoding/binary"
+ "errors"
+ "io"
+ "strings"
+
+ "github.com/Yawning/chacha20"
+ "golang.org/x/crypto/blowfish"
+ "golang.org/x/crypto/cast5"
+ "golang.org/x/crypto/salsa20/salsa"
+)
+
+var errEmptyPassword = errors.New("empty key")
+
+func md5sum(d []byte) []byte {
+ h := md5.New()
+ h.Write(d)
+ return h.Sum(nil)
+}
+
+func evpBytesToKey(password string, keyLen int) (key []byte) {
+ const md5Len = 16
+
+ cnt := (keyLen-1)/md5Len + 1
+ m := make([]byte, cnt*md5Len)
+ copy(m, md5sum([]byte(password)))
+
+ // Repeatedly call md5 until bytes generated is enough.
+ // Each call to md5 uses data: prev md5 sum + password.
+ d := make([]byte, md5Len+len(password))
+ start := 0
+ for i := 1; i < cnt; i++ {
+ start += md5Len
+ copy(d, m[start-md5Len:start])
+ copy(d[md5Len:], password)
+ copy(m[start:], md5sum(d))
+ }
+ return m[:keyLen]
+}
+
+type DecOrEnc int
+
+const (
+ Decrypt DecOrEnc = iota
+ Encrypt
+)
+
+func newStream(block cipher.Block, err error, key, iv []byte,
+ doe DecOrEnc) (cipher.Stream, error) {
+ if err != nil {
+ return nil, err
+ }
+ if doe == Encrypt {
+ return cipher.NewCFBEncrypter(block, iv), nil
+ } else {
+ return cipher.NewCFBDecrypter(block, iv), nil
+ }
+}
+
+func newAESCFBStream(key, iv []byte, doe DecOrEnc) (cipher.Stream, error) {
+ block, err := aes.NewCipher(key)
+ return newStream(block, err, key, iv, doe)
+}
+
+func newAESCTRStream(key, iv []byte, doe DecOrEnc) (cipher.Stream, error) {
+ block, err := aes.NewCipher(key)
+ if err != nil {
+ return nil, err
+ }
+ return cipher.NewCTR(block, iv), nil
+}
+
+func newDESStream(key, iv []byte, doe DecOrEnc) (cipher.Stream, error) {
+ block, err := des.NewCipher(key)
+ return newStream(block, err, key, iv, doe)
+}
+
+func newBlowFishStream(key, iv []byte, doe DecOrEnc) (cipher.Stream, error) {
+ block, err := blowfish.NewCipher(key)
+ return newStream(block, err, key, iv, doe)
+}
+
+func newCast5Stream(key, iv []byte, doe DecOrEnc) (cipher.Stream, error) {
+ block, err := cast5.NewCipher(key)
+ return newStream(block, err, key, iv, doe)
+}
+
+func newRC4MD5Stream(key, iv []byte, _ DecOrEnc) (cipher.Stream, error) {
+ h := md5.New()
+ h.Write(key)
+ h.Write(iv)
+ rc4key := h.Sum(nil)
+
+ return rc4.NewCipher(rc4key)
+}
+
+func newChaCha20Stream(key, iv []byte, _ DecOrEnc) (cipher.Stream, error) {
+ return chacha20.NewCipher(key, iv)
+}
+
+func newChaCha20IETFStream(key, iv []byte, _ DecOrEnc) (cipher.Stream, error) {
+ return chacha20.NewCipher(key, iv)
+}
+
+type salsaStreamCipher struct {
+ nonce [8]byte
+ key [32]byte
+ counter int
+}
+
+func (c *salsaStreamCipher) XORKeyStream(dst, src []byte) {
+ var buf []byte
+ padLen := c.counter % 64
+ dataSize := len(src) + padLen
+ if cap(dst) >= dataSize {
+ buf = dst[:dataSize]
+ } else if leakyBufSize >= dataSize {
+ buf = leakyBuf.Get()
+ defer leakyBuf.Put(buf)
+ buf = buf[:dataSize]
+ } else {
+ buf = make([]byte, dataSize)
+ }
+
+ var subNonce [16]byte
+ copy(subNonce[:], c.nonce[:])
+ binary.LittleEndian.PutUint64(subNonce[len(c.nonce):], uint64(c.counter/64))
+
+ // It's difficult to avoid data copy here. src or dst maybe slice from
+ // Conn.Read/Write, which can't have padding.
+ copy(buf[padLen:], src[:])
+ salsa.XORKeyStream(buf, buf, &subNonce, &c.key)
+ copy(dst, buf[padLen:])
+
+ c.counter += len(src)
+}
+
+func newSalsa20Stream(key, iv []byte, _ DecOrEnc) (cipher.Stream, error) {
+ var c salsaStreamCipher
+ copy(c.nonce[:], iv[:8])
+ copy(c.key[:], key[:32])
+ return &c, nil
+}
+
+type cipherInfo struct {
+ keyLen int
+ ivLen int
+ newStream func(key, iv []byte, doe DecOrEnc) (cipher.Stream, error)
+}
+
+var cipherMethod = map[string]*cipherInfo{
+ "aes-128-cfb": {16, 16, newAESCFBStream},
+ "aes-192-cfb": {24, 16, newAESCFBStream},
+ "aes-256-cfb": {32, 16, newAESCFBStream},
+ "aes-128-ctr": {16, 16, newAESCTRStream},
+ "aes-192-ctr": {24, 16, newAESCTRStream},
+ "aes-256-ctr": {32, 16, newAESCTRStream},
+ "des-cfb": {8, 8, newDESStream},
+ "bf-cfb": {16, 8, newBlowFishStream},
+ "cast5-cfb": {16, 8, newCast5Stream},
+ "rc4-md5": {16, 16, newRC4MD5Stream},
+ "rc4-md5-6": {16, 6, newRC4MD5Stream},
+ "chacha20": {32, 8, newChaCha20Stream},
+ "chacha20-ietf": {32, 12, newChaCha20IETFStream},
+ "salsa20": {32, 8, newSalsa20Stream},
+}
+
+func CheckCipherMethod(method string) error {
+ if method == "" {
+ method = "aes-256-cfb"
+ }
+ _, ok := cipherMethod[method]
+ if !ok {
+ return errors.New("Unsupported encryption method: " + method)
+ }
+ return nil
+}
+
+type Cipher struct {
+ enc cipher.Stream
+ dec cipher.Stream
+ key []byte
+ info *cipherInfo
+ ota bool // one-time auth
+ iv []byte
+}
+
+// NewCipher creates a cipher that can be used in Dial() etc.
+// Use cipher.Copy() to create a new cipher with the same method and password
+// to avoid the cost of repeated cipher initialization.
+func NewCipher(method, password string) (c *Cipher, err error) {
+ if password == "" {
+ return nil, errEmptyPassword
+ }
+ var ota bool
+ if strings.HasSuffix(strings.ToLower(method), "-auth") {
+ method = method[:len(method)-5] // len("-auth") = 5
+ ota = true
+ } else {
+ ota = false
+ }
+ mi, ok := cipherMethod[method]
+ if !ok {
+ return nil, errors.New("Unsupported encryption method: " + method)
+ }
+
+ key := evpBytesToKey(password, mi.keyLen)
+
+ c = &Cipher{key: key, info: mi}
+
+ if err != nil {
+ return nil, err
+ }
+ c.ota = ota
+ return c, nil
+}
+
+// Initializes the block cipher with CFB mode, returns IV.
+func (c *Cipher) initEncrypt() (iv []byte, err error) {
+ if c.iv == nil {
+ iv = make([]byte, c.info.ivLen)
+ if _, err := io.ReadFull(rand.Reader, iv); err != nil {
+ return nil, err
+ }
+ c.iv = iv
+ } else {
+ iv = c.iv
+ }
+ c.enc, err = c.info.newStream(c.key, iv, Encrypt)
+ return
+}
+
+func (c *Cipher) initDecrypt(iv []byte) (err error) {
+ c.dec, err = c.info.newStream(c.key, iv, Decrypt)
+ return
+}
+
+func (c *Cipher) encrypt(dst, src []byte) {
+ c.enc.XORKeyStream(dst, src)
+}
+
+func (c *Cipher) decrypt(dst, src []byte) {
+ c.dec.XORKeyStream(dst, src)
+}
+
+// Copy creates a new cipher at it's initial state.
+func (c *Cipher) Copy() *Cipher {
+ // This optimization maybe not necessary. But without this function, we
+ // need to maintain a table cache for newTableCipher and use lock to
+ // protect concurrent access to that cache.
+
+ // AES and DES ciphers does not return specific types, so it's difficult
+ // to create copy. But their initizliation time is less than 4000ns on my
+ // 2.26 GHz Intel Core 2 Duo processor. So no need to worry.
+
+ // Currently, blow-fish and cast5 initialization cost is an order of
+ // maganitude slower than other ciphers. (I'm not sure whether this is
+ // because the current implementation is not highly optimized, or this is
+ // the nature of the algorithm.)
+
+ nc := *c
+ nc.enc = nil
+ nc.dec = nil
+ nc.ota = c.ota
+ return &nc
+}
diff --git a/utils/ss/leakybuf.go b/utils/ss/leakybuf.go
new file mode 100644
index 0000000..029d93f
--- /dev/null
+++ b/utils/ss/leakybuf.go
@@ -0,0 +1,45 @@
+// Provides leaky buffer, based on the example in Effective Go.
+package ss
+
+type LeakyBuf struct {
+ bufSize int // size of each buffer
+ freeList chan []byte
+}
+
+const leakyBufSize = 4108 // data.len(2) + hmacsha1(10) + data(4096)
+const maxNBuf = 2048
+
+var leakyBuf = NewLeakyBuf(maxNBuf, leakyBufSize)
+
+// NewLeakyBuf creates a leaky buffer which can hold at most n buffer, each
+// with bufSize bytes.
+func NewLeakyBuf(n, bufSize int) *LeakyBuf {
+ return &LeakyBuf{
+ bufSize: bufSize,
+ freeList: make(chan []byte, n),
+ }
+}
+
+// Get returns a buffer from the leaky buffer or create a new buffer.
+func (lb *LeakyBuf) Get() (b []byte) {
+ select {
+ case b = <-lb.freeList:
+ default:
+ b = make([]byte, lb.bufSize)
+ }
+ return
+}
+
+// Put add the buffer into the free buffer pool for reuse. Panic if the buffer
+// size is not the same with the leaky buffer's. This is intended to expose
+// error usage of leaky buffer.
+func (lb *LeakyBuf) Put(b []byte) {
+ if len(b) != lb.bufSize {
+ panic("invalid buffer size that's put into leaky buffer")
+ }
+ select {
+ case lb.freeList <- b:
+ default:
+ }
+ return
+}
diff --git a/utils/ss/util.go b/utils/ss/util.go
new file mode 100644
index 0000000..290c4a7
--- /dev/null
+++ b/utils/ss/util.go
@@ -0,0 +1,124 @@
+package ss
+
+import (
+ "crypto/hmac"
+ "crypto/sha1"
+ "encoding/binary"
+ "errors"
+ "fmt"
+ "io"
+ "net"
+ "os"
+ "strconv"
+)
+
+func IsFileExists(path string) (bool, error) {
+ stat, err := os.Stat(path)
+ if err == nil {
+ if stat.Mode()&os.ModeType == 0 {
+ return true, nil
+ }
+ return false, errors.New(path + " exists but is not regular file")
+ }
+ if os.IsNotExist(err) {
+ return false, nil
+ }
+ return false, err
+}
+
+func HmacSha1(key []byte, data []byte) []byte {
+ hmacSha1 := hmac.New(sha1.New, key)
+ hmacSha1.Write(data)
+ return hmacSha1.Sum(nil)[:10]
+}
+
+func otaConnectAuth(iv, key, data []byte) []byte {
+ return append(data, HmacSha1(append(iv, key...), data)...)
+}
+
+func otaReqChunkAuth(iv []byte, chunkId uint32, data []byte) []byte {
+ nb := make([]byte, 2)
+ binary.BigEndian.PutUint16(nb, uint16(len(data)))
+ chunkIdBytes := make([]byte, 4)
+ binary.BigEndian.PutUint32(chunkIdBytes, chunkId)
+ header := append(nb, HmacSha1(append(iv, chunkIdBytes...), data)...)
+ return append(header, data...)
+}
+
+const (
+ idType = 0 // address type index
+ idIP0 = 1 // ip addres start index
+ idDmLen = 1 // domain address length index
+ idDm0 = 2 // domain address start index
+
+ typeIPv4 = 1 // type is ipv4 address
+ typeDm = 3 // type is domain address
+ typeIPv6 = 4 // type is ipv6 address
+
+ lenIPv4 = net.IPv4len + 2 // ipv4 + 2port
+ lenIPv6 = net.IPv6len + 2 // ipv6 + 2port
+ lenDmBase = 2 // 1addrLen + 2port, plus addrLen
+ lenHmacSha1 = 10
+)
+
+func GetRequest(conn *Conn) (host string, err error) {
+
+ // buf size should at least have the same size with the largest possible
+ // request size (when addrType is 3, domain name has at most 256 bytes)
+ // 1(addrType) + 1(lenByte) + 255(max length address) + 2(port) + 10(hmac-sha1)
+ buf := make([]byte, 269)
+ // read till we get possible domain length field
+ if _, err = io.ReadFull(conn, buf[:idType+1]); err != nil {
+ return
+ }
+
+ var reqStart, reqEnd int
+ addrType := buf[idType]
+ switch addrType & AddrMask {
+ case typeIPv4:
+ reqStart, reqEnd = idIP0, idIP0+lenIPv4
+ case typeIPv6:
+ reqStart, reqEnd = idIP0, idIP0+lenIPv6
+ case typeDm:
+ if _, err = io.ReadFull(conn, buf[idType+1:idDmLen+1]); err != nil {
+ return
+ }
+ reqStart, reqEnd = idDm0, idDm0+int(buf[idDmLen])+lenDmBase
+ default:
+ err = fmt.Errorf("addr type %d not supported", addrType&AddrMask)
+ return
+ }
+
+ if _, err = io.ReadFull(conn, buf[reqStart:reqEnd]); err != nil {
+ return
+ }
+
+ // Return string for typeIP is not most efficient, but browsers (Chrome,
+ // Safari, Firefox) all seems using typeDm exclusively. So this is not a
+ // big problem.
+ switch addrType & AddrMask {
+ case typeIPv4:
+ host = net.IP(buf[idIP0 : idIP0+net.IPv4len]).String()
+ case typeIPv6:
+ host = net.IP(buf[idIP0 : idIP0+net.IPv6len]).String()
+ case typeDm:
+ host = string(buf[idDm0 : idDm0+int(buf[idDmLen])])
+ }
+ // parse port
+ port := binary.BigEndian.Uint16(buf[reqEnd-2 : reqEnd])
+ host = net.JoinHostPort(host, strconv.Itoa(int(port)))
+
+ return
+}
+
+type ClosedFlag struct {
+ flag bool
+}
+
+func (flag *ClosedFlag) SetClosed() {
+ flag.flag = true
+}
+
+func (flag *ClosedFlag) IsClosed() bool {
+ return flag.flag
+}
diff --git a/utils/structs.go b/utils/structs.go
index acc4b45..69f415c 100644
--- a/utils/structs.go
+++ b/utils/structs.go
@@ -1,22 +1,24 @@
package utils
import (
+ "bufio"
"bytes"
"crypto/tls"
"encoding/base64"
"errors"
"fmt"
- "github.com/snail007/goproxy/services/kcpcfg"
- "github.com/snail007/goproxy/utils/sni"
"io"
"io/ioutil"
- "log"
+ logger "log"
"net"
"net/url"
"strings"
"sync"
"time"
+ "github.com/snail007/goproxy/services/kcpcfg"
+ "github.com/snail007/goproxy/utils/sni"
+
"github.com/golang/snappy"
"github.com/miekg/dns"
)
@@ -28,6 +30,7 @@ type Checker struct {
interval int64
timeout int
isStop bool
+ log *logger.Logger
}
type CheckerItem struct {
IsHTTPS bool
@@ -43,12 +46,13 @@ type CheckerItem struct {
//NewChecker args:
//timeout : tcp timeout milliseconds ,connect to host
//interval: recheck domain interval seconds
-func NewChecker(timeout int, interval int64, blockedFile, directFile string) Checker {
+func NewChecker(timeout int, interval int64, blockedFile, directFile string, log *logger.Logger) Checker {
ch := Checker{
data: NewConcurrentMap(),
interval: interval,
timeout: timeout,
isStop: false,
+ log: log,
}
ch.blockedMap = ch.loadMap(blockedFile)
ch.directMap = ch.loadMap(directFile)
@@ -70,7 +74,7 @@ func (c *Checker) loadMap(f string) (dataMap ConcurrentMap) {
if PathExists(f) {
_contents, err := ioutil.ReadFile(f)
if err != nil {
- log.Printf("load file err:%s", err)
+ c.log.Printf("load file err:%s", err)
return
}
for _, line := range strings.Split(string(_contents), "\n") {
@@ -149,7 +153,7 @@ func (c *Checker) IsBlocked(address string) (blocked bool, failN, successN uint)
func (c *Checker) domainIsInMap(address string, blockedMap bool) bool {
u, err := url.Parse("http://" + address)
if err != nil {
- log.Printf("blocked check , url parse err:%s", err)
+ c.log.Printf("blocked check , url parse err:%s", err)
return true
}
domainSlice := strings.Split(u.Hostname(), ".")
@@ -187,12 +191,14 @@ type BasicAuth struct {
authTimeout int
authRetry int
dns *DomainResolver
+ log *logger.Logger
}
-func NewBasicAuth(dns *DomainResolver) BasicAuth {
+func NewBasicAuth(dns *DomainResolver, log *logger.Logger) BasicAuth {
return BasicAuth{
data: NewConcurrentMap(),
dns: dns,
+ log: log,
}
}
func (ba *BasicAuth) SetAuthURL(URL string, code, timeout, retry int) {
@@ -245,7 +251,7 @@ func (ba *BasicAuth) Check(userpass string, ip, target string) (ok bool) {
if err == nil {
return true
}
- log.Printf("%s", err)
+ ba.log.Printf("%s", err)
}
return false
}
@@ -294,7 +300,7 @@ func (ba *BasicAuth) checkFromURL(userpass, ip, target string) (err error) {
err = fmt.Errorf("auth fail from url %s,resonse code: %d, except: %d , %s , %s", URL, code, ba.authOkCode, ip, b)
}
if err != nil && tryCount < ba.authRetry {
- log.Print(err)
+ ba.log.Print(err)
time.Sleep(time.Second * 2)
}
tryCount++
@@ -320,13 +326,15 @@ type HTTPRequest struct {
hostOrURL string
isBasicAuth bool
basicAuth *BasicAuth
+ log *logger.Logger
}
-func NewHTTPRequest(inConn *net.Conn, bufSize int, isBasicAuth bool, basicAuth *BasicAuth, header ...[]byte) (req HTTPRequest, err error) {
+func NewHTTPRequest(inConn *net.Conn, bufSize int, isBasicAuth bool, basicAuth *BasicAuth, log *logger.Logger, header ...[]byte) (req HTTPRequest, err error) {
buf := make([]byte, bufSize)
n := 0
req = HTTPRequest{
conn: inConn,
+ log: log,
}
if header != nil && len(header) == 1 && len(header[0]) > 1 {
buf = header[0]
@@ -546,12 +554,14 @@ func (op *OutConn) Get() (conn net.Conn, err error) {
type ConnManager struct {
pool ConcurrentMap
l *sync.Mutex
+ log *logger.Logger
}
-func NewConnManager() ConnManager {
+func NewConnManager(log *logger.Logger) ConnManager {
cm := ConnManager{
pool: NewConcurrentMap(),
l: &sync.Mutex{},
+ log: log,
}
return cm
}
@@ -568,7 +578,7 @@ func (cm *ConnManager) Add(key, ID string, conn *net.Conn) {
(*v.(*net.Conn)).Close()
}
conns.Set(ID, conn)
- log.Printf("%s conn added", key)
+ cm.log.Printf("%s conn added", key)
return conns
})
}
@@ -579,7 +589,7 @@ func (cm *ConnManager) Remove(key string) {
conns.IterCb(func(key string, v interface{}) {
CloseConn(v.(*net.Conn))
})
- log.Printf("%s conns closed", key)
+ cm.log.Printf("%s conns closed", key)
}
cm.pool.Remove(key)
}
@@ -594,7 +604,7 @@ func (cm *ConnManager) RemoveOne(key string, ID string) {
(*v.(*net.Conn)).Close()
conns.Remove(ID)
cm.pool.Set(key, conns)
- log.Printf("%s %s conn closed", key, ID)
+ cm.log.Printf("%s %s conn closed", key, ID)
}
}
}
@@ -650,6 +660,7 @@ type DomainResolver struct {
ttl int
dnsAddrress string
data ConcurrentMap
+ log *logger.Logger
}
type DomainResolverItem struct {
ip string
@@ -657,12 +668,13 @@ type DomainResolverItem struct {
expiredAt int64
}
-func NewDomainResolver(dnsAddrress string, ttl int) DomainResolver {
+func NewDomainResolver(dnsAddrress string, ttl int, log *logger.Logger) DomainResolver {
return DomainResolver{
ttl: ttl,
dnsAddrress: dnsAddrress,
data: NewConcurrentMap(),
+ log: log,
}
}
func (a *DomainResolver) MustResolve(address string) (ip string) {
@@ -677,7 +689,7 @@ func (a *DomainResolver) Resolve(address string) (ip string, err error) {
if port != "" {
ip = net.JoinHostPort(ip, port)
}
- log.Printf("dns:%s->%s,cache:%s", address, ip, fromCache)
+ a.log.Printf("dns:%s->%s,cache:%s", address, ip, fromCache)
//a.PrintData()
}()
if strings.Contains(domain, ":") {
@@ -792,3 +804,33 @@ func (c *CompStream) SetReadDeadline(t time.Time) error {
func (c *CompStream) SetWriteDeadline(t time.Time) error {
return c.conn.SetWriteDeadline(t)
}
+
+type BufferedConn struct {
+ r *bufio.Reader
+ net.Conn // So that most methods are embedded
+}
+
+func NewBufferedConn(c net.Conn) BufferedConn {
+ return BufferedConn{bufio.NewReader(c), c}
+}
+
+func NewBufferedConnSize(c net.Conn, n int) BufferedConn {
+ return BufferedConn{bufio.NewReaderSize(c, n), c}
+}
+
+func (b BufferedConn) Peek(n int) ([]byte, error) {
+ return b.r.Peek(n)
+}
+
+func (b BufferedConn) Read(p []byte) (int, error) {
+ return b.r.Read(p)
+}
+func (b BufferedConn) ReadByte() (byte, error) {
+ return b.r.ReadByte()
+}
+func (b BufferedConn) UnreadByte() error {
+ return b.r.UnreadByte()
+}
+func (b BufferedConn) Buffered() int {
+ return b.r.Buffered()
+}
diff --git a/vendor/github.com/Yawning/chacha20/LICENSE b/vendor/github.com/Yawning/chacha20/LICENSE
new file mode 100644
index 0000000..6ca207e
--- /dev/null
+++ b/vendor/github.com/Yawning/chacha20/LICENSE
@@ -0,0 +1,122 @@
+Creative Commons Legal Code
+
+CC0 1.0 Universal
+
+ CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
+ LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
+ ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
+ INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
+ REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
+ PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
+ THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
+ HEREUNDER.
+
+Statement of Purpose
+
+The laws of most jurisdictions throughout the world automatically confer
+exclusive Copyright and Related Rights (defined below) upon the creator
+and subsequent owner(s) (each and all, an "owner") of an original work of
+authorship and/or a database (each, a "Work").
+
+Certain owners wish to permanently relinquish those rights to a Work for
+the purpose of contributing to a commons of creative, cultural and
+scientific works ("Commons") that the public can reliably and without fear
+of later claims of infringement build upon, modify, incorporate in other
+works, reuse and redistribute as freely as possible in any form whatsoever
+and for any purposes, including without limitation commercial purposes.
+These owners may contribute to the Commons to promote the ideal of a free
+culture and the further production of creative, cultural and scientific
+works, or to gain reputation or greater distribution for their Work in
+part through the use and efforts of others.
+
+For these and/or other purposes and motivations, and without any
+expectation of additional consideration or compensation, the person
+associating CC0 with a Work (the "Affirmer"), to the extent that he or she
+is an owner of Copyright and Related Rights in the Work, voluntarily
+elects to apply CC0 to the Work and publicly distribute the Work under its
+terms, with knowledge of his or her Copyright and Related Rights in the
+Work and the meaning and intended legal effect of CC0 on those rights.
+
+1. Copyright and Related Rights. A Work made available under CC0 may be
+protected by copyright and related or neighboring rights ("Copyright and
+Related Rights"). Copyright and Related Rights include, but are not
+limited to, the following:
+
+ i. the right to reproduce, adapt, distribute, perform, display,
+ communicate, and translate a Work;
+ ii. moral rights retained by the original author(s) and/or performer(s);
+iii. publicity and privacy rights pertaining to a person's image or
+ likeness depicted in a Work;
+ iv. rights protecting against unfair competition in regards to a Work,
+ subject to the limitations in paragraph 4(a), below;
+ v. rights protecting the extraction, dissemination, use and reuse of data
+ in a Work;
+ vi. database rights (such as those arising under Directive 96/9/EC of the
+ European Parliament and of the Council of 11 March 1996 on the legal
+ protection of databases, and under any national implementation
+ thereof, including any amended or successor version of such
+ directive); and
+vii. other similar, equivalent or corresponding rights throughout the
+ world based on applicable law or treaty, and any national
+ implementations thereof.
+
+2. Waiver. To the greatest extent permitted by, but not in contravention
+of, applicable law, Affirmer hereby overtly, fully, permanently,
+irrevocably and unconditionally waives, abandons, and surrenders all of
+Affirmer's Copyright and Related Rights and associated claims and causes
+of action, whether now known or unknown (including existing as well as
+future claims and causes of action), in the Work (i) in all territories
+worldwide, (ii) for the maximum duration provided by applicable law or
+treaty (including future time extensions), (iii) in any current or future
+medium and for any number of copies, and (iv) for any purpose whatsoever,
+including without limitation commercial, advertising or promotional
+purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
+member of the public at large and to the detriment of Affirmer's heirs and
+successors, fully intending that such Waiver shall not be subject to
+revocation, rescission, cancellation, termination, or any other legal or
+equitable action to disrupt the quiet enjoyment of the Work by the public
+as contemplated by Affirmer's express Statement of Purpose.
+
+3. Public License Fallback. Should any part of the Waiver for any reason
+be judged legally invalid or ineffective under applicable law, then the
+Waiver shall be preserved to the maximum extent permitted taking into
+account Affirmer's express Statement of Purpose. In addition, to the
+extent the Waiver is so judged Affirmer hereby grants to each affected
+person a royalty-free, non transferable, non sublicensable, non exclusive,
+irrevocable and unconditional license to exercise Affirmer's Copyright and
+Related Rights in the Work (i) in all territories worldwide, (ii) for the
+maximum duration provided by applicable law or treaty (including future
+time extensions), (iii) in any current or future medium and for any number
+of copies, and (iv) for any purpose whatsoever, including without
+limitation commercial, advertising or promotional purposes (the
+"License"). The License shall be deemed effective as of the date CC0 was
+applied by Affirmer to the Work. Should any part of the License for any
+reason be judged legally invalid or ineffective under applicable law, such
+partial invalidity or ineffectiveness shall not invalidate the remainder
+of the License, and in such case Affirmer hereby affirms that he or she
+will not (i) exercise any of his or her remaining Copyright and Related
+Rights in the Work or (ii) assert any associated claims and causes of
+action with respect to the Work, in either case contrary to Affirmer's
+express Statement of Purpose.
+
+4. Limitations and Disclaimers.
+
+ a. No trademark or patent rights held by Affirmer are waived, abandoned,
+ surrendered, licensed or otherwise affected by this document.
+ b. Affirmer offers the Work as-is and makes no representations or
+ warranties of any kind concerning the Work, express, implied,
+ statutory or otherwise, including without limitation warranties of
+ title, merchantability, fitness for a particular purpose, non
+ infringement, or the absence of latent or other defects, accuracy, or
+ the present or absence of errors, whether or not discoverable, all to
+ the greatest extent permissible under applicable law.
+ c. Affirmer disclaims responsibility for clearing rights of other persons
+ that may apply to the Work or any use thereof, including without
+ limitation any person's Copyright and Related Rights in the Work.
+ Further, Affirmer disclaims responsibility for obtaining any necessary
+ consents, permissions or other rights required for any use of the
+ Work.
+ d. Affirmer understands and acknowledges that Creative Commons is not a
+ party to this document and has no duty or obligation with respect to
+ this CC0 or use of the Work.
+
diff --git a/vendor/github.com/Yawning/chacha20/README.md b/vendor/github.com/Yawning/chacha20/README.md
new file mode 100644
index 0000000..9080a84
--- /dev/null
+++ b/vendor/github.com/Yawning/chacha20/README.md
@@ -0,0 +1,14 @@
+### chacha20 - ChaCha20
+#### Yawning Angel (yawning at schwanenlied dot me)
+
+Yet another Go ChaCha20 implementation. Everything else I found was slow,
+didn't support all the variants I need to use, or relied on cgo to go fast.
+
+Features:
+
+ * 20 round, 256 bit key only. Everything else is pointless and stupid.
+ * IETF 96 bit nonce variant.
+ * XChaCha 24 byte nonce variant.
+ * SSE2 and AVX2 support on amd64 targets.
+ * Incremental encrypt/decrypt support, unlike golang.org/x/crypto/salsa20.
+
diff --git a/vendor/github.com/Yawning/chacha20/chacha20.go b/vendor/github.com/Yawning/chacha20/chacha20.go
new file mode 100644
index 0000000..07d5e4b
--- /dev/null
+++ b/vendor/github.com/Yawning/chacha20/chacha20.go
@@ -0,0 +1,273 @@
+// chacha20.go - A ChaCha stream cipher implementation.
+//
+// To the extent possible under law, Yawning Angel has waived all copyright
+// and related or neighboring rights to chacha20, using the Creative
+// Commons "CC0" public domain dedication. See LICENSE or
+// for full details.
+
+package chacha20
+
+import (
+ "crypto/cipher"
+ "encoding/binary"
+ "errors"
+ "math"
+ "runtime"
+)
+
+const (
+ // KeySize is the ChaCha20 key size in bytes.
+ KeySize = 32
+
+ // NonceSize is the ChaCha20 nonce size in bytes.
+ NonceSize = 8
+
+ // INonceSize is the IETF ChaCha20 nonce size in bytes.
+ INonceSize = 12
+
+ // XNonceSize is the XChaCha20 nonce size in bytes.
+ XNonceSize = 24
+
+ // HNonceSize is the HChaCha20 nonce size in bytes.
+ HNonceSize = 16
+
+ // BlockSize is the ChaCha20 block size in bytes.
+ BlockSize = 64
+
+ stateSize = 16
+ chachaRounds = 20
+
+ // The constant "expand 32-byte k" as little endian uint32s.
+ sigma0 = uint32(0x61707865)
+ sigma1 = uint32(0x3320646e)
+ sigma2 = uint32(0x79622d32)
+ sigma3 = uint32(0x6b206574)
+)
+
+var (
+ // ErrInvalidKey is the error returned when the key is invalid.
+ ErrInvalidKey = errors.New("key length must be KeySize bytes")
+
+ // ErrInvalidNonce is the error returned when the nonce is invalid.
+ ErrInvalidNonce = errors.New("nonce length must be NonceSize/INonceSize/XNonceSize bytes")
+
+ // ErrInvalidCounter is the error returned when the counter is invalid.
+ ErrInvalidCounter = errors.New("block counter is invalid (out of range)")
+
+ useUnsafe = false
+ usingVectors = false
+ blocksFn = blocksRef
+)
+
+// A Cipher is an instance of ChaCha20/XChaCha20 using a particular key and
+// nonce.
+type Cipher struct {
+ state [stateSize]uint32
+
+ buf [BlockSize]byte
+ off int
+ ietf bool
+}
+
+// Reset zeros the key data so that it will no longer appear in the process's
+// memory.
+func (c *Cipher) Reset() {
+ for i := range c.state {
+ c.state[i] = 0
+ }
+ for i := range c.buf {
+ c.buf[i] = 0
+ }
+}
+
+// XORKeyStream sets dst to the result of XORing src with the key stream. Dst
+// and src may be the same slice but otherwise should not overlap.
+func (c *Cipher) XORKeyStream(dst, src []byte) {
+ if len(dst) < len(src) {
+ src = src[:len(dst)]
+ }
+
+ for remaining := len(src); remaining > 0; {
+ // Process multiple blocks at once.
+ if c.off == BlockSize {
+ nrBlocks := remaining / BlockSize
+ directBytes := nrBlocks * BlockSize
+ if nrBlocks > 0 {
+ blocksFn(&c.state, src, dst, nrBlocks, c.ietf)
+ remaining -= directBytes
+ if remaining == 0 {
+ return
+ }
+ dst = dst[directBytes:]
+ src = src[directBytes:]
+ }
+
+ // If there's a partial block, generate 1 block of keystream into
+ // the internal buffer.
+ blocksFn(&c.state, nil, c.buf[:], 1, c.ietf)
+ c.off = 0
+ }
+
+ // Process partial blocks from the buffered keystream.
+ toXor := BlockSize - c.off
+ if remaining < toXor {
+ toXor = remaining
+ }
+ if toXor > 0 {
+ for i, v := range src[:toXor] {
+ dst[i] = v ^ c.buf[c.off+i]
+ }
+ dst = dst[toXor:]
+ src = src[toXor:]
+
+ remaining -= toXor
+ c.off += toXor
+ }
+ }
+}
+
+// KeyStream sets dst to the raw keystream.
+func (c *Cipher) KeyStream(dst []byte) {
+ for remaining := len(dst); remaining > 0; {
+ // Process multiple blocks at once.
+ if c.off == BlockSize {
+ nrBlocks := remaining / BlockSize
+ directBytes := nrBlocks * BlockSize
+ if nrBlocks > 0 {
+ blocksFn(&c.state, nil, dst, nrBlocks, c.ietf)
+ remaining -= directBytes
+ if remaining == 0 {
+ return
+ }
+ dst = dst[directBytes:]
+ }
+
+ // If there's a partial block, generate 1 block of keystream into
+ // the internal buffer.
+ blocksFn(&c.state, nil, c.buf[:], 1, c.ietf)
+ c.off = 0
+ }
+
+ // Process partial blocks from the buffered keystream.
+ toCopy := BlockSize - c.off
+ if remaining < toCopy {
+ toCopy = remaining
+ }
+ if toCopy > 0 {
+ copy(dst[:toCopy], c.buf[c.off:c.off+toCopy])
+ dst = dst[toCopy:]
+ remaining -= toCopy
+ c.off += toCopy
+ }
+ }
+}
+
+// ReKey reinitializes the ChaCha20/XChaCha20 instance with the provided key
+// and nonce.
+func (c *Cipher) ReKey(key, nonce []byte) error {
+ if len(key) != KeySize {
+ return ErrInvalidKey
+ }
+
+ switch len(nonce) {
+ case NonceSize:
+ case INonceSize:
+ case XNonceSize:
+ var subkey [KeySize]byte
+ var subnonce [HNonceSize]byte
+ copy(subnonce[:], nonce[0:16])
+ HChaCha(key, &subnonce, &subkey)
+ key = subkey[:]
+ nonce = nonce[16:24]
+ defer func() {
+ for i := range subkey {
+ subkey[i] = 0
+ }
+ }()
+ default:
+ return ErrInvalidNonce
+ }
+
+ c.Reset()
+ c.state[0] = sigma0
+ c.state[1] = sigma1
+ c.state[2] = sigma2
+ c.state[3] = sigma3
+ c.state[4] = binary.LittleEndian.Uint32(key[0:4])
+ c.state[5] = binary.LittleEndian.Uint32(key[4:8])
+ c.state[6] = binary.LittleEndian.Uint32(key[8:12])
+ c.state[7] = binary.LittleEndian.Uint32(key[12:16])
+ c.state[8] = binary.LittleEndian.Uint32(key[16:20])
+ c.state[9] = binary.LittleEndian.Uint32(key[20:24])
+ c.state[10] = binary.LittleEndian.Uint32(key[24:28])
+ c.state[11] = binary.LittleEndian.Uint32(key[28:32])
+ c.state[12] = 0
+ if len(nonce) == INonceSize {
+ c.state[13] = binary.LittleEndian.Uint32(nonce[0:4])
+ c.state[14] = binary.LittleEndian.Uint32(nonce[4:8])
+ c.state[15] = binary.LittleEndian.Uint32(nonce[8:12])
+ c.ietf = true
+ } else {
+ c.state[13] = 0
+ c.state[14] = binary.LittleEndian.Uint32(nonce[0:4])
+ c.state[15] = binary.LittleEndian.Uint32(nonce[4:8])
+ c.ietf = false
+ }
+ c.off = BlockSize
+ return nil
+
+}
+
+// Seek sets the block counter to a given offset.
+func (c *Cipher) Seek(blockCounter uint64) error {
+ if c.ietf {
+ if blockCounter > math.MaxUint32 {
+ return ErrInvalidCounter
+ }
+ c.state[12] = uint32(blockCounter)
+ } else {
+ c.state[12] = uint32(blockCounter)
+ c.state[13] = uint32(blockCounter >> 32)
+ }
+ c.off = BlockSize
+ return nil
+}
+
+// NewCipher returns a new ChaCha20/XChaCha20 instance.
+func NewCipher(key, nonce []byte) (*Cipher, error) {
+ c := new(Cipher)
+ if err := c.ReKey(key, nonce); err != nil {
+ return nil, err
+ }
+ return c, nil
+}
+
+// HChaCha is the HChaCha20 hash function used to make XChaCha.
+func HChaCha(key []byte, nonce *[HNonceSize]byte, out *[32]byte) {
+ var x [stateSize]uint32 // Last 4 slots unused, sigma hardcoded.
+ x[0] = binary.LittleEndian.Uint32(key[0:4])
+ x[1] = binary.LittleEndian.Uint32(key[4:8])
+ x[2] = binary.LittleEndian.Uint32(key[8:12])
+ x[3] = binary.LittleEndian.Uint32(key[12:16])
+ x[4] = binary.LittleEndian.Uint32(key[16:20])
+ x[5] = binary.LittleEndian.Uint32(key[20:24])
+ x[6] = binary.LittleEndian.Uint32(key[24:28])
+ x[7] = binary.LittleEndian.Uint32(key[28:32])
+ x[8] = binary.LittleEndian.Uint32(nonce[0:4])
+ x[9] = binary.LittleEndian.Uint32(nonce[4:8])
+ x[10] = binary.LittleEndian.Uint32(nonce[8:12])
+ x[11] = binary.LittleEndian.Uint32(nonce[12:16])
+ hChaChaRef(&x, out)
+}
+
+func init() {
+ switch runtime.GOARCH {
+ case "386", "amd64":
+ // Abuse unsafe to skip calling binary.LittleEndian.PutUint32
+ // in the critical path. This is a big boost on systems that are
+ // little endian and not overly picky about alignment.
+ useUnsafe = true
+ }
+}
+
+var _ cipher.Stream = (*Cipher)(nil)
diff --git a/vendor/github.com/Yawning/chacha20/chacha20_amd64.go b/vendor/github.com/Yawning/chacha20/chacha20_amd64.go
new file mode 100644
index 0000000..05adad1
--- /dev/null
+++ b/vendor/github.com/Yawning/chacha20/chacha20_amd64.go
@@ -0,0 +1,95 @@
+// chacha20_amd64.go - AMD64 optimized chacha20.
+//
+// To the extent possible under law, Yawning Angel has waived all copyright
+// and related or neighboring rights to chacha20, using the Creative
+// Commons "CC0" public domain dedication. See LICENSE or
+// for full details.
+
+// +build amd64,!gccgo,!appengine
+
+package chacha20
+
+import (
+ "math"
+)
+
+var usingAVX2 = false
+
+func blocksAmd64SSE2(x *uint32, inp, outp *byte, nrBlocks uint)
+
+func blocksAmd64AVX2(x *uint32, inp, outp *byte, nrBlocks uint)
+
+func cpuidAmd64(cpuidParams *uint32)
+
+func xgetbv0Amd64(xcrVec *uint32)
+
+func blocksAmd64(x *[stateSize]uint32, in []byte, out []byte, nrBlocks int, isIetf bool) {
+ // Probably unneeded, but stating this explicitly simplifies the assembly.
+ if nrBlocks == 0 {
+ return
+ }
+
+ if isIetf {
+ var totalBlocks uint64
+ totalBlocks = uint64(x[12]) + uint64(nrBlocks)
+ if totalBlocks > math.MaxUint32 {
+ panic("chacha20: Exceeded keystream per nonce limit")
+ }
+ }
+
+ if in == nil {
+ for i := range out {
+ out[i] = 0
+ }
+ in = out
+ }
+
+ // Pointless to call the AVX2 code for just a single block, since half of
+ // the output gets discarded...
+ if usingAVX2 && nrBlocks > 1 {
+ blocksAmd64AVX2(&x[0], &in[0], &out[0], uint(nrBlocks))
+ } else {
+ blocksAmd64SSE2(&x[0], &in[0], &out[0], uint(nrBlocks))
+ }
+}
+
+func supportsAVX2() bool {
+ // https://software.intel.com/en-us/articles/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family
+ const (
+ osXsaveBit = 1 << 27
+ avx2Bit = 1 << 5
+ )
+
+ // Check to see if CPUID actually supports the leaf that indicates AVX2.
+ // CPUID.(EAX=0H, ECX=0H) >= 7
+ regs := [4]uint32{0x00}
+ cpuidAmd64(®s[0])
+ if regs[0] < 7 {
+ return false
+ }
+
+ // Check to see if the OS knows how to save/restore XMM/YMM state.
+ // CPUID.(EAX=01H, ECX=0H):ECX.OSXSAVE[bit 27]==1
+ regs = [4]uint32{0x01}
+ cpuidAmd64(®s[0])
+ if regs[2]&osXsaveBit == 0 {
+ return false
+ }
+ xcrRegs := [2]uint32{}
+ xgetbv0Amd64(&xcrRegs[0])
+ if xcrRegs[0]&6 != 6 {
+ return false
+ }
+
+ // Check for AVX2 support.
+ // CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5]==1
+ regs = [4]uint32{0x07}
+ cpuidAmd64(®s[0])
+ return regs[1]&avx2Bit != 0
+}
+
+func init() {
+ blocksFn = blocksAmd64
+ usingVectors = true
+ usingAVX2 = supportsAVX2()
+}
diff --git a/vendor/github.com/Yawning/chacha20/chacha20_amd64.py b/vendor/github.com/Yawning/chacha20/chacha20_amd64.py
new file mode 100644
index 0000000..3bfebf4
--- /dev/null
+++ b/vendor/github.com/Yawning/chacha20/chacha20_amd64.py
@@ -0,0 +1,1295 @@
+#!/usr/bin/env python3
+#
+# To the extent possible under law, Yawning Angel has waived all copyright
+# and related or neighboring rights to chacha20, using the Creative
+# Commons "CC0" public domain dedication. See LICENSE or
+# for full details.
+
+#
+# cgo sucks. Plan 9 assembly sucks. Real languages have SIMD intrinsics.
+# The least terrible/retarded option is to use a Python code generator, so
+# that's what I did.
+#
+# Code based on Ted Krovetz's vec128 C implementation, with corrections
+# to use a 64 bit counter instead of 32 bit, and to allow unaligned input and
+# output pointers.
+#
+# Dependencies: https://github.com/Maratyszcza/PeachPy
+#
+# python3 -m peachpy.x86_64 -mabi=goasm -S -o chacha20_amd64.s chacha20_amd64.py
+#
+
+from peachpy import *
+from peachpy.x86_64 import *
+
+x = Argument(ptr(uint32_t))
+inp = Argument(ptr(const_uint8_t))
+outp = Argument(ptr(uint8_t))
+nrBlocks = Argument(ptr(size_t))
+
+#
+# SSE2 helper functions. A temporary register is explicitly passed in because
+# the main fast loop uses every single register (and even spills) so manual
+# control is needed.
+#
+# This used to also have a DQROUNDS helper that did 2 rounds of ChaCha like
+# in the C code, but the C code has the luxury of an optimizer reordering
+# everything, while this does not.
+#
+
+def ROTW16_sse2(tmp, d):
+ MOVDQA(tmp, d)
+ PSLLD(tmp, 16)
+ PSRLD(d, 16)
+ PXOR(d, tmp)
+
+def ROTW12_sse2(tmp, b):
+ MOVDQA(tmp, b)
+ PSLLD(tmp, 12)
+ PSRLD(b, 20)
+ PXOR(b, tmp)
+
+def ROTW8_sse2(tmp, d):
+ MOVDQA(tmp, d)
+ PSLLD(tmp, 8)
+ PSRLD(d, 24)
+ PXOR(d, tmp)
+
+def ROTW7_sse2(tmp, b):
+ MOVDQA(tmp, b)
+ PSLLD(tmp, 7)
+ PSRLD(b, 25)
+ PXOR(b, tmp)
+
+def WriteXor_sse2(tmp, inp, outp, d, v0, v1, v2, v3):
+ MOVDQU(tmp, [inp+d])
+ PXOR(tmp, v0)
+ MOVDQU([outp+d], tmp)
+ MOVDQU(tmp, [inp+d+16])
+ PXOR(tmp, v1)
+ MOVDQU([outp+d+16], tmp)
+ MOVDQU(tmp, [inp+d+32])
+ PXOR(tmp, v2)
+ MOVDQU([outp+d+32], tmp)
+ MOVDQU(tmp, [inp+d+48])
+ PXOR(tmp, v3)
+ MOVDQU([outp+d+48], tmp)
+
+# SSE2 ChaCha20 (aka vec128). Does not handle partial blocks, and will
+# process 4/2/1 blocks at a time.
+with Function("blocksAmd64SSE2", (x, inp, outp, nrBlocks)):
+ reg_x = GeneralPurposeRegister64()
+ reg_inp = GeneralPurposeRegister64()
+ reg_outp = GeneralPurposeRegister64()
+ reg_blocks = GeneralPurposeRegister64()
+ reg_sp_save = GeneralPurposeRegister64()
+
+ LOAD.ARGUMENT(reg_x, x)
+ LOAD.ARGUMENT(reg_inp, inp)
+ LOAD.ARGUMENT(reg_outp, outp)
+ LOAD.ARGUMENT(reg_blocks, nrBlocks)
+
+ # Align the stack to a 32 byte boundary.
+ MOV(reg_sp_save, registers.rsp)
+ AND(registers.rsp, 0xffffffffffffffe0)
+ SUB(registers.rsp, 0x20)
+
+ # Build the counter increment vector on the stack, and allocate the scratch
+ # space
+ xmm_v0 = XMMRegister()
+ PXOR(xmm_v0, xmm_v0)
+ SUB(registers.rsp, 16+16)
+ MOVDQA([registers.rsp], xmm_v0)
+ reg_tmp = GeneralPurposeRegister32()
+ MOV(reg_tmp, 0x00000001)
+ MOV([registers.rsp], reg_tmp)
+ mem_one = [registers.rsp] # (Stack) Counter increment vector
+ mem_tmp0 = [registers.rsp+16] # (Stack) Scratch space.
+
+ mem_s0 = [reg_x] # (Memory) Cipher state [0..3]
+ mem_s1 = [reg_x+16] # (Memory) Cipher state [4..7]
+ mem_s2 = [reg_x+32] # (Memory) Cipher state [8..11]
+ mem_s3 = [reg_x+48] # (Memory) Cipher state [12..15]
+
+ # xmm_v0 allocated above...
+ xmm_v1 = XMMRegister()
+ xmm_v2 = XMMRegister()
+ xmm_v3 = XMMRegister()
+
+ xmm_v4 = XMMRegister()
+ xmm_v5 = XMMRegister()
+ xmm_v6 = XMMRegister()
+ xmm_v7 = XMMRegister()
+
+ xmm_v8 = XMMRegister()
+ xmm_v9 = XMMRegister()
+ xmm_v10 = XMMRegister()
+ xmm_v11 = XMMRegister()
+
+ xmm_v12 = XMMRegister()
+ xmm_v13 = XMMRegister()
+ xmm_v14 = XMMRegister()
+ xmm_v15 = XMMRegister()
+
+ xmm_tmp = xmm_v12
+
+ #
+ # 4 blocks at a time.
+ #
+
+ reg_rounds = GeneralPurposeRegister64()
+
+ vector_loop4 = Loop()
+ SUB(reg_blocks, 4)
+ JB(vector_loop4.end)
+ with vector_loop4:
+ MOVDQU(xmm_v0, mem_s0)
+ MOVDQU(xmm_v1, mem_s1)
+ MOVDQU(xmm_v2, mem_s2)
+ MOVDQU(xmm_v3, mem_s3)
+
+ MOVDQA(xmm_v4, xmm_v0)
+ MOVDQA(xmm_v5, xmm_v1)
+ MOVDQA(xmm_v6, xmm_v2)
+ MOVDQA(xmm_v7, xmm_v3)
+ PADDQ(xmm_v7, mem_one)
+
+ MOVDQA(xmm_v8, xmm_v0)
+ MOVDQA(xmm_v9, xmm_v1)
+ MOVDQA(xmm_v10, xmm_v2)
+ MOVDQA(xmm_v11, xmm_v7)
+ PADDQ(xmm_v11, mem_one)
+
+ MOVDQA(xmm_v12, xmm_v0)
+ MOVDQA(xmm_v13, xmm_v1)
+ MOVDQA(xmm_v14, xmm_v2)
+ MOVDQA(xmm_v15, xmm_v11)
+ PADDQ(xmm_v15, mem_one)
+
+ MOV(reg_rounds, 20)
+ rounds_loop4 = Loop()
+ with rounds_loop4:
+ # a += b; d ^= a; d = ROTW16(d);
+ PADDD(xmm_v0, xmm_v1)
+ PADDD(xmm_v4, xmm_v5)
+ PADDD(xmm_v8, xmm_v9)
+ PADDD(xmm_v12, xmm_v13)
+ PXOR(xmm_v3, xmm_v0)
+ PXOR(xmm_v7, xmm_v4)
+ PXOR(xmm_v11, xmm_v8)
+ PXOR(xmm_v15, xmm_v12)
+
+ MOVDQA(mem_tmp0, xmm_tmp) # Save
+
+ ROTW16_sse2(xmm_tmp, xmm_v3)
+ ROTW16_sse2(xmm_tmp, xmm_v7)
+ ROTW16_sse2(xmm_tmp, xmm_v11)
+ ROTW16_sse2(xmm_tmp, xmm_v15)
+
+ # c += d; b ^= c; b = ROTW12(b);
+ PADDD(xmm_v2, xmm_v3)
+ PADDD(xmm_v6, xmm_v7)
+ PADDD(xmm_v10, xmm_v11)
+ PADDD(xmm_v14, xmm_v15)
+ PXOR(xmm_v1, xmm_v2)
+ PXOR(xmm_v5, xmm_v6)
+ PXOR(xmm_v9, xmm_v10)
+ PXOR(xmm_v13, xmm_v14)
+ ROTW12_sse2(xmm_tmp, xmm_v1)
+ ROTW12_sse2(xmm_tmp, xmm_v5)
+ ROTW12_sse2(xmm_tmp, xmm_v9)
+ ROTW12_sse2(xmm_tmp, xmm_v13)
+
+ # a += b; d ^= a; d = ROTW8(d);
+ MOVDQA(xmm_tmp, mem_tmp0) # Restore
+
+ PADDD(xmm_v0, xmm_v1)
+ PADDD(xmm_v4, xmm_v5)
+ PADDD(xmm_v8, xmm_v9)
+ PADDD(xmm_v12, xmm_v13)
+ PXOR(xmm_v3, xmm_v0)
+ PXOR(xmm_v7, xmm_v4)
+ PXOR(xmm_v11, xmm_v8)
+ PXOR(xmm_v15, xmm_v12)
+
+ MOVDQA(mem_tmp0, xmm_tmp) # Save
+
+ ROTW8_sse2(xmm_tmp, xmm_v3)
+ ROTW8_sse2(xmm_tmp, xmm_v7)
+ ROTW8_sse2(xmm_tmp, xmm_v11)
+ ROTW8_sse2(xmm_tmp, xmm_v15)
+
+ # c += d; b ^= c; b = ROTW7(b)
+ PADDD(xmm_v2, xmm_v3)
+ PADDD(xmm_v6, xmm_v7)
+ PADDD(xmm_v10, xmm_v11)
+ PADDD(xmm_v14, xmm_v15)
+ PXOR(xmm_v1, xmm_v2)
+ PXOR(xmm_v5, xmm_v6)
+ PXOR(xmm_v9, xmm_v10)
+ PXOR(xmm_v13, xmm_v14)
+ ROTW7_sse2(xmm_tmp, xmm_v1)
+ ROTW7_sse2(xmm_tmp, xmm_v5)
+ ROTW7_sse2(xmm_tmp, xmm_v9)
+ ROTW7_sse2(xmm_tmp, xmm_v13)
+
+ # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
+ PSHUFD(xmm_v1, xmm_v1, 0x39)
+ PSHUFD(xmm_v5, xmm_v5, 0x39)
+ PSHUFD(xmm_v9, xmm_v9, 0x39)
+ PSHUFD(xmm_v13, xmm_v13, 0x39)
+ PSHUFD(xmm_v2, xmm_v2, 0x4e)
+ PSHUFD(xmm_v6, xmm_v6, 0x4e)
+ PSHUFD(xmm_v10, xmm_v10, 0x4e)
+ PSHUFD(xmm_v14, xmm_v14, 0x4e)
+ PSHUFD(xmm_v3, xmm_v3, 0x93)
+ PSHUFD(xmm_v7, xmm_v7, 0x93)
+ PSHUFD(xmm_v11, xmm_v11, 0x93)
+ PSHUFD(xmm_v15, xmm_v15, 0x93)
+
+ MOVDQA(xmm_tmp, mem_tmp0) # Restore
+
+ # a += b; d ^= a; d = ROTW16(d);
+ PADDD(xmm_v0, xmm_v1)
+ PADDD(xmm_v4, xmm_v5)
+ PADDD(xmm_v8, xmm_v9)
+ PADDD(xmm_v12, xmm_v13)
+ PXOR(xmm_v3, xmm_v0)
+ PXOR(xmm_v7, xmm_v4)
+ PXOR(xmm_v11, xmm_v8)
+ PXOR(xmm_v15, xmm_v12)
+
+ MOVDQA(mem_tmp0, xmm_tmp) # Save
+
+ ROTW16_sse2(xmm_tmp, xmm_v3)
+ ROTW16_sse2(xmm_tmp, xmm_v7)
+ ROTW16_sse2(xmm_tmp, xmm_v11)
+ ROTW16_sse2(xmm_tmp, xmm_v15)
+
+ # c += d; b ^= c; b = ROTW12(b);
+ PADDD(xmm_v2, xmm_v3)
+ PADDD(xmm_v6, xmm_v7)
+ PADDD(xmm_v10, xmm_v11)
+ PADDD(xmm_v14, xmm_v15)
+ PXOR(xmm_v1, xmm_v2)
+ PXOR(xmm_v5, xmm_v6)
+ PXOR(xmm_v9, xmm_v10)
+ PXOR(xmm_v13, xmm_v14)
+ ROTW12_sse2(xmm_tmp, xmm_v1)
+ ROTW12_sse2(xmm_tmp, xmm_v5)
+ ROTW12_sse2(xmm_tmp, xmm_v9)
+ ROTW12_sse2(xmm_tmp, xmm_v13)
+
+ # a += b; d ^= a; d = ROTW8(d);
+ MOVDQA(xmm_tmp, mem_tmp0) # Restore
+
+ PADDD(xmm_v0, xmm_v1)
+ PADDD(xmm_v4, xmm_v5)
+ PADDD(xmm_v8, xmm_v9)
+ PADDD(xmm_v12, xmm_v13)
+ PXOR(xmm_v3, xmm_v0)
+ PXOR(xmm_v7, xmm_v4)
+ PXOR(xmm_v11, xmm_v8)
+ PXOR(xmm_v15, xmm_v12)
+
+ MOVDQA(mem_tmp0, xmm_tmp) # Save
+
+ ROTW8_sse2(xmm_tmp, xmm_v3)
+ ROTW8_sse2(xmm_tmp, xmm_v7)
+ ROTW8_sse2(xmm_tmp, xmm_v11)
+ ROTW8_sse2(xmm_tmp, xmm_v15)
+
+ # c += d; b ^= c; b = ROTW7(b)
+ PADDD(xmm_v2, xmm_v3)
+ PADDD(xmm_v6, xmm_v7)
+ PADDD(xmm_v10, xmm_v11)
+ PADDD(xmm_v14, xmm_v15)
+ PXOR(xmm_v1, xmm_v2)
+ PXOR(xmm_v5, xmm_v6)
+ PXOR(xmm_v9, xmm_v10)
+ PXOR(xmm_v13, xmm_v14)
+ ROTW7_sse2(xmm_tmp, xmm_v1)
+ ROTW7_sse2(xmm_tmp, xmm_v5)
+ ROTW7_sse2(xmm_tmp, xmm_v9)
+ ROTW7_sse2(xmm_tmp, xmm_v13)
+
+ # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
+ PSHUFD(xmm_v1, xmm_v1, 0x93)
+ PSHUFD(xmm_v5, xmm_v5, 0x93)
+ PSHUFD(xmm_v9, xmm_v9, 0x93)
+ PSHUFD(xmm_v13, xmm_v13, 0x93)
+ PSHUFD(xmm_v2, xmm_v2, 0x4e)
+ PSHUFD(xmm_v6, xmm_v6, 0x4e)
+ PSHUFD(xmm_v10, xmm_v10, 0x4e)
+ PSHUFD(xmm_v14, xmm_v14, 0x4e)
+ PSHUFD(xmm_v3, xmm_v3, 0x39)
+ PSHUFD(xmm_v7, xmm_v7, 0x39)
+ PSHUFD(xmm_v11, xmm_v11, 0x39)
+ PSHUFD(xmm_v15, xmm_v15, 0x39)
+
+ MOVDQA(xmm_tmp, mem_tmp0) # Restore
+
+ SUB(reg_rounds, 2)
+ JNZ(rounds_loop4.begin)
+
+ MOVDQA(mem_tmp0, xmm_tmp)
+
+ PADDD(xmm_v0, mem_s0)
+ PADDD(xmm_v1, mem_s1)
+ PADDD(xmm_v2, mem_s2)
+ PADDD(xmm_v3, mem_s3)
+ WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3)
+ MOVDQU(xmm_v3, mem_s3)
+ PADDQ(xmm_v3, mem_one)
+
+ PADDD(xmm_v4, mem_s0)
+ PADDD(xmm_v5, mem_s1)
+ PADDD(xmm_v6, mem_s2)
+ PADDD(xmm_v7, xmm_v3)
+ WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 64, xmm_v4, xmm_v5, xmm_v6, xmm_v7)
+ PADDQ(xmm_v3, mem_one)
+
+ PADDD(xmm_v8, mem_s0)
+ PADDD(xmm_v9, mem_s1)
+ PADDD(xmm_v10, mem_s2)
+ PADDD(xmm_v11, xmm_v3)
+ WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 128, xmm_v8, xmm_v9, xmm_v10, xmm_v11)
+ PADDQ(xmm_v3, mem_one)
+
+ MOVDQA(xmm_tmp, mem_tmp0)
+
+ PADDD(xmm_v12, mem_s0)
+ PADDD(xmm_v13, mem_s1)
+ PADDD(xmm_v14, mem_s2)
+ PADDD(xmm_v15, xmm_v3)
+ WriteXor_sse2(xmm_v0, reg_inp, reg_outp, 192, xmm_v12, xmm_v13, xmm_v14, xmm_v15)
+ PADDQ(xmm_v3, mem_one)
+
+ MOVDQU(mem_s3, xmm_v3)
+
+ ADD(reg_inp, 4 * 64)
+ ADD(reg_outp, 4 * 64)
+
+ SUB(reg_blocks, 4)
+ JAE(vector_loop4.begin)
+
+ ADD(reg_blocks, 4)
+ out = Label()
+ JZ(out)
+
+ # Past this point, we no longer need to use every single register to hold
+ # the in progress state.
+
+ xmm_s0 = xmm_v8
+ xmm_s1 = xmm_v9
+ xmm_s2 = xmm_v10
+ xmm_s3 = xmm_v11
+ xmm_one = xmm_v13
+ MOVDQU(xmm_s0, mem_s0)
+ MOVDQU(xmm_s1, mem_s1)
+ MOVDQU(xmm_s2, mem_s2)
+ MOVDQU(xmm_s3, mem_s3)
+ MOVDQA(xmm_one, mem_one)
+
+ #
+ # 2 blocks at a time.
+ #
+
+ process_1_block = Label()
+ SUB(reg_blocks, 2)
+ JB(process_1_block) # < 2 blocks remaining.
+
+ MOVDQA(xmm_v0, xmm_s0)
+ MOVDQA(xmm_v1, xmm_s1)
+ MOVDQA(xmm_v2, xmm_s2)
+ MOVDQA(xmm_v3, xmm_s3)
+
+ MOVDQA(xmm_v4, xmm_v0)
+ MOVDQA(xmm_v5, xmm_v1)
+ MOVDQA(xmm_v6, xmm_v2)
+ MOVDQA(xmm_v7, xmm_v3)
+ PADDQ(xmm_v7, xmm_one)
+
+ MOV(reg_rounds, 20)
+ rounds_loop2 = Loop()
+ with rounds_loop2:
+ # a += b; d ^= a; d = ROTW16(d);
+ PADDD(xmm_v0, xmm_v1)
+ PADDD(xmm_v4, xmm_v5)
+ PXOR(xmm_v3, xmm_v0)
+ PXOR(xmm_v7, xmm_v4)
+ ROTW16_sse2(xmm_tmp, xmm_v3)
+ ROTW16_sse2(xmm_tmp, xmm_v7)
+
+ # c += d; b ^= c; b = ROTW12(b);
+ PADDD(xmm_v2, xmm_v3)
+ PADDD(xmm_v6, xmm_v7)
+ PXOR(xmm_v1, xmm_v2)
+ PXOR(xmm_v5, xmm_v6)
+ ROTW12_sse2(xmm_tmp, xmm_v1)
+ ROTW12_sse2(xmm_tmp, xmm_v5)
+
+ # a += b; d ^= a; d = ROTW8(d);
+ PADDD(xmm_v0, xmm_v1)
+ PADDD(xmm_v4, xmm_v5)
+ PXOR(xmm_v3, xmm_v0)
+ PXOR(xmm_v7, xmm_v4)
+ ROTW8_sse2(xmm_tmp, xmm_v3)
+ ROTW8_sse2(xmm_tmp, xmm_v7)
+
+ # c += d; b ^= c; b = ROTW7(b)
+ PADDD(xmm_v2, xmm_v3)
+ PADDD(xmm_v6, xmm_v7)
+ PXOR(xmm_v1, xmm_v2)
+ PXOR(xmm_v5, xmm_v6)
+ ROTW7_sse2(xmm_tmp, xmm_v1)
+ ROTW7_sse2(xmm_tmp, xmm_v5)
+
+ # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
+ PSHUFD(xmm_v1, xmm_v1, 0x39)
+ PSHUFD(xmm_v5, xmm_v5, 0x39)
+ PSHUFD(xmm_v2, xmm_v2, 0x4e)
+ PSHUFD(xmm_v6, xmm_v6, 0x4e)
+ PSHUFD(xmm_v3, xmm_v3, 0x93)
+ PSHUFD(xmm_v7, xmm_v7, 0x93)
+
+ # a += b; d ^= a; d = ROTW16(d);
+ PADDD(xmm_v0, xmm_v1)
+ PADDD(xmm_v4, xmm_v5)
+ PXOR(xmm_v3, xmm_v0)
+ PXOR(xmm_v7, xmm_v4)
+ ROTW16_sse2(xmm_tmp, xmm_v3)
+ ROTW16_sse2(xmm_tmp, xmm_v7)
+
+ # c += d; b ^= c; b = ROTW12(b);
+ PADDD(xmm_v2, xmm_v3)
+ PADDD(xmm_v6, xmm_v7)
+ PXOR(xmm_v1, xmm_v2)
+ PXOR(xmm_v5, xmm_v6)
+ ROTW12_sse2(xmm_tmp, xmm_v1)
+ ROTW12_sse2(xmm_tmp, xmm_v5)
+
+ # a += b; d ^= a; d = ROTW8(d);
+ PADDD(xmm_v0, xmm_v1)
+ PADDD(xmm_v4, xmm_v5)
+ PXOR(xmm_v3, xmm_v0)
+ PXOR(xmm_v7, xmm_v4)
+ ROTW8_sse2(xmm_tmp, xmm_v3)
+ ROTW8_sse2(xmm_tmp, xmm_v7)
+
+ # c += d; b ^= c; b = ROTW7(b)
+ PADDD(xmm_v2, xmm_v3)
+ PADDD(xmm_v6, xmm_v7)
+ PXOR(xmm_v1, xmm_v2)
+ PXOR(xmm_v5, xmm_v6)
+ ROTW7_sse2(xmm_tmp, xmm_v1)
+ ROTW7_sse2(xmm_tmp, xmm_v5)
+
+ # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
+ PSHUFD(xmm_v1, xmm_v1, 0x93)
+ PSHUFD(xmm_v5, xmm_v5, 0x93)
+ PSHUFD(xmm_v2, xmm_v2, 0x4e)
+ PSHUFD(xmm_v6, xmm_v6, 0x4e)
+ PSHUFD(xmm_v3, xmm_v3, 0x39)
+ PSHUFD(xmm_v7, xmm_v7, 0x39)
+
+ SUB(reg_rounds, 2)
+ JNZ(rounds_loop2.begin)
+
+ PADDD(xmm_v0, xmm_s0)
+ PADDD(xmm_v1, xmm_s1)
+ PADDD(xmm_v2, xmm_s2)
+ PADDD(xmm_v3, xmm_s3)
+ WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3)
+ PADDQ(xmm_s3, xmm_one)
+
+ PADDD(xmm_v4, xmm_s0)
+ PADDD(xmm_v5, xmm_s1)
+ PADDD(xmm_v6, xmm_s2)
+ PADDD(xmm_v7, xmm_s3)
+ WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 64, xmm_v4, xmm_v5, xmm_v6, xmm_v7)
+ PADDQ(xmm_s3, xmm_one)
+
+ ADD(reg_inp, 2 * 64)
+ ADD(reg_outp, 2 * 64)
+ SUB(reg_blocks, 2)
+
+ LABEL(process_1_block)
+ ADD(reg_blocks, 2)
+ out_serial = Label()
+ JZ(out_serial)
+
+ #
+ # 1 block at a time. Only executed once, because if there was > 1,
+ # the parallel code would have processed it already.
+ #
+
+ MOVDQA(xmm_v0, xmm_s0)
+ MOVDQA(xmm_v1, xmm_s1)
+ MOVDQA(xmm_v2, xmm_s2)
+ MOVDQA(xmm_v3, xmm_s3)
+
+ MOV(reg_rounds, 20)
+ rounds_loop1 = Loop()
+ with rounds_loop1:
+ # a += b; d ^= a; d = ROTW16(d);
+ PADDD(xmm_v0, xmm_v1)
+ PXOR(xmm_v3, xmm_v0)
+ ROTW16_sse2(xmm_tmp, xmm_v3)
+
+ # c += d; b ^= c; b = ROTW12(b);
+ PADDD(xmm_v2, xmm_v3)
+ PXOR(xmm_v1, xmm_v2)
+ ROTW12_sse2(xmm_tmp, xmm_v1)
+
+ # a += b; d ^= a; d = ROTW8(d);
+ PADDD(xmm_v0, xmm_v1)
+ PXOR(xmm_v3, xmm_v0)
+ ROTW8_sse2(xmm_tmp, xmm_v3)
+
+ # c += d; b ^= c; b = ROTW7(b)
+ PADDD(xmm_v2, xmm_v3)
+ PXOR(xmm_v1, xmm_v2)
+ ROTW7_sse2(xmm_tmp, xmm_v1)
+
+ # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
+ PSHUFD(xmm_v1, xmm_v1, 0x39)
+ PSHUFD(xmm_v2, xmm_v2, 0x4e)
+ PSHUFD(xmm_v3, xmm_v3, 0x93)
+
+ # a += b; d ^= a; d = ROTW16(d);
+ PADDD(xmm_v0, xmm_v1)
+ PXOR(xmm_v3, xmm_v0)
+ ROTW16_sse2(xmm_tmp, xmm_v3)
+
+ # c += d; b ^= c; b = ROTW12(b);
+ PADDD(xmm_v2, xmm_v3)
+ PXOR(xmm_v1, xmm_v2)
+ ROTW12_sse2(xmm_tmp, xmm_v1)
+
+ # a += b; d ^= a; d = ROTW8(d);
+ PADDD(xmm_v0, xmm_v1)
+ PXOR(xmm_v3, xmm_v0)
+ ROTW8_sse2(xmm_tmp, xmm_v3)
+
+ # c += d; b ^= c; b = ROTW7(b)
+ PADDD(xmm_v2, xmm_v3)
+ PXOR(xmm_v1, xmm_v2)
+ ROTW7_sse2(xmm_tmp, xmm_v1)
+
+ # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
+ PSHUFD(xmm_v1, xmm_v1, 0x93)
+ PSHUFD(xmm_v2, xmm_v2, 0x4e)
+ PSHUFD(xmm_v3, xmm_v3, 0x39)
+
+ SUB(reg_rounds, 2)
+ JNZ(rounds_loop1.begin)
+
+ PADDD(xmm_v0, xmm_s0)
+ PADDD(xmm_v1, xmm_s1)
+ PADDD(xmm_v2, xmm_s2)
+ PADDD(xmm_v3, xmm_s3)
+ WriteXor_sse2(xmm_tmp, reg_inp, reg_outp, 0, xmm_v0, xmm_v1, xmm_v2, xmm_v3)
+ PADDQ(xmm_s3, xmm_one)
+
+ LABEL(out_serial)
+
+ # Write back the updated counter. Stoping at 2^70 bytes is the user's
+ # problem, not mine. (Skipped if there's exactly a multiple of 4 blocks
+ # because the counter is incremented in memory while looping.)
+ MOVDQU(mem_s3, xmm_s3)
+
+ LABEL(out)
+
+ # Paranoia, cleanse the scratch space.
+ PXOR(xmm_v0, xmm_v0)
+ MOVDQA(mem_tmp0, xmm_v0)
+
+ # Remove our stack allocation.
+ MOV(registers.rsp, reg_sp_save)
+
+ RETURN()
+
+#
+# AVX2 helpers. Like the SSE2 equivalents, the scratch register is explicit,
+# and more helpers are used to increase readability for destructive operations.
+#
+# XXX/Performance: ROTW16_avx2/ROTW8_avx2 both can use VPSHUFFB.
+#
+
+def ADD_avx2(dst, src):
+ VPADDD(dst, dst, src)
+
+def XOR_avx2(dst, src):
+ VPXOR(dst, dst, src)
+
+def ROTW16_avx2(tmp, d):
+ VPSLLD(tmp, d, 16)
+ VPSRLD(d, d, 16)
+ XOR_avx2(d, tmp)
+
+def ROTW12_avx2(tmp, b):
+ VPSLLD(tmp, b, 12)
+ VPSRLD(b, b, 20)
+ XOR_avx2(b, tmp)
+
+def ROTW8_avx2(tmp, d):
+ VPSLLD(tmp, d, 8)
+ VPSRLD(d, d, 24)
+ XOR_avx2(d, tmp)
+
+def ROTW7_avx2(tmp, b):
+ VPSLLD(tmp, b, 7)
+ VPSRLD(b, b, 25)
+ XOR_avx2(b, tmp)
+
+def WriteXor_avx2(tmp, inp, outp, d, v0, v1, v2, v3):
+ # XOR_WRITE(out+ 0, in+ 0, _mm256_permute2x128_si256(v0,v1,0x20));
+ VPERM2I128(tmp, v0, v1, 0x20)
+ VPXOR(tmp, tmp, [inp+d])
+ VMOVDQU([outp+d], tmp)
+
+ # XOR_WRITE(out+32, in+32, _mm256_permute2x128_si256(v2,v3,0x20));
+ VPERM2I128(tmp, v2, v3, 0x20)
+ VPXOR(tmp, tmp, [inp+d+32])
+ VMOVDQU([outp+d+32], tmp)
+
+ # XOR_WRITE(out+64, in+64, _mm256_permute2x128_si256(v0,v1,0x31));
+ VPERM2I128(tmp, v0, v1, 0x31)
+ VPXOR(tmp, tmp, [inp+d+64])
+ VMOVDQU([outp+d+64], tmp)
+
+ # XOR_WRITE(out+96, in+96, _mm256_permute2x128_si256(v2,v3,0x31));
+ VPERM2I128(tmp, v2, v3, 0x31)
+ VPXOR(tmp, tmp, [inp+d+96])
+ VMOVDQU([outp+d+96], tmp)
+
+# AVX2 ChaCha20 (aka avx2). Does not handle partial blocks, will process
+# 8/4/2 blocks at a time.
+with Function("blocksAmd64AVX2", (x, inp, outp, nrBlocks), target=uarch.broadwell):
+ reg_x = GeneralPurposeRegister64()
+ reg_inp = GeneralPurposeRegister64()
+ reg_outp = GeneralPurposeRegister64()
+ reg_blocks = GeneralPurposeRegister64()
+ reg_sp_save = GeneralPurposeRegister64()
+
+ LOAD.ARGUMENT(reg_x, x)
+ LOAD.ARGUMENT(reg_inp, inp)
+ LOAD.ARGUMENT(reg_outp, outp)
+ LOAD.ARGUMENT(reg_blocks, nrBlocks)
+
+ # Align the stack to a 32 byte boundary.
+ MOV(reg_sp_save, registers.rsp)
+ AND(registers.rsp, 0xffffffffffffffe0)
+ SUB(registers.rsp, 0x20)
+
+ x_s0 = [reg_x] # (Memory) Cipher state [0..3]
+ x_s1 = [reg_x+16] # (Memory) Cipher state [4..7]
+ x_s2 = [reg_x+32] # (Memory) Cipher state [8..11]
+ x_s3 = [reg_x+48] # (Memory) Cipher state [12..15]
+
+ ymm_v0 = YMMRegister()
+ ymm_v1 = YMMRegister()
+ ymm_v2 = YMMRegister()
+ ymm_v3 = YMMRegister()
+
+ ymm_v4 = YMMRegister()
+ ymm_v5 = YMMRegister()
+ ymm_v6 = YMMRegister()
+ ymm_v7 = YMMRegister()
+
+ ymm_v8 = YMMRegister()
+ ymm_v9 = YMMRegister()
+ ymm_v10 = YMMRegister()
+ ymm_v11 = YMMRegister()
+
+ ymm_v12 = YMMRegister()
+ ymm_v13 = YMMRegister()
+ ymm_v14 = YMMRegister()
+ ymm_v15 = YMMRegister()
+
+ ymm_tmp0 = ymm_v12
+
+ # Allocate the neccecary stack space for the counter vector and two ymm
+ # registers that we will spill.
+ SUB(registers.rsp, 96)
+ mem_tmp0 = [registers.rsp+64] # (Stack) Scratch space.
+ mem_s3 = [registers.rsp+32] # (Stack) Working copy of s3. (8x)
+ mem_inc = [registers.rsp] # (Stack) Counter increment vector.
+
+ # Increment the counter for one side of the state vector.
+ VPXOR(ymm_tmp0, ymm_tmp0, ymm_tmp0)
+ VMOVDQU(mem_inc, ymm_tmp0)
+ reg_tmp = GeneralPurposeRegister32()
+ MOV(reg_tmp, 0x00000001)
+ MOV([registers.rsp+16], reg_tmp)
+ VBROADCASTI128(ymm_v3, x_s3)
+ VPADDQ(ymm_v3, ymm_v3, [registers.rsp])
+ VMOVDQA(mem_s3, ymm_v3)
+
+ # As we process 2xN blocks at a time, so the counter increment for both
+ # sides of the state vector is 2.
+ MOV(reg_tmp, 0x00000002)
+ MOV([registers.rsp], reg_tmp)
+ MOV([registers.rsp+16], reg_tmp)
+
+ out_write_even = Label()
+ out_write_odd = Label()
+
+ #
+ # 8 blocks at a time. Ted Krovetz's avx2 code does not do this, but it's
+ # a decent gain despite all the pain...
+ #
+
+ reg_rounds = GeneralPurposeRegister64()
+
+ vector_loop8 = Loop()
+ SUB(reg_blocks, 8)
+ JB(vector_loop8.end)
+ with vector_loop8:
+ VBROADCASTI128(ymm_v0, x_s0)
+ VBROADCASTI128(ymm_v1, x_s1)
+ VBROADCASTI128(ymm_v2, x_s2)
+ VMOVDQA(ymm_v3, mem_s3)
+
+ VMOVDQA(ymm_v4, ymm_v0)
+ VMOVDQA(ymm_v5, ymm_v1)
+ VMOVDQA(ymm_v6, ymm_v2)
+ VPADDQ(ymm_v7, ymm_v3, mem_inc)
+
+ VMOVDQA(ymm_v8, ymm_v0)
+ VMOVDQA(ymm_v9, ymm_v1)
+ VMOVDQA(ymm_v10, ymm_v2)
+ VPADDQ(ymm_v11, ymm_v7, mem_inc)
+
+ VMOVDQA(ymm_v12, ymm_v0)
+ VMOVDQA(ymm_v13, ymm_v1)
+ VMOVDQA(ymm_v14, ymm_v2)
+ VPADDQ(ymm_v15, ymm_v11, mem_inc)
+
+ MOV(reg_rounds, 20)
+ rounds_loop8 = Loop()
+ with rounds_loop8:
+ # a += b; d ^= a; d = ROTW16(d);
+ ADD_avx2(ymm_v0, ymm_v1)
+ ADD_avx2(ymm_v4, ymm_v5)
+ ADD_avx2(ymm_v8, ymm_v9)
+ ADD_avx2(ymm_v12, ymm_v13)
+ XOR_avx2(ymm_v3, ymm_v0)
+ XOR_avx2(ymm_v7, ymm_v4)
+ XOR_avx2(ymm_v11, ymm_v8)
+ XOR_avx2(ymm_v15, ymm_v12)
+
+ VMOVDQA(mem_tmp0, ymm_tmp0) # Save
+
+ ROTW16_avx2(ymm_tmp0, ymm_v3)
+ ROTW16_avx2(ymm_tmp0, ymm_v7)
+ ROTW16_avx2(ymm_tmp0, ymm_v11)
+ ROTW16_avx2(ymm_tmp0, ymm_v15)
+
+ # c += d; b ^= c; b = ROTW12(b);
+ ADD_avx2(ymm_v2, ymm_v3)
+ ADD_avx2(ymm_v6, ymm_v7)
+ ADD_avx2(ymm_v10, ymm_v11)
+ ADD_avx2(ymm_v14, ymm_v15)
+ XOR_avx2(ymm_v1, ymm_v2)
+ XOR_avx2(ymm_v5, ymm_v6)
+ XOR_avx2(ymm_v9, ymm_v10)
+ XOR_avx2(ymm_v13, ymm_v14)
+ ROTW12_avx2(ymm_tmp0, ymm_v1)
+ ROTW12_avx2(ymm_tmp0, ymm_v5)
+ ROTW12_avx2(ymm_tmp0, ymm_v9)
+ ROTW12_avx2(ymm_tmp0, ymm_v13)
+
+ # a += b; d ^= a; d = ROTW8(d);
+ VMOVDQA(ymm_tmp0, mem_tmp0) # Restore
+
+ ADD_avx2(ymm_v0, ymm_v1)
+ ADD_avx2(ymm_v4, ymm_v5)
+ ADD_avx2(ymm_v8, ymm_v9)
+ ADD_avx2(ymm_v12, ymm_v13)
+ XOR_avx2(ymm_v3, ymm_v0)
+ XOR_avx2(ymm_v7, ymm_v4)
+ XOR_avx2(ymm_v11, ymm_v8)
+ XOR_avx2(ymm_v15, ymm_v12)
+
+ VMOVDQA(mem_tmp0, ymm_tmp0) # Save
+
+ ROTW8_avx2(ymm_tmp0, ymm_v3)
+ ROTW8_avx2(ymm_tmp0, ymm_v7)
+ ROTW8_avx2(ymm_tmp0, ymm_v11)
+ ROTW8_avx2(ymm_tmp0, ymm_v15)
+
+ # c += d; b ^= c; b = ROTW7(b)
+ ADD_avx2(ymm_v2, ymm_v3)
+ ADD_avx2(ymm_v6, ymm_v7)
+ ADD_avx2(ymm_v10, ymm_v11)
+ ADD_avx2(ymm_v14, ymm_v15)
+ XOR_avx2(ymm_v1, ymm_v2)
+ XOR_avx2(ymm_v5, ymm_v6)
+ XOR_avx2(ymm_v9, ymm_v10)
+ XOR_avx2(ymm_v13, ymm_v14)
+ ROTW7_avx2(ymm_tmp0, ymm_v1)
+ ROTW7_avx2(ymm_tmp0, ymm_v5)
+ ROTW7_avx2(ymm_tmp0, ymm_v9)
+ ROTW7_avx2(ymm_tmp0, ymm_v13)
+
+ # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
+ VPSHUFD(ymm_v1, ymm_v1, 0x39)
+ VPSHUFD(ymm_v5, ymm_v5, 0x39)
+ VPSHUFD(ymm_v9, ymm_v9, 0x39)
+ VPSHUFD(ymm_v13, ymm_v13, 0x39)
+ VPSHUFD(ymm_v2, ymm_v2, 0x4e)
+ VPSHUFD(ymm_v6, ymm_v6, 0x4e)
+ VPSHUFD(ymm_v10, ymm_v10, 0x4e)
+ VPSHUFD(ymm_v14, ymm_v14, 0x4e)
+ VPSHUFD(ymm_v3, ymm_v3, 0x93)
+ VPSHUFD(ymm_v7, ymm_v7, 0x93)
+ VPSHUFD(ymm_v11, ymm_v11, 0x93)
+ VPSHUFD(ymm_v15, ymm_v15, 0x93)
+
+ # a += b; d ^= a; d = ROTW16(d);
+ VMOVDQA(ymm_tmp0, mem_tmp0) # Restore
+
+ ADD_avx2(ymm_v0, ymm_v1)
+ ADD_avx2(ymm_v4, ymm_v5)
+ ADD_avx2(ymm_v8, ymm_v9)
+ ADD_avx2(ymm_v12, ymm_v13)
+ XOR_avx2(ymm_v3, ymm_v0)
+ XOR_avx2(ymm_v7, ymm_v4)
+ XOR_avx2(ymm_v11, ymm_v8)
+ XOR_avx2(ymm_v15, ymm_v12)
+
+ VMOVDQA(mem_tmp0, ymm_tmp0) # Save
+
+ ROTW16_avx2(ymm_tmp0, ymm_v3)
+ ROTW16_avx2(ymm_tmp0, ymm_v7)
+ ROTW16_avx2(ymm_tmp0, ymm_v11)
+ ROTW16_avx2(ymm_tmp0, ymm_v15)
+
+ # c += d; b ^= c; b = ROTW12(b);
+ ADD_avx2(ymm_v2, ymm_v3)
+ ADD_avx2(ymm_v6, ymm_v7)
+ ADD_avx2(ymm_v10, ymm_v11)
+ ADD_avx2(ymm_v14, ymm_v15)
+ XOR_avx2(ymm_v1, ymm_v2)
+ XOR_avx2(ymm_v5, ymm_v6)
+ XOR_avx2(ymm_v9, ymm_v10)
+ XOR_avx2(ymm_v13, ymm_v14)
+ ROTW12_avx2(ymm_tmp0, ymm_v1)
+ ROTW12_avx2(ymm_tmp0, ymm_v5)
+ ROTW12_avx2(ymm_tmp0, ymm_v9)
+ ROTW12_avx2(ymm_tmp0, ymm_v13)
+
+ # a += b; d ^= a; d = ROTW8(d);
+ VMOVDQA(ymm_tmp0, mem_tmp0) # Restore
+
+ ADD_avx2(ymm_v0, ymm_v1)
+ ADD_avx2(ymm_v4, ymm_v5)
+ ADD_avx2(ymm_v8, ymm_v9)
+ ADD_avx2(ymm_v12, ymm_v13)
+ XOR_avx2(ymm_v3, ymm_v0)
+ XOR_avx2(ymm_v7, ymm_v4)
+ XOR_avx2(ymm_v11, ymm_v8)
+ XOR_avx2(ymm_v15, ymm_v12)
+
+ VMOVDQA(mem_tmp0, ymm_tmp0) # Save
+
+ ROTW8_avx2(ymm_tmp0, ymm_v3)
+ ROTW8_avx2(ymm_tmp0, ymm_v7)
+ ROTW8_avx2(ymm_tmp0, ymm_v11)
+ ROTW8_avx2(ymm_tmp0, ymm_v15)
+
+ # c += d; b ^= c; b = ROTW7(b)
+ ADD_avx2(ymm_v2, ymm_v3)
+ ADD_avx2(ymm_v6, ymm_v7)
+ ADD_avx2(ymm_v10, ymm_v11)
+ ADD_avx2(ymm_v14, ymm_v15)
+ XOR_avx2(ymm_v1, ymm_v2)
+ XOR_avx2(ymm_v5, ymm_v6)
+ XOR_avx2(ymm_v9, ymm_v10)
+ XOR_avx2(ymm_v13, ymm_v14)
+ ROTW7_avx2(ymm_tmp0, ymm_v1)
+ ROTW7_avx2(ymm_tmp0, ymm_v5)
+ ROTW7_avx2(ymm_tmp0, ymm_v9)
+ ROTW7_avx2(ymm_tmp0, ymm_v13)
+
+ # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
+ VPSHUFD(ymm_v1, ymm_v1, 0x93)
+ VPSHUFD(ymm_v5, ymm_v5, 0x93)
+ VPSHUFD(ymm_v9, ymm_v9, 0x93)
+ VPSHUFD(ymm_v13, ymm_v13, 0x93)
+ VPSHUFD(ymm_v2, ymm_v2, 0x4e)
+ VPSHUFD(ymm_v6, ymm_v6, 0x4e)
+ VPSHUFD(ymm_v10, ymm_v10, 0x4e)
+ VPSHUFD(ymm_v14, ymm_v14, 0x4e)
+ VPSHUFD(ymm_v3, ymm_v3, 0x39)
+ VPSHUFD(ymm_v7, ymm_v7, 0x39)
+ VPSHUFD(ymm_v11, ymm_v11, 0x39)
+ VPSHUFD(ymm_v15, ymm_v15, 0x39)
+
+ VMOVDQA(ymm_tmp0, mem_tmp0) # Restore
+
+ SUB(reg_rounds, 2)
+ JNZ(rounds_loop8.begin)
+
+ # ymm_v12 is in mem_tmp0 and is current....
+
+ # XXX: I assume VBROADCASTI128 is about as fast as VMOVDQA....
+ VBROADCASTI128(ymm_tmp0, x_s0)
+ ADD_avx2(ymm_v0, ymm_tmp0)
+ ADD_avx2(ymm_v4, ymm_tmp0)
+ ADD_avx2(ymm_v8, ymm_tmp0)
+ ADD_avx2(ymm_tmp0, mem_tmp0)
+ VMOVDQA(mem_tmp0, ymm_tmp0)
+
+ VBROADCASTI128(ymm_tmp0, x_s1)
+ ADD_avx2(ymm_v1, ymm_tmp0)
+ ADD_avx2(ymm_v5, ymm_tmp0)
+ ADD_avx2(ymm_v9, ymm_tmp0)
+ ADD_avx2(ymm_v13, ymm_tmp0)
+
+ VBROADCASTI128(ymm_tmp0, x_s2)
+ ADD_avx2(ymm_v2, ymm_tmp0)
+ ADD_avx2(ymm_v6, ymm_tmp0)
+ ADD_avx2(ymm_v10, ymm_tmp0)
+ ADD_avx2(ymm_v14, ymm_tmp0)
+
+ ADD_avx2(ymm_v3, mem_s3)
+ WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 0, ymm_v0, ymm_v1, ymm_v2, ymm_v3)
+ VMOVDQA(ymm_v3, mem_s3)
+ ADD_avx2(ymm_v3, mem_inc)
+
+ ADD_avx2(ymm_v7, ymm_v3)
+ WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 128, ymm_v4, ymm_v5, ymm_v6, ymm_v7)
+ ADD_avx2(ymm_v3, mem_inc)
+
+ ADD_avx2(ymm_v11, ymm_v3)
+ WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 256, ymm_v8, ymm_v9, ymm_v10, ymm_v11)
+ ADD_avx2(ymm_v3, mem_inc)
+
+ VMOVDQA(ymm_v12, mem_tmp0)
+ ADD_avx2(ymm_v15, ymm_v3)
+ WriteXor_avx2(ymm_v0, reg_inp, reg_outp, 384, ymm_v12, ymm_v13, ymm_v14, ymm_v15)
+ ADD_avx2(ymm_v3, mem_inc)
+
+ VMOVDQA(mem_s3, ymm_v3)
+
+ ADD(reg_inp, 8 * 64)
+ ADD(reg_outp, 8 * 64)
+
+ SUB(reg_blocks, 8)
+ JAE(vector_loop8.begin)
+
+ # ymm_v3 contains a current copy of mem_s3 either from when it was built,
+ # or because the loop updates it. Copy this before we mess with the block
+ # counter in case we need to write it back and return.
+ ymm_s3 = ymm_v11
+ VMOVDQA(ymm_s3, ymm_v3)
+
+ ADD(reg_blocks, 8)
+ JZ(out_write_even)
+
+ # We now actually can do everything in registers.
+ ymm_s0 = ymm_v8
+ VBROADCASTI128(ymm_s0, x_s0)
+ ymm_s1 = ymm_v9
+ VBROADCASTI128(ymm_s1, x_s1)
+ ymm_s2 = ymm_v10
+ VBROADCASTI128(ymm_s2, x_s2)
+ ymm_inc = ymm_v14
+ VMOVDQA(ymm_inc, mem_inc)
+
+ #
+ # 4 blocks at a time.
+ #
+
+ process_2_blocks = Label()
+ SUB(reg_blocks, 4)
+ JB(process_2_blocks) # < 4 blocks remaining.
+
+ VMOVDQA(ymm_v0, ymm_s0)
+ VMOVDQA(ymm_v1, ymm_s1)
+ VMOVDQA(ymm_v2, ymm_s2)
+ VMOVDQA(ymm_v3, ymm_s3)
+
+ VMOVDQA(ymm_v4, ymm_v0)
+ VMOVDQA(ymm_v5, ymm_v1)
+ VMOVDQA(ymm_v6, ymm_v2)
+ VPADDQ(ymm_v7, ymm_v3, ymm_inc)
+
+ MOV(reg_rounds, 20)
+ rounds_loop4 = Loop()
+ with rounds_loop4:
+ # a += b; d ^= a; d = ROTW16(d);
+ ADD_avx2(ymm_v0, ymm_v1)
+ ADD_avx2(ymm_v4, ymm_v5)
+ XOR_avx2(ymm_v3, ymm_v0)
+ XOR_avx2(ymm_v7, ymm_v4)
+ ROTW16_avx2(ymm_tmp0, ymm_v3)
+ ROTW16_avx2(ymm_tmp0, ymm_v7)
+
+ # c += d; b ^= c; b = ROTW12(b);
+ ADD_avx2(ymm_v2, ymm_v3)
+ ADD_avx2(ymm_v6, ymm_v7)
+ XOR_avx2(ymm_v1, ymm_v2)
+ XOR_avx2(ymm_v5, ymm_v6)
+ ROTW12_avx2(ymm_tmp0, ymm_v1)
+ ROTW12_avx2(ymm_tmp0, ymm_v5)
+
+ # a += b; d ^= a; d = ROTW8(d);
+ ADD_avx2(ymm_v0, ymm_v1)
+ ADD_avx2(ymm_v4, ymm_v5)
+ XOR_avx2(ymm_v3, ymm_v0)
+ XOR_avx2(ymm_v7, ymm_v4)
+ ROTW8_avx2(ymm_tmp0, ymm_v3)
+ ROTW8_avx2(ymm_tmp0, ymm_v7)
+
+ # c += d; b ^= c; b = ROTW7(b)
+ ADD_avx2(ymm_v2, ymm_v3)
+ ADD_avx2(ymm_v6, ymm_v7)
+ XOR_avx2(ymm_v1, ymm_v2)
+ XOR_avx2(ymm_v5, ymm_v6)
+ ROTW7_avx2(ymm_tmp0, ymm_v1)
+ ROTW7_avx2(ymm_tmp0, ymm_v5)
+
+ # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
+ VPSHUFD(ymm_v1, ymm_v1, 0x39)
+ VPSHUFD(ymm_v5, ymm_v5, 0x39)
+ VPSHUFD(ymm_v2, ymm_v2, 0x4e)
+ VPSHUFD(ymm_v6, ymm_v6, 0x4e)
+ VPSHUFD(ymm_v3, ymm_v3, 0x93)
+ VPSHUFD(ymm_v7, ymm_v7, 0x93)
+
+ # a += b; d ^= a; d = ROTW16(d);
+ ADD_avx2(ymm_v0, ymm_v1)
+ ADD_avx2(ymm_v4, ymm_v5)
+ XOR_avx2(ymm_v3, ymm_v0)
+ XOR_avx2(ymm_v7, ymm_v4)
+ ROTW16_avx2(ymm_tmp0, ymm_v3)
+ ROTW16_avx2(ymm_tmp0, ymm_v7)
+
+ # c += d; b ^= c; b = ROTW12(b);
+ ADD_avx2(ymm_v2, ymm_v3)
+ ADD_avx2(ymm_v6, ymm_v7)
+ XOR_avx2(ymm_v1, ymm_v2)
+ XOR_avx2(ymm_v5, ymm_v6)
+ ROTW12_avx2(ymm_tmp0, ymm_v1)
+ ROTW12_avx2(ymm_tmp0, ymm_v5)
+
+ # a += b; d ^= a; d = ROTW8(d);
+ ADD_avx2(ymm_v0, ymm_v1)
+ ADD_avx2(ymm_v4, ymm_v5)
+ XOR_avx2(ymm_v3, ymm_v0)
+ XOR_avx2(ymm_v7, ymm_v4)
+ ROTW8_avx2(ymm_tmp0, ymm_v3)
+ ROTW8_avx2(ymm_tmp0, ymm_v7)
+
+ # c += d; b ^= c; b = ROTW7(b)
+ ADD_avx2(ymm_v2, ymm_v3)
+ ADD_avx2(ymm_v6, ymm_v7)
+ XOR_avx2(ymm_v1, ymm_v2)
+ XOR_avx2(ymm_v5, ymm_v6)
+ ROTW7_avx2(ymm_tmp0, ymm_v1)
+ ROTW7_avx2(ymm_tmp0, ymm_v5)
+
+ # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
+ VPSHUFD(ymm_v1, ymm_v1, 0x93)
+ VPSHUFD(ymm_v5, ymm_v5, 0x93)
+ VPSHUFD(ymm_v2, ymm_v2, 0x4e)
+ VPSHUFD(ymm_v6, ymm_v6, 0x4e)
+ VPSHUFD(ymm_v3, ymm_v3, 0x39)
+ VPSHUFD(ymm_v7, ymm_v7, 0x39)
+
+ SUB(reg_rounds, 2)
+ JNZ(rounds_loop4.begin)
+
+ ADD_avx2(ymm_v0, ymm_s0)
+ ADD_avx2(ymm_v1, ymm_s1)
+ ADD_avx2(ymm_v2, ymm_s2)
+ ADD_avx2(ymm_v3, ymm_s3)
+ WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 0, ymm_v0, ymm_v1, ymm_v2, ymm_v3)
+ ADD_avx2(ymm_s3, ymm_inc)
+
+ ADD_avx2(ymm_v4, ymm_s0)
+ ADD_avx2(ymm_v5, ymm_s1)
+ ADD_avx2(ymm_v6, ymm_s2)
+ ADD_avx2(ymm_v7, ymm_s3)
+ WriteXor_avx2(ymm_tmp0, reg_inp, reg_outp, 128, ymm_v4, ymm_v5, ymm_v6, ymm_v7)
+ ADD_avx2(ymm_s3, ymm_inc)
+
+ ADD(reg_inp, 4 * 64)
+ ADD(reg_outp, 4 * 64)
+ SUB(reg_blocks, 4)
+
+ LABEL(process_2_blocks)
+ ADD(reg_blocks, 4)
+ JZ(out_write_even) # 0 blocks left.
+
+ #
+ # 2/1 blocks at a time. The two codepaths are unified because
+ # with AVX2 we do 2 blocks at a time anyway, and this only gets called
+ # if 3/2/1 blocks are remaining, so the extra branches don't hurt that
+ # much.
+ #
+
+ vector_loop2 = Loop()
+ with vector_loop2:
+ VMOVDQA(ymm_v0, ymm_s0)
+ VMOVDQA(ymm_v1, ymm_s1)
+ VMOVDQA(ymm_v2, ymm_s2)
+ VMOVDQA(ymm_v3, ymm_s3)
+
+ MOV(reg_rounds, 20)
+ rounds_loop2 = Loop()
+ with rounds_loop2:
+ # a += b; d ^= a; d = ROTW16(d);
+ ADD_avx2(ymm_v0, ymm_v1)
+ XOR_avx2(ymm_v3, ymm_v0)
+ ROTW16_avx2(ymm_tmp0, ymm_v3)
+
+ # c += d; b ^= c; b = ROTW12(b);
+ ADD_avx2(ymm_v2, ymm_v3)
+ XOR_avx2(ymm_v1, ymm_v2)
+ ROTW12_avx2(ymm_tmp0, ymm_v1)
+
+ # a += b; d ^= a; d = ROTW8(d);
+ ADD_avx2(ymm_v0, ymm_v1)
+ XOR_avx2(ymm_v3, ymm_v0)
+ ROTW8_avx2(ymm_tmp0, ymm_v3)
+
+ # c += d; b ^= c; b = ROTW7(b)
+ ADD_avx2(ymm_v2, ymm_v3)
+ XOR_avx2(ymm_v1, ymm_v2)
+ ROTW7_avx2(ymm_tmp0, ymm_v1)
+
+ # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
+ VPSHUFD(ymm_v1, ymm_v1, 0x39)
+ VPSHUFD(ymm_v2, ymm_v2, 0x4e)
+ VPSHUFD(ymm_v3, ymm_v3, 0x93)
+
+ # a += b; d ^= a; d = ROTW16(d);
+ ADD_avx2(ymm_v0, ymm_v1)
+ XOR_avx2(ymm_v3, ymm_v0)
+ ROTW16_avx2(ymm_tmp0, ymm_v3)
+
+ # c += d; b ^= c; b = ROTW12(b);
+ ADD_avx2(ymm_v2, ymm_v3)
+ XOR_avx2(ymm_v1, ymm_v2)
+ ROTW12_avx2(ymm_tmp0, ymm_v1)
+
+ # a += b; d ^= a; d = ROTW8(d);
+ ADD_avx2(ymm_v0, ymm_v1)
+ XOR_avx2(ymm_v3, ymm_v0)
+ ROTW8_avx2(ymm_tmp0, ymm_v3)
+
+ # c += d; b ^= c; b = ROTW7(b)
+ ADD_avx2(ymm_v2, ymm_v3)
+ XOR_avx2(ymm_v1, ymm_v2)
+ ROTW7_avx2(ymm_tmp0, ymm_v1)
+
+ # b = ROTV1(b); c = ROTV2(c); d = ROTV3(d);
+ VPSHUFD(ymm_v1, ymm_v1, 0x93)
+ VPSHUFD(ymm_v2, ymm_v2, 0x4e)
+ VPSHUFD(ymm_v3, ymm_v3, 0x39)
+
+ SUB(reg_rounds, 2)
+ JNZ(rounds_loop2.begin)
+
+ ADD_avx2(ymm_v0, ymm_s0)
+ ADD_avx2(ymm_v1, ymm_s1)
+ ADD_avx2(ymm_v2, ymm_s2)
+ ADD_avx2(ymm_v3, ymm_s3)
+
+ # XOR_WRITE(out+ 0, in+ 0, _mm256_permute2x128_si256(v0,v1,0x20));
+ VPERM2I128(ymm_tmp0, ymm_v0, ymm_v1, 0x20)
+ VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp])
+ VMOVDQU([reg_outp], ymm_tmp0)
+
+ # XOR_WRITE(out+32, in+32, _mm256_permute2x128_si256(v2,v3,0x20));
+ VPERM2I128(ymm_tmp0, ymm_v2, ymm_v3, 0x20)
+ VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+32])
+ VMOVDQU([reg_outp+32], ymm_tmp0)
+
+ SUB(reg_blocks, 1)
+ JZ(out_write_odd)
+
+ ADD_avx2(ymm_s3, ymm_inc)
+
+ # XOR_WRITE(out+64, in+64, _mm256_permute2x128_si256(v0,v1,0x31));
+ VPERM2I128(ymm_tmp0, ymm_v0, ymm_v1, 0x31)
+ VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+64])
+ VMOVDQU([reg_outp+64], ymm_tmp0)
+
+ # XOR_WRITE(out+96, in+96, _mm256_permute2x128_si256(v2,v3,0x31));
+ VPERM2I128(ymm_tmp0, ymm_v2, ymm_v3, 0x31)
+ VPXOR(ymm_tmp0, ymm_tmp0, [reg_inp+96])
+ VMOVDQU([reg_outp+96], ymm_tmp0)
+
+ SUB(reg_blocks, 1)
+ JZ(out_write_even)
+
+ ADD(reg_inp, 2 * 64)
+ ADD(reg_outp, 2 * 64)
+ JMP(vector_loop2.begin)
+
+ LABEL(out_write_odd)
+ VPERM2I128(ymm_s3, ymm_s3, ymm_s3, 0x01) # Odd number of blocks.
+
+ LABEL(out_write_even)
+ VMOVDQU(x_s3, ymm_s3.as_xmm) # Write back ymm_s3 to x_v3
+
+ # Paranoia, cleanse the scratch space.
+ VPXOR(ymm_v0, ymm_v0, ymm_v0)
+ VMOVDQA(mem_tmp0, ymm_v0)
+ VMOVDQA(mem_s3, ymm_v0)
+
+ # Clear all YMM (and XMM) registers.
+ VZEROALL()
+
+ # Remove our stack allocation.
+ MOV(registers.rsp, reg_sp_save)
+
+ RETURN()
+
+#
+# CPUID
+#
+
+cpuidParams = Argument(ptr(uint32_t))
+
+with Function("cpuidAmd64", (cpuidParams,)):
+ reg_params = registers.r15
+ LOAD.ARGUMENT(reg_params, cpuidParams)
+
+ MOV(registers.eax, [reg_params])
+ MOV(registers.ecx, [reg_params+8])
+
+ CPUID()
+
+ MOV([reg_params], registers.eax)
+ MOV([reg_params+4], registers.ebx)
+ MOV([reg_params+8], registers.ecx)
+ MOV([reg_params+12], registers.edx)
+
+ RETURN()
+
+#
+# XGETBV (ECX = 0)
+#
+
+xcrVec = Argument(ptr(uint32_t))
+
+with Function("xgetbv0Amd64", (xcrVec,)):
+ reg_vec = GeneralPurposeRegister64()
+
+ LOAD.ARGUMENT(reg_vec, xcrVec)
+
+ XOR(registers.ecx, registers.ecx)
+
+ XGETBV()
+
+ MOV([reg_vec], registers.eax)
+ MOV([reg_vec+4], registers.edx)
+
+ RETURN()
diff --git a/vendor/github.com/Yawning/chacha20/chacha20_amd64.s b/vendor/github.com/Yawning/chacha20/chacha20_amd64.s
new file mode 100644
index 0000000..e3792af
--- /dev/null
+++ b/vendor/github.com/Yawning/chacha20/chacha20_amd64.s
@@ -0,0 +1,1180 @@
+// +build !noasm
+// Generated by PeachPy 0.2.0 from chacha20_amd64.py
+
+
+// func blocksAmd64SSE2(x *uint32, inp *uint8, outp *uint8, nrBlocks *uint)
+TEXT ·blocksAmd64SSE2(SB),4,$0-32
+ MOVQ x+0(FP), AX
+ MOVQ inp+8(FP), BX
+ MOVQ outp+16(FP), CX
+ MOVQ nrBlocks+24(FP), DX
+ MOVQ SP, DI
+ ANDQ $18446744073709551584, SP
+ SUBQ $32, SP
+ PXOR X0, X0
+ SUBQ $32, SP
+ MOVO X0, 0(SP)
+ MOVL $1, SI
+ MOVL SI, 0(SP)
+ SUBQ $4, DX
+ JCS vector_loop4_end
+vector_loop4_begin:
+ MOVOU 0(AX), X0
+ MOVOU 16(AX), X1
+ MOVOU 32(AX), X2
+ MOVOU 48(AX), X3
+ MOVO X0, X4
+ MOVO X1, X5
+ MOVO X2, X6
+ MOVO X3, X7
+ PADDQ 0(SP), X7
+ MOVO X0, X8
+ MOVO X1, X9
+ MOVO X2, X10
+ MOVO X7, X11
+ PADDQ 0(SP), X11
+ MOVO X0, X12
+ MOVO X1, X13
+ MOVO X2, X14
+ MOVO X11, X15
+ PADDQ 0(SP), X15
+ MOVQ $20, SI
+rounds_loop4_begin:
+ PADDL X1, X0
+ PADDL X5, X4
+ PADDL X9, X8
+ PADDL X13, X12
+ PXOR X0, X3
+ PXOR X4, X7
+ PXOR X8, X11
+ PXOR X12, X15
+ MOVO X12, 16(SP)
+ MOVO X3, X12
+ PSLLL $16, X12
+ PSRLL $16, X3
+ PXOR X12, X3
+ MOVO X7, X12
+ PSLLL $16, X12
+ PSRLL $16, X7
+ PXOR X12, X7
+ MOVO X11, X12
+ PSLLL $16, X12
+ PSRLL $16, X11
+ PXOR X12, X11
+ MOVO X15, X12
+ PSLLL $16, X12
+ PSRLL $16, X15
+ PXOR X12, X15
+ PADDL X3, X2
+ PADDL X7, X6
+ PADDL X11, X10
+ PADDL X15, X14
+ PXOR X2, X1
+ PXOR X6, X5
+ PXOR X10, X9
+ PXOR X14, X13
+ MOVO X1, X12
+ PSLLL $12, X12
+ PSRLL $20, X1
+ PXOR X12, X1
+ MOVO X5, X12
+ PSLLL $12, X12
+ PSRLL $20, X5
+ PXOR X12, X5
+ MOVO X9, X12
+ PSLLL $12, X12
+ PSRLL $20, X9
+ PXOR X12, X9
+ MOVO X13, X12
+ PSLLL $12, X12
+ PSRLL $20, X13
+ PXOR X12, X13
+ MOVO 16(SP), X12
+ PADDL X1, X0
+ PADDL X5, X4
+ PADDL X9, X8
+ PADDL X13, X12
+ PXOR X0, X3
+ PXOR X4, X7
+ PXOR X8, X11
+ PXOR X12, X15
+ MOVO X12, 16(SP)
+ MOVO X3, X12
+ PSLLL $8, X12
+ PSRLL $24, X3
+ PXOR X12, X3
+ MOVO X7, X12
+ PSLLL $8, X12
+ PSRLL $24, X7
+ PXOR X12, X7
+ MOVO X11, X12
+ PSLLL $8, X12
+ PSRLL $24, X11
+ PXOR X12, X11
+ MOVO X15, X12
+ PSLLL $8, X12
+ PSRLL $24, X15
+ PXOR X12, X15
+ PADDL X3, X2
+ PADDL X7, X6
+ PADDL X11, X10
+ PADDL X15, X14
+ PXOR X2, X1
+ PXOR X6, X5
+ PXOR X10, X9
+ PXOR X14, X13
+ MOVO X1, X12
+ PSLLL $7, X12
+ PSRLL $25, X1
+ PXOR X12, X1
+ MOVO X5, X12
+ PSLLL $7, X12
+ PSRLL $25, X5
+ PXOR X12, X5
+ MOVO X9, X12
+ PSLLL $7, X12
+ PSRLL $25, X9
+ PXOR X12, X9
+ MOVO X13, X12
+ PSLLL $7, X12
+ PSRLL $25, X13
+ PXOR X12, X13
+ PSHUFL $57, X1, X1
+ PSHUFL $57, X5, X5
+ PSHUFL $57, X9, X9
+ PSHUFL $57, X13, X13
+ PSHUFL $78, X2, X2
+ PSHUFL $78, X6, X6
+ PSHUFL $78, X10, X10
+ PSHUFL $78, X14, X14
+ PSHUFL $147, X3, X3
+ PSHUFL $147, X7, X7
+ PSHUFL $147, X11, X11
+ PSHUFL $147, X15, X15
+ MOVO 16(SP), X12
+ PADDL X1, X0
+ PADDL X5, X4
+ PADDL X9, X8
+ PADDL X13, X12
+ PXOR X0, X3
+ PXOR X4, X7
+ PXOR X8, X11
+ PXOR X12, X15
+ MOVO X12, 16(SP)
+ MOVO X3, X12
+ PSLLL $16, X12
+ PSRLL $16, X3
+ PXOR X12, X3
+ MOVO X7, X12
+ PSLLL $16, X12
+ PSRLL $16, X7
+ PXOR X12, X7
+ MOVO X11, X12
+ PSLLL $16, X12
+ PSRLL $16, X11
+ PXOR X12, X11
+ MOVO X15, X12
+ PSLLL $16, X12
+ PSRLL $16, X15
+ PXOR X12, X15
+ PADDL X3, X2
+ PADDL X7, X6
+ PADDL X11, X10
+ PADDL X15, X14
+ PXOR X2, X1
+ PXOR X6, X5
+ PXOR X10, X9
+ PXOR X14, X13
+ MOVO X1, X12
+ PSLLL $12, X12
+ PSRLL $20, X1
+ PXOR X12, X1
+ MOVO X5, X12
+ PSLLL $12, X12
+ PSRLL $20, X5
+ PXOR X12, X5
+ MOVO X9, X12
+ PSLLL $12, X12
+ PSRLL $20, X9
+ PXOR X12, X9
+ MOVO X13, X12
+ PSLLL $12, X12
+ PSRLL $20, X13
+ PXOR X12, X13
+ MOVO 16(SP), X12
+ PADDL X1, X0
+ PADDL X5, X4
+ PADDL X9, X8
+ PADDL X13, X12
+ PXOR X0, X3
+ PXOR X4, X7
+ PXOR X8, X11
+ PXOR X12, X15
+ MOVO X12, 16(SP)
+ MOVO X3, X12
+ PSLLL $8, X12
+ PSRLL $24, X3
+ PXOR X12, X3
+ MOVO X7, X12
+ PSLLL $8, X12
+ PSRLL $24, X7
+ PXOR X12, X7
+ MOVO X11, X12
+ PSLLL $8, X12
+ PSRLL $24, X11
+ PXOR X12, X11
+ MOVO X15, X12
+ PSLLL $8, X12
+ PSRLL $24, X15
+ PXOR X12, X15
+ PADDL X3, X2
+ PADDL X7, X6
+ PADDL X11, X10
+ PADDL X15, X14
+ PXOR X2, X1
+ PXOR X6, X5
+ PXOR X10, X9
+ PXOR X14, X13
+ MOVO X1, X12
+ PSLLL $7, X12
+ PSRLL $25, X1
+ PXOR X12, X1
+ MOVO X5, X12
+ PSLLL $7, X12
+ PSRLL $25, X5
+ PXOR X12, X5
+ MOVO X9, X12
+ PSLLL $7, X12
+ PSRLL $25, X9
+ PXOR X12, X9
+ MOVO X13, X12
+ PSLLL $7, X12
+ PSRLL $25, X13
+ PXOR X12, X13
+ PSHUFL $147, X1, X1
+ PSHUFL $147, X5, X5
+ PSHUFL $147, X9, X9
+ PSHUFL $147, X13, X13
+ PSHUFL $78, X2, X2
+ PSHUFL $78, X6, X6
+ PSHUFL $78, X10, X10
+ PSHUFL $78, X14, X14
+ PSHUFL $57, X3, X3
+ PSHUFL $57, X7, X7
+ PSHUFL $57, X11, X11
+ PSHUFL $57, X15, X15
+ MOVO 16(SP), X12
+ SUBQ $2, SI
+ JNE rounds_loop4_begin
+ MOVO X12, 16(SP)
+ PADDL 0(AX), X0
+ PADDL 16(AX), X1
+ PADDL 32(AX), X2
+ PADDL 48(AX), X3
+ MOVOU 0(BX), X12
+ PXOR X0, X12
+ MOVOU X12, 0(CX)
+ MOVOU 16(BX), X12
+ PXOR X1, X12
+ MOVOU X12, 16(CX)
+ MOVOU 32(BX), X12
+ PXOR X2, X12
+ MOVOU X12, 32(CX)
+ MOVOU 48(BX), X12
+ PXOR X3, X12
+ MOVOU X12, 48(CX)
+ MOVOU 48(AX), X3
+ PADDQ 0(SP), X3
+ PADDL 0(AX), X4
+ PADDL 16(AX), X5
+ PADDL 32(AX), X6
+ PADDL X3, X7
+ MOVOU 64(BX), X12
+ PXOR X4, X12
+ MOVOU X12, 64(CX)
+ MOVOU 80(BX), X12
+ PXOR X5, X12
+ MOVOU X12, 80(CX)
+ MOVOU 96(BX), X12
+ PXOR X6, X12
+ MOVOU X12, 96(CX)
+ MOVOU 112(BX), X12
+ PXOR X7, X12
+ MOVOU X12, 112(CX)
+ PADDQ 0(SP), X3
+ PADDL 0(AX), X8
+ PADDL 16(AX), X9
+ PADDL 32(AX), X10
+ PADDL X3, X11
+ MOVOU 128(BX), X12
+ PXOR X8, X12
+ MOVOU X12, 128(CX)
+ MOVOU 144(BX), X12
+ PXOR X9, X12
+ MOVOU X12, 144(CX)
+ MOVOU 160(BX), X12
+ PXOR X10, X12
+ MOVOU X12, 160(CX)
+ MOVOU 176(BX), X12
+ PXOR X11, X12
+ MOVOU X12, 176(CX)
+ PADDQ 0(SP), X3
+ MOVO 16(SP), X12
+ PADDL 0(AX), X12
+ PADDL 16(AX), X13
+ PADDL 32(AX), X14
+ PADDL X3, X15
+ MOVOU 192(BX), X0
+ PXOR X12, X0
+ MOVOU X0, 192(CX)
+ MOVOU 208(BX), X0
+ PXOR X13, X0
+ MOVOU X0, 208(CX)
+ MOVOU 224(BX), X0
+ PXOR X14, X0
+ MOVOU X0, 224(CX)
+ MOVOU 240(BX), X0
+ PXOR X15, X0
+ MOVOU X0, 240(CX)
+ PADDQ 0(SP), X3
+ MOVOU X3, 48(AX)
+ ADDQ $256, BX
+ ADDQ $256, CX
+ SUBQ $4, DX
+ JCC vector_loop4_begin
+vector_loop4_end:
+ ADDQ $4, DX
+ JEQ out
+ MOVOU 0(AX), X8
+ MOVOU 16(AX), X9
+ MOVOU 32(AX), X10
+ MOVOU 48(AX), X11
+ MOVO 0(SP), X13
+ SUBQ $2, DX
+ JCS process_1_block
+ MOVO X8, X0
+ MOVO X9, X1
+ MOVO X10, X2
+ MOVO X11, X3
+ MOVO X0, X4
+ MOVO X1, X5
+ MOVO X2, X6
+ MOVO X3, X7
+ PADDQ X13, X7
+ MOVQ $20, SI
+rounds_loop2_begin:
+ PADDL X1, X0
+ PADDL X5, X4
+ PXOR X0, X3
+ PXOR X4, X7
+ MOVO X3, X12
+ PSLLL $16, X12
+ PSRLL $16, X3
+ PXOR X12, X3
+ MOVO X7, X12
+ PSLLL $16, X12
+ PSRLL $16, X7
+ PXOR X12, X7
+ PADDL X3, X2
+ PADDL X7, X6
+ PXOR X2, X1
+ PXOR X6, X5
+ MOVO X1, X12
+ PSLLL $12, X12
+ PSRLL $20, X1
+ PXOR X12, X1
+ MOVO X5, X12
+ PSLLL $12, X12
+ PSRLL $20, X5
+ PXOR X12, X5
+ PADDL X1, X0
+ PADDL X5, X4
+ PXOR X0, X3
+ PXOR X4, X7
+ MOVO X3, X12
+ PSLLL $8, X12
+ PSRLL $24, X3
+ PXOR X12, X3
+ MOVO X7, X12
+ PSLLL $8, X12
+ PSRLL $24, X7
+ PXOR X12, X7
+ PADDL X3, X2
+ PADDL X7, X6
+ PXOR X2, X1
+ PXOR X6, X5
+ MOVO X1, X12
+ PSLLL $7, X12
+ PSRLL $25, X1
+ PXOR X12, X1
+ MOVO X5, X12
+ PSLLL $7, X12
+ PSRLL $25, X5
+ PXOR X12, X5
+ PSHUFL $57, X1, X1
+ PSHUFL $57, X5, X5
+ PSHUFL $78, X2, X2
+ PSHUFL $78, X6, X6
+ PSHUFL $147, X3, X3
+ PSHUFL $147, X7, X7
+ PADDL X1, X0
+ PADDL X5, X4
+ PXOR X0, X3
+ PXOR X4, X7
+ MOVO X3, X12
+ PSLLL $16, X12
+ PSRLL $16, X3
+ PXOR X12, X3
+ MOVO X7, X12
+ PSLLL $16, X12
+ PSRLL $16, X7
+ PXOR X12, X7
+ PADDL X3, X2
+ PADDL X7, X6
+ PXOR X2, X1
+ PXOR X6, X5
+ MOVO X1, X12
+ PSLLL $12, X12
+ PSRLL $20, X1
+ PXOR X12, X1
+ MOVO X5, X12
+ PSLLL $12, X12
+ PSRLL $20, X5
+ PXOR X12, X5
+ PADDL X1, X0
+ PADDL X5, X4
+ PXOR X0, X3
+ PXOR X4, X7
+ MOVO X3, X12
+ PSLLL $8, X12
+ PSRLL $24, X3
+ PXOR X12, X3
+ MOVO X7, X12
+ PSLLL $8, X12
+ PSRLL $24, X7
+ PXOR X12, X7
+ PADDL X3, X2
+ PADDL X7, X6
+ PXOR X2, X1
+ PXOR X6, X5
+ MOVO X1, X12
+ PSLLL $7, X12
+ PSRLL $25, X1
+ PXOR X12, X1
+ MOVO X5, X12
+ PSLLL $7, X12
+ PSRLL $25, X5
+ PXOR X12, X5
+ PSHUFL $147, X1, X1
+ PSHUFL $147, X5, X5
+ PSHUFL $78, X2, X2
+ PSHUFL $78, X6, X6
+ PSHUFL $57, X3, X3
+ PSHUFL $57, X7, X7
+ SUBQ $2, SI
+ JNE rounds_loop2_begin
+ PADDL X8, X0
+ PADDL X9, X1
+ PADDL X10, X2
+ PADDL X11, X3
+ MOVOU 0(BX), X12
+ PXOR X0, X12
+ MOVOU X12, 0(CX)
+ MOVOU 16(BX), X12
+ PXOR X1, X12
+ MOVOU X12, 16(CX)
+ MOVOU 32(BX), X12
+ PXOR X2, X12
+ MOVOU X12, 32(CX)
+ MOVOU 48(BX), X12
+ PXOR X3, X12
+ MOVOU X12, 48(CX)
+ PADDQ X13, X11
+ PADDL X8, X4
+ PADDL X9, X5
+ PADDL X10, X6
+ PADDL X11, X7
+ MOVOU 64(BX), X12
+ PXOR X4, X12
+ MOVOU X12, 64(CX)
+ MOVOU 80(BX), X12
+ PXOR X5, X12
+ MOVOU X12, 80(CX)
+ MOVOU 96(BX), X12
+ PXOR X6, X12
+ MOVOU X12, 96(CX)
+ MOVOU 112(BX), X12
+ PXOR X7, X12
+ MOVOU X12, 112(CX)
+ PADDQ X13, X11
+ ADDQ $128, BX
+ ADDQ $128, CX
+ SUBQ $2, DX
+process_1_block:
+ ADDQ $2, DX
+ JEQ out_serial
+ MOVO X8, X0
+ MOVO X9, X1
+ MOVO X10, X2
+ MOVO X11, X3
+ MOVQ $20, SI
+rounds_loop1_begin:
+ PADDL X1, X0
+ PXOR X0, X3
+ MOVO X3, X12
+ PSLLL $16, X12
+ PSRLL $16, X3
+ PXOR X12, X3
+ PADDL X3, X2
+ PXOR X2, X1
+ MOVO X1, X12
+ PSLLL $12, X12
+ PSRLL $20, X1
+ PXOR X12, X1
+ PADDL X1, X0
+ PXOR X0, X3
+ MOVO X3, X12
+ PSLLL $8, X12
+ PSRLL $24, X3
+ PXOR X12, X3
+ PADDL X3, X2
+ PXOR X2, X1
+ MOVO X1, X12
+ PSLLL $7, X12
+ PSRLL $25, X1
+ PXOR X12, X1
+ PSHUFL $57, X1, X1
+ PSHUFL $78, X2, X2
+ PSHUFL $147, X3, X3
+ PADDL X1, X0
+ PXOR X0, X3
+ MOVO X3, X12
+ PSLLL $16, X12
+ PSRLL $16, X3
+ PXOR X12, X3
+ PADDL X3, X2
+ PXOR X2, X1
+ MOVO X1, X12
+ PSLLL $12, X12
+ PSRLL $20, X1
+ PXOR X12, X1
+ PADDL X1, X0
+ PXOR X0, X3
+ MOVO X3, X12
+ PSLLL $8, X12
+ PSRLL $24, X3
+ PXOR X12, X3
+ PADDL X3, X2
+ PXOR X2, X1
+ MOVO X1, X12
+ PSLLL $7, X12
+ PSRLL $25, X1
+ PXOR X12, X1
+ PSHUFL $147, X1, X1
+ PSHUFL $78, X2, X2
+ PSHUFL $57, X3, X3
+ SUBQ $2, SI
+ JNE rounds_loop1_begin
+ PADDL X8, X0
+ PADDL X9, X1
+ PADDL X10, X2
+ PADDL X11, X3
+ MOVOU 0(BX), X12
+ PXOR X0, X12
+ MOVOU X12, 0(CX)
+ MOVOU 16(BX), X12
+ PXOR X1, X12
+ MOVOU X12, 16(CX)
+ MOVOU 32(BX), X12
+ PXOR X2, X12
+ MOVOU X12, 32(CX)
+ MOVOU 48(BX), X12
+ PXOR X3, X12
+ MOVOU X12, 48(CX)
+ PADDQ X13, X11
+out_serial:
+ MOVOU X11, 48(AX)
+out:
+ PXOR X0, X0
+ MOVO X0, 16(SP)
+ MOVQ DI, SP
+ RET
+
+// func blocksAmd64AVX2(x *uint32, inp *uint8, outp *uint8, nrBlocks *uint)
+TEXT ·blocksAmd64AVX2(SB),4,$0-32
+ MOVQ x+0(FP), AX
+ MOVQ inp+8(FP), BX
+ MOVQ outp+16(FP), CX
+ MOVQ nrBlocks+24(FP), DX
+ MOVQ SP, DI
+ ANDQ $18446744073709551584, SP
+ SUBQ $32, SP
+ SUBQ $96, SP
+ BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm0, ymm0, ymm0
+ BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x04; BYTE $0x24 // VMOVDQU [rsp], ymm0
+ MOVL $1, SI
+ MOVL SI, 16(SP)
+ BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x48; BYTE $0x30 // VBROADCASTI128 ymm1, [rax + 48]
+ BYTE $0xC5; BYTE $0xF5; BYTE $0xD4; BYTE $0x0C; BYTE $0x24 // VPADDQ ymm1, ymm1, [rsp]
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x4C; BYTE $0x24; BYTE $0x20 // VMOVDQA [rsp + 32], ymm1
+ MOVL $2, SI
+ MOVL SI, 0(SP)
+ MOVL SI, 16(SP)
+ SUBQ $8, DX
+ JCS vector_loop8_end
+vector_loop8_begin:
+ BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x10 // VBROADCASTI128 ymm2, [rax]
+ BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x58; BYTE $0x10 // VBROADCASTI128 ymm3, [rax + 16]
+ BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x60; BYTE $0x20 // VBROADCASTI128 ymm4, [rax + 32]
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x4C; BYTE $0x24; BYTE $0x20 // VMOVDQA ymm1, [rsp + 32]
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xEA // VMOVDQA ymm5, ymm2
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xF3 // VMOVDQA ymm6, ymm3
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xFC // VMOVDQA ymm7, ymm4
+ BYTE $0xC5; BYTE $0x75; BYTE $0xD4; BYTE $0x04; BYTE $0x24 // VPADDQ ymm8, ymm1, [rsp]
+ BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xCA // VMOVDQA ymm9, ymm2
+ BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xD3 // VMOVDQA ymm10, ymm3
+ BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xDC // VMOVDQA ymm11, ymm4
+ BYTE $0xC5; BYTE $0x3D; BYTE $0xD4; BYTE $0x24; BYTE $0x24 // VPADDQ ymm12, ymm8, [rsp]
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xC2 // VMOVDQA ymm0, ymm2
+ BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xEB // VMOVDQA ymm13, ymm3
+ BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xF4 // VMOVDQA ymm14, ymm4
+ BYTE $0xC5; BYTE $0x1D; BYTE $0xD4; BYTE $0x3C; BYTE $0x24 // VPADDQ ymm15, ymm12, [rsp]
+ MOVQ $20, SI
+rounds_loop8_begin:
+ BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
+ BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6
+ BYTE $0xC4; BYTE $0x41; BYTE $0x35; BYTE $0xFE; BYTE $0xCA // VPADDD ymm9, ymm9, ymm10
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0xFE; BYTE $0xC5 // VPADDD ymm0, ymm0, ymm13
+ BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
+ BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5
+ BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xEF; BYTE $0xE1 // VPXOR ymm12, ymm12, ymm9
+ BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA [rsp + 64], ymm0
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x10 // VPSLLD ymm0, ymm1, 16
+ BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x10 // VPSRLD ymm1, ymm1, 16
+ BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x10 // VPSLLD ymm0, ymm8, 16
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x10 // VPSRLD ymm8, ymm8, 16
+ BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF4; BYTE $0x10 // VPSLLD ymm0, ymm12, 16
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x1D; BYTE $0x72; BYTE $0xD4; BYTE $0x10 // VPSRLD ymm12, ymm12, 16
+ BYTE $0xC5; BYTE $0x1D; BYTE $0xEF; BYTE $0xE0 // VPXOR ymm12, ymm12, ymm0
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF7; BYTE $0x10 // VPSLLD ymm0, ymm15, 16
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x05; BYTE $0x72; BYTE $0xD7; BYTE $0x10 // VPSRLD ymm15, ymm15, 16
+ BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0
+ BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8
+ BYTE $0xC4; BYTE $0x41; BYTE $0x25; BYTE $0xFE; BYTE $0xDC // VPADDD ymm11, ymm11, ymm12
+ BYTE $0xC4; BYTE $0x41; BYTE $0x0D; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm14, ymm14, ymm15
+ BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
+ BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7
+ BYTE $0xC4; BYTE $0x41; BYTE $0x2D; BYTE $0xEF; BYTE $0xD3 // VPXOR ymm10, ymm10, ymm11
+ BYTE $0xC4; BYTE $0x41; BYTE $0x15; BYTE $0xEF; BYTE $0xEE // VPXOR ymm13, ymm13, ymm14
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x0C // VPSLLD ymm0, ymm3, 12
+ BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x14 // VPSRLD ymm3, ymm3, 20
+ BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x0C // VPSLLD ymm0, ymm6, 12
+ BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x14 // VPSRLD ymm6, ymm6, 20
+ BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF2; BYTE $0x0C // VPSLLD ymm0, ymm10, 12
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x2D; BYTE $0x72; BYTE $0xD2; BYTE $0x14 // VPSRLD ymm10, ymm10, 20
+ BYTE $0xC5; BYTE $0x2D; BYTE $0xEF; BYTE $0xD0 // VPXOR ymm10, ymm10, ymm0
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF5; BYTE $0x0C // VPSLLD ymm0, ymm13, 12
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x15; BYTE $0x72; BYTE $0xD5; BYTE $0x14 // VPSRLD ymm13, ymm13, 20
+ BYTE $0xC5; BYTE $0x15; BYTE $0xEF; BYTE $0xE8 // VPXOR ymm13, ymm13, ymm0
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA ymm0, [rsp + 64]
+ BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
+ BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6
+ BYTE $0xC4; BYTE $0x41; BYTE $0x35; BYTE $0xFE; BYTE $0xCA // VPADDD ymm9, ymm9, ymm10
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0xFE; BYTE $0xC5 // VPADDD ymm0, ymm0, ymm13
+ BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
+ BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5
+ BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xEF; BYTE $0xE1 // VPXOR ymm12, ymm12, ymm9
+ BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA [rsp + 64], ymm0
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x08 // VPSLLD ymm0, ymm1, 8
+ BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x18 // VPSRLD ymm1, ymm1, 24
+ BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x08 // VPSLLD ymm0, ymm8, 8
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x18 // VPSRLD ymm8, ymm8, 24
+ BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF4; BYTE $0x08 // VPSLLD ymm0, ymm12, 8
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x1D; BYTE $0x72; BYTE $0xD4; BYTE $0x18 // VPSRLD ymm12, ymm12, 24
+ BYTE $0xC5; BYTE $0x1D; BYTE $0xEF; BYTE $0xE0 // VPXOR ymm12, ymm12, ymm0
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF7; BYTE $0x08 // VPSLLD ymm0, ymm15, 8
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x05; BYTE $0x72; BYTE $0xD7; BYTE $0x18 // VPSRLD ymm15, ymm15, 24
+ BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0
+ BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8
+ BYTE $0xC4; BYTE $0x41; BYTE $0x25; BYTE $0xFE; BYTE $0xDC // VPADDD ymm11, ymm11, ymm12
+ BYTE $0xC4; BYTE $0x41; BYTE $0x0D; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm14, ymm14, ymm15
+ BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
+ BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7
+ BYTE $0xC4; BYTE $0x41; BYTE $0x2D; BYTE $0xEF; BYTE $0xD3 // VPXOR ymm10, ymm10, ymm11
+ BYTE $0xC4; BYTE $0x41; BYTE $0x15; BYTE $0xEF; BYTE $0xEE // VPXOR ymm13, ymm13, ymm14
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x07 // VPSLLD ymm0, ymm3, 7
+ BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x19 // VPSRLD ymm3, ymm3, 25
+ BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x07 // VPSLLD ymm0, ymm6, 7
+ BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x19 // VPSRLD ymm6, ymm6, 25
+ BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF2; BYTE $0x07 // VPSLLD ymm0, ymm10, 7
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x2D; BYTE $0x72; BYTE $0xD2; BYTE $0x19 // VPSRLD ymm10, ymm10, 25
+ BYTE $0xC5; BYTE $0x2D; BYTE $0xEF; BYTE $0xD0 // VPXOR ymm10, ymm10, ymm0
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF5; BYTE $0x07 // VPSLLD ymm0, ymm13, 7
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x15; BYTE $0x72; BYTE $0xD5; BYTE $0x19 // VPSRLD ymm13, ymm13, 25
+ BYTE $0xC5; BYTE $0x15; BYTE $0xEF; BYTE $0xE8 // VPXOR ymm13, ymm13, ymm0
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xDB; BYTE $0x39 // VPSHUFD ymm3, ymm3, 57
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xF6; BYTE $0x39 // VPSHUFD ymm6, ymm6, 57
+ BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xD2; BYTE $0x39 // VPSHUFD ymm10, ymm10, 57
+ BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xED; BYTE $0x39 // VPSHUFD ymm13, ymm13, 57
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xE4; BYTE $0x4E // VPSHUFD ymm4, ymm4, 78
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xFF; BYTE $0x4E // VPSHUFD ymm7, ymm7, 78
+ BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xDB; BYTE $0x4E // VPSHUFD ymm11, ymm11, 78
+ BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xF6; BYTE $0x4E // VPSHUFD ymm14, ymm14, 78
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xC9; BYTE $0x93 // VPSHUFD ymm1, ymm1, 147
+ BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xC0; BYTE $0x93 // VPSHUFD ymm8, ymm8, 147
+ BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xE4; BYTE $0x93 // VPSHUFD ymm12, ymm12, 147
+ BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xFF; BYTE $0x93 // VPSHUFD ymm15, ymm15, 147
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA ymm0, [rsp + 64]
+ BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
+ BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6
+ BYTE $0xC4; BYTE $0x41; BYTE $0x35; BYTE $0xFE; BYTE $0xCA // VPADDD ymm9, ymm9, ymm10
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0xFE; BYTE $0xC5 // VPADDD ymm0, ymm0, ymm13
+ BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
+ BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5
+ BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xEF; BYTE $0xE1 // VPXOR ymm12, ymm12, ymm9
+ BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA [rsp + 64], ymm0
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x10 // VPSLLD ymm0, ymm1, 16
+ BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x10 // VPSRLD ymm1, ymm1, 16
+ BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x10 // VPSLLD ymm0, ymm8, 16
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x10 // VPSRLD ymm8, ymm8, 16
+ BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF4; BYTE $0x10 // VPSLLD ymm0, ymm12, 16
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x1D; BYTE $0x72; BYTE $0xD4; BYTE $0x10 // VPSRLD ymm12, ymm12, 16
+ BYTE $0xC5; BYTE $0x1D; BYTE $0xEF; BYTE $0xE0 // VPXOR ymm12, ymm12, ymm0
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF7; BYTE $0x10 // VPSLLD ymm0, ymm15, 16
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x05; BYTE $0x72; BYTE $0xD7; BYTE $0x10 // VPSRLD ymm15, ymm15, 16
+ BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0
+ BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8
+ BYTE $0xC4; BYTE $0x41; BYTE $0x25; BYTE $0xFE; BYTE $0xDC // VPADDD ymm11, ymm11, ymm12
+ BYTE $0xC4; BYTE $0x41; BYTE $0x0D; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm14, ymm14, ymm15
+ BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
+ BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7
+ BYTE $0xC4; BYTE $0x41; BYTE $0x2D; BYTE $0xEF; BYTE $0xD3 // VPXOR ymm10, ymm10, ymm11
+ BYTE $0xC4; BYTE $0x41; BYTE $0x15; BYTE $0xEF; BYTE $0xEE // VPXOR ymm13, ymm13, ymm14
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x0C // VPSLLD ymm0, ymm3, 12
+ BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x14 // VPSRLD ymm3, ymm3, 20
+ BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x0C // VPSLLD ymm0, ymm6, 12
+ BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x14 // VPSRLD ymm6, ymm6, 20
+ BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF2; BYTE $0x0C // VPSLLD ymm0, ymm10, 12
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x2D; BYTE $0x72; BYTE $0xD2; BYTE $0x14 // VPSRLD ymm10, ymm10, 20
+ BYTE $0xC5; BYTE $0x2D; BYTE $0xEF; BYTE $0xD0 // VPXOR ymm10, ymm10, ymm0
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF5; BYTE $0x0C // VPSLLD ymm0, ymm13, 12
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x15; BYTE $0x72; BYTE $0xD5; BYTE $0x14 // VPSRLD ymm13, ymm13, 20
+ BYTE $0xC5; BYTE $0x15; BYTE $0xEF; BYTE $0xE8 // VPXOR ymm13, ymm13, ymm0
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA ymm0, [rsp + 64]
+ BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
+ BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6
+ BYTE $0xC4; BYTE $0x41; BYTE $0x35; BYTE $0xFE; BYTE $0xCA // VPADDD ymm9, ymm9, ymm10
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0xFE; BYTE $0xC5 // VPADDD ymm0, ymm0, ymm13
+ BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
+ BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5
+ BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xEF; BYTE $0xE1 // VPXOR ymm12, ymm12, ymm9
+ BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA [rsp + 64], ymm0
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x08 // VPSLLD ymm0, ymm1, 8
+ BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x18 // VPSRLD ymm1, ymm1, 24
+ BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x08 // VPSLLD ymm0, ymm8, 8
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x18 // VPSRLD ymm8, ymm8, 24
+ BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF4; BYTE $0x08 // VPSLLD ymm0, ymm12, 8
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x1D; BYTE $0x72; BYTE $0xD4; BYTE $0x18 // VPSRLD ymm12, ymm12, 24
+ BYTE $0xC5; BYTE $0x1D; BYTE $0xEF; BYTE $0xE0 // VPXOR ymm12, ymm12, ymm0
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF7; BYTE $0x08 // VPSLLD ymm0, ymm15, 8
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x05; BYTE $0x72; BYTE $0xD7; BYTE $0x18 // VPSRLD ymm15, ymm15, 24
+ BYTE $0xC5; BYTE $0x05; BYTE $0xEF; BYTE $0xF8 // VPXOR ymm15, ymm15, ymm0
+ BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8
+ BYTE $0xC4; BYTE $0x41; BYTE $0x25; BYTE $0xFE; BYTE $0xDC // VPADDD ymm11, ymm11, ymm12
+ BYTE $0xC4; BYTE $0x41; BYTE $0x0D; BYTE $0xFE; BYTE $0xF7 // VPADDD ymm14, ymm14, ymm15
+ BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
+ BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7
+ BYTE $0xC4; BYTE $0x41; BYTE $0x2D; BYTE $0xEF; BYTE $0xD3 // VPXOR ymm10, ymm10, ymm11
+ BYTE $0xC4; BYTE $0x41; BYTE $0x15; BYTE $0xEF; BYTE $0xEE // VPXOR ymm13, ymm13, ymm14
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x07 // VPSLLD ymm0, ymm3, 7
+ BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x19 // VPSRLD ymm3, ymm3, 25
+ BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x07 // VPSLLD ymm0, ymm6, 7
+ BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x19 // VPSRLD ymm6, ymm6, 25
+ BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF2; BYTE $0x07 // VPSLLD ymm0, ymm10, 7
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x2D; BYTE $0x72; BYTE $0xD2; BYTE $0x19 // VPSRLD ymm10, ymm10, 25
+ BYTE $0xC5; BYTE $0x2D; BYTE $0xEF; BYTE $0xD0 // VPXOR ymm10, ymm10, ymm0
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF5; BYTE $0x07 // VPSLLD ymm0, ymm13, 7
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x15; BYTE $0x72; BYTE $0xD5; BYTE $0x19 // VPSRLD ymm13, ymm13, 25
+ BYTE $0xC5; BYTE $0x15; BYTE $0xEF; BYTE $0xE8 // VPXOR ymm13, ymm13, ymm0
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xDB; BYTE $0x93 // VPSHUFD ymm3, ymm3, 147
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xF6; BYTE $0x93 // VPSHUFD ymm6, ymm6, 147
+ BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xD2; BYTE $0x93 // VPSHUFD ymm10, ymm10, 147
+ BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xED; BYTE $0x93 // VPSHUFD ymm13, ymm13, 147
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xE4; BYTE $0x4E // VPSHUFD ymm4, ymm4, 78
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xFF; BYTE $0x4E // VPSHUFD ymm7, ymm7, 78
+ BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xDB; BYTE $0x4E // VPSHUFD ymm11, ymm11, 78
+ BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xF6; BYTE $0x4E // VPSHUFD ymm14, ymm14, 78
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xC9; BYTE $0x39 // VPSHUFD ymm1, ymm1, 57
+ BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xC0; BYTE $0x39 // VPSHUFD ymm8, ymm8, 57
+ BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xE4; BYTE $0x39 // VPSHUFD ymm12, ymm12, 57
+ BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xFF; BYTE $0x39 // VPSHUFD ymm15, ymm15, 57
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA ymm0, [rsp + 64]
+ SUBQ $2, SI
+ JNE rounds_loop8_begin
+ BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x00 // VBROADCASTI128 ymm0, [rax]
+ BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD0 // VPADDD ymm2, ymm2, ymm0
+ BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xE8 // VPADDD ymm5, ymm5, ymm0
+ BYTE $0xC5; BYTE $0x35; BYTE $0xFE; BYTE $0xC8 // VPADDD ymm9, ymm9, ymm0
+ BYTE $0xC5; BYTE $0xFD; BYTE $0xFE; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VPADDD ymm0, ymm0, [rsp + 64]
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA [rsp + 64], ymm0
+ BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x40; BYTE $0x10 // VBROADCASTI128 ymm0, [rax + 16]
+ BYTE $0xC5; BYTE $0xE5; BYTE $0xFE; BYTE $0xD8 // VPADDD ymm3, ymm3, ymm0
+ BYTE $0xC5; BYTE $0xCD; BYTE $0xFE; BYTE $0xF0 // VPADDD ymm6, ymm6, ymm0
+ BYTE $0xC5; BYTE $0x2D; BYTE $0xFE; BYTE $0xD0 // VPADDD ymm10, ymm10, ymm0
+ BYTE $0xC5; BYTE $0x15; BYTE $0xFE; BYTE $0xE8 // VPADDD ymm13, ymm13, ymm0
+ BYTE $0xC4; BYTE $0xE2; BYTE $0x7D; BYTE $0x5A; BYTE $0x40; BYTE $0x20 // VBROADCASTI128 ymm0, [rax + 32]
+ BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE0 // VPADDD ymm4, ymm4, ymm0
+ BYTE $0xC5; BYTE $0xC5; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm0
+ BYTE $0xC5; BYTE $0x25; BYTE $0xFE; BYTE $0xD8 // VPADDD ymm11, ymm11, ymm0
+ BYTE $0xC5; BYTE $0x0D; BYTE $0xFE; BYTE $0xF0 // VPADDD ymm14, ymm14, ymm0
+ BYTE $0xC5; BYTE $0xF5; BYTE $0xFE; BYTE $0x4C; BYTE $0x24; BYTE $0x20 // VPADDD ymm1, ymm1, [rsp + 32]
+ BYTE $0xC4; BYTE $0xE3; BYTE $0x6D; BYTE $0x46; BYTE $0xC3; BYTE $0x20 // VPERM2I128 ymm0, ymm2, ymm3, 32
+ BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x03 // VPXOR ymm0, ymm0, [rbx]
+ BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x01 // VMOVDQU [rcx], ymm0
+ BYTE $0xC4; BYTE $0xE3; BYTE $0x5D; BYTE $0x46; BYTE $0xC1; BYTE $0x20 // VPERM2I128 ymm0, ymm4, ymm1, 32
+ BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x20 // VPXOR ymm0, ymm0, [rbx + 32]
+ BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x20 // VMOVDQU [rcx + 32], ymm0
+ BYTE $0xC4; BYTE $0xE3; BYTE $0x6D; BYTE $0x46; BYTE $0xC3; BYTE $0x31 // VPERM2I128 ymm0, ymm2, ymm3, 49
+ BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x40 // VPXOR ymm0, ymm0, [rbx + 64]
+ BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x40 // VMOVDQU [rcx + 64], ymm0
+ BYTE $0xC4; BYTE $0xE3; BYTE $0x5D; BYTE $0x46; BYTE $0xC1; BYTE $0x31 // VPERM2I128 ymm0, ymm4, ymm1, 49
+ BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x60 // VPXOR ymm0, ymm0, [rbx + 96]
+ BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x60 // VMOVDQU [rcx + 96], ymm0
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x4C; BYTE $0x24; BYTE $0x20 // VMOVDQA ymm1, [rsp + 32]
+ BYTE $0xC5; BYTE $0xF5; BYTE $0xFE; BYTE $0x0C; BYTE $0x24 // VPADDD ymm1, ymm1, [rsp]
+ BYTE $0xC5; BYTE $0x3D; BYTE $0xFE; BYTE $0xC1 // VPADDD ymm8, ymm8, ymm1
+ BYTE $0xC4; BYTE $0xE3; BYTE $0x55; BYTE $0x46; BYTE $0xC6; BYTE $0x20 // VPERM2I128 ymm0, ymm5, ymm6, 32
+ BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x80; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 128]
+ BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x80; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 128], ymm0
+ BYTE $0xC4; BYTE $0xC3; BYTE $0x45; BYTE $0x46; BYTE $0xC0; BYTE $0x20 // VPERM2I128 ymm0, ymm7, ymm8, 32
+ BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0xA0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 160]
+ BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0xA0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 160], ymm0
+ BYTE $0xC4; BYTE $0xE3; BYTE $0x55; BYTE $0x46; BYTE $0xC6; BYTE $0x31 // VPERM2I128 ymm0, ymm5, ymm6, 49
+ BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0xC0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 192]
+ BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0xC0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 192], ymm0
+ BYTE $0xC4; BYTE $0xC3; BYTE $0x45; BYTE $0x46; BYTE $0xC0; BYTE $0x31 // VPERM2I128 ymm0, ymm7, ymm8, 49
+ BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0xE0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 224]
+ BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0xE0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 224], ymm0
+ BYTE $0xC5; BYTE $0xF5; BYTE $0xFE; BYTE $0x0C; BYTE $0x24 // VPADDD ymm1, ymm1, [rsp]
+ BYTE $0xC5; BYTE $0x1D; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm12, ymm12, ymm1
+ BYTE $0xC4; BYTE $0xC3; BYTE $0x35; BYTE $0x46; BYTE $0xC2; BYTE $0x20 // VPERM2I128 ymm0, ymm9, ymm10, 32
+ BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x00; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 256]
+ BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x00; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 256], ymm0
+ BYTE $0xC4; BYTE $0xC3; BYTE $0x25; BYTE $0x46; BYTE $0xC4; BYTE $0x20 // VPERM2I128 ymm0, ymm11, ymm12, 32
+ BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x20; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 288]
+ BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x20; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 288], ymm0
+ BYTE $0xC4; BYTE $0xC3; BYTE $0x35; BYTE $0x46; BYTE $0xC2; BYTE $0x31 // VPERM2I128 ymm0, ymm9, ymm10, 49
+ BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x40; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 320]
+ BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x40; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 320], ymm0
+ BYTE $0xC4; BYTE $0xC3; BYTE $0x25; BYTE $0x46; BYTE $0xC4; BYTE $0x31 // VPERM2I128 ymm0, ymm11, ymm12, 49
+ BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x60; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 352]
+ BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x60; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 352], ymm0
+ BYTE $0xC5; BYTE $0xF5; BYTE $0xFE; BYTE $0x0C; BYTE $0x24 // VPADDD ymm1, ymm1, [rsp]
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0x44; BYTE $0x24; BYTE $0x40 // VMOVDQA ymm0, [rsp + 64]
+ BYTE $0xC5; BYTE $0x05; BYTE $0xFE; BYTE $0xF9 // VPADDD ymm15, ymm15, ymm1
+ BYTE $0xC4; BYTE $0xC3; BYTE $0x7D; BYTE $0x46; BYTE $0xD5; BYTE $0x20 // VPERM2I128 ymm2, ymm0, ymm13, 32
+ BYTE $0xC5; BYTE $0xED; BYTE $0xEF; BYTE $0x93; BYTE $0x80; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm2, ymm2, [rbx + 384]
+ BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x91; BYTE $0x80; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 384], ymm2
+ BYTE $0xC4; BYTE $0xC3; BYTE $0x0D; BYTE $0x46; BYTE $0xD7; BYTE $0x20 // VPERM2I128 ymm2, ymm14, ymm15, 32
+ BYTE $0xC5; BYTE $0xED; BYTE $0xEF; BYTE $0x93; BYTE $0xA0; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm2, ymm2, [rbx + 416]
+ BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x91; BYTE $0xA0; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 416], ymm2
+ BYTE $0xC4; BYTE $0xC3; BYTE $0x7D; BYTE $0x46; BYTE $0xD5; BYTE $0x31 // VPERM2I128 ymm2, ymm0, ymm13, 49
+ BYTE $0xC5; BYTE $0xED; BYTE $0xEF; BYTE $0x93; BYTE $0xC0; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm2, ymm2, [rbx + 448]
+ BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x91; BYTE $0xC0; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 448], ymm2
+ BYTE $0xC4; BYTE $0xC3; BYTE $0x0D; BYTE $0x46; BYTE $0xD7; BYTE $0x31 // VPERM2I128 ymm2, ymm14, ymm15, 49
+ BYTE $0xC5; BYTE $0xED; BYTE $0xEF; BYTE $0x93; BYTE $0xE0; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VPXOR ymm2, ymm2, [rbx + 480]
+ BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x91; BYTE $0xE0; BYTE $0x01; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 480], ymm2
+ BYTE $0xC5; BYTE $0xF5; BYTE $0xFE; BYTE $0x0C; BYTE $0x24 // VPADDD ymm1, ymm1, [rsp]
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x4C; BYTE $0x24; BYTE $0x20 // VMOVDQA [rsp + 32], ymm1
+ ADDQ $512, BX
+ ADDQ $512, CX
+ SUBQ $8, DX
+ JCC vector_loop8_begin
+vector_loop8_end:
+ BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0xE1 // VMOVDQA ymm12, ymm1
+ ADDQ $8, DX
+ JEQ out_write_even
+ BYTE $0xC4; BYTE $0x62; BYTE $0x7D; BYTE $0x5A; BYTE $0x08 // VBROADCASTI128 ymm9, [rax]
+ BYTE $0xC4; BYTE $0x62; BYTE $0x7D; BYTE $0x5A; BYTE $0x50; BYTE $0x10 // VBROADCASTI128 ymm10, [rax + 16]
+ BYTE $0xC4; BYTE $0x62; BYTE $0x7D; BYTE $0x5A; BYTE $0x58; BYTE $0x20 // VBROADCASTI128 ymm11, [rax + 32]
+ BYTE $0xC5; BYTE $0x7D; BYTE $0x6F; BYTE $0x34; BYTE $0x24 // VMOVDQA ymm14, [rsp]
+ SUBQ $4, DX
+ JCS process_2_blocks
+ BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xCA // VMOVDQA ymm2, ymm9
+ BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xD3 // VMOVDQA ymm3, ymm10
+ BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xDC // VMOVDQA ymm4, ymm11
+ BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xE1 // VMOVDQA ymm1, ymm12
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xEA // VMOVDQA ymm5, ymm2
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xF3 // VMOVDQA ymm6, ymm3
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x6F; BYTE $0xFC // VMOVDQA ymm7, ymm4
+ BYTE $0xC4; BYTE $0x41; BYTE $0x75; BYTE $0xD4; BYTE $0xC6 // VPADDQ ymm8, ymm1, ymm14
+ MOVQ $20, SI
+rounds_loop4_begin:
+ BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
+ BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6
+ BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
+ BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x10 // VPSLLD ymm0, ymm1, 16
+ BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x10 // VPSRLD ymm1, ymm1, 16
+ BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x10 // VPSLLD ymm0, ymm8, 16
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x10 // VPSRLD ymm8, ymm8, 16
+ BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0
+ BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8
+ BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
+ BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x0C // VPSLLD ymm0, ymm3, 12
+ BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x14 // VPSRLD ymm3, ymm3, 20
+ BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x0C // VPSLLD ymm0, ymm6, 12
+ BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x14 // VPSRLD ymm6, ymm6, 20
+ BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0
+ BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
+ BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6
+ BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
+ BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x08 // VPSLLD ymm0, ymm1, 8
+ BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x18 // VPSRLD ymm1, ymm1, 24
+ BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x08 // VPSLLD ymm0, ymm8, 8
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x18 // VPSRLD ymm8, ymm8, 24
+ BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0
+ BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8
+ BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
+ BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x07 // VPSLLD ymm0, ymm3, 7
+ BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x19 // VPSRLD ymm3, ymm3, 25
+ BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x07 // VPSLLD ymm0, ymm6, 7
+ BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x19 // VPSRLD ymm6, ymm6, 25
+ BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xDB; BYTE $0x39 // VPSHUFD ymm3, ymm3, 57
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xF6; BYTE $0x39 // VPSHUFD ymm6, ymm6, 57
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xE4; BYTE $0x4E // VPSHUFD ymm4, ymm4, 78
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xFF; BYTE $0x4E // VPSHUFD ymm7, ymm7, 78
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xC9; BYTE $0x93 // VPSHUFD ymm1, ymm1, 147
+ BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xC0; BYTE $0x93 // VPSHUFD ymm8, ymm8, 147
+ BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
+ BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6
+ BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
+ BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x10 // VPSLLD ymm0, ymm1, 16
+ BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x10 // VPSRLD ymm1, ymm1, 16
+ BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x10 // VPSLLD ymm0, ymm8, 16
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x10 // VPSRLD ymm8, ymm8, 16
+ BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0
+ BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8
+ BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
+ BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x0C // VPSLLD ymm0, ymm3, 12
+ BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x14 // VPSRLD ymm3, ymm3, 20
+ BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x0C // VPSLLD ymm0, ymm6, 12
+ BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x14 // VPSRLD ymm6, ymm6, 20
+ BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0
+ BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
+ BYTE $0xC5; BYTE $0xD5; BYTE $0xFE; BYTE $0xEE // VPADDD ymm5, ymm5, ymm6
+ BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
+ BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC5 // VPXOR ymm8, ymm8, ymm5
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x08 // VPSLLD ymm0, ymm1, 8
+ BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x18 // VPSRLD ymm1, ymm1, 24
+ BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x7D; BYTE $0x72; BYTE $0xF0; BYTE $0x08 // VPSLLD ymm0, ymm8, 8
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x3D; BYTE $0x72; BYTE $0xD0; BYTE $0x18 // VPSRLD ymm8, ymm8, 24
+ BYTE $0xC5; BYTE $0x3D; BYTE $0xEF; BYTE $0xC0 // VPXOR ymm8, ymm8, ymm0
+ BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xF8 // VPADDD ymm7, ymm7, ymm8
+ BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
+ BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF7 // VPXOR ymm6, ymm6, ymm7
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x07 // VPSLLD ymm0, ymm3, 7
+ BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x19 // VPSRLD ymm3, ymm3, 25
+ BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF6; BYTE $0x07 // VPSLLD ymm0, ymm6, 7
+ BYTE $0xC5; BYTE $0xCD; BYTE $0x72; BYTE $0xD6; BYTE $0x19 // VPSRLD ymm6, ymm6, 25
+ BYTE $0xC5; BYTE $0xCD; BYTE $0xEF; BYTE $0xF0 // VPXOR ymm6, ymm6, ymm0
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xDB; BYTE $0x93 // VPSHUFD ymm3, ymm3, 147
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xF6; BYTE $0x93 // VPSHUFD ymm6, ymm6, 147
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xE4; BYTE $0x4E // VPSHUFD ymm4, ymm4, 78
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xFF; BYTE $0x4E // VPSHUFD ymm7, ymm7, 78
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xC9; BYTE $0x39 // VPSHUFD ymm1, ymm1, 57
+ BYTE $0xC4; BYTE $0x41; BYTE $0x7D; BYTE $0x70; BYTE $0xC0; BYTE $0x39 // VPSHUFD ymm8, ymm8, 57
+ SUBQ $2, SI
+ JNE rounds_loop4_begin
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x6D; BYTE $0xFE; BYTE $0xD1 // VPADDD ymm2, ymm2, ymm9
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x65; BYTE $0xFE; BYTE $0xDA // VPADDD ymm3, ymm3, ymm10
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x5D; BYTE $0xFE; BYTE $0xE3 // VPADDD ymm4, ymm4, ymm11
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x75; BYTE $0xFE; BYTE $0xCC // VPADDD ymm1, ymm1, ymm12
+ BYTE $0xC4; BYTE $0xE3; BYTE $0x6D; BYTE $0x46; BYTE $0xC3; BYTE $0x20 // VPERM2I128 ymm0, ymm2, ymm3, 32
+ BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x03 // VPXOR ymm0, ymm0, [rbx]
+ BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x01 // VMOVDQU [rcx], ymm0
+ BYTE $0xC4; BYTE $0xE3; BYTE $0x5D; BYTE $0x46; BYTE $0xC1; BYTE $0x20 // VPERM2I128 ymm0, ymm4, ymm1, 32
+ BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x20 // VPXOR ymm0, ymm0, [rbx + 32]
+ BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x20 // VMOVDQU [rcx + 32], ymm0
+ BYTE $0xC4; BYTE $0xE3; BYTE $0x6D; BYTE $0x46; BYTE $0xC3; BYTE $0x31 // VPERM2I128 ymm0, ymm2, ymm3, 49
+ BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x40 // VPXOR ymm0, ymm0, [rbx + 64]
+ BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x40 // VMOVDQU [rcx + 64], ymm0
+ BYTE $0xC4; BYTE $0xE3; BYTE $0x5D; BYTE $0x46; BYTE $0xC1; BYTE $0x31 // VPERM2I128 ymm0, ymm4, ymm1, 49
+ BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x60 // VPXOR ymm0, ymm0, [rbx + 96]
+ BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x60 // VMOVDQU [rcx + 96], ymm0
+ BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xFE; BYTE $0xE6 // VPADDD ymm12, ymm12, ymm14
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x55; BYTE $0xFE; BYTE $0xE9 // VPADDD ymm5, ymm5, ymm9
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x4D; BYTE $0xFE; BYTE $0xF2 // VPADDD ymm6, ymm6, ymm10
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x45; BYTE $0xFE; BYTE $0xFB // VPADDD ymm7, ymm7, ymm11
+ BYTE $0xC4; BYTE $0x41; BYTE $0x3D; BYTE $0xFE; BYTE $0xC4 // VPADDD ymm8, ymm8, ymm12
+ BYTE $0xC4; BYTE $0xE3; BYTE $0x55; BYTE $0x46; BYTE $0xC6; BYTE $0x20 // VPERM2I128 ymm0, ymm5, ymm6, 32
+ BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0x80; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 128]
+ BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0x80; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 128], ymm0
+ BYTE $0xC4; BYTE $0xC3; BYTE $0x45; BYTE $0x46; BYTE $0xC0; BYTE $0x20 // VPERM2I128 ymm0, ymm7, ymm8, 32
+ BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0xA0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 160]
+ BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0xA0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 160], ymm0
+ BYTE $0xC4; BYTE $0xE3; BYTE $0x55; BYTE $0x46; BYTE $0xC6; BYTE $0x31 // VPERM2I128 ymm0, ymm5, ymm6, 49
+ BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0xC0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 192]
+ BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0xC0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 192], ymm0
+ BYTE $0xC4; BYTE $0xC3; BYTE $0x45; BYTE $0x46; BYTE $0xC0; BYTE $0x31 // VPERM2I128 ymm0, ymm7, ymm8, 49
+ BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x83; BYTE $0xE0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VPXOR ymm0, ymm0, [rbx + 224]
+ BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x81; BYTE $0xE0; BYTE $0x00; BYTE $0x00; BYTE $0x00 // VMOVDQU [rcx + 224], ymm0
+ BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xFE; BYTE $0xE6 // VPADDD ymm12, ymm12, ymm14
+ ADDQ $256, BX
+ ADDQ $256, CX
+ SUBQ $4, DX
+process_2_blocks:
+ ADDQ $4, DX
+ JEQ out_write_even
+vector_loop2_begin:
+ BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xCA // VMOVDQA ymm2, ymm9
+ BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xD3 // VMOVDQA ymm3, ymm10
+ BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xDC // VMOVDQA ymm4, ymm11
+ BYTE $0xC5; BYTE $0x7D; BYTE $0x7F; BYTE $0xE1 // VMOVDQA ymm1, ymm12
+ MOVQ $20, SI
+rounds_loop2_begin:
+ BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
+ BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x10 // VPSLLD ymm0, ymm1, 16
+ BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x10 // VPSRLD ymm1, ymm1, 16
+ BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
+ BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
+ BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x0C // VPSLLD ymm0, ymm3, 12
+ BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x14 // VPSRLD ymm3, ymm3, 20
+ BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
+ BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
+ BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x08 // VPSLLD ymm0, ymm1, 8
+ BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x18 // VPSRLD ymm1, ymm1, 24
+ BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
+ BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
+ BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x07 // VPSLLD ymm0, ymm3, 7
+ BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x19 // VPSRLD ymm3, ymm3, 25
+ BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xDB; BYTE $0x39 // VPSHUFD ymm3, ymm3, 57
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xE4; BYTE $0x4E // VPSHUFD ymm4, ymm4, 78
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xC9; BYTE $0x93 // VPSHUFD ymm1, ymm1, 147
+ BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
+ BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x10 // VPSLLD ymm0, ymm1, 16
+ BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x10 // VPSRLD ymm1, ymm1, 16
+ BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
+ BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
+ BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x0C // VPSLLD ymm0, ymm3, 12
+ BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x14 // VPSRLD ymm3, ymm3, 20
+ BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
+ BYTE $0xC5; BYTE $0xED; BYTE $0xFE; BYTE $0xD3 // VPADDD ymm2, ymm2, ymm3
+ BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xCA // VPXOR ymm1, ymm1, ymm2
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF1; BYTE $0x08 // VPSLLD ymm0, ymm1, 8
+ BYTE $0xC5; BYTE $0xF5; BYTE $0x72; BYTE $0xD1; BYTE $0x18 // VPSRLD ymm1, ymm1, 24
+ BYTE $0xC5; BYTE $0xF5; BYTE $0xEF; BYTE $0xC8 // VPXOR ymm1, ymm1, ymm0
+ BYTE $0xC5; BYTE $0xDD; BYTE $0xFE; BYTE $0xE1 // VPADDD ymm4, ymm4, ymm1
+ BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xDC // VPXOR ymm3, ymm3, ymm4
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x72; BYTE $0xF3; BYTE $0x07 // VPSLLD ymm0, ymm3, 7
+ BYTE $0xC5; BYTE $0xE5; BYTE $0x72; BYTE $0xD3; BYTE $0x19 // VPSRLD ymm3, ymm3, 25
+ BYTE $0xC5; BYTE $0xE5; BYTE $0xEF; BYTE $0xD8 // VPXOR ymm3, ymm3, ymm0
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xDB; BYTE $0x93 // VPSHUFD ymm3, ymm3, 147
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xE4; BYTE $0x4E // VPSHUFD ymm4, ymm4, 78
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x70; BYTE $0xC9; BYTE $0x39 // VPSHUFD ymm1, ymm1, 57
+ SUBQ $2, SI
+ JNE rounds_loop2_begin
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x6D; BYTE $0xFE; BYTE $0xD1 // VPADDD ymm2, ymm2, ymm9
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x65; BYTE $0xFE; BYTE $0xDA // VPADDD ymm3, ymm3, ymm10
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x5D; BYTE $0xFE; BYTE $0xE3 // VPADDD ymm4, ymm4, ymm11
+ BYTE $0xC4; BYTE $0xC1; BYTE $0x75; BYTE $0xFE; BYTE $0xCC // VPADDD ymm1, ymm1, ymm12
+ BYTE $0xC4; BYTE $0xE3; BYTE $0x6D; BYTE $0x46; BYTE $0xC3; BYTE $0x20 // VPERM2I128 ymm0, ymm2, ymm3, 32
+ BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x03 // VPXOR ymm0, ymm0, [rbx]
+ BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x01 // VMOVDQU [rcx], ymm0
+ BYTE $0xC4; BYTE $0xE3; BYTE $0x5D; BYTE $0x46; BYTE $0xC1; BYTE $0x20 // VPERM2I128 ymm0, ymm4, ymm1, 32
+ BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x20 // VPXOR ymm0, ymm0, [rbx + 32]
+ BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x20 // VMOVDQU [rcx + 32], ymm0
+ SUBQ $1, DX
+ JEQ out_write_odd
+ BYTE $0xC4; BYTE $0x41; BYTE $0x1D; BYTE $0xFE; BYTE $0xE6 // VPADDD ymm12, ymm12, ymm14
+ BYTE $0xC4; BYTE $0xE3; BYTE $0x6D; BYTE $0x46; BYTE $0xC3; BYTE $0x31 // VPERM2I128 ymm0, ymm2, ymm3, 49
+ BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x40 // VPXOR ymm0, ymm0, [rbx + 64]
+ BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x40 // VMOVDQU [rcx + 64], ymm0
+ BYTE $0xC4; BYTE $0xE3; BYTE $0x5D; BYTE $0x46; BYTE $0xC1; BYTE $0x31 // VPERM2I128 ymm0, ymm4, ymm1, 49
+ BYTE $0xC5; BYTE $0xFD; BYTE $0xEF; BYTE $0x43; BYTE $0x60 // VPXOR ymm0, ymm0, [rbx + 96]
+ BYTE $0xC5; BYTE $0xFE; BYTE $0x7F; BYTE $0x41; BYTE $0x60 // VMOVDQU [rcx + 96], ymm0
+ SUBQ $1, DX
+ JEQ out_write_even
+ ADDQ $128, BX
+ ADDQ $128, CX
+ JMP vector_loop2_begin
+out_write_odd:
+ BYTE $0xC4; BYTE $0x43; BYTE $0x1D; BYTE $0x46; BYTE $0xE4; BYTE $0x01 // VPERM2I128 ymm12, ymm12, ymm12, 1
+out_write_even:
+ BYTE $0xC5; BYTE $0x7A; BYTE $0x7F; BYTE $0x60; BYTE $0x30 // VMOVDQU [rax + 48], xmm12
+ BYTE $0xC5; BYTE $0xED; BYTE $0xEF; BYTE $0xD2 // VPXOR ymm2, ymm2, ymm2
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x54; BYTE $0x24; BYTE $0x40 // VMOVDQA [rsp + 64], ymm2
+ BYTE $0xC5; BYTE $0xFD; BYTE $0x7F; BYTE $0x54; BYTE $0x24; BYTE $0x20 // VMOVDQA [rsp + 32], ymm2
+ BYTE $0xC5; BYTE $0xFC; BYTE $0x77 // VZEROALL
+ MOVQ DI, SP
+ RET
+
+// func cpuidAmd64(cpuidParams *uint32)
+TEXT ·cpuidAmd64(SB),4,$0-8
+ MOVQ cpuidParams+0(FP), R15
+ MOVL 0(R15), AX
+ MOVL 8(R15), CX
+ CPUID
+ MOVL AX, 0(R15)
+ MOVL BX, 4(R15)
+ MOVL CX, 8(R15)
+ MOVL DX, 12(R15)
+ RET
+
+// func xgetbv0Amd64(xcrVec *uint32)
+TEXT ·xgetbv0Amd64(SB),4,$0-8
+ MOVQ xcrVec+0(FP), BX
+ XORL CX, CX
+ BYTE $0x0F; BYTE $0x01; BYTE $0xD0 // XGETBV
+ MOVL AX, 0(BX)
+ MOVL DX, 4(BX)
+ RET
diff --git a/vendor/github.com/Yawning/chacha20/chacha20_ref.go b/vendor/github.com/Yawning/chacha20/chacha20_ref.go
new file mode 100644
index 0000000..fcdc8c6
--- /dev/null
+++ b/vendor/github.com/Yawning/chacha20/chacha20_ref.go
@@ -0,0 +1,394 @@
+// chacha20_ref.go - Reference ChaCha20.
+//
+// To the extent possible under law, Yawning Angel has waived all copyright
+// and related or neighboring rights to chacha20, using the Creative
+// Commons "CC0" public domain dedication. See LICENSE or
+// for full details.
+
+// +build !go1.9
+
+package chacha20
+
+import (
+ "encoding/binary"
+ "math"
+ "unsafe"
+)
+
+func blocksRef(x *[stateSize]uint32, in []byte, out []byte, nrBlocks int, isIetf bool) {
+ if isIetf {
+ var totalBlocks uint64
+ totalBlocks = uint64(x[12]) + uint64(nrBlocks)
+ if totalBlocks > math.MaxUint32 {
+ panic("chacha20: Exceeded keystream per nonce limit")
+ }
+ }
+
+ // This routine ignores x[0]...x[4] in favor the const values since it's
+ // ever so slightly faster.
+
+ for n := 0; n < nrBlocks; n++ {
+ x0, x1, x2, x3 := sigma0, sigma1, sigma2, sigma3
+ x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
+
+ for i := chachaRounds; i > 0; i -= 2 {
+ // quarterround(x, 0, 4, 8, 12)
+ x0 += x4
+ x12 ^= x0
+ x12 = (x12 << 16) | (x12 >> 16)
+ x8 += x12
+ x4 ^= x8
+ x4 = (x4 << 12) | (x4 >> 20)
+ x0 += x4
+ x12 ^= x0
+ x12 = (x12 << 8) | (x12 >> 24)
+ x8 += x12
+ x4 ^= x8
+ x4 = (x4 << 7) | (x4 >> 25)
+
+ // quarterround(x, 1, 5, 9, 13)
+ x1 += x5
+ x13 ^= x1
+ x13 = (x13 << 16) | (x13 >> 16)
+ x9 += x13
+ x5 ^= x9
+ x5 = (x5 << 12) | (x5 >> 20)
+ x1 += x5
+ x13 ^= x1
+ x13 = (x13 << 8) | (x13 >> 24)
+ x9 += x13
+ x5 ^= x9
+ x5 = (x5 << 7) | (x5 >> 25)
+
+ // quarterround(x, 2, 6, 10, 14)
+ x2 += x6
+ x14 ^= x2
+ x14 = (x14 << 16) | (x14 >> 16)
+ x10 += x14
+ x6 ^= x10
+ x6 = (x6 << 12) | (x6 >> 20)
+ x2 += x6
+ x14 ^= x2
+ x14 = (x14 << 8) | (x14 >> 24)
+ x10 += x14
+ x6 ^= x10
+ x6 = (x6 << 7) | (x6 >> 25)
+
+ // quarterround(x, 3, 7, 11, 15)
+ x3 += x7
+ x15 ^= x3
+ x15 = (x15 << 16) | (x15 >> 16)
+ x11 += x15
+ x7 ^= x11
+ x7 = (x7 << 12) | (x7 >> 20)
+ x3 += x7
+ x15 ^= x3
+ x15 = (x15 << 8) | (x15 >> 24)
+ x11 += x15
+ x7 ^= x11
+ x7 = (x7 << 7) | (x7 >> 25)
+
+ // quarterround(x, 0, 5, 10, 15)
+ x0 += x5
+ x15 ^= x0
+ x15 = (x15 << 16) | (x15 >> 16)
+ x10 += x15
+ x5 ^= x10
+ x5 = (x5 << 12) | (x5 >> 20)
+ x0 += x5
+ x15 ^= x0
+ x15 = (x15 << 8) | (x15 >> 24)
+ x10 += x15
+ x5 ^= x10
+ x5 = (x5 << 7) | (x5 >> 25)
+
+ // quarterround(x, 1, 6, 11, 12)
+ x1 += x6
+ x12 ^= x1
+ x12 = (x12 << 16) | (x12 >> 16)
+ x11 += x12
+ x6 ^= x11
+ x6 = (x6 << 12) | (x6 >> 20)
+ x1 += x6
+ x12 ^= x1
+ x12 = (x12 << 8) | (x12 >> 24)
+ x11 += x12
+ x6 ^= x11
+ x6 = (x6 << 7) | (x6 >> 25)
+
+ // quarterround(x, 2, 7, 8, 13)
+ x2 += x7
+ x13 ^= x2
+ x13 = (x13 << 16) | (x13 >> 16)
+ x8 += x13
+ x7 ^= x8
+ x7 = (x7 << 12) | (x7 >> 20)
+ x2 += x7
+ x13 ^= x2
+ x13 = (x13 << 8) | (x13 >> 24)
+ x8 += x13
+ x7 ^= x8
+ x7 = (x7 << 7) | (x7 >> 25)
+
+ // quarterround(x, 3, 4, 9, 14)
+ x3 += x4
+ x14 ^= x3
+ x14 = (x14 << 16) | (x14 >> 16)
+ x9 += x14
+ x4 ^= x9
+ x4 = (x4 << 12) | (x4 >> 20)
+ x3 += x4
+ x14 ^= x3
+ x14 = (x14 << 8) | (x14 >> 24)
+ x9 += x14
+ x4 ^= x9
+ x4 = (x4 << 7) | (x4 >> 25)
+ }
+
+ // On amd64 at least, this is a rather big boost.
+ if useUnsafe {
+ if in != nil {
+ inArr := (*[16]uint32)(unsafe.Pointer(&in[n*BlockSize]))
+ outArr := (*[16]uint32)(unsafe.Pointer(&out[n*BlockSize]))
+ outArr[0] = inArr[0] ^ (x0 + sigma0)
+ outArr[1] = inArr[1] ^ (x1 + sigma1)
+ outArr[2] = inArr[2] ^ (x2 + sigma2)
+ outArr[3] = inArr[3] ^ (x3 + sigma3)
+ outArr[4] = inArr[4] ^ (x4 + x[4])
+ outArr[5] = inArr[5] ^ (x5 + x[5])
+ outArr[6] = inArr[6] ^ (x6 + x[6])
+ outArr[7] = inArr[7] ^ (x7 + x[7])
+ outArr[8] = inArr[8] ^ (x8 + x[8])
+ outArr[9] = inArr[9] ^ (x9 + x[9])
+ outArr[10] = inArr[10] ^ (x10 + x[10])
+ outArr[11] = inArr[11] ^ (x11 + x[11])
+ outArr[12] = inArr[12] ^ (x12 + x[12])
+ outArr[13] = inArr[13] ^ (x13 + x[13])
+ outArr[14] = inArr[14] ^ (x14 + x[14])
+ outArr[15] = inArr[15] ^ (x15 + x[15])
+ } else {
+ outArr := (*[16]uint32)(unsafe.Pointer(&out[n*BlockSize]))
+ outArr[0] = x0 + sigma0
+ outArr[1] = x1 + sigma1
+ outArr[2] = x2 + sigma2
+ outArr[3] = x3 + sigma3
+ outArr[4] = x4 + x[4]
+ outArr[5] = x5 + x[5]
+ outArr[6] = x6 + x[6]
+ outArr[7] = x7 + x[7]
+ outArr[8] = x8 + x[8]
+ outArr[9] = x9 + x[9]
+ outArr[10] = x10 + x[10]
+ outArr[11] = x11 + x[11]
+ outArr[12] = x12 + x[12]
+ outArr[13] = x13 + x[13]
+ outArr[14] = x14 + x[14]
+ outArr[15] = x15 + x[15]
+ }
+ } else {
+ // Slow path, either the architecture cares about alignment, or is not little endian.
+ x0 += sigma0
+ x1 += sigma1
+ x2 += sigma2
+ x3 += sigma3
+ x4 += x[4]
+ x5 += x[5]
+ x6 += x[6]
+ x7 += x[7]
+ x8 += x[8]
+ x9 += x[9]
+ x10 += x[10]
+ x11 += x[11]
+ x12 += x[12]
+ x13 += x[13]
+ x14 += x[14]
+ x15 += x[15]
+ if in != nil {
+ binary.LittleEndian.PutUint32(out[0:4], binary.LittleEndian.Uint32(in[0:4])^x0)
+ binary.LittleEndian.PutUint32(out[4:8], binary.LittleEndian.Uint32(in[4:8])^x1)
+ binary.LittleEndian.PutUint32(out[8:12], binary.LittleEndian.Uint32(in[8:12])^x2)
+ binary.LittleEndian.PutUint32(out[12:16], binary.LittleEndian.Uint32(in[12:16])^x3)
+ binary.LittleEndian.PutUint32(out[16:20], binary.LittleEndian.Uint32(in[16:20])^x4)
+ binary.LittleEndian.PutUint32(out[20:24], binary.LittleEndian.Uint32(in[20:24])^x5)
+ binary.LittleEndian.PutUint32(out[24:28], binary.LittleEndian.Uint32(in[24:28])^x6)
+ binary.LittleEndian.PutUint32(out[28:32], binary.LittleEndian.Uint32(in[28:32])^x7)
+ binary.LittleEndian.PutUint32(out[32:36], binary.LittleEndian.Uint32(in[32:36])^x8)
+ binary.LittleEndian.PutUint32(out[36:40], binary.LittleEndian.Uint32(in[36:40])^x9)
+ binary.LittleEndian.PutUint32(out[40:44], binary.LittleEndian.Uint32(in[40:44])^x10)
+ binary.LittleEndian.PutUint32(out[44:48], binary.LittleEndian.Uint32(in[44:48])^x11)
+ binary.LittleEndian.PutUint32(out[48:52], binary.LittleEndian.Uint32(in[48:52])^x12)
+ binary.LittleEndian.PutUint32(out[52:56], binary.LittleEndian.Uint32(in[52:56])^x13)
+ binary.LittleEndian.PutUint32(out[56:60], binary.LittleEndian.Uint32(in[56:60])^x14)
+ binary.LittleEndian.PutUint32(out[60:64], binary.LittleEndian.Uint32(in[60:64])^x15)
+ in = in[BlockSize:]
+ } else {
+ binary.LittleEndian.PutUint32(out[0:4], x0)
+ binary.LittleEndian.PutUint32(out[4:8], x1)
+ binary.LittleEndian.PutUint32(out[8:12], x2)
+ binary.LittleEndian.PutUint32(out[12:16], x3)
+ binary.LittleEndian.PutUint32(out[16:20], x4)
+ binary.LittleEndian.PutUint32(out[20:24], x5)
+ binary.LittleEndian.PutUint32(out[24:28], x6)
+ binary.LittleEndian.PutUint32(out[28:32], x7)
+ binary.LittleEndian.PutUint32(out[32:36], x8)
+ binary.LittleEndian.PutUint32(out[36:40], x9)
+ binary.LittleEndian.PutUint32(out[40:44], x10)
+ binary.LittleEndian.PutUint32(out[44:48], x11)
+ binary.LittleEndian.PutUint32(out[48:52], x12)
+ binary.LittleEndian.PutUint32(out[52:56], x13)
+ binary.LittleEndian.PutUint32(out[56:60], x14)
+ binary.LittleEndian.PutUint32(out[60:64], x15)
+ }
+ out = out[BlockSize:]
+ }
+
+ // Stoping at 2^70 bytes per nonce is the user's responsibility.
+ ctr := uint64(x[13])<<32 | uint64(x[12])
+ ctr++
+ x[12] = uint32(ctr)
+ x[13] = uint32(ctr >> 32)
+ }
+}
+
+func hChaChaRef(x *[stateSize]uint32, out *[32]byte) {
+ x0, x1, x2, x3 := sigma0, sigma1, sigma2, sigma3
+ x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11]
+
+ for i := chachaRounds; i > 0; i -= 2 {
+ // quarterround(x, 0, 4, 8, 12)
+ x0 += x4
+ x12 ^= x0
+ x12 = (x12 << 16) | (x12 >> 16)
+ x8 += x12
+ x4 ^= x8
+ x4 = (x4 << 12) | (x4 >> 20)
+ x0 += x4
+ x12 ^= x0
+ x12 = (x12 << 8) | (x12 >> 24)
+ x8 += x12
+ x4 ^= x8
+ x4 = (x4 << 7) | (x4 >> 25)
+
+ // quarterround(x, 1, 5, 9, 13)
+ x1 += x5
+ x13 ^= x1
+ x13 = (x13 << 16) | (x13 >> 16)
+ x9 += x13
+ x5 ^= x9
+ x5 = (x5 << 12) | (x5 >> 20)
+ x1 += x5
+ x13 ^= x1
+ x13 = (x13 << 8) | (x13 >> 24)
+ x9 += x13
+ x5 ^= x9
+ x5 = (x5 << 7) | (x5 >> 25)
+
+ // quarterround(x, 2, 6, 10, 14)
+ x2 += x6
+ x14 ^= x2
+ x14 = (x14 << 16) | (x14 >> 16)
+ x10 += x14
+ x6 ^= x10
+ x6 = (x6 << 12) | (x6 >> 20)
+ x2 += x6
+ x14 ^= x2
+ x14 = (x14 << 8) | (x14 >> 24)
+ x10 += x14
+ x6 ^= x10
+ x6 = (x6 << 7) | (x6 >> 25)
+
+ // quarterround(x, 3, 7, 11, 15)
+ x3 += x7
+ x15 ^= x3
+ x15 = (x15 << 16) | (x15 >> 16)
+ x11 += x15
+ x7 ^= x11
+ x7 = (x7 << 12) | (x7 >> 20)
+ x3 += x7
+ x15 ^= x3
+ x15 = (x15 << 8) | (x15 >> 24)
+ x11 += x15
+ x7 ^= x11
+ x7 = (x7 << 7) | (x7 >> 25)
+
+ // quarterround(x, 0, 5, 10, 15)
+ x0 += x5
+ x15 ^= x0
+ x15 = (x15 << 16) | (x15 >> 16)
+ x10 += x15
+ x5 ^= x10
+ x5 = (x5 << 12) | (x5 >> 20)
+ x0 += x5
+ x15 ^= x0
+ x15 = (x15 << 8) | (x15 >> 24)
+ x10 += x15
+ x5 ^= x10
+ x5 = (x5 << 7) | (x5 >> 25)
+
+ // quarterround(x, 1, 6, 11, 12)
+ x1 += x6
+ x12 ^= x1
+ x12 = (x12 << 16) | (x12 >> 16)
+ x11 += x12
+ x6 ^= x11
+ x6 = (x6 << 12) | (x6 >> 20)
+ x1 += x6
+ x12 ^= x1
+ x12 = (x12 << 8) | (x12 >> 24)
+ x11 += x12
+ x6 ^= x11
+ x6 = (x6 << 7) | (x6 >> 25)
+
+ // quarterround(x, 2, 7, 8, 13)
+ x2 += x7
+ x13 ^= x2
+ x13 = (x13 << 16) | (x13 >> 16)
+ x8 += x13
+ x7 ^= x8
+ x7 = (x7 << 12) | (x7 >> 20)
+ x2 += x7
+ x13 ^= x2
+ x13 = (x13 << 8) | (x13 >> 24)
+ x8 += x13
+ x7 ^= x8
+ x7 = (x7 << 7) | (x7 >> 25)
+
+ // quarterround(x, 3, 4, 9, 14)
+ x3 += x4
+ x14 ^= x3
+ x14 = (x14 << 16) | (x14 >> 16)
+ x9 += x14
+ x4 ^= x9
+ x4 = (x4 << 12) | (x4 >> 20)
+ x3 += x4
+ x14 ^= x3
+ x14 = (x14 << 8) | (x14 >> 24)
+ x9 += x14
+ x4 ^= x9
+ x4 = (x4 << 7) | (x4 >> 25)
+ }
+
+ // HChaCha returns x0...x3 | x12...x15, which corresponds to the
+ // indexes of the ChaCha constant and the indexes of the IV.
+ if useUnsafe {
+ outArr := (*[16]uint32)(unsafe.Pointer(&out[0]))
+ outArr[0] = x0
+ outArr[1] = x1
+ outArr[2] = x2
+ outArr[3] = x3
+ outArr[4] = x12
+ outArr[5] = x13
+ outArr[6] = x14
+ outArr[7] = x15
+ } else {
+ binary.LittleEndian.PutUint32(out[0:4], x0)
+ binary.LittleEndian.PutUint32(out[4:8], x1)
+ binary.LittleEndian.PutUint32(out[8:12], x2)
+ binary.LittleEndian.PutUint32(out[12:16], x3)
+ binary.LittleEndian.PutUint32(out[16:20], x12)
+ binary.LittleEndian.PutUint32(out[20:24], x13)
+ binary.LittleEndian.PutUint32(out[24:28], x14)
+ binary.LittleEndian.PutUint32(out[28:32], x15)
+ }
+ return
+}
diff --git a/vendor/github.com/Yawning/chacha20/chacha20_ref_go19.go b/vendor/github.com/Yawning/chacha20/chacha20_ref_go19.go
new file mode 100644
index 0000000..8405c22
--- /dev/null
+++ b/vendor/github.com/Yawning/chacha20/chacha20_ref_go19.go
@@ -0,0 +1,395 @@
+// chacha20_ref.go - Reference ChaCha20.
+//
+// To the extent possible under law, Yawning Angel has waived all copyright
+// and related or neighboring rights to chacha20, using the Creative
+// Commons "CC0" public domain dedication. See LICENSE or
+// for full details.
+
+// +build go1.9
+
+package chacha20
+
+import (
+ "encoding/binary"
+ "math"
+ "math/bits"
+ "unsafe"
+)
+
+func blocksRef(x *[stateSize]uint32, in []byte, out []byte, nrBlocks int, isIetf bool) {
+ if isIetf {
+ var totalBlocks uint64
+ totalBlocks = uint64(x[12]) + uint64(nrBlocks)
+ if totalBlocks > math.MaxUint32 {
+ panic("chacha20: Exceeded keystream per nonce limit")
+ }
+ }
+
+ // This routine ignores x[0]...x[4] in favor the const values since it's
+ // ever so slightly faster.
+
+ for n := 0; n < nrBlocks; n++ {
+ x0, x1, x2, x3 := sigma0, sigma1, sigma2, sigma3
+ x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11], x[12], x[13], x[14], x[15]
+
+ for i := chachaRounds; i > 0; i -= 2 {
+ // quarterround(x, 0, 4, 8, 12)
+ x0 += x4
+ x12 ^= x0
+ x12 = bits.RotateLeft32(x12, 16)
+ x8 += x12
+ x4 ^= x8
+ x4 = bits.RotateLeft32(x4, 12)
+ x0 += x4
+ x12 ^= x0
+ x12 = bits.RotateLeft32(x12, 8)
+ x8 += x12
+ x4 ^= x8
+ x4 = bits.RotateLeft32(x4, 7)
+
+ // quarterround(x, 1, 5, 9, 13)
+ x1 += x5
+ x13 ^= x1
+ x13 = bits.RotateLeft32(x13, 16)
+ x9 += x13
+ x5 ^= x9
+ x5 = bits.RotateLeft32(x5, 12)
+ x1 += x5
+ x13 ^= x1
+ x13 = bits.RotateLeft32(x13, 8)
+ x9 += x13
+ x5 ^= x9
+ x5 = bits.RotateLeft32(x5, 7)
+
+ // quarterround(x, 2, 6, 10, 14)
+ x2 += x6
+ x14 ^= x2
+ x14 = bits.RotateLeft32(x14, 16)
+ x10 += x14
+ x6 ^= x10
+ x6 = bits.RotateLeft32(x6, 12)
+ x2 += x6
+ x14 ^= x2
+ x14 = bits.RotateLeft32(x14, 8)
+ x10 += x14
+ x6 ^= x10
+ x6 = bits.RotateLeft32(x6, 7)
+
+ // quarterround(x, 3, 7, 11, 15)
+ x3 += x7
+ x15 ^= x3
+ x15 = bits.RotateLeft32(x15, 16)
+ x11 += x15
+ x7 ^= x11
+ x7 = bits.RotateLeft32(x7, 12)
+ x3 += x7
+ x15 ^= x3
+ x15 = bits.RotateLeft32(x15, 8)
+ x11 += x15
+ x7 ^= x11
+ x7 = bits.RotateLeft32(x7, 7)
+
+ // quarterround(x, 0, 5, 10, 15)
+ x0 += x5
+ x15 ^= x0
+ x15 = bits.RotateLeft32(x15, 16)
+ x10 += x15
+ x5 ^= x10
+ x5 = bits.RotateLeft32(x5, 12)
+ x0 += x5
+ x15 ^= x0
+ x15 = bits.RotateLeft32(x15, 8)
+ x10 += x15
+ x5 ^= x10
+ x5 = bits.RotateLeft32(x5, 7)
+
+ // quarterround(x, 1, 6, 11, 12)
+ x1 += x6
+ x12 ^= x1
+ x12 = bits.RotateLeft32(x12, 16)
+ x11 += x12
+ x6 ^= x11
+ x6 = bits.RotateLeft32(x6, 12)
+ x1 += x6
+ x12 ^= x1
+ x12 = bits.RotateLeft32(x12, 8)
+ x11 += x12
+ x6 ^= x11
+ x6 = bits.RotateLeft32(x6, 7)
+
+ // quarterround(x, 2, 7, 8, 13)
+ x2 += x7
+ x13 ^= x2
+ x13 = bits.RotateLeft32(x13, 16)
+ x8 += x13
+ x7 ^= x8
+ x7 = bits.RotateLeft32(x7, 12)
+ x2 += x7
+ x13 ^= x2
+ x13 = bits.RotateLeft32(x13, 8)
+ x8 += x13
+ x7 ^= x8
+ x7 = bits.RotateLeft32(x7, 7)
+
+ // quarterround(x, 3, 4, 9, 14)
+ x3 += x4
+ x14 ^= x3
+ x14 = bits.RotateLeft32(x14, 16)
+ x9 += x14
+ x4 ^= x9
+ x4 = bits.RotateLeft32(x4, 12)
+ x3 += x4
+ x14 ^= x3
+ x14 = bits.RotateLeft32(x14, 8)
+ x9 += x14
+ x4 ^= x9
+ x4 = bits.RotateLeft32(x4, 7)
+ }
+
+ // On amd64 at least, this is a rather big boost.
+ if useUnsafe {
+ if in != nil {
+ inArr := (*[16]uint32)(unsafe.Pointer(&in[n*BlockSize]))
+ outArr := (*[16]uint32)(unsafe.Pointer(&out[n*BlockSize]))
+ outArr[0] = inArr[0] ^ (x0 + sigma0)
+ outArr[1] = inArr[1] ^ (x1 + sigma1)
+ outArr[2] = inArr[2] ^ (x2 + sigma2)
+ outArr[3] = inArr[3] ^ (x3 + sigma3)
+ outArr[4] = inArr[4] ^ (x4 + x[4])
+ outArr[5] = inArr[5] ^ (x5 + x[5])
+ outArr[6] = inArr[6] ^ (x6 + x[6])
+ outArr[7] = inArr[7] ^ (x7 + x[7])
+ outArr[8] = inArr[8] ^ (x8 + x[8])
+ outArr[9] = inArr[9] ^ (x9 + x[9])
+ outArr[10] = inArr[10] ^ (x10 + x[10])
+ outArr[11] = inArr[11] ^ (x11 + x[11])
+ outArr[12] = inArr[12] ^ (x12 + x[12])
+ outArr[13] = inArr[13] ^ (x13 + x[13])
+ outArr[14] = inArr[14] ^ (x14 + x[14])
+ outArr[15] = inArr[15] ^ (x15 + x[15])
+ } else {
+ outArr := (*[16]uint32)(unsafe.Pointer(&out[n*BlockSize]))
+ outArr[0] = x0 + sigma0
+ outArr[1] = x1 + sigma1
+ outArr[2] = x2 + sigma2
+ outArr[3] = x3 + sigma3
+ outArr[4] = x4 + x[4]
+ outArr[5] = x5 + x[5]
+ outArr[6] = x6 + x[6]
+ outArr[7] = x7 + x[7]
+ outArr[8] = x8 + x[8]
+ outArr[9] = x9 + x[9]
+ outArr[10] = x10 + x[10]
+ outArr[11] = x11 + x[11]
+ outArr[12] = x12 + x[12]
+ outArr[13] = x13 + x[13]
+ outArr[14] = x14 + x[14]
+ outArr[15] = x15 + x[15]
+ }
+ } else {
+ // Slow path, either the architecture cares about alignment, or is not little endian.
+ x0 += sigma0
+ x1 += sigma1
+ x2 += sigma2
+ x3 += sigma3
+ x4 += x[4]
+ x5 += x[5]
+ x6 += x[6]
+ x7 += x[7]
+ x8 += x[8]
+ x9 += x[9]
+ x10 += x[10]
+ x11 += x[11]
+ x12 += x[12]
+ x13 += x[13]
+ x14 += x[14]
+ x15 += x[15]
+ if in != nil {
+ binary.LittleEndian.PutUint32(out[0:4], binary.LittleEndian.Uint32(in[0:4])^x0)
+ binary.LittleEndian.PutUint32(out[4:8], binary.LittleEndian.Uint32(in[4:8])^x1)
+ binary.LittleEndian.PutUint32(out[8:12], binary.LittleEndian.Uint32(in[8:12])^x2)
+ binary.LittleEndian.PutUint32(out[12:16], binary.LittleEndian.Uint32(in[12:16])^x3)
+ binary.LittleEndian.PutUint32(out[16:20], binary.LittleEndian.Uint32(in[16:20])^x4)
+ binary.LittleEndian.PutUint32(out[20:24], binary.LittleEndian.Uint32(in[20:24])^x5)
+ binary.LittleEndian.PutUint32(out[24:28], binary.LittleEndian.Uint32(in[24:28])^x6)
+ binary.LittleEndian.PutUint32(out[28:32], binary.LittleEndian.Uint32(in[28:32])^x7)
+ binary.LittleEndian.PutUint32(out[32:36], binary.LittleEndian.Uint32(in[32:36])^x8)
+ binary.LittleEndian.PutUint32(out[36:40], binary.LittleEndian.Uint32(in[36:40])^x9)
+ binary.LittleEndian.PutUint32(out[40:44], binary.LittleEndian.Uint32(in[40:44])^x10)
+ binary.LittleEndian.PutUint32(out[44:48], binary.LittleEndian.Uint32(in[44:48])^x11)
+ binary.LittleEndian.PutUint32(out[48:52], binary.LittleEndian.Uint32(in[48:52])^x12)
+ binary.LittleEndian.PutUint32(out[52:56], binary.LittleEndian.Uint32(in[52:56])^x13)
+ binary.LittleEndian.PutUint32(out[56:60], binary.LittleEndian.Uint32(in[56:60])^x14)
+ binary.LittleEndian.PutUint32(out[60:64], binary.LittleEndian.Uint32(in[60:64])^x15)
+ in = in[BlockSize:]
+ } else {
+ binary.LittleEndian.PutUint32(out[0:4], x0)
+ binary.LittleEndian.PutUint32(out[4:8], x1)
+ binary.LittleEndian.PutUint32(out[8:12], x2)
+ binary.LittleEndian.PutUint32(out[12:16], x3)
+ binary.LittleEndian.PutUint32(out[16:20], x4)
+ binary.LittleEndian.PutUint32(out[20:24], x5)
+ binary.LittleEndian.PutUint32(out[24:28], x6)
+ binary.LittleEndian.PutUint32(out[28:32], x7)
+ binary.LittleEndian.PutUint32(out[32:36], x8)
+ binary.LittleEndian.PutUint32(out[36:40], x9)
+ binary.LittleEndian.PutUint32(out[40:44], x10)
+ binary.LittleEndian.PutUint32(out[44:48], x11)
+ binary.LittleEndian.PutUint32(out[48:52], x12)
+ binary.LittleEndian.PutUint32(out[52:56], x13)
+ binary.LittleEndian.PutUint32(out[56:60], x14)
+ binary.LittleEndian.PutUint32(out[60:64], x15)
+ }
+ out = out[BlockSize:]
+ }
+
+ // Stoping at 2^70 bytes per nonce is the user's responsibility.
+ ctr := uint64(x[13])<<32 | uint64(x[12])
+ ctr++
+ x[12] = uint32(ctr)
+ x[13] = uint32(ctr >> 32)
+ }
+}
+
+func hChaChaRef(x *[stateSize]uint32, out *[32]byte) {
+ x0, x1, x2, x3 := sigma0, sigma1, sigma2, sigma3
+ x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15 := x[0], x[1], x[2], x[3], x[4], x[5], x[6], x[7], x[8], x[9], x[10], x[11]
+
+ for i := chachaRounds; i > 0; i -= 2 {
+ // quarterround(x, 0, 4, 8, 12)
+ x0 += x4
+ x12 ^= x0
+ x12 = bits.RotateLeft32(x12, 16)
+ x8 += x12
+ x4 ^= x8
+ x4 = bits.RotateLeft32(x4, 12)
+ x0 += x4
+ x12 ^= x0
+ x12 = bits.RotateLeft32(x12, 8)
+ x8 += x12
+ x4 ^= x8
+ x4 = bits.RotateLeft32(x4, 7)
+
+ // quarterround(x, 1, 5, 9, 13)
+ x1 += x5
+ x13 ^= x1
+ x13 = bits.RotateLeft32(x13, 16)
+ x9 += x13
+ x5 ^= x9
+ x5 = bits.RotateLeft32(x5, 12)
+ x1 += x5
+ x13 ^= x1
+ x13 = bits.RotateLeft32(x13, 8)
+ x9 += x13
+ x5 ^= x9
+ x5 = bits.RotateLeft32(x5, 7)
+
+ // quarterround(x, 2, 6, 10, 14)
+ x2 += x6
+ x14 ^= x2
+ x14 = bits.RotateLeft32(x14, 16)
+ x10 += x14
+ x6 ^= x10
+ x6 = bits.RotateLeft32(x6, 12)
+ x2 += x6
+ x14 ^= x2
+ x14 = bits.RotateLeft32(x14, 8)
+ x10 += x14
+ x6 ^= x10
+ x6 = bits.RotateLeft32(x6, 7)
+
+ // quarterround(x, 3, 7, 11, 15)
+ x3 += x7
+ x15 ^= x3
+ x15 = bits.RotateLeft32(x15, 16)
+ x11 += x15
+ x7 ^= x11
+ x7 = bits.RotateLeft32(x7, 12)
+ x3 += x7
+ x15 ^= x3
+ x15 = bits.RotateLeft32(x15, 8)
+ x11 += x15
+ x7 ^= x11
+ x7 = bits.RotateLeft32(x7, 7)
+
+ // quarterround(x, 0, 5, 10, 15)
+ x0 += x5
+ x15 ^= x0
+ x15 = bits.RotateLeft32(x15, 16)
+ x10 += x15
+ x5 ^= x10
+ x5 = bits.RotateLeft32(x5, 12)
+ x0 += x5
+ x15 ^= x0
+ x15 = bits.RotateLeft32(x15, 8)
+ x10 += x15
+ x5 ^= x10
+ x5 = bits.RotateLeft32(x5, 7)
+
+ // quarterround(x, 1, 6, 11, 12)
+ x1 += x6
+ x12 ^= x1
+ x12 = bits.RotateLeft32(x12, 16)
+ x11 += x12
+ x6 ^= x11
+ x6 = bits.RotateLeft32(x6, 12)
+ x1 += x6
+ x12 ^= x1
+ x12 = bits.RotateLeft32(x12, 8)
+ x11 += x12
+ x6 ^= x11
+ x6 = bits.RotateLeft32(x6, 7)
+
+ // quarterround(x, 2, 7, 8, 13)
+ x2 += x7
+ x13 ^= x2
+ x13 = bits.RotateLeft32(x13, 16)
+ x8 += x13
+ x7 ^= x8
+ x7 = bits.RotateLeft32(x7, 12)
+ x2 += x7
+ x13 ^= x2
+ x13 = bits.RotateLeft32(x13, 8)
+ x8 += x13
+ x7 ^= x8
+ x7 = bits.RotateLeft32(x7, 7)
+
+ // quarterround(x, 3, 4, 9, 14)
+ x3 += x4
+ x14 ^= x3
+ x14 = bits.RotateLeft32(x14, 16)
+ x9 += x14
+ x4 ^= x9
+ x4 = bits.RotateLeft32(x4, 12)
+ x3 += x4
+ x14 ^= x3
+ x14 = bits.RotateLeft32(x14, 8)
+ x9 += x14
+ x4 ^= x9
+ x4 = bits.RotateLeft32(x4, 7)
+ }
+
+ // HChaCha returns x0...x3 | x12...x15, which corresponds to the
+ // indexes of the ChaCha constant and the indexes of the IV.
+ if useUnsafe {
+ outArr := (*[16]uint32)(unsafe.Pointer(&out[0]))
+ outArr[0] = x0
+ outArr[1] = x1
+ outArr[2] = x2
+ outArr[3] = x3
+ outArr[4] = x12
+ outArr[5] = x13
+ outArr[6] = x14
+ outArr[7] = x15
+ } else {
+ binary.LittleEndian.PutUint32(out[0:4], x0)
+ binary.LittleEndian.PutUint32(out[4:8], x1)
+ binary.LittleEndian.PutUint32(out[8:12], x2)
+ binary.LittleEndian.PutUint32(out[12:16], x3)
+ binary.LittleEndian.PutUint32(out[16:20], x12)
+ binary.LittleEndian.PutUint32(out[20:24], x13)
+ binary.LittleEndian.PutUint32(out[24:28], x14)
+ binary.LittleEndian.PutUint32(out[28:32], x15)
+ }
+ return
+}