update kcp vendor

10 years ago · 1e709ceaba
28 changed files with 699 additions and 2308 deletions
--- a/cmd/gost/vendor/github.com/codahale/chacha20/LICENSE
+++ b/cmd/gost/vendor/github.com/codahale/chacha20/LICENSE
@ -1,21 +0,0 @@
-The MIT License (MIT)
-
-Copyright (c) 2014 Coda Hale
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
--- a/cmd/gost/vendor/github.com/codahale/chacha20/README.md
+++ b/cmd/gost/vendor/github.com/codahale/chacha20/README.md
@ -1,8 +0,0 @@
-chacha20
-========
-
-[![Build Status](https://travis-ci.org/codahale/chacha20.png?branch=master)](https://travis-ci.org/codahale/chacha20)
-
-A pure Go implementation of the ChaCha20 stream cipher.
-
-For documentation, check [godoc](http://godoc.org/github.com/codahale/chacha20).
--- a/cmd/gost/vendor/github.com/codahale/chacha20/chacha20.go
+++ b/cmd/gost/vendor/github.com/codahale/chacha20/chacha20.go
@ -1,235 +0,0 @@
-// Package chacha20 provides a pure Go implementation of ChaCha20, a fast,
-// secure stream cipher.
-//
-// From Bernstein, Daniel J. "ChaCha, a variant of Salsa20." Workshop Record of
-// SASC. 2008. (http://cr.yp.to/chacha/chacha-20080128.pdf):
-//
-//	ChaCha8 is a 256-bit stream cipher based on the 8-round cipher Salsa20/8.
-//	The changes from Salsa20/8 to ChaCha8 are designed to improve diffusion per
-//	round, conjecturally increasing resistance to cryptanalysis, while
-//	preserving -- and often improving -- time per round. ChaCha12 and ChaCha20
-//	are analogous modiﬁcations of the 12-round and 20-round ciphers Salsa20/12
-//	and Salsa20/20. This paper presents the ChaCha family and explains the
-//	differences between Salsa20 and ChaCha.
-//
-// For more information, see http://cr.yp.to/chacha.html
-package chacha20
-
-import (
-	"crypto/cipher"
-	"encoding/binary"
-	"errors"
-	"unsafe"
-)
-
-const (
-	// KeySize is the length of ChaCha20 keys, in bytes.
-	KeySize = 32
-	// NonceSize is the length of ChaCha20 nonces, in bytes.
-	NonceSize = 8
-	// XNonceSize is the length of XChaCha20 nonces, in bytes.
-	XNonceSize = 24
-)
-
-var (
-	// ErrInvalidKey is returned when the provided key is not 256 bits long.
-	ErrInvalidKey = errors.New("invalid key length (must be 256 bits)")
-	// ErrInvalidNonce is returned when the provided nonce is not 64 bits long.
-	ErrInvalidNonce = errors.New("invalid nonce length (must be 64 bits)")
-	// ErrInvalidXNonce is returned when the provided nonce is not 192 bits
-	// long.
-	ErrInvalidXNonce = errors.New("invalid nonce length (must be 192 bits)")
-	// ErrInvalidRounds is returned when the provided rounds is not
-	// 8, 12, or 20.
-	ErrInvalidRounds = errors.New("invalid rounds number (must be 8, 12, or 20)")
-)
-
-// New creates and returns a new cipher.Stream. The key argument must be 256
-// bits long, and the nonce argument must be 64 bits long. The nonce must be
-// randomly generated or used only once. This Stream instance must not be used
-// to encrypt more than 2^70 bytes (~1 zettabyte).
-func New(key []byte, nonce []byte) (cipher.Stream, error) {
-	return NewWithRounds(key, nonce, 20)
-}
-
-// NewWithRounds creates and returns a new cipher.Stream just like New but
-// the rounds number of 8, 12, or 20 can be specified.
-func NewWithRounds(key []byte, nonce []byte, rounds uint8) (cipher.Stream, error) {
-	if len(key) != KeySize {
-		return nil, ErrInvalidKey
-	}
-
-	if len(nonce) != NonceSize {
-		return nil, ErrInvalidNonce
-	}
-
-	if (rounds != 8) && (rounds != 12) && (rounds != 20) {
-		return nil, ErrInvalidRounds
-	}
-
-	s := new(stream)
-	s.init(key, nonce, rounds)
-	s.advance()
-
-	return s, nil
-}
-
-// NewXChaCha creates and returns a new cipher.Stream. The key argument must be
-// 256 bits long, and the nonce argument must be 192 bits long. The nonce must
-// be randomly generated or only used once. This Stream instance must not be
-// used to encrypt more than 2^70 bytes (~1 zetta byte).
-func NewXChaCha(key []byte, nonce []byte) (cipher.Stream, error) {
-	return NewXChaChaWithRounds(key, nonce, 20)
-}
-
-// NewXChaChaWithRounds creates and returns a new cipher.Stream just like
-// NewXChaCha but the rounds number of 8, 12, or 20 can be specified.
-func NewXChaChaWithRounds(key []byte, nonce []byte, rounds uint8) (cipher.Stream, error) {
-	if len(key) != KeySize {
-		return nil, ErrInvalidKey
-	}
-
-	if len(nonce) != XNonceSize {
-		return nil, ErrInvalidXNonce
-	}
-
-	if (rounds != 8) && (rounds != 12) && (rounds != 20) {
-		return nil, ErrInvalidRounds
-	}
-
-	s := new(stream)
-	s.init(key, nonce, rounds)
-
-	// Call HChaCha to derive the subkey using the key and the first 16 bytes
-	// of the nonce, and re-initialize the state using the subkey and the
-	// remaining nonce.
-	blockArr := (*[stateSize]uint32)(unsafe.Pointer(&s.block))
-	core(&s.state, blockArr, s.rounds, true)
-	copy(s.state[4:8], blockArr[0:4])
-	copy(s.state[8:12], blockArr[12:16])
-	s.state[12] = 0
-	s.state[13] = 0
-	s.state[14] = binary.LittleEndian.Uint32(nonce[16:])
-	s.state[15] = binary.LittleEndian.Uint32(nonce[20:])
-
-	s.advance()
-
-	return s, nil
-}
-
-type stream struct {
-	state  [stateSize]uint32 // the state as an array of 16 32-bit words
-	block  [blockSize]byte   // the keystream as an array of 64 bytes
-	offset int               // the offset of used bytes in block
-	rounds uint8
-}
-
-func (s *stream) XORKeyStream(dst, src []byte) {
-	// Stride over the input in 64-byte blocks, minus the amount of keystream
-	// previously used. This will produce best results when processing blocks
-	// of a size evenly divisible by 64.
-	i := 0
-	max := len(src)
-	for i < max {
-		gap := blockSize - s.offset
-
-		limit := i + gap
-		if limit > max {
-			limit = max
-		}
-
-		o := s.offset
-		for j := i; j < limit; j++ {
-			dst[j] = src[j] ^ s.block[o]
-			o++
-		}
-
-		i += gap
-		s.offset = o
-
-		if o == blockSize {
-			s.advance()
-		}
-	}
-}
-
-func (s *stream) init(key []byte, nonce []byte, rounds uint8) {
-	// the magic constants for 256-bit keys
-	s.state[0] = 0x61707865
-	s.state[1] = 0x3320646e
-	s.state[2] = 0x79622d32
-	s.state[3] = 0x6b206574
-
-	s.state[4] = binary.LittleEndian.Uint32(key[0:])
-	s.state[5] = binary.LittleEndian.Uint32(key[4:])
-	s.state[6] = binary.LittleEndian.Uint32(key[8:])
-	s.state[7] = binary.LittleEndian.Uint32(key[12:])
-	s.state[8] = binary.LittleEndian.Uint32(key[16:])
-	s.state[9] = binary.LittleEndian.Uint32(key[20:])
-	s.state[10] = binary.LittleEndian.Uint32(key[24:])
-	s.state[11] = binary.LittleEndian.Uint32(key[28:])
-
-	switch len(nonce) {
-	case NonceSize:
-		// ChaCha20 uses 8 byte nonces.
-		s.state[12] = 0
-		s.state[13] = 0
-		s.state[14] = binary.LittleEndian.Uint32(nonce[0:])
-		s.state[15] = binary.LittleEndian.Uint32(nonce[4:])
-	case XNonceSize:
-		// XChaCha20 derives the subkey via HChaCha initialized
-		// with the first 16 bytes of the nonce.
-		s.state[12] = binary.LittleEndian.Uint32(nonce[0:])
-		s.state[13] = binary.LittleEndian.Uint32(nonce[4:])
-		s.state[14] = binary.LittleEndian.Uint32(nonce[8:])
-		s.state[15] = binary.LittleEndian.Uint32(nonce[12:])
-	default:
-		// Never happens, both ctors validate the nonce length.
-		panic("invalid nonce size")
-	}
-
-	s.rounds = rounds
-}
-
-// BUG(codahale): Totally untested on big-endian CPUs. Would very much
-// appreciate someone with an ARM device giving this a swing.
-
-// advances the keystream
-func (s *stream) advance() {
-	core(&s.state, (*[stateSize]uint32)(unsafe.Pointer(&s.block)), s.rounds, false)
-
-	if bigEndian {
-		j := blockSize - 1
-		for i := 0; i < blockSize/2; i++ {
-			s.block[j], s.block[i] = s.block[i], s.block[j]
-			j--
-		}
-	}
-
-	s.offset = 0
-	i := s.state[12] + 1
-	s.state[12] = i
-	if i == 0 {
-		s.state[13]++
-	}
-}
-
-const (
-	wordSize  = 4                    // the size of ChaCha20's words
-	stateSize = 16                   // the size of ChaCha20's state, in words
-	blockSize = stateSize * wordSize // the size of ChaCha20's block, in bytes
-)
-
-var (
-	bigEndian bool // whether or not we're running on a bigEndian CPU
-)
-
-// Do some up-front bookkeeping on what sort of CPU we're using. ChaCha20 treats
-// its state as a little-endian byte array when it comes to generating the
-// keystream, which allows for a zero-copy approach to the core transform. On
-// big-endian architectures, we have to take a hit to reverse the bytes.
-func init() {
-	x := uint32(0x04030201)
-	y := [4]byte{0x1, 0x2, 0x3, 0x4}
-	bigEndian = *(*[4]byte)(unsafe.Pointer(&x)) != y
-}
--- a/cmd/gost/vendor/github.com/codahale/chacha20/core_ref.go
+++ b/cmd/gost/vendor/github.com/codahale/chacha20/core_ref.go
@ -1,166 +0,0 @@
-// The ChaCha20 core transform.
-// An unrolled and inlined implementation in pure Go.
-
-package chacha20
-
-func core(input, output *[stateSize]uint32, rounds uint8, hchacha bool) {
-	var (
-		x00 = input[0]
-		x01 = input[1]
-		x02 = input[2]
-		x03 = input[3]
-		x04 = input[4]
-		x05 = input[5]
-		x06 = input[6]
-		x07 = input[7]
-		x08 = input[8]
-		x09 = input[9]
-		x10 = input[10]
-		x11 = input[11]
-		x12 = input[12]
-		x13 = input[13]
-		x14 = input[14]
-		x15 = input[15]
-	)
-
-	var x uint32
-
-	// Unrolling all 20 rounds kills performance on modern Intel processors
-	// (Tested on a i5 Haswell, likely applies to Sandy Bridge+), due to uop
-	// cache thrashing.  The straight forward 2 rounds per loop implementation
-	// of this has double the performance of the fully unrolled version.
-	for i := uint8(0); i < rounds; i += 2 {
-		x00 += x04
-		x = x12 ^ x00
-		x12 = (x << 16) | (x >> 16)
-		x08 += x12
-		x = x04 ^ x08
-		x04 = (x << 12) | (x >> 20)
-		x00 += x04
-		x = x12 ^ x00
-		x12 = (x << 8) | (x >> 24)
-		x08 += x12
-		x = x04 ^ x08
-		x04 = (x << 7) | (x >> 25)
-		x01 += x05
-		x = x13 ^ x01
-		x13 = (x << 16) | (x >> 16)
-		x09 += x13
-		x = x05 ^ x09
-		x05 = (x << 12) | (x >> 20)
-		x01 += x05
-		x = x13 ^ x01
-		x13 = (x << 8) | (x >> 24)
-		x09 += x13
-		x = x05 ^ x09
-		x05 = (x << 7) | (x >> 25)
-		x02 += x06
-		x = x14 ^ x02
-		x14 = (x << 16) | (x >> 16)
-		x10 += x14
-		x = x06 ^ x10
-		x06 = (x << 12) | (x >> 20)
-		x02 += x06
-		x = x14 ^ x02
-		x14 = (x << 8) | (x >> 24)
-		x10 += x14
-		x = x06 ^ x10
-		x06 = (x << 7) | (x >> 25)
-		x03 += x07
-		x = x15 ^ x03
-		x15 = (x << 16) | (x >> 16)
-		x11 += x15
-		x = x07 ^ x11
-		x07 = (x << 12) | (x >> 20)
-		x03 += x07
-		x = x15 ^ x03
-		x15 = (x << 8) | (x >> 24)
-		x11 += x15
-		x = x07 ^ x11
-		x07 = (x << 7) | (x >> 25)
-		x00 += x05
-		x = x15 ^ x00
-		x15 = (x << 16) | (x >> 16)
-		x10 += x15
-		x = x05 ^ x10
-		x05 = (x << 12) | (x >> 20)
-		x00 += x05
-		x = x15 ^ x00
-		x15 = (x << 8) | (x >> 24)
-		x10 += x15
-		x = x05 ^ x10
-		x05 = (x << 7) | (x >> 25)
-		x01 += x06
-		x = x12 ^ x01
-		x12 = (x << 16) | (x >> 16)
-		x11 += x12
-		x = x06 ^ x11
-		x06 = (x << 12) | (x >> 20)
-		x01 += x06
-		x = x12 ^ x01
-		x12 = (x << 8) | (x >> 24)
-		x11 += x12
-		x = x06 ^ x11
-		x06 = (x << 7) | (x >> 25)
-		x02 += x07
-		x = x13 ^ x02
-		x13 = (x << 16) | (x >> 16)
-		x08 += x13
-		x = x07 ^ x08
-		x07 = (x << 12) | (x >> 20)
-		x02 += x07
-		x = x13 ^ x02
-		x13 = (x << 8) | (x >> 24)
-		x08 += x13
-		x = x07 ^ x08
-		x07 = (x << 7) | (x >> 25)
-		x03 += x04
-		x = x14 ^ x03
-		x14 = (x << 16) | (x >> 16)
-		x09 += x14
-		x = x04 ^ x09
-		x04 = (x << 12) | (x >> 20)
-		x03 += x04
-		x = x14 ^ x03
-		x14 = (x << 8) | (x >> 24)
-		x09 += x14
-		x = x04 ^ x09
-		x04 = (x << 7) | (x >> 25)
-	}
-
-	if !hchacha {
-		output[0] = x00 + input[0]
-		output[1] = x01 + input[1]
-		output[2] = x02 + input[2]
-		output[3] = x03 + input[3]
-		output[4] = x04 + input[4]
-		output[5] = x05 + input[5]
-		output[6] = x06 + input[6]
-		output[7] = x07 + input[7]
-		output[8] = x08 + input[8]
-		output[9] = x09 + input[9]
-		output[10] = x10 + input[10]
-		output[11] = x11 + input[11]
-		output[12] = x12 + input[12]
-		output[13] = x13 + input[13]
-		output[14] = x14 + input[14]
-		output[15] = x15 + input[15]
-	} else {
-		output[0] = x00
-		output[1] = x01
-		output[2] = x02
-		output[3] = x03
-		output[4] = x04
-		output[5] = x05
-		output[6] = x06
-		output[7] = x07
-		output[8] = x08
-		output[9] = x09
-		output[10] = x10
-		output[11] = x11
-		output[12] = x12
-		output[13] = x13
-		output[14] = x14
-		output[15] = x15
-	}
-}
--- a/cmd/gost/vendor/github.com/klauspost/crc32/LICENSE
+++ b/cmd/gost/vendor/github.com/klauspost/crc32/LICENSE
@ -1,28 +0,0 @@
-Copyright (c) 2012 The Go Authors. All rights reserved.
-Copyright (c) 2015 Klaus Post
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-   * Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-   * Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-   * Neither the name of Google Inc. nor the names of its
-contributors may be used to endorse or promote products derived from
-this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/cmd/gost/vendor/github.com/klauspost/crc32/README.md
+++ b/cmd/gost/vendor/github.com/klauspost/crc32/README.md
@ -1,87 +0,0 @@
-# crc32
-CRC32 hash with x64 optimizations
-
-This package is a drop-in replacement for the standard library `hash/crc32` package, that features SSE 4.2 optimizations on x64 platforms, for a 10x speedup.
-
-[![Build Status](https://travis-ci.org/klauspost/crc32.svg?branch=master)](https://travis-ci.org/klauspost/crc32)
-
-# usage
-
-Install using `go get github.com/klauspost/crc32`. This library is based on Go 1.5 code and requires Go 1.3 or newer.
-
-Replace `import "hash/crc32"` with `import "github.com/klauspost/crc32"` and you are good to go.
-
-# changes
-* Oct 20, 2016: Changes have been merged to upstream Go. Package updated to match.
-* Dec 4, 2015: Uses the "slice-by-8" trick more extensively, which gives a 1.5 to 2.5x speedup if assembler is unavailable.
-
-
-# performance
-
-For *Go 1.7* performance is equivalent to the standard library. So if you use this package for Go 1.7 you can switch back.
-
-
-For IEEE tables (the most common), there is approximately a factor 10 speedup with "CLMUL" (Carryless multiplication) instruction:
-```
-benchmark            old ns/op     new ns/op     delta
-BenchmarkCrc32KB     99955         10258         -89.74%
-
-benchmark            old MB/s     new MB/s     speedup
-BenchmarkCrc32KB     327.83       3194.20      9.74x
-```
-
-For other tables and "CLMUL"  capable machines the performance is the same as the standard library.
-
-Here are some detailed benchmarks, comparing to go 1.5 standard library with and without assembler enabled.
-
-```
-Std:   Standard Go 1.5 library
-Crc:   Indicates IEEE type CRC.
-40B:   Size of each slice encoded.
-NoAsm: Assembler was disabled (ie. not an AMD64 or SSE 4.2+ capable machine).
-Castagnoli: Castagnoli CRC type.
-
-BenchmarkStdCrc40B-4            10000000               158 ns/op         252.88 MB/s
-BenchmarkCrc40BNoAsm-4          20000000               105 ns/op         377.38 MB/s (slice8)
-BenchmarkCrc40B-4               20000000               105 ns/op         378.77 MB/s (slice8)
-
-BenchmarkStdCrc1KB-4              500000              3604 ns/op         284.10 MB/s
-BenchmarkCrc1KBNoAsm-4           1000000              1463 ns/op         699.79 MB/s (slice8)
-BenchmarkCrc1KB-4                3000000               396 ns/op        2583.69 MB/s (asm)
-
-BenchmarkStdCrc8KB-4              200000             11417 ns/op         717.48 MB/s (slice8)
-BenchmarkCrc8KBNoAsm-4            200000             11317 ns/op         723.85 MB/s (slice8)
-BenchmarkCrc8KB-4                 500000              2919 ns/op        2805.73 MB/s (asm)
-
-BenchmarkStdCrc32KB-4              30000             45749 ns/op         716.24 MB/s (slice8)
-BenchmarkCrc32KBNoAsm-4            30000             45109 ns/op         726.42 MB/s (slice8)
-BenchmarkCrc32KB-4                100000             11497 ns/op        2850.09 MB/s (asm)
-
-BenchmarkStdNoAsmCastagnol40B-4 10000000               161 ns/op         246.94 MB/s
-BenchmarkStdCastagnoli40B-4     50000000              28.4 ns/op        1410.69 MB/s (asm)
-BenchmarkCastagnoli40BNoAsm-4   20000000               100 ns/op         398.01 MB/s (slice8)
-BenchmarkCastagnoli40B-4        50000000              28.2 ns/op        1419.54 MB/s (asm)
-
-BenchmarkStdNoAsmCastagnoli1KB-4  500000              3622 ns/op        282.67 MB/s
-BenchmarkStdCastagnoli1KB-4     10000000               144 ns/op        7099.78 MB/s (asm)
-BenchmarkCastagnoli1KBNoAsm-4    1000000              1475 ns/op         694.14 MB/s (slice8)
-BenchmarkCastagnoli1KB-4        10000000               146 ns/op        6993.35 MB/s (asm)
-
-BenchmarkStdNoAsmCastagnoli8KB-4  50000              28781 ns/op         284.63 MB/s
-BenchmarkStdCastagnoli8KB-4      1000000              1029 ns/op        7957.89 MB/s (asm)
-BenchmarkCastagnoli8KBNoAsm-4     200000             11410 ns/op         717.94 MB/s (slice8)
-BenchmarkCastagnoli8KB-4         1000000              1000 ns/op        8188.71 MB/s (asm)
-
-BenchmarkStdNoAsmCastagnoli32KB-4  10000            115426 ns/op         283.89 MB/s
-BenchmarkStdCastagnoli32KB-4      300000              4065 ns/op        8059.13 MB/s (asm)
-BenchmarkCastagnoli32KBNoAsm-4     30000             45171 ns/op         725.41 MB/s (slice8)
-BenchmarkCastagnoli32KB-4         500000              4077 ns/op        8035.89 MB/s (asm)
-```
-
-The IEEE assembler optimizations has been submitted and will be part of the Go 1.6 standard library.
-
-However, the improved use of slice-by-8 has not, but will probably be submitted for Go 1.7.
-
-# license
-
-Standard Go license. Changes are Copyright (c) 2015 Klaus Post under same conditions.
--- a/cmd/gost/vendor/github.com/klauspost/crc32/crc32.go
+++ b/cmd/gost/vendor/github.com/klauspost/crc32/crc32.go
@ -1,207 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package crc32 implements the 32-bit cyclic redundancy check, or CRC-32,
-// checksum. See http://en.wikipedia.org/wiki/Cyclic_redundancy_check for
-// information.
-//
-// Polynomials are represented in LSB-first form also known as reversed representation.
-//
-// See http://en.wikipedia.org/wiki/Mathematics_of_cyclic_redundancy_checks#Reversed_representations_and_reciprocal_polynomials
-// for information.
-package crc32
-
-import (
-	"hash"
-	"sync"
-)
-
-// The size of a CRC-32 checksum in bytes.
-const Size = 4
-
-// Predefined polynomials.
-const (
-	// IEEE is by far and away the most common CRC-32 polynomial.
-	// Used by ethernet (IEEE 802.3), v.42, fddi, gzip, zip, png, ...
-	IEEE = 0xedb88320
-
-	// Castagnoli's polynomial, used in iSCSI.
-	// Has better error detection characteristics than IEEE.
-	// http://dx.doi.org/10.1109/26.231911
-	Castagnoli = 0x82f63b78
-
-	// Koopman's polynomial.
-	// Also has better error detection characteristics than IEEE.
-	// http://dx.doi.org/10.1109/DSN.2002.1028931
-	Koopman = 0xeb31d82e
-)
-
-// Table is a 256-word table representing the polynomial for efficient processing.
-type Table [256]uint32
-
-// This file makes use of functions implemented in architecture-specific files.
-// The interface that they implement is as follows:
-//
-//    // archAvailableIEEE reports whether an architecture-specific CRC32-IEEE
-//    // algorithm is available.
-//    archAvailableIEEE() bool
-//
-//    // archInitIEEE initializes the architecture-specific CRC3-IEEE algorithm.
-//    // It can only be called if archAvailableIEEE() returns true.
-//    archInitIEEE()
-//
-//    // archUpdateIEEE updates the given CRC32-IEEE. It can only be called if
-//    // archInitIEEE() was previously called.
-//    archUpdateIEEE(crc uint32, p []byte) uint32
-//
-//    // archAvailableCastagnoli reports whether an architecture-specific
-//    // CRC32-C algorithm is available.
-//    archAvailableCastagnoli() bool
-//
-//    // archInitCastagnoli initializes the architecture-specific CRC32-C
-//    // algorithm. It can only be called if archAvailableCastagnoli() returns
-//    // true.
-//    archInitCastagnoli()
-//
-//    // archUpdateCastagnoli updates the given CRC32-C. It can only be called
-//    // if archInitCastagnoli() was previously called.
-//    archUpdateCastagnoli(crc uint32, p []byte) uint32
-
-// castagnoliTable points to a lazily initialized Table for the Castagnoli
-// polynomial. MakeTable will always return this value when asked to make a
-// Castagnoli table so we can compare against it to find when the caller is
-// using this polynomial.
-var castagnoliTable *Table
-var castagnoliTable8 *slicing8Table
-var castagnoliArchImpl bool
-var updateCastagnoli func(crc uint32, p []byte) uint32
-var castagnoliOnce sync.Once
-
-func castagnoliInit() {
-	castagnoliTable = simpleMakeTable(Castagnoli)
-	castagnoliArchImpl = archAvailableCastagnoli()
-
-	if castagnoliArchImpl {
-		archInitCastagnoli()
-		updateCastagnoli = archUpdateCastagnoli
-	} else {
-		// Initialize the slicing-by-8 table.
-		castagnoliTable8 = slicingMakeTable(Castagnoli)
-		updateCastagnoli = func(crc uint32, p []byte) uint32 {
-			return slicingUpdate(crc, castagnoliTable8, p)
-		}
-	}
-}
-
-// IEEETable is the table for the IEEE polynomial.
-var IEEETable = simpleMakeTable(IEEE)
-
-// ieeeTable8 is the slicing8Table for IEEE
-var ieeeTable8 *slicing8Table
-var ieeeArchImpl bool
-var updateIEEE func(crc uint32, p []byte) uint32
-var ieeeOnce sync.Once
-
-func ieeeInit() {
-	ieeeArchImpl = archAvailableIEEE()
-
-	if ieeeArchImpl {
-		archInitIEEE()
-		updateIEEE = archUpdateIEEE
-	} else {
-		// Initialize the slicing-by-8 table.
-		ieeeTable8 = slicingMakeTable(IEEE)
-		updateIEEE = func(crc uint32, p []byte) uint32 {
-			return slicingUpdate(crc, ieeeTable8, p)
-		}
-	}
-}
-
-// MakeTable returns a Table constructed from the specified polynomial.
-// The contents of this Table must not be modified.
-func MakeTable(poly uint32) *Table {
-	switch poly {
-	case IEEE:
-		ieeeOnce.Do(ieeeInit)
-		return IEEETable
-	case Castagnoli:
-		castagnoliOnce.Do(castagnoliInit)
-		return castagnoliTable
-	}
-	return simpleMakeTable(poly)
-}
-
-// digest represents the partial evaluation of a checksum.
-type digest struct {
-	crc uint32
-	tab *Table
-}
-
-// New creates a new hash.Hash32 computing the CRC-32 checksum
-// using the polynomial represented by the Table.
-// Its Sum method will lay the value out in big-endian byte order.
-func New(tab *Table) hash.Hash32 {
-	if tab == IEEETable {
-		ieeeOnce.Do(ieeeInit)
-	}
-	return &digest{0, tab}
-}
-
-// NewIEEE creates a new hash.Hash32 computing the CRC-32 checksum
-// using the IEEE polynomial.
-// Its Sum method will lay the value out in big-endian byte order.
-func NewIEEE() hash.Hash32 { return New(IEEETable) }
-
-func (d *digest) Size() int { return Size }
-
-func (d *digest) BlockSize() int { return 1 }
-
-func (d *digest) Reset() { d.crc = 0 }
-
-// Update returns the result of adding the bytes in p to the crc.
-func Update(crc uint32, tab *Table, p []byte) uint32 {
-	switch tab {
-	case castagnoliTable:
-		return updateCastagnoli(crc, p)
-	case IEEETable:
-		// Unfortunately, because IEEETable is exported, IEEE may be used without a
-		// call to MakeTable. We have to make sure it gets initialized in that case.
-		ieeeOnce.Do(ieeeInit)
-		return updateIEEE(crc, p)
-	default:
-		return simpleUpdate(crc, tab, p)
-	}
-}
-
-func (d *digest) Write(p []byte) (n int, err error) {
-	switch d.tab {
-	case castagnoliTable:
-		d.crc = updateCastagnoli(d.crc, p)
-	case IEEETable:
-		// We only create digest objects through New() which takes care of
-		// initialization in this case.
-		d.crc = updateIEEE(d.crc, p)
-	default:
-		d.crc = simpleUpdate(d.crc, d.tab, p)
-	}
-	return len(p), nil
-}
-
-func (d *digest) Sum32() uint32 { return d.crc }
-
-func (d *digest) Sum(in []byte) []byte {
-	s := d.Sum32()
-	return append(in, byte(s>>24), byte(s>>16), byte(s>>8), byte(s))
-}
-
-// Checksum returns the CRC-32 checksum of data
-// using the polynomial represented by the Table.
-func Checksum(data []byte, tab *Table) uint32 { return Update(0, tab, data) }
-
-// ChecksumIEEE returns the CRC-32 checksum of data
-// using the IEEE polynomial.
-func ChecksumIEEE(data []byte) uint32 {
-	ieeeOnce.Do(ieeeInit)
-	return updateIEEE(0, data)
-}
--- a/cmd/gost/vendor/github.com/klauspost/crc32/crc32_amd64.go
+++ b/cmd/gost/vendor/github.com/klauspost/crc32/crc32_amd64.go
@ -1,230 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !appengine,!gccgo
-
-// AMD64-specific hardware-assisted CRC32 algorithms. See crc32.go for a
-// description of the interface that each architecture-specific file
-// implements.
-
-package crc32
-
-import "unsafe"
-
-// This file contains the code to call the SSE 4.2 version of the Castagnoli
-// and IEEE CRC.
-
-// haveSSE41/haveSSE42/haveCLMUL are defined in crc_amd64.s and use
-// CPUID to test for SSE 4.1, 4.2 and CLMUL support.
-func haveSSE41() bool
-func haveSSE42() bool
-func haveCLMUL() bool
-
-// castagnoliSSE42 is defined in crc32_amd64.s and uses the SSE4.2 CRC32
-// instruction.
-//go:noescape
-func castagnoliSSE42(crc uint32, p []byte) uint32
-
-// castagnoliSSE42Triple is defined in crc32_amd64.s and uses the SSE4.2 CRC32
-// instruction.
-//go:noescape
-func castagnoliSSE42Triple(
-	crcA, crcB, crcC uint32,
-	a, b, c []byte,
-	rounds uint32,
-) (retA uint32, retB uint32, retC uint32)
-
-// ieeeCLMUL is defined in crc_amd64.s and uses the PCLMULQDQ
-// instruction as well as SSE 4.1.
-//go:noescape
-func ieeeCLMUL(crc uint32, p []byte) uint32
-
-var sse42 = haveSSE42()
-var useFastIEEE = haveCLMUL() && haveSSE41()
-
-const castagnoliK1 = 168
-const castagnoliK2 = 1344
-
-type sse42Table [4]Table
-
-var castagnoliSSE42TableK1 *sse42Table
-var castagnoliSSE42TableK2 *sse42Table
-
-func archAvailableCastagnoli() bool {
-	return sse42
-}
-
-func archInitCastagnoli() {
-	if !sse42 {
-		panic("arch-specific Castagnoli not available")
-	}
-	castagnoliSSE42TableK1 = new(sse42Table)
-	castagnoliSSE42TableK2 = new(sse42Table)
-	// See description in updateCastagnoli.
-	//    t[0][i] = CRC(i000, O)
-	//    t[1][i] = CRC(0i00, O)
-	//    t[2][i] = CRC(00i0, O)
-	//    t[3][i] = CRC(000i, O)
-	// where O is a sequence of K zeros.
-	var tmp [castagnoliK2]byte
-	for b := 0; b < 4; b++ {
-		for i := 0; i < 256; i++ {
-			val := uint32(i) << uint32(b*8)
-			castagnoliSSE42TableK1[b][i] = castagnoliSSE42(val, tmp[:castagnoliK1])
-			castagnoliSSE42TableK2[b][i] = castagnoliSSE42(val, tmp[:])
-		}
-	}
-}
-
-// castagnoliShift computes the CRC32-C of K1 or K2 zeroes (depending on the
-// table given) with the given initial crc value. This corresponds to
-// CRC(crc, O) in the description in updateCastagnoli.
-func castagnoliShift(table *sse42Table, crc uint32) uint32 {
-	return table[3][crc>>24] ^
-		table[2][(crc>>16)&0xFF] ^
-		table[1][(crc>>8)&0xFF] ^
-		table[0][crc&0xFF]
-}
-
-func archUpdateCastagnoli(crc uint32, p []byte) uint32 {
-	if !sse42 {
-		panic("not available")
-	}
-
-	// This method is inspired from the algorithm in Intel's white paper:
-	//    "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction"
-	// The same strategy of splitting the buffer in three is used but the
-	// combining calculation is different; the complete derivation is explained
-	// below.
-	//
-	// -- The basic idea --
-	//
-	// The CRC32 instruction (available in SSE4.2) can process 8 bytes at a
-	// time. In recent Intel architectures the instruction takes 3 cycles;
-	// however the processor can pipeline up to three instructions if they
-	// don't depend on each other.
-	//
-	// Roughly this means that we can process three buffers in about the same
-	// time we can process one buffer.
-	//
-	// The idea is then to split the buffer in three, CRC the three pieces
-	// separately and then combine the results.
-	//
-	// Combining the results requires precomputed tables, so we must choose a
-	// fixed buffer length to optimize. The longer the length, the faster; but
-	// only buffers longer than this length will use the optimization. We choose
-	// two cutoffs and compute tables for both:
-	//  - one around 512: 168*3=504
-	//  - one around 4KB: 1344*3=4032
-	//
-	// -- The nitty gritty --
-	//
-	// Let CRC(I, X) be the non-inverted CRC32-C of the sequence X (with
-	// initial non-inverted CRC I). This function has the following properties:
-	//   (a) CRC(I, AB) = CRC(CRC(I, A), B)
-	//   (b) CRC(I, A xor B) = CRC(I, A) xor CRC(0, B)
-	//
-	// Say we want to compute CRC(I, ABC) where A, B, C are three sequences of
-	// K bytes each, where K is a fixed constant. Let O be the sequence of K zero
-	// bytes.
-	//
-	// CRC(I, ABC) = CRC(I, ABO xor C)
-	//             = CRC(I, ABO) xor CRC(0, C)
-	//             = CRC(CRC(I, AB), O) xor CRC(0, C)
-	//             = CRC(CRC(I, AO xor B), O) xor CRC(0, C)
-	//             = CRC(CRC(I, AO) xor CRC(0, B), O) xor CRC(0, C)
-	//             = CRC(CRC(CRC(I, A), O) xor CRC(0, B), O) xor CRC(0, C)
-	//
-	// The castagnoliSSE42Triple function can compute CRC(I, A), CRC(0, B),
-	// and CRC(0, C) efficiently.  We just need to find a way to quickly compute
-	// CRC(uvwx, O) given a 4-byte initial value uvwx. We can precompute these
-	// values; since we can't have a 32-bit table, we break it up into four
-	// 8-bit tables:
-	//
-	//    CRC(uvwx, O) = CRC(u000, O) xor
-	//                   CRC(0v00, O) xor
-	//                   CRC(00w0, O) xor
-	//                   CRC(000x, O)
-	//
-	// We can compute tables corresponding to the four terms for all 8-bit
-	// values.
-
-	crc = ^crc
-
-	// If a buffer is long enough to use the optimization, process the first few
-	// bytes to align the buffer to an 8 byte boundary (if necessary).
-	if len(p) >= castagnoliK1*3 {
-		delta := int(uintptr(unsafe.Pointer(&p[0])) & 7)
-		if delta != 0 {
-			delta = 8 - delta
-			crc = castagnoliSSE42(crc, p[:delta])
-			p = p[delta:]
-		}
-	}
-
-	// Process 3*K2 at a time.
-	for len(p) >= castagnoliK2*3 {
-		// Compute CRC(I, A), CRC(0, B), and CRC(0, C).
-		crcA, crcB, crcC := castagnoliSSE42Triple(
-			crc, 0, 0,
-			p, p[castagnoliK2:], p[castagnoliK2*2:],
-			castagnoliK2/24)
-
-		// CRC(I, AB) = CRC(CRC(I, A), O) xor CRC(0, B)
-		crcAB := castagnoliShift(castagnoliSSE42TableK2, crcA) ^ crcB
-		// CRC(I, ABC) = CRC(CRC(I, AB), O) xor CRC(0, C)
-		crc = castagnoliShift(castagnoliSSE42TableK2, crcAB) ^ crcC
-		p = p[castagnoliK2*3:]
-	}
-
-	// Process 3*K1 at a time.
-	for len(p) >= castagnoliK1*3 {
-		// Compute CRC(I, A), CRC(0, B), and CRC(0, C).
-		crcA, crcB, crcC := castagnoliSSE42Triple(
-			crc, 0, 0,
-			p, p[castagnoliK1:], p[castagnoliK1*2:],
-			castagnoliK1/24)
-
-		// CRC(I, AB) = CRC(CRC(I, A), O) xor CRC(0, B)
-		crcAB := castagnoliShift(castagnoliSSE42TableK1, crcA) ^ crcB
-		// CRC(I, ABC) = CRC(CRC(I, AB), O) xor CRC(0, C)
-		crc = castagnoliShift(castagnoliSSE42TableK1, crcAB) ^ crcC
-		p = p[castagnoliK1*3:]
-	}
-
-	// Use the simple implementation for what's left.
-	crc = castagnoliSSE42(crc, p)
-	return ^crc
-}
-
-func archAvailableIEEE() bool {
-	return useFastIEEE
-}
-
-var archIeeeTable8 *slicing8Table
-
-func archInitIEEE() {
-	if !useFastIEEE {
-		panic("not available")
-	}
-	// We still use slicing-by-8 for small buffers.
-	archIeeeTable8 = slicingMakeTable(IEEE)
-}
-
-func archUpdateIEEE(crc uint32, p []byte) uint32 {
-	if !useFastIEEE {
-		panic("not available")
-	}
-
-	if len(p) >= 64 {
-		left := len(p) & 15
-		do := len(p) - left
-		crc = ^ieeeCLMUL(^crc, p[:do])
-		p = p[do:]
-	}
-	if len(p) == 0 {
-		return crc
-	}
-	return slicingUpdate(crc, archIeeeTable8, p)
-}
--- a/cmd/gost/vendor/github.com/klauspost/crc32/crc32_amd64.s
+++ b/cmd/gost/vendor/github.com/klauspost/crc32/crc32_amd64.s
@ -1,319 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build gc
-
-#define NOSPLIT 4
-#define RODATA 8
-
-// castagnoliSSE42 updates the (non-inverted) crc with the given buffer.
-//
-// func castagnoliSSE42(crc uint32, p []byte) uint32
-TEXT ·castagnoliSSE42(SB), NOSPLIT, $0
-	MOVL crc+0(FP), AX    // CRC value
-	MOVQ p+8(FP), SI      // data pointer
-	MOVQ p_len+16(FP), CX // len(p)
-
-	// If there are fewer than 8 bytes to process, skip alignment.
-	CMPQ CX, $8
-	JL   less_than_8
-
-	MOVQ SI, BX
-	ANDQ $7, BX
-	JZ   aligned
-
-	// Process the first few bytes to 8-byte align the input.
-
-	// BX = 8 - BX. We need to process this many bytes to align.
-	SUBQ $1, BX
-	XORQ $7, BX
-
-	BTQ $0, BX
-	JNC align_2
-
-	CRC32B (SI), AX
-	DECQ   CX
-	INCQ   SI
-
-align_2:
-	BTQ $1, BX
-	JNC align_4
-
-	// CRC32W (SI), AX
-	BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
-
-	SUBQ $2, CX
-	ADDQ $2, SI
-
-align_4:
-	BTQ $2, BX
-	JNC aligned
-
-	// CRC32L (SI), AX
-	BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
-
-	SUBQ $4, CX
-	ADDQ $4, SI
-
-aligned:
-	// The input is now 8-byte aligned and we can process 8-byte chunks.
-	CMPQ CX, $8
-	JL   less_than_8
-
-	CRC32Q (SI), AX
-	ADDQ   $8, SI
-	SUBQ   $8, CX
-	JMP    aligned
-
-less_than_8:
-	// We may have some bytes left over; process 4 bytes, then 2, then 1.
-	BTQ $2, CX
-	JNC less_than_4
-
-	// CRC32L (SI), AX
-	BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
-	ADDQ $4, SI
-
-less_than_4:
-	BTQ $1, CX
-	JNC less_than_2
-
-	// CRC32W (SI), AX
-	BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06
-	ADDQ $2, SI
-
-less_than_2:
-	BTQ $0, CX
-	JNC done
-
-	CRC32B (SI), AX
-
-done:
-	MOVL AX, ret+32(FP)
-	RET
-
-// castagnoliSSE42Triple updates three (non-inverted) crcs with (24*rounds)
-// bytes from each buffer.
-//
-// func castagnoliSSE42Triple(
-//     crc1, crc2, crc3 uint32,
-//     a, b, c []byte,
-//     rounds uint32,
-// ) (retA uint32, retB uint32, retC uint32)
-TEXT ·castagnoliSSE42Triple(SB), NOSPLIT, $0
-	MOVL crcA+0(FP), AX
-	MOVL crcB+4(FP), CX
-	MOVL crcC+8(FP), DX
-
-	MOVQ a+16(FP), R8  // data pointer
-	MOVQ b+40(FP), R9  // data pointer
-	MOVQ c+64(FP), R10 // data pointer
-
-	MOVL rounds+88(FP), R11
-
-loop:
-	CRC32Q (R8), AX
-	CRC32Q (R9), CX
-	CRC32Q (R10), DX
-
-	CRC32Q 8(R8), AX
-	CRC32Q 8(R9), CX
-	CRC32Q 8(R10), DX
-
-	CRC32Q 16(R8), AX
-	CRC32Q 16(R9), CX
-	CRC32Q 16(R10), DX
-
-	ADDQ $24, R8
-	ADDQ $24, R9
-	ADDQ $24, R10
-
-	DECQ R11
-	JNZ  loop
-
-	MOVL AX, retA+96(FP)
-	MOVL CX, retB+100(FP)
-	MOVL DX, retC+104(FP)
-	RET
-
-// func haveSSE42() bool
-TEXT ·haveSSE42(SB), NOSPLIT, $0
-	XORQ AX, AX
-	INCL AX
-	CPUID
-	SHRQ $20, CX
-	ANDQ $1, CX
-	MOVB CX, ret+0(FP)
-	RET
-
-// func haveCLMUL() bool
-TEXT ·haveCLMUL(SB), NOSPLIT, $0
-	XORQ AX, AX
-	INCL AX
-	CPUID
-	SHRQ $1, CX
-	ANDQ $1, CX
-	MOVB CX, ret+0(FP)
-	RET
-
-// func haveSSE41() bool
-TEXT ·haveSSE41(SB), NOSPLIT, $0
-	XORQ AX, AX
-	INCL AX
-	CPUID
-	SHRQ $19, CX
-	ANDQ $1, CX
-	MOVB CX, ret+0(FP)
-	RET
-
-// CRC32 polynomial data
-//
-// These constants are lifted from the
-// Linux kernel, since they avoid the costly
-// PSHUFB 16 byte reversal proposed in the
-// original Intel paper.
-DATA r2r1kp<>+0(SB)/8, $0x154442bd4
-DATA r2r1kp<>+8(SB)/8, $0x1c6e41596
-DATA r4r3kp<>+0(SB)/8, $0x1751997d0
-DATA r4r3kp<>+8(SB)/8, $0x0ccaa009e
-DATA rupolykp<>+0(SB)/8, $0x1db710641
-DATA rupolykp<>+8(SB)/8, $0x1f7011641
-DATA r5kp<>+0(SB)/8, $0x163cd6124
-
-GLOBL r2r1kp<>(SB), RODATA, $16
-GLOBL r4r3kp<>(SB), RODATA, $16
-GLOBL rupolykp<>(SB), RODATA, $16
-GLOBL r5kp<>(SB), RODATA, $8
-
-// Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
-// len(p) must be at least 64, and must be a multiple of 16.
-
-// func ieeeCLMUL(crc uint32, p []byte) uint32
-TEXT ·ieeeCLMUL(SB), NOSPLIT, $0
-	MOVL crc+0(FP), X0    // Initial CRC value
-	MOVQ p+8(FP), SI      // data pointer
-	MOVQ p_len+16(FP), CX // len(p)
-
-	MOVOU (SI), X1
-	MOVOU 16(SI), X2
-	MOVOU 32(SI), X3
-	MOVOU 48(SI), X4
-	PXOR  X0, X1
-	ADDQ  $64, SI    // buf+=64
-	SUBQ  $64, CX    // len-=64
-	CMPQ  CX, $64    // Less than 64 bytes left
-	JB    remain64
-
-	MOVOA r2r1kp<>+0(SB), X0
-
-loopback64:
-	MOVOA X1, X5
-	MOVOA X2, X6
-	MOVOA X3, X7
-	MOVOA X4, X8
-
-	PCLMULQDQ $0, X0, X1
-	PCLMULQDQ $0, X0, X2
-	PCLMULQDQ $0, X0, X3
-	PCLMULQDQ $0, X0, X4
-
-	// Load next early
-	MOVOU (SI), X11
-	MOVOU 16(SI), X12
-	MOVOU 32(SI), X13
-	MOVOU 48(SI), X14
-
-	PCLMULQDQ $0x11, X0, X5
-	PCLMULQDQ $0x11, X0, X6
-	PCLMULQDQ $0x11, X0, X7
-	PCLMULQDQ $0x11, X0, X8
-
-	PXOR X5, X1
-	PXOR X6, X2
-	PXOR X7, X3
-	PXOR X8, X4
-
-	PXOR X11, X1
-	PXOR X12, X2
-	PXOR X13, X3
-	PXOR X14, X4
-
-	ADDQ $0x40, DI
-	ADDQ $64, SI    // buf+=64
-	SUBQ $64, CX    // len-=64
-	CMPQ CX, $64    // Less than 64 bytes left?
-	JGE  loopback64
-
-	// Fold result into a single register (X1)
-remain64:
-	MOVOA r4r3kp<>+0(SB), X0
-
-	MOVOA     X1, X5
-	PCLMULQDQ $0, X0, X1
-	PCLMULQDQ $0x11, X0, X5
-	PXOR      X5, X1
-	PXOR      X2, X1
-
-	MOVOA     X1, X5
-	PCLMULQDQ $0, X0, X1
-	PCLMULQDQ $0x11, X0, X5
-	PXOR      X5, X1
-	PXOR      X3, X1
-
-	MOVOA     X1, X5
-	PCLMULQDQ $0, X0, X1
-	PCLMULQDQ $0x11, X0, X5
-	PXOR      X5, X1
-	PXOR      X4, X1
-
-	// If there is less than 16 bytes left we are done
-	CMPQ CX, $16
-	JB   finish
-
-	// Encode 16 bytes
-remain16:
-	MOVOU     (SI), X10
-	MOVOA     X1, X5
-	PCLMULQDQ $0, X0, X1
-	PCLMULQDQ $0x11, X0, X5
-	PXOR      X5, X1
-	PXOR      X10, X1
-	SUBQ      $16, CX
-	ADDQ      $16, SI
-	CMPQ      CX, $16
-	JGE       remain16
-
-finish:
-	// Fold final result into 32 bits and return it
-	PCMPEQB   X3, X3
-	PCLMULQDQ $1, X1, X0
-	PSRLDQ    $8, X1
-	PXOR      X0, X1
-
-	MOVOA X1, X2
-	MOVQ  r5kp<>+0(SB), X0
-
-	// Creates 32 bit mask. Note that we don't care about upper half.
-	PSRLQ $32, X3
-
-	PSRLDQ    $4, X2
-	PAND      X3, X1
-	PCLMULQDQ $0, X0, X1
-	PXOR      X2, X1
-
-	MOVOA rupolykp<>+0(SB), X0
-
-	MOVOA     X1, X2
-	PAND      X3, X1
-	PCLMULQDQ $0x10, X0, X1
-	PAND      X3, X1
-	PCLMULQDQ $0, X0, X1
-	PXOR      X2, X1
-
-	// PEXTRD   $1, X1, AX  (SSE 4.1)
-	BYTE $0x66; BYTE $0x0f; BYTE $0x3a
-	BYTE $0x16; BYTE $0xc8; BYTE $0x01
-	MOVL AX, ret+32(FP)
-
-	RET
--- a/cmd/gost/vendor/github.com/klauspost/crc32/crc32_amd64p32.go
+++ b/cmd/gost/vendor/github.com/klauspost/crc32/crc32_amd64p32.go
@ -1,43 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !appengine,!gccgo
-
-package crc32
-
-// This file contains the code to call the SSE 4.2 version of the Castagnoli
-// CRC.
-
-// haveSSE42 is defined in crc32_amd64p32.s and uses CPUID to test for SSE 4.2
-// support.
-func haveSSE42() bool
-
-// castagnoliSSE42 is defined in crc32_amd64p32.s and uses the SSE4.2 CRC32
-// instruction.
-//go:noescape
-func castagnoliSSE42(crc uint32, p []byte) uint32
-
-var sse42 = haveSSE42()
-
-func archAvailableCastagnoli() bool {
-	return sse42
-}
-
-func archInitCastagnoli() {
-	if !sse42 {
-		panic("not available")
-	}
-	// No initialization necessary.
-}
-
-func archUpdateCastagnoli(crc uint32, p []byte) uint32 {
-	if !sse42 {
-		panic("not available")
-	}
-	return castagnoliSSE42(crc, p)
-}
-
-func archAvailableIEEE() bool                    { return false }
-func archInitIEEE()                              { panic("not available") }
-func archUpdateIEEE(crc uint32, p []byte) uint32 { panic("not available") }
--- a/cmd/gost/vendor/github.com/klauspost/crc32/crc32_amd64p32.s
+++ b/cmd/gost/vendor/github.com/klauspost/crc32/crc32_amd64p32.s
@ -1,67 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build gc
-
-#define NOSPLIT 4
-#define RODATA 8
-
-// func castagnoliSSE42(crc uint32, p []byte) uint32
-TEXT ·castagnoliSSE42(SB), NOSPLIT, $0
-	MOVL crc+0(FP), AX   // CRC value
-	MOVL p+4(FP), SI     // data pointer
-	MOVL p_len+8(FP), CX // len(p)
-
-	NOTL AX
-
-	// If there's less than 8 bytes to process, we do it byte-by-byte.
-	CMPQ CX, $8
-	JL   cleanup
-
-	// Process individual bytes until the input is 8-byte aligned.
-startup:
-	MOVQ SI, BX
-	ANDQ $7, BX
-	JZ   aligned
-
-	CRC32B (SI), AX
-	DECQ   CX
-	INCQ   SI
-	JMP    startup
-
-aligned:
-	// The input is now 8-byte aligned and we can process 8-byte chunks.
-	CMPQ CX, $8
-	JL   cleanup
-
-	CRC32Q (SI), AX
-	ADDQ   $8, SI
-	SUBQ   $8, CX
-	JMP    aligned
-
-cleanup:
-	// We may have some bytes left over that we process one at a time.
-	CMPQ CX, $0
-	JE   done
-
-	CRC32B (SI), AX
-	INCQ   SI
-	DECQ   CX
-	JMP    cleanup
-
-done:
-	NOTL AX
-	MOVL AX, ret+16(FP)
-	RET
-
-// func haveSSE42() bool
-TEXT ·haveSSE42(SB), NOSPLIT, $0
-	XORQ AX, AX
-	INCL AX
-	CPUID
-	SHRQ $20, CX
-	ANDQ $1, CX
-	MOVB CX, ret+0(FP)
-	RET
-
--- a/cmd/gost/vendor/github.com/klauspost/crc32/crc32_generic.go
+++ b/cmd/gost/vendor/github.com/klauspost/crc32/crc32_generic.go
@ -1,89 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// This file contains CRC32 algorithms that are not specific to any architecture
-// and don't use hardware acceleration.
-//
-// The simple (and slow) CRC32 implementation only uses a 256*4 bytes table.
-//
-// The slicing-by-8 algorithm is a faster implementation that uses a bigger
-// table (8*256*4 bytes).
-
-package crc32
-
-// simpleMakeTable allocates and constructs a Table for the specified
-// polynomial. The table is suitable for use with the simple algorithm
-// (simpleUpdate).
-func simpleMakeTable(poly uint32) *Table {
-	t := new(Table)
-	simplePopulateTable(poly, t)
-	return t
-}
-
-// simplePopulateTable constructs a Table for the specified polynomial, suitable
-// for use with simpleUpdate.
-func simplePopulateTable(poly uint32, t *Table) {
-	for i := 0; i < 256; i++ {
-		crc := uint32(i)
-		for j := 0; j < 8; j++ {
-			if crc&1 == 1 {
-				crc = (crc >> 1) ^ poly
-			} else {
-				crc >>= 1
-			}
-		}
-		t[i] = crc
-	}
-}
-
-// simpleUpdate uses the simple algorithm to update the CRC, given a table that
-// was previously computed using simpleMakeTable.
-func simpleUpdate(crc uint32, tab *Table, p []byte) uint32 {
-	crc = ^crc
-	for _, v := range p {
-		crc = tab[byte(crc)^v] ^ (crc >> 8)
-	}
-	return ^crc
-}
-
-// Use slicing-by-8 when payload >= this value.
-const slicing8Cutoff = 16
-
-// slicing8Table is array of 8 Tables, used by the slicing-by-8 algorithm.
-type slicing8Table [8]Table
-
-// slicingMakeTable constructs a slicing8Table for the specified polynomial. The
-// table is suitable for use with the slicing-by-8 algorithm (slicingUpdate).
-func slicingMakeTable(poly uint32) *slicing8Table {
-	t := new(slicing8Table)
-	simplePopulateTable(poly, &t[0])
-	for i := 0; i < 256; i++ {
-		crc := t[0][i]
-		for j := 1; j < 8; j++ {
-			crc = t[0][crc&0xFF] ^ (crc >> 8)
-			t[j][i] = crc
-		}
-	}
-	return t
-}
-
-// slicingUpdate uses the slicing-by-8 algorithm to update the CRC, given a
-// table that was previously computed using slicingMakeTable.
-func slicingUpdate(crc uint32, tab *slicing8Table, p []byte) uint32 {
-	if len(p) >= slicing8Cutoff {
-		crc = ^crc
-		for len(p) > 8 {
-			crc ^= uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
-			crc = tab[0][p[7]] ^ tab[1][p[6]] ^ tab[2][p[5]] ^ tab[3][p[4]] ^
-				tab[4][crc>>24] ^ tab[5][(crc>>16)&0xFF] ^
-				tab[6][(crc>>8)&0xFF] ^ tab[7][crc&0xFF]
-			p = p[8:]
-		}
-		crc = ^crc
-	}
-	if len(p) == 0 {
-		return crc
-	}
-	return simpleUpdate(crc, &tab[0], p)
-}
--- a/cmd/gost/vendor/github.com/klauspost/crc32/crc32_otherarch.go
+++ b/cmd/gost/vendor/github.com/klauspost/crc32/crc32_otherarch.go
@ -1,15 +0,0 @@
-// Copyright 2011 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !amd64,!amd64p32,!s390x
-
-package crc32
-
-func archAvailableIEEE() bool                    { return false }
-func archInitIEEE()                              { panic("not available") }
-func archUpdateIEEE(crc uint32, p []byte) uint32 { panic("not available") }
-
-func archAvailableCastagnoli() bool                    { return false }
-func archInitCastagnoli()                              { panic("not available") }
-func archUpdateCastagnoli(crc uint32, p []byte) uint32 { panic("not available") }
--- a/cmd/gost/vendor/github.com/klauspost/crc32/crc32_s390x.go
+++ b/cmd/gost/vendor/github.com/klauspost/crc32/crc32_s390x.go
@ -1,91 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build s390x
-
-package crc32
-
-const (
-	vxMinLen    = 64
-	vxAlignMask = 15 // align to 16 bytes
-)
-
-// hasVectorFacility reports whether the machine has the z/Architecture
-// vector facility installed and enabled.
-func hasVectorFacility() bool
-
-var hasVX = hasVectorFacility()
-
-// vectorizedCastagnoli implements CRC32 using vector instructions.
-// It is defined in crc32_s390x.s.
-//go:noescape
-func vectorizedCastagnoli(crc uint32, p []byte) uint32
-
-// vectorizedIEEE implements CRC32 using vector instructions.
-// It is defined in crc32_s390x.s.
-//go:noescape
-func vectorizedIEEE(crc uint32, p []byte) uint32
-
-func archAvailableCastagnoli() bool {
-	return hasVX
-}
-
-var archCastagnoliTable8 *slicing8Table
-
-func archInitCastagnoli() {
-	if !hasVX {
-		panic("not available")
-	}
-	// We still use slicing-by-8 for small buffers.
-	archCastagnoliTable8 = slicingMakeTable(Castagnoli)
-}
-
-// archUpdateCastagnoli calculates the checksum of p using
-// vectorizedCastagnoli.
-func archUpdateCastagnoli(crc uint32, p []byte) uint32 {
-	if !hasVX {
-		panic("not available")
-	}
-	// Use vectorized function if data length is above threshold.
-	if len(p) >= vxMinLen {
-		aligned := len(p) & ^vxAlignMask
-		crc = vectorizedCastagnoli(crc, p[:aligned])
-		p = p[aligned:]
-	}
-	if len(p) == 0 {
-		return crc
-	}
-	return slicingUpdate(crc, archCastagnoliTable8, p)
-}
-
-func archAvailableIEEE() bool {
-	return hasVX
-}
-
-var archIeeeTable8 *slicing8Table
-
-func archInitIEEE() {
-	if !hasVX {
-		panic("not available")
-	}
-	// We still use slicing-by-8 for small buffers.
-	archIeeeTable8 = slicingMakeTable(IEEE)
-}
-
-// archUpdateIEEE calculates the checksum of p using vectorizedIEEE.
-func archUpdateIEEE(crc uint32, p []byte) uint32 {
-	if !hasVX {
-		panic("not available")
-	}
-	// Use vectorized function if data length is above threshold.
-	if len(p) >= vxMinLen {
-		aligned := len(p) & ^vxAlignMask
-		crc = vectorizedIEEE(crc, p[:aligned])
-		p = p[aligned:]
-	}
-	if len(p) == 0 {
-		return crc
-	}
-	return slicingUpdate(crc, archIeeeTable8, p)
-}
--- a/cmd/gost/vendor/github.com/klauspost/crc32/crc32_s390x.s
+++ b/cmd/gost/vendor/github.com/klauspost/crc32/crc32_s390x.s
@ -1,249 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build s390x
-
-#include "textflag.h"
-
-// Vector register range containing CRC-32 constants
-
-#define CONST_PERM_LE2BE        V9
-#define CONST_R2R1              V10
-#define CONST_R4R3              V11
-#define CONST_R5                V12
-#define CONST_RU_POLY           V13
-#define CONST_CRC_POLY          V14
-
-// The CRC-32 constant block contains reduction constants to fold and
-// process particular chunks of the input data stream in parallel.
-//
-// Note that the constant definitions below are extended in order to compute
-// intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction.
-// The rightmost doubleword can be 0 to prevent contribution to the result or
-// can be multiplied by 1 to perform an XOR without the need for a separate
-// VECTOR EXCLUSIVE OR instruction.
-//
-// The polynomials used are bit-reflected:
-//
-//            IEEE: P'(x) = 0x0edb88320
-//      Castagnoli: P'(x) = 0x082f63b78
-
-// IEEE polynomial constants
-DATA ·crcleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask
-DATA ·crcleconskp+8(SB)/8, $0x0706050403020100
-DATA ·crcleconskp+16(SB)/8, $0x00000001c6e41596 // R2
-DATA ·crcleconskp+24(SB)/8, $0x0000000154442bd4 // R1
-DATA ·crcleconskp+32(SB)/8, $0x00000000ccaa009e // R4
-DATA ·crcleconskp+40(SB)/8, $0x00000001751997d0 // R3
-DATA ·crcleconskp+48(SB)/8, $0x0000000000000000
-DATA ·crcleconskp+56(SB)/8, $0x0000000163cd6124 // R5
-DATA ·crcleconskp+64(SB)/8, $0x0000000000000000
-DATA ·crcleconskp+72(SB)/8, $0x00000001F7011641 // u'
-DATA ·crcleconskp+80(SB)/8, $0x0000000000000000
-DATA ·crcleconskp+88(SB)/8, $0x00000001DB710641 // P'(x) << 1
-
-GLOBL ·crcleconskp(SB), RODATA, $144
-
-// Castagonli Polynomial constants
-DATA ·crccleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask
-DATA ·crccleconskp+8(SB)/8, $0x0706050403020100
-DATA ·crccleconskp+16(SB)/8, $0x000000009e4addf8 // R2
-DATA ·crccleconskp+24(SB)/8, $0x00000000740eef02 // R1
-DATA ·crccleconskp+32(SB)/8, $0x000000014cd00bd6 // R4
-DATA ·crccleconskp+40(SB)/8, $0x00000000f20c0dfe // R3
-DATA ·crccleconskp+48(SB)/8, $0x0000000000000000
-DATA ·crccleconskp+56(SB)/8, $0x00000000dd45aab8 // R5
-DATA ·crccleconskp+64(SB)/8, $0x0000000000000000
-DATA ·crccleconskp+72(SB)/8, $0x00000000dea713f1 // u'
-DATA ·crccleconskp+80(SB)/8, $0x0000000000000000
-DATA ·crccleconskp+88(SB)/8, $0x0000000105ec76f0 // P'(x) << 1
-
-GLOBL ·crccleconskp(SB), RODATA, $144
-
-// func hasVectorFacility() bool
-TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1
-	MOVD  $x-24(SP), R1
-	XC    $24, 0(R1), 0(R1) // clear the storage
-	MOVD  $2, R0            // R0 is the number of double words stored -1
-	WORD  $0xB2B01000       // STFLE 0(R1)
-	XOR   R0, R0            // reset the value of R0
-	MOVBZ z-8(SP), R1
-	AND   $0x40, R1
-	BEQ   novector
-
-vectorinstalled:
-	// check if the vector instruction has been enabled
-	VLEIB  $0, $0xF, V16
-	VLGVB  $0, V16, R1
-	CMPBNE R1, $0xF, novector
-	MOVB   $1, ret+0(FP)      // have vx
-	RET
-
-novector:
-	MOVB $0, ret+0(FP) // no vx
-	RET
-
-// The CRC-32 function(s) use these calling conventions:
-//
-// Parameters:
-//
-//      R2:    Initial CRC value, typically ~0; and final CRC (return) value.
-//      R3:    Input buffer pointer, performance might be improved if the
-//             buffer is on a doubleword boundary.
-//      R4:    Length of the buffer, must be 64 bytes or greater.
-//
-// Register usage:
-//
-//      R5:     CRC-32 constant pool base pointer.
-//      V0:     Initial CRC value and intermediate constants and results.
-//      V1..V4: Data for CRC computation.
-//      V5..V8: Next data chunks that are fetched from the input buffer.
-//
-//      V9..V14: CRC-32 constants.
-
-// func vectorizedIEEE(crc uint32, p []byte) uint32
-TEXT ·vectorizedIEEE(SB), NOSPLIT, $0
-	MOVWZ crc+0(FP), R2    // R2 stores the CRC value
-	MOVD  p+8(FP), R3      // data pointer
-	MOVD  p_len+16(FP), R4 // len(p)
-
-	MOVD $·crcleconskp(SB), R5
-	BR   vectorizedBody<>(SB)
-
-// func vectorizedCastagnoli(crc uint32, p []byte) uint32
-TEXT ·vectorizedCastagnoli(SB), NOSPLIT, $0
-	MOVWZ crc+0(FP), R2    // R2 stores the CRC value
-	MOVD  p+8(FP), R3      // data pointer
-	MOVD  p_len+16(FP), R4 // len(p)
-
-	// R5: crc-32 constant pool base pointer, constant is used to reduce crc
-	MOVD $·crccleconskp(SB), R5
-	BR   vectorizedBody<>(SB)
-
-TEXT vectorizedBody<>(SB), NOSPLIT, $0
-	XOR $0xffffffff, R2                         // NOTW R2
-	VLM 0(R5), CONST_PERM_LE2BE, CONST_CRC_POLY
-
-	// Load the initial CRC value into the rightmost word of V0
-	VZERO V0
-	VLVGF $3, R2, V0
-
-	// Crash if the input size is less than 64-bytes.
-	CMP R4, $64
-	BLT crash
-
-	// Load a 64-byte data chunk and XOR with CRC
-	VLM 0(R3), V1, V4 // 64-bytes into V1..V4
-
-	// Reflect the data if the CRC operation is in the bit-reflected domain
-	VPERM V1, V1, CONST_PERM_LE2BE, V1
-	VPERM V2, V2, CONST_PERM_LE2BE, V2
-	VPERM V3, V3, CONST_PERM_LE2BE, V3
-	VPERM V4, V4, CONST_PERM_LE2BE, V4
-
-	VX  V0, V1, V1 // V1 ^= CRC
-	ADD $64, R3    // BUF = BUF + 64
-	ADD $(-64), R4
-
-	// Check remaining buffer size and jump to proper folding method
-	CMP R4, $64
-	BLT less_than_64bytes
-
-fold_64bytes_loop:
-	// Load the next 64-byte data chunk into V5 to V8
-	VLM   0(R3), V5, V8
-	VPERM V5, V5, CONST_PERM_LE2BE, V5
-	VPERM V6, V6, CONST_PERM_LE2BE, V6
-	VPERM V7, V7, CONST_PERM_LE2BE, V7
-	VPERM V8, V8, CONST_PERM_LE2BE, V8
-
-	// Perform a GF(2) multiplication of the doublewords in V1 with
-	// the reduction constants in V0.  The intermediate result is
-	// then folded (accumulated) with the next data chunk in V5 and
-	// stored in V1.  Repeat this step for the register contents
-	// in V2, V3, and V4 respectively.
-
-	VGFMAG CONST_R2R1, V1, V5, V1
-	VGFMAG CONST_R2R1, V2, V6, V2
-	VGFMAG CONST_R2R1, V3, V7, V3
-	VGFMAG CONST_R2R1, V4, V8, V4
-
-	// Adjust buffer pointer and length for next loop
-	ADD $64, R3    // BUF = BUF + 64
-	ADD $(-64), R4 // LEN = LEN - 64
-
-	CMP R4, $64
-	BGE fold_64bytes_loop
-
-less_than_64bytes:
-	// Fold V1 to V4 into a single 128-bit value in V1
-	VGFMAG CONST_R4R3, V1, V2, V1
-	VGFMAG CONST_R4R3, V1, V3, V1
-	VGFMAG CONST_R4R3, V1, V4, V1
-
-	// Check whether to continue with 64-bit folding
-	CMP R4, $16
-	BLT final_fold
-
-fold_16bytes_loop:
-	VL    0(R3), V2                    // Load next data chunk
-	VPERM V2, V2, CONST_PERM_LE2BE, V2
-
-	VGFMAG CONST_R4R3, V1, V2, V1 // Fold next data chunk
-
-	// Adjust buffer pointer and size for folding next data chunk
-	ADD $16, R3
-	ADD $-16, R4
-
-	// Process remaining data chunks
-	CMP R4, $16
-	BGE fold_16bytes_loop
-
-final_fold:
-	VLEIB $7, $0x40, V9
-	VSRLB V9, CONST_R4R3, V0
-	VLEIG $0, $1, V0
-
-	VGFMG V0, V1, V1
-
-	VLEIB  $7, $0x20, V9        // Shift by words
-	VSRLB  V9, V1, V2           // Store remaining bits in V2
-	VUPLLF V1, V1               // Split rightmost doubleword
-	VGFMAG CONST_R5, V1, V2, V1 // V1 = (V1 * R5) XOR V2
-
-	// The input values to the Barret reduction are the degree-63 polynomial
-	// in V1 (R(x)), degree-32 generator polynomial, and the reduction
-	// constant u.  The Barret reduction result is the CRC value of R(x) mod
-	// P(x).
-	//
-	// The Barret reduction algorithm is defined as:
-	//
-	//    1. T1(x) = floor( R(x) / x^32 ) GF2MUL u
-	//    2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x)
-	//    3. C(x)  = R(x) XOR T2(x) mod x^32
-	//
-	// Note: To compensate the division by x^32, use the vector unpack
-	// instruction to move the leftmost word into the leftmost doubleword
-	// of the vector register.  The rightmost doubleword is multiplied
-	// with zero to not contribute to the intermedate results.
-
-	// T1(x) = floor( R(x) / x^32 ) GF2MUL u
-	VUPLLF V1, V2
-	VGFMG  CONST_RU_POLY, V2, V2
-
-	// Compute the GF(2) product of the CRC polynomial in VO with T1(x) in
-	// V2 and XOR the intermediate result, T2(x),  with the value in V1.
-	// The final result is in the rightmost word of V2.
-
-	VUPLLF V2, V2
-	VGFMAG CONST_CRC_POLY, V2, V1, V2
-
-done:
-	VLGVF $2, V2, R2
-	XOR   $0xffffffff, R2  // NOTW R2
-	MOVWZ R2, ret + 32(FP)
-	RET
-
-crash:
-	MOVD $0, (R0) // input size is less than 64-bytes
--- a/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/README.md
+++ b/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/README.md
@ -1,4 +1,5 @@
-# kcp-go
+<img src="kcp-go.png" alt="kcp-go" height="50px" />
+

 [![GoDoc][1]][2] [![Powered][9]][10] [![MIT licensed][11]][12] [![Build Status][3]][4] [![Go Report Card][5]][6] [![Coverage Statusd][7]][8]

@ -19,12 +20,12 @@

 ## Introduction

-kcp-go is a full-featured ***reliable-UDP*** library for golang. It provides ***reliable, ordered, and error-checked*** delivery of a stream of octets between applications running on hosts communicating over an IP network.
+kcp-go is a full-featured ***Reliable-UDP*** library for golang. It provides ***reliable, ordered, and error-checked*** delivery of a stream of octets between applications running on hosts communicating over an IP network.

 ## Features

-1. Optimized for ***Real-Time Strategy Game***.
-1. Compatible with [skywind3000's](https://github.com/skywind3000) C version with modifications.
+1. Optimized for ***Online Games, Audio/Video Streaming***.
+1. Compatible with [skywind3000's](https://github.com/skywind3000) C version with optimizations.
 1. ***Cache friendly*** and ***Memory optimized*** design in golang.
 1. Compatible with [net.Conn](https://golang.org/pkg/net/#Conn) and [net.Listener](https://golang.org/pkg/net/#Listener).
 1. [FEC(Forward Error Correction)](https://en.wikipedia.org/wiki/Forward_error_correction) Support with [Reed-Solomon Codes](https://en.wikipedia.org/wiki/Reed%E2%80%93Solomon_error_correction)
@ -40,7 +41,7 @@ For complete documentation, see the associated [Godoc](https://godoc.org/github.

 ## Specification

-# <img src="frame.png" alt="Frame Format" height="160px" /> 
+<img src="frame.png" alt="Frame Format" height="109px" />

 ## Usage

@ -75,14 +76,14 @@ PASS
 ok  	github.com/xtaci/kcp-go	0.600s
 ```

+## Who is using this?
+
+1. https://github.com/xtaci/kcptun
+2. https://github.com/getlantern/lantern
+3. https://github.com/smallnest/rpcx
+
 ## Links

-1. https://github.com/xtaci/libkcp -- Official client library for iOS/Android(C++11)
+1. https://github.com/xtaci/libkcp -- FEC enhanced KCP session library for iOS/Android in C++
 2. https://github.com/skywind3000/kcp -- A Fast and Reliable ARQ Protocol
 3. https://github.com/klauspost/reedsolomon -- Reed-Solomon Erasure Coding in Go
-
-## Donation
-
-![donate](donate.png)          
-
-All donations on this project will be used to support the development of [gonet/2](http://gonet2.github.io/).
--- a/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/crypt.go
+++ b/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/crypt.go
@ -20,7 +20,9 @@ var (
 	saltxor       = `sH3CIVoF#rWLtJo6`
 )

-// BlockCrypt defines encryption/decryption methods for a given byte slice
+// BlockCrypt defines encryption/decryption methods for a given byte slice.
+// Notes on implementing: the data to be encrypted contains a builtin
+// nonce at the first 16 bytes
 type BlockCrypt interface {
 	// Encrypt encrypts the whole block in src into dst.
 	// Dst and src may point at the same memory.
@ -31,40 +33,35 @@ type BlockCrypt interface {
 	Decrypt(dst, src []byte)
 }

-// Salsa20BlockCrypt implements BlockCrypt
-type Salsa20BlockCrypt struct {
+type salsa20BlockCrypt struct {
 	key [32]byte
 }

-// NewSalsa20BlockCrypt initates BlockCrypt by the given key
+// NewSalsa20BlockCrypt https://en.wikipedia.org/wiki/Salsa20
 func NewSalsa20BlockCrypt(key []byte) (BlockCrypt, error) {
-	c := new(Salsa20BlockCrypt)
+	c := new(salsa20BlockCrypt)
 	copy(c.key[:], key)
 	return c, nil
 }

-// Encrypt implements Encrypt interface
-func (c *Salsa20BlockCrypt) Encrypt(dst, src []byte) {
+func (c *salsa20BlockCrypt) Encrypt(dst, src []byte) {
 	salsa20.XORKeyStream(dst[8:], src[8:], src[:8], &c.key)
 	copy(dst[:8], src[:8])
 }
-
-// Decrypt implements Decrypt interface
-func (c *Salsa20BlockCrypt) Decrypt(dst, src []byte) {
+func (c *salsa20BlockCrypt) Decrypt(dst, src []byte) {
 	salsa20.XORKeyStream(dst[8:], src[8:], src[:8], &c.key)
 	copy(dst[:8], src[:8])
 }

-// TwofishBlockCrypt implements BlockCrypt
-type TwofishBlockCrypt struct {
+type twofishBlockCrypt struct {
 	encbuf []byte
 	decbuf []byte
 	block  cipher.Block
 }

-// NewTwofishBlockCrypt initates BlockCrypt by the given key
+// NewTwofishBlockCrypt https://en.wikipedia.org/wiki/Twofish
 func NewTwofishBlockCrypt(key []byte) (BlockCrypt, error) {
-	c := new(TwofishBlockCrypt)
+	c := new(twofishBlockCrypt)
 	block, err := twofish.NewCipher(key)
 	if err != nil {
 		return nil, err
@ -75,22 +72,18 @@ func NewTwofishBlockCrypt(key []byte) (BlockCrypt, error) {
 	return c, nil
 }

-// Encrypt implements Encrypt interface
-func (c *TwofishBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
-
-// Decrypt implements Decrypt interface
-func (c *TwofishBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
+func (c *twofishBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
+func (c *twofishBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }

-// TripleDESBlockCrypt implements BlockCrypt
-type TripleDESBlockCrypt struct {
+type tripleDESBlockCrypt struct {
 	encbuf []byte
 	decbuf []byte
 	block  cipher.Block
 }

-// NewTripleDESBlockCrypt initates BlockCrypt by the given key
+// NewTripleDESBlockCrypt https://en.wikipedia.org/wiki/Triple_DES
 func NewTripleDESBlockCrypt(key []byte) (BlockCrypt, error) {
-	c := new(TripleDESBlockCrypt)
+	c := new(tripleDESBlockCrypt)
 	block, err := des.NewTripleDESCipher(key)
 	if err != nil {
 		return nil, err
@ -101,22 +94,18 @@ func NewTripleDESBlockCrypt(key []byte) (BlockCrypt, error) {
 	return c, nil
 }

-// Encrypt implements Encrypt interface
-func (c *TripleDESBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
-
-// Decrypt implements Decrypt interface
-func (c *TripleDESBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
+func (c *tripleDESBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
+func (c *tripleDESBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }

-// Cast5BlockCrypt implements BlockCrypt
-type Cast5BlockCrypt struct {
+type cast5BlockCrypt struct {
 	encbuf []byte
 	decbuf []byte
 	block  cipher.Block
 }

-// NewCast5BlockCrypt initates BlockCrypt by the given key
+// NewCast5BlockCrypt https://en.wikipedia.org/wiki/CAST-128
 func NewCast5BlockCrypt(key []byte) (BlockCrypt, error) {
-	c := new(Cast5BlockCrypt)
+	c := new(cast5BlockCrypt)
 	block, err := cast5.NewCipher(key)
 	if err != nil {
 		return nil, err
@ -127,22 +116,18 @@ func NewCast5BlockCrypt(key []byte) (BlockCrypt, error) {
 	return c, nil
 }

-// Encrypt implements Encrypt interface
-func (c *Cast5BlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
+func (c *cast5BlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
+func (c *cast5BlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }

-// Decrypt implements Decrypt interface
-func (c *Cast5BlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
-
-// BlowfishBlockCrypt implements BlockCrypt
-type BlowfishBlockCrypt struct {
+type blowfishBlockCrypt struct {
 	encbuf []byte
 	decbuf []byte
 	block  cipher.Block
 }

-// NewBlowfishBlockCrypt initates BlockCrypt by the given key
+// NewBlowfishBlockCrypt https://en.wikipedia.org/wiki/Blowfish_(cipher)
 func NewBlowfishBlockCrypt(key []byte) (BlockCrypt, error) {
-	c := new(BlowfishBlockCrypt)
+	c := new(blowfishBlockCrypt)
 	block, err := blowfish.NewCipher(key)
 	if err != nil {
 		return nil, err
@ -153,22 +138,18 @@ func NewBlowfishBlockCrypt(key []byte) (BlockCrypt, error) {
 	return c, nil
 }

-// Encrypt implements Encrypt interface
-func (c *BlowfishBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
-
-// Decrypt implements Decrypt interface
-func (c *BlowfishBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
+func (c *blowfishBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
+func (c *blowfishBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }

-// AESBlockCrypt implements BlockCrypt
-type AESBlockCrypt struct {
+type aesBlockCrypt struct {
 	encbuf []byte
 	decbuf []byte
 	block  cipher.Block
 }

-// NewAESBlockCrypt initates BlockCrypt by the given key
+// NewAESBlockCrypt https://en.wikipedia.org/wiki/Advanced_Encryption_Standard
 func NewAESBlockCrypt(key []byte) (BlockCrypt, error) {
-	c := new(AESBlockCrypt)
+	c := new(aesBlockCrypt)
 	block, err := aes.NewCipher(key)
 	if err != nil {
 		return nil, err
@ -179,22 +160,18 @@ func NewAESBlockCrypt(key []byte) (BlockCrypt, error) {
 	return c, nil
 }

-// Encrypt implements Encrypt interface
-func (c *AESBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
+func (c *aesBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
+func (c *aesBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }

-// Decrypt implements Decrypt interface
-func (c *AESBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
-
-// TEABlockCrypt implements BlockCrypt
-type TEABlockCrypt struct {
+type teaBlockCrypt struct {
 	encbuf []byte
 	decbuf []byte
 	block  cipher.Block
 }

-// NewTEABlockCrypt initate BlockCrypt by the given key
+// NewTEABlockCrypt https://en.wikipedia.org/wiki/Tiny_Encryption_Algorithm
 func NewTEABlockCrypt(key []byte) (BlockCrypt, error) {
-	c := new(TEABlockCrypt)
+	c := new(teaBlockCrypt)
 	block, err := tea.NewCipherWithRounds(key, 16)
 	if err != nil {
 		return nil, err
@ -205,22 +182,18 @@ func NewTEABlockCrypt(key []byte) (BlockCrypt, error) {
 	return c, nil
 }

-// Encrypt implements Encrypt interface
-func (c *TEABlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
-
-// Decrypt implements Decrypt interface
-func (c *TEABlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
+func (c *teaBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
+func (c *teaBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }

-// XTEABlockCrypt implements BlockCrypt
-type XTEABlockCrypt struct {
+type xteaBlockCrypt struct {
 	encbuf []byte
 	decbuf []byte
 	block  cipher.Block
 }

-// NewXTEABlockCrypt initate BlockCrypt by the given key
+// NewXTEABlockCrypt https://en.wikipedia.org/wiki/XTEA
 func NewXTEABlockCrypt(key []byte) (BlockCrypt, error) {
-	c := new(XTEABlockCrypt)
+	c := new(xteaBlockCrypt)
 	block, err := xtea.NewCipher(key)
 	if err != nil {
 		return nil, err
@ -231,43 +204,32 @@ func NewXTEABlockCrypt(key []byte) (BlockCrypt, error) {
 	return c, nil
 }

-// Encrypt implements Encrypt interface
-func (c *XTEABlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
-
-// Decrypt implements Decrypt interface
-func (c *XTEABlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }
+func (c *xteaBlockCrypt) Encrypt(dst, src []byte) { encrypt(c.block, dst, src, c.encbuf) }
+func (c *xteaBlockCrypt) Decrypt(dst, src []byte) { decrypt(c.block, dst, src, c.decbuf) }

-// SimpleXORBlockCrypt implements BlockCrypt
-type SimpleXORBlockCrypt struct {
+type simpleXORBlockCrypt struct {
 	xortbl []byte
 }

-// NewSimpleXORBlockCrypt initate BlockCrypt by the given key
+// NewSimpleXORBlockCrypt simple xor with key expanding
 func NewSimpleXORBlockCrypt(key []byte) (BlockCrypt, error) {
-	c := new(SimpleXORBlockCrypt)
+	c := new(simpleXORBlockCrypt)
 	c.xortbl = pbkdf2.Key(key, []byte(saltxor), 32, mtuLimit, sha1.New)
 	return c, nil
 }

-// Encrypt implements Encrypt interface
-func (c *SimpleXORBlockCrypt) Encrypt(dst, src []byte) { xorBytes(dst, src, c.xortbl) }
+func (c *simpleXORBlockCrypt) Encrypt(dst, src []byte) { xorBytes(dst, src, c.xortbl) }
+func (c *simpleXORBlockCrypt) Decrypt(dst, src []byte) { xorBytes(dst, src, c.xortbl) }

-// Decrypt implements Decrypt interface
-func (c *SimpleXORBlockCrypt) Decrypt(dst, src []byte) { xorBytes(dst, src, c.xortbl) }
+type noneBlockCrypt struct{}

-// NoneBlockCrypt simple returns the plaintext
-type NoneBlockCrypt struct{}
-
-// NewNoneBlockCrypt initate by the given key
+// NewNoneBlockCrypt does nothing but copying
 func NewNoneBlockCrypt(key []byte) (BlockCrypt, error) {
-	return new(NoneBlockCrypt), nil
+	return new(noneBlockCrypt), nil
 }

-// Encrypt implements Encrypt interface
-func (c *NoneBlockCrypt) Encrypt(dst, src []byte) { copy(dst, src) }
-
-// Decrypt implements Decrypt interface
-func (c *NoneBlockCrypt) Decrypt(dst, src []byte) { copy(dst, src) }
+func (c *noneBlockCrypt) Encrypt(dst, src []byte) { copy(dst, src) }
+func (c *noneBlockCrypt) Decrypt(dst, src []byte) { copy(dst, src) }

 // packet encryption with local CFB mode
 func encrypt(block cipher.Block, dst, src, buf []byte) {
--- a/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/fec.go
+++ b/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/fec.go
@ -2,7 +2,7 @@ package kcp

 import (
 	"encoding/binary"
-	"sync"
+	"sync/atomic"

 	"github.com/klauspost/reedsolomon"
 )
@ -26,10 +26,10 @@ type (
 		next         uint32 // next seqid
 		enc          reedsolomon.Encoder
 		shards       [][]byte
+		shards2      [][]byte // for calcECC
 		shardsflag   []bool
 		paws         uint32 // Protect Against Wrapped Sequence numbers
 		lastCheck    uint32
-		xmitBuf      sync.Pool
 	}

 	fecPacket struct {
@ -60,11 +60,8 @@ func newFEC(rxlimit, dataShards, parityShards int) *FEC {
 	}
 	fec.enc = enc
 	fec.shards = make([][]byte, fec.shardSize)
+	fec.shards2 = make([][]byte, fec.shardSize)
 	fec.shardsflag = make([]bool, fec.shardSize)
-	fec.xmitBuf.New = func() interface{} {
-		return make([]byte, mtuLimit)
-	}
-
 	return fec
 }

@ -75,9 +72,8 @@ func (fec *FEC) decode(data []byte) fecPacket {
 	pkt.flag = binary.LittleEndian.Uint16(data[4:])
 	pkt.ts = currentMs()
 	// allocate memory & copy
-	buf := fec.xmitBuf.Get().([]byte)
-	n := copy(buf, data[6:])
-	xorBytes(buf[n:], buf[n:], buf[n:])
+	buf := xmitBuf.Get().([]byte)[:len(data)-6]
+	copy(buf, data[6:])
 	pkt.data = buf
 	return pkt
 }
@ -107,7 +103,7 @@ func (fec *FEC) input(pkt fecPacket) (recovered [][]byte) {
 			if now-fec.rx[k].ts < fecExpire {
 				rx = append(rx, fec.rx[k])
 			} else {
-				fec.xmitBuf.Put(fec.rx[k].data)
+				xmitBuf.Put(fec.rx[k].data)
 			}
 		}
 		fec.rx = rx
@ -119,7 +115,7 @@ func (fec *FEC) input(pkt fecPacket) (recovered [][]byte) {
 	insertIdx := 0
 	for i := n; i >= 0; i-- {
 		if pkt.seqid == fec.rx[i].seqid { // de-duplicate
-			fec.xmitBuf.Put(pkt.data)
+			xmitBuf.Put(pkt.data)
 			return nil
 		} else if pkt.seqid > fec.rx[i].seqid { // insertion
 			insertIdx = i + 1
@ -184,7 +180,7 @@ func (fec *FEC) input(pkt fecPacket) (recovered [][]byte) {

 		if numDataShard == fec.dataShards { // no lost
 			for i := first; i < first+numshard; i++ { // free
-				fec.xmitBuf.Put(fec.rx[i].data)
+				xmitBuf.Put(fec.rx[i].data)
 			}
 			copy(fec.rx[first:], fec.rx[first+numshard:])
 			for i := 0; i < numshard; i++ { // dereference
@ -194,7 +190,9 @@ func (fec *FEC) input(pkt fecPacket) (recovered [][]byte) {
 		} else if numshard >= fec.dataShards { // recoverable
 			for k := range shards {
 				if shards[k] != nil {
+					dlen := len(shards[k])
 					shards[k] = shards[k][:maxlen]
+					xorBytes(shards[k][dlen:], shards[k][dlen:], shards[k][dlen:])
 				}
 			}
 			if err := fec.enc.Reconstruct(shards); err == nil {
@ -206,7 +204,7 @@ func (fec *FEC) input(pkt fecPacket) (recovered [][]byte) {
 			}

 			for i := first; i < first+numshard; i++ { // free
-				fec.xmitBuf.Put(fec.rx[i].data)
+				xmitBuf.Put(fec.rx[i].data)
 			}
 			copy(fec.rx[first:], fec.rx[first+numshard:])
 			for i := 0; i < numshard; i++ { // dereference
@ -218,7 +216,10 @@ func (fec *FEC) input(pkt fecPacket) (recovered [][]byte) {

 	// keep rxlimit
 	if len(fec.rx) > fec.rxlimit {
-		fec.xmitBuf.Put(fec.rx[0].data) // free
+		if fec.rx[0].flag == typeData { // record unrecoverable data
+			atomic.AddUint64(&DefaultSnmp.FECShortShards, 1)
+		}
+		xmitBuf.Put(fec.rx[0].data) // free
 		fec.rx[0].data = nil
 		fec.rx = fec.rx[1:]
 	}
@ -229,7 +230,7 @@ func (fec *FEC) calcECC(data [][]byte, offset, maxlen int) (ecc [][]byte) {
 	if len(data) != fec.shardSize {
 		return nil
 	}
-	shards := make([][]byte, fec.shardSize)
+	shards := fec.shards2
 	for k := range shards {
 		shards[k] = data[k][offset:maxlen]
 	}
--- a/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/frame.png
+++ b/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/frame.png
--- a/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/kcp-go.png
+++ b/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/kcp-go.png
--- a/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/kcp.go
+++ b/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/kcp.go
@ -2,7 +2,6 @@
 package kcp

 import (
-	"container/heap"
 	"encoding/binary"
 	"sync/atomic"
 )
@ -123,13 +122,6 @@ func (seg *Segment) encode(ptr []byte) []byte {
 	return ptr
 }

-// NewSegment creates a KCP segment
-func NewSegment(size int) *Segment {
-	seg := new(Segment)
-	seg.data = make([]byte, size)
-	return seg
-}
-
 // KCP defines a single KCP connection
 type KCP struct {
 	conv, mtu, mss, state                  uint32
@ -137,7 +129,7 @@ type KCP struct {
 	ssthresh                               uint32
 	rx_rttval, rx_srtt, rx_rto, rx_minrto  uint32
 	snd_wnd, rcv_wnd, rmt_wnd, cwnd, probe uint32
-	current, interval, ts_flush, xmit      uint32
+	interval, ts_flush, xmit               uint32
 	nodelay, updated                       uint32
 	ts_probe, probe_wait                   uint32
 	dead_link, incr                        uint32
@ -150,33 +142,17 @@ type KCP struct {
 	snd_buf   []Segment
 	rcv_buf   []Segment

-	acklist ACKList
+	acklist []ackItem

 	buffer []byte
 	output Output
 }

-// ACK packet to return
-type ACK struct {
+type ackItem struct {
 	sn uint32
 	ts uint32
 }

-// ACKList is heapified
-type ACKList []ACK
-
-func (l ACKList) Len() int            { return len(l) }
-func (l ACKList) Less(i, j int) bool  { return l[i].sn < l[j].sn }
-func (l ACKList) Swap(i, j int)       { l[i], l[j] = l[j], l[i] }
-func (l *ACKList) Push(x interface{}) { *l = append(*l, x.(ACK)) }
-func (l *ACKList) Pop() interface{} {
-	old := *l
-	n := len(old)
-	x := old[n-1]
-	*l = old[0 : n-1]
-	return x
-}
-
 // NewKCP create a new kcp control object, 'conv' must equal in two endpoint
 // from the same connection.
 func NewKCP(conv uint32, output Output) *KCP {
@ -198,6 +174,18 @@ func NewKCP(conv uint32, output Output) *KCP {
 	return kcp
 }

+// newSegment creates a KCP segment
+func (kcp *KCP) newSegment(size int) *Segment {
+	seg := new(Segment)
+	seg.data = xmitBuf.Get().([]byte)[:size]
+	return seg
+}
+
+// delSegment recycles a KCP segment
+func (kcp *KCP) delSegment(seg *Segment) {
+	xmitBuf.Put(seg.data)
+}
+
 // PeekSize checks the size of next message in the recv queue
 func (kcp *KCP) PeekSize() (length int) {
 	if len(kcp.rcv_queue) == 0 {
@ -251,7 +239,7 @@ func (kcp *KCP) Recv(buffer []byte) (n int) {
 		buffer = buffer[len(seg.data):]
 		n += len(seg.data)
 		count++
-		seg.data = nil
+		kcp.delSegment(seg)
 		if seg.frg == 0 {
 			break
 		}
@ -263,14 +251,13 @@ func (kcp *KCP) Recv(buffer []byte) (n int) {
 	for k := range kcp.rcv_buf {
 		seg := &kcp.rcv_buf[k]
 		if seg.sn == kcp.rcv_nxt && len(kcp.rcv_queue) < int(kcp.rcv_wnd) {
-			kcp.rcv_queue = append(kcp.rcv_queue, *seg)
 			kcp.rcv_nxt++
 			count++
-			seg.data = nil
 		} else {
 			break
 		}
 	}
+	kcp.rcv_queue = append(kcp.rcv_queue, kcp.rcv_buf[:count]...)
 	kcp.rcv_buf = kcp.rcv_buf[count:]

 	// fast recover
@ -300,11 +287,12 @@ func (kcp *KCP) Send(buffer []byte) int {
 				if len(buffer) < capacity {
 					extend = len(buffer)
 				}
-				seg := NewSegment(len(old.data) + extend)
+				seg := kcp.newSegment(len(old.data) + extend)
 				seg.frg = 0
 				copy(seg.data, old.data)
 				copy(seg.data[len(old.data):], buffer)
 				buffer = buffer[extend:]
+				kcp.delSegment(old)
 				kcp.snd_queue[n-1] = *seg
 			}
 		}
@ -335,7 +323,7 @@ func (kcp *KCP) Send(buffer []byte) int {
 		} else {
 			size = len(buffer)
 		}
-		seg := NewSegment(size)
+		seg := kcp.newSegment(size)
 		copy(seg.data, buffer[:size])
 		if kcp.stream == 0 { // message mode
 			seg.frg = uint32(count - i - 1)
@ -348,8 +336,8 @@ func (kcp *KCP) Send(buffer []byte) int {
 	return 0
 }

-// https://tools.ietf.org/html/rfc6298
 func (kcp *KCP) update_ack(rtt int32) {
+	// https://tools.ietf.org/html/rfc6298
 	var rto uint32
 	if kcp.rx_srtt == 0 {
 		kcp.rx_srtt = uint32(rtt)
@ -365,7 +353,7 @@ func (kcp *KCP) update_ack(rtt int32) {
 			kcp.rx_srtt = 1
 		}
 	}
-	rto = kcp.rx_srtt + _imax_(1, 4*kcp.rx_rttval)
+	rto = kcp.rx_srtt + _imax_(kcp.interval, 4*kcp.rx_rttval)
 	kcp.rx_rto = _ibound_(kcp.rx_minrto, rto, IKCP_RTO_MAX)
 }

@ -386,6 +374,7 @@ func (kcp *KCP) parse_ack(sn uint32) {
 	for k := range kcp.snd_buf {
 		seg := &kcp.snd_buf[k]
 		if sn == seg.sn {
+			kcp.delSegment(seg)
 			copy(kcp.snd_buf[k:], kcp.snd_buf[k+1:])
 			kcp.snd_buf[len(kcp.snd_buf)-1] = Segment{}
 			kcp.snd_buf = kcp.snd_buf[:len(kcp.snd_buf)-1]
@ -417,8 +406,8 @@ func (kcp *KCP) parse_una(una uint32) {
 	for k := range kcp.snd_buf {
 		seg := &kcp.snd_buf[k]
 		if _itimediff(una, seg.sn) > 0 {
+			kcp.delSegment(seg)
 			count++
-			seg.data = nil
 		} else {
 			break
 		}
@ -428,14 +417,14 @@ func (kcp *KCP) parse_una(una uint32) {

 // ack append
 func (kcp *KCP) ack_push(sn, ts uint32) {
-	heap.Push(&kcp.acklist, ACK{sn, ts})
+	kcp.acklist = append(kcp.acklist, ackItem{sn, ts})
 }

 func (kcp *KCP) parse_data(newseg *Segment) {
 	sn := newseg.sn
 	if _itimediff(sn, kcp.rcv_nxt+kcp.rcv_wnd) >= 0 ||
 		_itimediff(sn, kcp.rcv_nxt) < 0 {
-		atomic.AddUint64(&DefaultSnmp.RepeatSegs, 1)
+		kcp.delSegment(newseg)
 		return
 	}

@ -463,6 +452,8 @@ func (kcp *KCP) parse_data(newseg *Segment) {
 			copy(kcp.rcv_buf[insert_idx+1:], kcp.rcv_buf[insert_idx:])
 			kcp.rcv_buf[insert_idx] = *newseg
 		}
+	} else {
+		kcp.delSegment(newseg)
 	}

 	// move available data from rcv_buf -> rcv_queue
@ -470,14 +461,13 @@ func (kcp *KCP) parse_data(newseg *Segment) {
 	for k := range kcp.rcv_buf {
 		seg := &kcp.rcv_buf[k]
 		if seg.sn == kcp.rcv_nxt && len(kcp.rcv_queue) < int(kcp.rcv_wnd) {
-			kcp.rcv_queue = append(kcp.rcv_queue, kcp.rcv_buf[k])
 			kcp.rcv_nxt++
 			count++
-			seg.data = nil
 		} else {
 			break
 		}
 	}
+	kcp.rcv_queue = append(kcp.rcv_queue, kcp.rcv_buf[:count]...)
 	kcp.rcv_buf = kcp.rcv_buf[count:]
 }

@ -489,7 +479,9 @@ func (kcp *KCP) Input(data []byte, update_ack bool) int {
 	}

 	var maxack uint32
+	var recentack uint32
 	var flag int
+
 	for {
 		var ts, sn, length, una, conv uint32
 		var wnd uint16
@ -525,9 +517,6 @@ func (kcp *KCP) Input(data []byte, update_ack bool) int {
 		kcp.shrink_buf()

 		if cmd == IKCP_CMD_ACK {
-			if update_ack && _itimediff(kcp.current, ts) >= 0 {
-				kcp.update_ack(_itimediff(kcp.current, ts))
-			}
 			kcp.parse_ack(sn)
 			kcp.shrink_buf()
 			if flag == 0 {
@ -536,11 +525,12 @@ func (kcp *KCP) Input(data []byte, update_ack bool) int {
 			} else if _itimediff(sn, maxack) > 0 {
 				maxack = sn
 			}
+			recentack = ts
 		} else if cmd == IKCP_CMD_PUSH {
 			if _itimediff(sn, kcp.rcv_nxt+kcp.rcv_wnd) < 0 {
 				kcp.ack_push(sn, ts)
 				if _itimediff(sn, kcp.rcv_nxt) >= 0 {
-					seg := NewSegment(int(length))
+					seg := kcp.newSegment(int(length))
 					seg.conv = conv
 					seg.cmd = uint32(cmd)
 					seg.frg = uint32(frg)
@ -550,7 +540,11 @@ func (kcp *KCP) Input(data []byte, update_ack bool) int {
 					seg.una = una
 					copy(seg.data, data[:length])
 					kcp.parse_data(seg)
+				} else {
+					atomic.AddUint64(&DefaultSnmp.RepeatSegs, 1)
 				}
+			} else {
+				atomic.AddUint64(&DefaultSnmp.RepeatSegs, 1)
 			}
 		} else if cmd == IKCP_CMD_WASK {
 			// ready to send back IKCP_CMD_WINS in Ikcp_flush
@ -565,8 +559,12 @@ func (kcp *KCP) Input(data []byte, update_ack bool) int {
 		data = data[length:]
 	}

+	current := currentMs()
 	if flag != 0 && update_ack {
 		kcp.parse_fastack(maxack)
+		if _itimediff(current, recentack) >= 0 {
+			kcp.update_ack(_itimediff(current, recentack))
+		}
 	}

 	if _itimediff(kcp.snd_una, una) > 0 {
@ -603,14 +601,10 @@ func (kcp *KCP) wnd_unused() int32 {

 // flush pending data
 func (kcp *KCP) flush() {
-	current := kcp.current
 	buffer := kcp.buffer
 	change := 0
 	lost := false

-	if kcp.updated == 0 {
-		return
-	}
 	var seg Segment
 	seg.conv = kcp.conv
 	seg.cmd = IKCP_CMD_ACK
@ -619,25 +613,28 @@ func (kcp *KCP) flush() {

 	// flush acknowledges
 	ptr := buffer
-	for kcp.acklist.Len() > 0 {
+	for i, ack := range kcp.acklist {
 		size := len(buffer) - len(ptr)
 		if size+IKCP_OVERHEAD > int(kcp.mtu) {
 			kcp.output(buffer, size)
 			ptr = buffer
 		}
-		ack := heap.Pop(&kcp.acklist).(ACK)
-		seg.sn, seg.ts = ack.sn, ack.ts
-		ptr = seg.encode(ptr)
+		// filter jitters caused by bufferbloat
+		if ack.sn >= kcp.rcv_nxt || len(kcp.acklist)-1 == i {
+			seg.sn, seg.ts = ack.sn, ack.ts
+			ptr = seg.encode(ptr)
+		}
 	}
 	kcp.acklist = nil

+	current := currentMs()
 	// probe window size (if remote window size equals zero)
 	if kcp.rmt_wnd == 0 {
 		if kcp.probe_wait == 0 {
 			kcp.probe_wait = IKCP_PROBE_INIT
-			kcp.ts_probe = kcp.current + kcp.probe_wait
+			kcp.ts_probe = current + kcp.probe_wait
 		} else {
-			if _itimediff(kcp.current, kcp.ts_probe) >= 0 {
+			if _itimediff(current, kcp.ts_probe) >= 0 {
 				if kcp.probe_wait < IKCP_PROBE_INIT {
 					kcp.probe_wait = IKCP_PROBE_INIT
 				}
@ -645,7 +642,7 @@ func (kcp *KCP) flush() {
 				if kcp.probe_wait > IKCP_PROBE_LIMIT {
 					kcp.probe_wait = IKCP_PROBE_LIMIT
 				}
-				kcp.ts_probe = kcp.current + kcp.probe_wait
+				kcp.ts_probe = current + kcp.probe_wait
 				kcp.probe |= IKCP_ASK_SEND
 			}
 		}
@ -684,6 +681,7 @@ func (kcp *KCP) flush() {
 		cwnd = _imin_(kcp.cwnd, cwnd)
 	}

+	// sliding window, controlled by snd_nxt && sna_una+cwnd
 	count := 0
 	for k := range kcp.snd_queue {
 		if _itimediff(kcp.snd_nxt, kcp.snd_una+cwnd) >= 0 {
@ -696,10 +694,8 @@ func (kcp *KCP) flush() {
 		newseg.ts = current
 		newseg.sn = kcp.snd_nxt
 		newseg.una = kcp.rcv_nxt
-		newseg.resendts = current
+		newseg.resendts = newseg.ts
 		newseg.rto = kcp.rx_rto
-		newseg.fastack = 0
-		newseg.xmit = 0
 		kcp.snd_buf = append(kcp.snd_buf, newseg)
 		kcp.snd_nxt++
 		count++
@ -707,27 +703,29 @@ func (kcp *KCP) flush() {
 	}
 	kcp.snd_queue = kcp.snd_queue[count:]

+	// flag pending data
+	hasPending := false
+	if count > 0 {
+		hasPending = true
+	}
+
 	// calculate resent
 	resent := uint32(kcp.fastresend)
 	if kcp.fastresend <= 0 {
 		resent = 0xffffffff
 	}
-	rtomin := (kcp.rx_rto >> 3)
-	if kcp.nodelay != 0 {
-		rtomin = 0
-	}

 	// flush data segments
-	nque := len(kcp.snd_queue)
 	var lostSegs, fastRetransSegs, earlyRetransSegs uint64
 	for k := range kcp.snd_buf {
+		current := currentMs()
 		segment := &kcp.snd_buf[k]
 		needsend := false
 		if segment.xmit == 0 {
 			needsend = true
 			segment.xmit++
 			segment.rto = kcp.rx_rto
-			segment.resendts = current + segment.rto + rtomin
+			segment.resendts = current + segment.rto
 		} else if _itimediff(current, segment.resendts) >= 0 {
 			needsend = true
 			segment.xmit++
@ -740,21 +738,26 @@ func (kcp *KCP) flush() {
 			segment.resendts = current + segment.rto
 			lost = true
 			lostSegs++
-		} else if segment.fastack >= resent {
-			needsend = true
-			segment.xmit++
-			segment.fastack = 0
-			segment.resendts = current + segment.rto
-			change++
-			fastRetransSegs++
-		} else if segment.fastack > 0 && nque == 0 {
-			// early retransmit
-			needsend = true
-			segment.xmit++
-			segment.fastack = 0
-			segment.resendts = current + segment.rto
-			change++
-			earlyRetransSegs++
+		} else if segment.fastack >= resent { // fast retransmit
+			lastsend := segment.resendts - segment.rto
+			if _itimediff(current, lastsend) >= int32(kcp.rx_rto/4) {
+				needsend = true
+				segment.xmit++
+				segment.fastack = 0
+				segment.resendts = current + segment.rto
+				change++
+				fastRetransSegs++
+			}
+		} else if segment.fastack > 0 && !hasPending { // early retransmit
+			lastsend := segment.resendts - segment.rto
+			if _itimediff(current, lastsend) >= int32(kcp.rx_rto/4) {
+				needsend = true
+				segment.xmit++
+				segment.fastack = 0
+				segment.resendts = current + segment.rto
+				change++
+				earlyRetransSegs++
+			}
 		}

 		if needsend {
@ -822,27 +825,26 @@ func (kcp *KCP) flush() {
 // Update updates state (call it repeatedly, every 10ms-100ms), or you can ask
 // ikcp_check when to call it again (without ikcp_input/_send calling).
 // 'current' - current timestamp in millisec.
-func (kcp *KCP) Update(current uint32) {
+func (kcp *KCP) Update() {
 	var slap int32

-	kcp.current = current
-
+	current := currentMs()
 	if kcp.updated == 0 {
 		kcp.updated = 1
-		kcp.ts_flush = kcp.current
+		kcp.ts_flush = current
 	}

-	slap = _itimediff(kcp.current, kcp.ts_flush)
+	slap = _itimediff(current, kcp.ts_flush)

 	if slap >= 10000 || slap < -10000 {
-		kcp.ts_flush = kcp.current
+		kcp.ts_flush = current
 		slap = 0
 	}

 	if slap >= 0 {
 		kcp.ts_flush += kcp.interval
-		if _itimediff(kcp.current, kcp.ts_flush) >= 0 {
-			kcp.ts_flush = kcp.current + kcp.interval
+		if _itimediff(current, kcp.ts_flush) >= 0 {
+			kcp.ts_flush = current + kcp.interval
 		}
 		kcp.flush()
 	}
@ -855,7 +857,8 @@ func (kcp *KCP) Update(current uint32) {
 // Important to reduce unnacessary ikcp_update invoking. use it to
 // schedule ikcp_update (eg. implementing an epoll-like mechanism,
 // or optimize ikcp_update when handling massive kcp connections)
-func (kcp *KCP) Check(current uint32) uint32 {
+func (kcp *KCP) Check() uint32 {
+	current := currentMs()
 	ts_flush := kcp.ts_flush
 	tm_flush := int32(0x7fffffff)
 	tm_packet := int32(0x7fffffff)
--- a/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/sess.go
+++ b/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/sess.go
@ -3,6 +3,7 @@ package kcp
 import (
 	"crypto/rand"
 	"encoding/binary"
+	"hash/crc32"
 	"io"
 	"net"
 	"sync"
@ -10,20 +11,9 @@ import (
 	"time"

 	"github.com/pkg/errors"
-
-	"github.com/klauspost/crc32"
-
 	"golang.org/x/net/ipv4"
 )

-// Option defines extra options
-type Option interface{}
-
-// OptionWithConvId defines conversation id
-type OptionWithConvId struct {
-	Id uint32
-}
-
 type errTimeout struct {
 	error
 }
@ -38,11 +28,26 @@ const (
 	crcSize                  = 4   // 4bytes packet checksum
 	cryptHeaderSize          = nonceSize + crcSize
 	mtuLimit                 = 2048
-	txQueueLimit             = 8192
-	rxFecLimit               = 8192
-	defaultKeepAliveInterval = 10 * time.Second
+	rxQueueLimit             = 8192
+	rxFECMulti               = 3 // FEC keeps rxFECMulti* (dataShard+parityShard) ordered packets in memory
+	defaultKeepAliveInterval = 10
+)
+
+const (
+	errBrokenPipe       = "broken pipe"
+	errInvalidOperation = "invalid operation"
+)
+
+var (
+	xmitBuf sync.Pool
 )

+func init() {
+	xmitBuf.New = func() interface{} {
+		return make([]byte, mtuLimit)
+	}
+}
+
 type (
 	// UDPSession defines a KCP session implemented by UDP
 	UDPSession struct {
@ -58,14 +63,13 @@ type (
 		die               chan struct{}
 		chReadEvent       chan struct{}
 		chWriteEvent      chan struct{}
-		chTicker          chan time.Time
 		chUDPOutput       chan []byte
 		headerSize        int
 		ackNoDelay        bool
 		isClosed          bool
-		keepAliveInterval time.Duration
-		xmitBuf           sync.Pool
+		keepAliveInterval int32
 		mu                sync.Mutex
+		updateInterval    int32
 	}

 	setReadBuffer interface {
@ -80,8 +84,7 @@ type (
 // newUDPSession create a new udp session for client or server
 func newUDPSession(conv uint32, dataShards, parityShards int, l *Listener, conn net.PacketConn, remote net.Addr, block BlockCrypt) *UDPSession {
 	sess := new(UDPSession)
-	sess.chTicker = make(chan time.Time, 1)
-	sess.chUDPOutput = make(chan []byte, txQueueLimit)
+	sess.chUDPOutput = make(chan []byte)
 	sess.die = make(chan struct{})
 	sess.chReadEvent = make(chan struct{}, 1)
 	sess.chWriteEvent = make(chan struct{}, 1)
@ -90,10 +93,7 @@ func newUDPSession(conv uint32, dataShards, parityShards int, l *Listener, conn
 	sess.keepAliveInterval = defaultKeepAliveInterval
 	sess.l = l
 	sess.block = block
-	sess.fec = newFEC(rxFecLimit, dataShards, parityShards)
-	sess.xmitBuf.New = func() interface{} {
-		return make([]byte, mtuLimit)
-	}
+	sess.fec = newFEC(rxFECMulti*(dataShards+parityShards), dataShards, parityShards)
 	// calculate header size
 	if sess.block != nil {
 		sess.headerSize += cryptHeaderSize
@ -104,7 +104,7 @@ func newUDPSession(conv uint32, dataShards, parityShards int, l *Listener, conn

 	sess.kcp = NewKCP(conv, func(buf []byte, size int) {
 		if size >= IKCP_OVERHEAD {
-			ext := sess.xmitBuf.Get().([]byte)[:sess.headerSize+size]
+			ext := xmitBuf.Get().([]byte)[:sess.headerSize+size]
 			copy(ext[sess.headerSize:], buf)
 			select {
 			case sess.chUDPOutput <- ext:
@ -145,7 +145,7 @@ func (s *UDPSession) Read(b []byte) (n int, err error) {

 		if s.isClosed {
 			s.mu.Unlock()
-			return 0, errors.New("broken pipe")
+			return 0, errors.New(errBrokenPipe)
 		}

 		if !s.rd.IsZero() {
@ -169,19 +169,25 @@ func (s *UDPSession) Read(b []byte) (n int, err error) {
 			return n, nil
 		}

-		var timeout <-chan time.Time
+		var timeout *time.Timer
+		var c <-chan time.Time
 		if !s.rd.IsZero() {
 			delay := s.rd.Sub(time.Now())
-			timeout = time.After(delay)
+			timeout = time.NewTimer(delay)
+			c = timeout.C
 		}
 		s.mu.Unlock()

 		// wait for read event or timeout
 		select {
 		case <-s.chReadEvent:
-		case <-timeout:
+		case <-c:
 		case <-s.die:
 		}
+
+		if timeout != nil {
+			timeout.Stop()
+		}
 	}
 }

@ -191,7 +197,7 @@ func (s *UDPSession) Write(b []byte) (n int, err error) {
 		s.mu.Lock()
 		if s.isClosed {
 			s.mu.Unlock()
-			return 0, errors.New("broken pipe")
+			return 0, errors.New(errBrokenPipe)
 		}

 		if !s.wd.IsZero() {
@ -201,7 +207,7 @@ func (s *UDPSession) Write(b []byte) (n int, err error) {
 			}
 		}

-		if s.kcp.WaitSnd() < 2*int(s.kcp.snd_wnd) {
+		if s.kcp.WaitSnd() < int(s.kcp.snd_wnd) {
 			n = len(b)
 			max := s.kcp.mss << 8
 			for {
@ -213,26 +219,31 @@ func (s *UDPSession) Write(b []byte) (n int, err error) {
 					b = b[max:]
 				}
 			}
-			s.kcp.current = currentMs()
 			s.kcp.flush()
 			s.mu.Unlock()
 			atomic.AddUint64(&DefaultSnmp.BytesSent, uint64(n))
 			return n, nil
 		}

-		var timeout <-chan time.Time
+		var timeout *time.Timer
+		var c <-chan time.Time
 		if !s.wd.IsZero() {
 			delay := s.wd.Sub(time.Now())
-			timeout = time.After(delay)
+			timeout = time.NewTimer(delay)
+			c = timeout.C
 		}
 		s.mu.Unlock()

 		// wait for write event or timeout
 		select {
 		case <-s.chWriteEvent:
-		case <-timeout:
+		case <-c:
 		case <-s.die:
 		}
+
+		if timeout != nil {
+			timeout.Stop()
+		}
 	}
 }

@ -241,7 +252,7 @@ func (s *UDPSession) Close() error {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 	if s.isClosed {
-		return errors.New("broken pipe")
+		return errors.New(errBrokenPipe)
 	}
 	close(s.die)
 	s.isClosed = true
@ -321,6 +332,7 @@ func (s *UDPSession) SetNoDelay(nodelay, interval, resend, nc int) {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 	s.kcp.NoDelay(nodelay, interval, resend, nc)
+	atomic.StoreInt32(&s.updateInterval, int32(interval))
 }

 // SetDSCP sets the 6bit DSCP field of IP header, no effect if it's accepted from Listener
@ -328,11 +340,13 @@ func (s *UDPSession) SetDSCP(dscp int) error {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 	if s.l == nil {
-		if nc, ok := s.conn.(net.Conn); ok {
+		if nc, ok := s.conn.(*ConnectedUDPConn); ok {
+			return ipv4.NewConn(nc.Conn).SetTOS(dscp << 2)
+		} else if nc, ok := s.conn.(net.Conn); ok {
 			return ipv4.NewConn(nc).SetTOS(dscp << 2)
 		}
 	}
-	return nil
+	return errors.New(errInvalidOperation)
 }

 // SetReadBuffer sets the socket read buffer, no effect if it's accepted from Listener
@ -344,7 +358,7 @@ func (s *UDPSession) SetReadBuffer(bytes int) error {
 			return nc.SetReadBuffer(bytes)
 		}
 	}
-	return nil
+	return errors.New(errInvalidOperation)
 }

 // SetWriteBuffer sets the socket write buffer, no effect if it's accepted from Listener
@ -356,24 +370,12 @@ func (s *UDPSession) SetWriteBuffer(bytes int) error {
 			return nc.SetWriteBuffer(bytes)
 		}
 	}
-	return nil
+	return errors.New(errInvalidOperation)
 }

 // SetKeepAlive changes per-connection NAT keepalive interval; 0 to disable, default to 10s
 func (s *UDPSession) SetKeepAlive(interval int) {
-	s.mu.Lock()
-	defer s.mu.Unlock()
-	s.keepAliveInterval = time.Duration(interval) * time.Second
-}
-
-// writeTo wraps write method for client & listener
-func (s *UDPSession) writeTo(b []byte, addr net.Addr) (int, error) {
-	if s.l == nil {
-		if nc, ok := s.conn.(io.Writer); ok {
-			return nc.Write(b)
-		}
-	}
-	return s.conn.WriteTo(b, addr)
+	atomic.StoreInt32(&s.keepAliveInterval, int32(interval))
 }

 func (s *UDPSession) outputTask() {
@ -385,13 +387,15 @@ func (s *UDPSession) outputTask() {
 	szOffset := fecOffset + fecHeaderSize

 	// fec data group
+	var cacheLine []byte
 	var fecGroup [][]byte
 	var fecCnt int
 	var fecMaxSize int
 	if s.fec != nil {
+		cacheLine = make([]byte, s.fec.shardSize*mtuLimit)
 		fecGroup = make([][]byte, s.fec.shardSize)
 		for k := range fecGroup {
-			fecGroup[k] = make([]byte, mtuLimit)
+			fecGroup[k] = cacheLine[k*mtuLimit : (k+1)*mtuLimit]
 		}
 	}

@ -402,23 +406,31 @@ func (s *UDPSession) outputTask() {

 	for {
 		select {
+		// receive from a synchronous channel
+		// buffered channel must be avoided, because of "bufferbloat"
 		case ext := <-s.chUDPOutput:
 			var ecc [][]byte
 			if s.fec != nil {
 				s.fec.markData(ext[fecOffset:])
-				// explicit size
+				// explicit size, including 2bytes size itself.
 				binary.LittleEndian.PutUint16(ext[szOffset:], uint16(len(ext[szOffset:])))

 				// copy data to fec group
-				xorBytes(fecGroup[fecCnt], fecGroup[fecCnt], fecGroup[fecCnt])
+				sz := len(ext)
+				fecGroup[fecCnt] = fecGroup[fecCnt][:sz]
 				copy(fecGroup[fecCnt], ext)
 				fecCnt++
-				if len(ext) > fecMaxSize {
-					fecMaxSize = len(ext)
+				if sz > fecMaxSize {
+					fecMaxSize = sz
 				}

 				//  calculate Reed-Solomon Erasure Code
 				if fecCnt == s.fec.dataShards {
+					for i := 0; i < s.fec.dataShards; i++ {
+						shard := fecGroup[i]
+						slen := len(shard)
+						xorBytes(shard[slen:fecMaxSize], shard[slen:fecMaxSize], shard[slen:fecMaxSize])
+					}
 					ecc = s.fec.calcECC(fecGroup, szOffset, fecMaxSize)
 					for k := range ecc {
 						s.fec.markFEC(ecc[k][fecOffset:])
@ -445,38 +457,36 @@ func (s *UDPSession) outputTask() {
 				}
 			}

-			//if rand.Intn(100) < 80 {
-			if n, err := s.writeTo(ext, s.remote); err == nil {
-				atomic.AddUint64(&DefaultSnmp.OutSegs, 1)
-				atomic.AddUint64(&DefaultSnmp.OutBytes, uint64(n))
+			nbytes := 0
+			nsegs := 0
+			// if mrand.Intn(100) < 50 {
+			if n, err := s.conn.WriteTo(ext, s.remote); err == nil {
+				nbytes += n
+				nsegs++
 			}
-			//}
+			// }

 			if ecc != nil {
 				for k := range ecc {
-					if n, err := s.writeTo(ecc[k], s.remote); err == nil {
-						atomic.AddUint64(&DefaultSnmp.OutSegs, 1)
-						atomic.AddUint64(&DefaultSnmp.OutBytes, uint64(n))
+					if n, err := s.conn.WriteTo(ecc[k], s.remote); err == nil {
+						nbytes += n
+						nsegs++
 					}
 				}
 			}
-			xorBytes(ext, ext, ext)
-			s.xmitBuf.Put(ext)
+			atomic.AddUint64(&DefaultSnmp.OutSegs, uint64(nsegs))
+			atomic.AddUint64(&DefaultSnmp.OutBytes, uint64(nbytes))
+			xmitBuf.Put(ext)
 		case <-ticker.C: // NAT keep-alive
-			if len(s.chUDPOutput) == 0 {
-				s.mu.Lock()
-				interval := s.keepAliveInterval
-				s.mu.Unlock()
-				if interval > 0 && time.Now().After(lastPing.Add(interval)) {
-					buf := make([]byte, 2)
-					io.ReadFull(rand.Reader, buf)
-					rnd := int(binary.LittleEndian.Uint16(buf))
-					sz := rnd%(IKCP_MTU_DEF-s.headerSize-IKCP_OVERHEAD) + s.headerSize + IKCP_OVERHEAD
-					ping := make([]byte, sz)
-					io.ReadFull(rand.Reader, ping)
-					s.writeTo(ping, s.remote)
-					lastPing = time.Now()
-				}
+			interval := time.Duration(atomic.LoadInt32(&s.keepAliveInterval)) * time.Second
+			if interval > 0 && time.Now().After(lastPing.Add(interval)) {
+				var rnd uint16
+				binary.Read(rand.Reader, binary.LittleEndian, &rnd)
+				sz := int(rnd)%(IKCP_MTU_DEF-s.headerSize-IKCP_OVERHEAD) + s.headerSize + IKCP_OVERHEAD
+				ping := make([]byte, sz) // randomized ping packet
+				io.ReadFull(rand.Reader, ping)
+				s.conn.WriteTo(ping, s.remote)
+				lastPing = time.Now()
 			}
 		case <-s.die:
 			return
@ -486,25 +496,18 @@ func (s *UDPSession) outputTask() {

 // kcp update, input loop
 func (s *UDPSession) updateTask() {
-	var tc <-chan time.Time
-	if s.l == nil { // client
-		ticker := time.NewTicker(10 * time.Millisecond)
-		tc = ticker.C
-		defer ticker.Stop()
-	} else {
-		tc = s.chTicker
-	}
+	tc := time.After(time.Duration(atomic.LoadInt32(&s.updateInterval)) * time.Millisecond)

 	for {
 		select {
 		case <-tc:
 			s.mu.Lock()
-			current := currentMs()
-			s.kcp.Update(current)
-			if s.kcp.WaitSnd() < 2*int(s.kcp.snd_wnd) {
+			s.kcp.flush()
+			if s.kcp.WaitSnd() < int(s.kcp.snd_wnd) {
 				s.notifyWriteEvent()
 			}
 			s.mu.Unlock()
+			tc = time.After(time.Duration(atomic.LoadInt32(&s.updateInterval)) * time.Millisecond)
 		case <-s.die:
 			if s.l != nil { // has listener
 				select {
@ -537,58 +540,84 @@ func (s *UDPSession) notifyWriteEvent() {
 }

 func (s *UDPSession) kcpInput(data []byte) {
-	current := currentMs()
+	var kcpInErrors, fecErrs, fecRecovered, fecSegs uint64
+
 	if s.fec != nil {
 		f := s.fec.decode(data)
+		s.mu.Lock()
+		if f.flag == typeData {
+			if ret := s.kcp.Input(data[fecHeaderSizePlus2:], true); ret != 0 {
+				kcpInErrors++
+			}
+		}
+
 		if f.flag == typeData || f.flag == typeFEC {
 			if f.flag == typeFEC {
-				atomic.AddUint64(&DefaultSnmp.FECSegs, 1)
+				fecSegs++
 			}

 			if recovers := s.fec.input(f); recovers != nil {
-				s.mu.Lock()
-				s.kcp.current = current
-				for k := range recovers {
-					sz := binary.LittleEndian.Uint16(recovers[k])
-					if int(sz) <= len(recovers[k]) && sz >= 2 {
-						s.kcp.Input(recovers[k][2:sz], false)
+				for _, r := range recovers {
+					if len(r) >= 2 { // must be larger than 2bytes
+						sz := binary.LittleEndian.Uint16(r)
+						if int(sz) <= len(r) && sz >= 2 {
+							if ret := s.kcp.Input(r[2:sz], false); ret == 0 {
+								fecRecovered++
+							} else {
+								kcpInErrors++
+							}
+						} else {
+							fecErrs++
+						}
 					} else {
-						atomic.AddUint64(&DefaultSnmp.FECErrs, 1)
+						fecErrs++
 					}
 				}
-				s.mu.Unlock()
-				atomic.AddUint64(&DefaultSnmp.FECRecovered, uint64(len(recovers)))
 			}
 		}
-		if f.flag == typeData {
-			s.mu.Lock()
-			s.kcp.current = current
-			s.kcp.Input(data[fecHeaderSizePlus2:], true)
-			s.mu.Unlock()
+
+		// notify reader
+		if n := s.kcp.PeekSize(); n > 0 {
+			s.notifyReadEvent()
+		}
+		if s.ackNoDelay {
+			s.kcp.flush()
 		}
+		s.mu.Unlock()
 	} else {
 		s.mu.Lock()
-		s.kcp.current = current
-		s.kcp.Input(data, true)
+		if ret := s.kcp.Input(data, true); ret != 0 {
+			kcpInErrors++
+		}
+		// notify reader
+		if n := s.kcp.PeekSize(); n > 0 {
+			s.notifyReadEvent()
+		}
+		if s.ackNoDelay {
+			s.kcp.flush()
+		}
 		s.mu.Unlock()
 	}

-	// notify reader
-	s.mu.Lock()
-	if n := s.kcp.PeekSize(); n > 0 {
-		s.notifyReadEvent()
+	atomic.AddUint64(&DefaultSnmp.InSegs, 1)
+	atomic.AddUint64(&DefaultSnmp.InBytes, uint64(len(data)))
+	if fecSegs > 0 {
+		atomic.AddUint64(&DefaultSnmp.FECSegs, fecSegs)
 	}
-	if s.ackNoDelay {
-		s.kcp.current = current
-		s.kcp.flush()
+	if kcpInErrors > 0 {
+		atomic.AddUint64(&DefaultSnmp.KCPInErrors, kcpInErrors)
+	}
+	if fecErrs > 0 {
+		atomic.AddUint64(&DefaultSnmp.FECErrs, fecErrs)
+	}
+	if fecRecovered > 0 {
+		atomic.AddUint64(&DefaultSnmp.FECRecovered, fecRecovered)
 	}
-	s.mu.Unlock()
-	atomic.AddUint64(&DefaultSnmp.InSegs, 1)
 }

 func (s *UDPSession) receiver(ch chan []byte) {
 	for {
-		data := s.xmitBuf.Get().([]byte)[:mtuLimit]
+		data := xmitBuf.Get().([]byte)[:mtuLimit]
 		if n, _, err := s.conn.ReadFrom(data); err == nil && n >= s.headerSize+IKCP_OVERHEAD {
 			select {
 			case ch <- data[:n]:
@ -604,7 +633,7 @@ func (s *UDPSession) receiver(ch chan []byte) {

 // read loop for client session
 func (s *UDPSession) readLoop() {
-	chPacket := make(chan []byte, txQueueLimit)
+	chPacket := make(chan []byte, rxQueueLimit)
 	go s.receiver(chPacket)

 	for {
@ -629,8 +658,7 @@ func (s *UDPSession) readLoop() {
 			if dataValid {
 				s.kcpInput(data)
 			}
-			xorBytes(raw, raw, raw)
-			s.xmitBuf.Put(raw)
+			xmitBuf.Put(raw)
 		case <-s.die:
 			return
 		}
@ -662,10 +690,8 @@ type (

 // monitor incoming data for all connections of server
 func (l *Listener) monitor() {
-	chPacket := make(chan packet, txQueueLimit)
+	chPacket := make(chan packet, rxQueueLimit)
 	go l.receiver(chPacket)
-	ticker := time.NewTicker(10 * time.Millisecond)
-	defer ticker.Stop()
 	for {
 		select {
 		case p := <-chPacket:
@ -715,20 +741,11 @@ func (l *Listener) monitor() {
 				}
 			}

-			xorBytes(raw, raw, raw)
 			l.rxbuf.Put(raw)
 		case deadlink := <-l.chDeadlinks:
 			delete(l.sessions, deadlink.String())
 		case <-l.die:
 			return
-		case <-ticker.C:
-			now := time.Now()
-			for _, s := range l.sessions {
-				select {
-				case s.chTicker <- now:
-				default:
-				}
-			}
 		}
 	}
 }
@ -751,7 +768,7 @@ func (l *Listener) SetReadBuffer(bytes int) error {
 	if nc, ok := l.conn.(setReadBuffer); ok {
 		return nc.SetReadBuffer(bytes)
 	}
-	return nil
+	return errors.New(errInvalidOperation)
 }

 // SetWriteBuffer sets the socket write buffer for the Listener
@ -759,7 +776,7 @@ func (l *Listener) SetWriteBuffer(bytes int) error {
 	if nc, ok := l.conn.(setWriteBuffer); ok {
 		return nc.SetWriteBuffer(bytes)
 	}
-	return nil
+	return errors.New(errInvalidOperation)
 }

 // SetDSCP sets the 6bit DSCP field of IP header
@ -767,7 +784,7 @@ func (l *Listener) SetDSCP(dscp int) error {
 	if nc, ok := l.conn.(net.Conn); ok {
 		return ipv4.NewConn(nc).SetTOS(dscp << 2)
 	}
-	return nil
+	return errors.New(errInvalidOperation)
 }

 // Accept implements the Accept method in the Listener interface; it waits for the next call and returns a generic Conn.
@ -788,7 +805,7 @@ func (l *Listener) AcceptKCP() (*UDPSession, error) {
 	case c := <-l.chAccepts:
 		return c, nil
 	case <-l.die:
-		return nil, errors.New("listener stopped")
+		return nil, errors.New(errBrokenPipe)
 	}
 }

@ -823,7 +840,7 @@ func (l *Listener) Addr() net.Addr {
 }

 // Listen listens for incoming KCP packets addressed to the local address laddr on the network "udp",
-func Listen(laddr string) (*Listener, error) {
+func Listen(laddr string) (net.Listener, error) {
 	return ListenWithOptions(laddr, nil, 0, 0)
 }

@ -839,6 +856,11 @@ func ListenWithOptions(laddr string, block BlockCrypt, dataShards, parityShards
 		return nil, errors.Wrap(err, "net.ListenUDP")
 	}

+	return ServeConn(block, dataShards, parityShards, conn)
+}
+
+// ServeConn serves KCP protocol for a single packet connection.
+func ServeConn(block BlockCrypt, dataShards, parityShards int, conn net.PacketConn) (*Listener, error) {
 	l := new(Listener)
 	l.conn = conn
 	l.sessions = make(map[string]*UDPSession)
@ -848,7 +870,7 @@ func ListenWithOptions(laddr string, block BlockCrypt, dataShards, parityShards
 	l.dataShards = dataShards
 	l.parityShards = parityShards
 	l.block = block
-	l.fec = newFEC(rxFecLimit, dataShards, parityShards)
+	l.fec = newFEC(rxFECMulti*(dataShards+parityShards), dataShards, parityShards)
 	l.rxbuf.New = func() interface{} {
 		return make([]byte, mtuLimit)
 	}
@ -866,12 +888,12 @@ func ListenWithOptions(laddr string, block BlockCrypt, dataShards, parityShards
 }

 // Dial connects to the remote address "raddr" on the network "udp"
-func Dial(raddr string) (*UDPSession, error) {
+func Dial(raddr string) (net.Conn, error) {
 	return DialWithOptions(raddr, nil, 0, 0)
 }

 // DialWithOptions connects to the remote address "raddr" on the network "udp" with packet encryption
-func DialWithOptions(raddr string, block BlockCrypt, dataShards, parityShards int, opts ...Option) (*UDPSession, error) {
+func DialWithOptions(raddr string, block BlockCrypt, dataShards, parityShards int) (*UDPSession, error) {
 	udpaddr, err := net.ResolveUDPAddr("udp", raddr)
 	if err != nil {
 		return nil, errors.Wrap(err, "net.ResolveUDPAddr")
@ -882,20 +904,34 @@ func DialWithOptions(raddr string, block BlockCrypt, dataShards, parityShards in
 		return nil, errors.Wrap(err, "net.DialUDP")
 	}

-	buf := make([]byte, 4)
-	io.ReadFull(rand.Reader, buf)
-	convid := binary.LittleEndian.Uint32(buf)
-	for k := range opts {
-		switch opt := opts[k].(type) {
-		case OptionWithConvId:
-			convid = opt.Id
-		default:
-			return nil, errors.New("unrecognized option")
-		}
+	return NewConn(raddr, block, dataShards, parityShards, &ConnectedUDPConn{udpconn, udpconn})
+}
+
+// NewConn establishes a session and talks KCP protocol over a packet connection.
+func NewConn(raddr string, block BlockCrypt, dataShards, parityShards int, conn net.PacketConn) (*UDPSession, error) {
+	udpaddr, err := net.ResolveUDPAddr("udp", raddr)
+	if err != nil {
+		return nil, errors.Wrap(err, "net.ResolveUDPAddr")
 	}
-	return newUDPSession(convid, dataShards, parityShards, nil, udpconn, udpaddr, block), nil
+
+	var convid uint32
+	binary.Read(rand.Reader, binary.LittleEndian, &convid)
+	return newUDPSession(convid, dataShards, parityShards, nil, conn, udpaddr, block), nil
 }

 func currentMs() uint32 {
 	return uint32(time.Now().UnixNano() / int64(time.Millisecond))
 }
+
+// ConnectedUDPConn is a wrapper for net.UDPConn which converts WriteTo syscalls
+// to Write syscalls that are 4 times faster on some OS'es. This should only be
+// used for connections that were produced by a net.Dial* call.
+type ConnectedUDPConn struct {
+	*net.UDPConn
+	Conn net.Conn // underlying connection if any
+}
+
+// WriteTo redirects all writes to the Write syscall, which is 4 times faster.
+func (c *ConnectedUDPConn) WriteTo(b []byte, addr net.Addr) (int, error) {
+	return c.Write(b)
+}
--- a/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/snmp.go
+++ b/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/snmp.go
@ -1,34 +1,95 @@
 package kcp

-import "sync/atomic"
+import (
+	"fmt"
+	"sync/atomic"
+)

 // Snmp defines network statistics indicator
 type Snmp struct {
-	BytesSent        uint64 // payload bytes sent
+	BytesSent        uint64 // raw bytes sent
 	BytesReceived    uint64
 	MaxConn          uint64
 	ActiveOpens      uint64
 	PassiveOpens     uint64
-	CurrEstab        uint64
-	InErrs           uint64
-	InCsumErrors     uint64 // checksum errors
+	CurrEstab        uint64 // count of connections for now
+	InErrs           uint64 // udp read errors
+	InCsumErrors     uint64 // checksum errors from CRC32
+	KCPInErrors      uint64 // packet iput errors from kcp
 	InSegs           uint64
 	OutSegs          uint64
+	InBytes          uint64 // udp bytes received
 	OutBytes         uint64 // udp bytes sent
 	RetransSegs      uint64
 	FastRetransSegs  uint64
 	EarlyRetransSegs uint64
-	LostSegs         uint64
-	RepeatSegs       uint64
-	FECRecovered     uint64
-	FECErrs          uint64
-	FECSegs          uint64 // fec segments received
+	LostSegs         uint64 // number of segs infered as lost
+	RepeatSegs       uint64 // number of segs duplicated
+	FECRecovered     uint64 // correct packets recovered from FEC
+	FECErrs          uint64 // incorrect packets recovered from FEC
+	FECSegs          uint64 // FEC segments received
+	FECShortShards   uint64 // number of data shards that's not enough for recovery
 }

 func newSnmp() *Snmp {
 	return new(Snmp)
 }

+func (s *Snmp) Header() []string {
+	return []string{
+		"BytesSent",
+		"BytesReceived",
+		"MaxConn",
+		"ActiveOpens",
+		"PassiveOpens",
+		"CurrEstab",
+		"InErrs",
+		"InCsumErrors",
+		"KCPInErrors",
+		"InSegs",
+		"OutSegs",
+		"InBytes",
+		"OutBytes",
+		"RetransSegs",
+		"FastRetransSegs",
+		"EarlyRetransSegs",
+		"LostSegs",
+		"RepeatSegs",
+		"FECSegs",
+		"FECErrs",
+		"FECRecovered",
+		"FECShortShards",
+	}
+}
+
+func (s *Snmp) ToSlice() []string {
+	snmp := s.Copy()
+	return []string{
+		fmt.Sprint(snmp.BytesSent),
+		fmt.Sprint(snmp.BytesReceived),
+		fmt.Sprint(snmp.MaxConn),
+		fmt.Sprint(snmp.ActiveOpens),
+		fmt.Sprint(snmp.PassiveOpens),
+		fmt.Sprint(snmp.CurrEstab),
+		fmt.Sprint(snmp.InErrs),
+		fmt.Sprint(snmp.InCsumErrors),
+		fmt.Sprint(snmp.KCPInErrors),
+		fmt.Sprint(snmp.InSegs),
+		fmt.Sprint(snmp.OutSegs),
+		fmt.Sprint(snmp.InBytes),
+		fmt.Sprint(snmp.OutBytes),
+		fmt.Sprint(snmp.RetransSegs),
+		fmt.Sprint(snmp.FastRetransSegs),
+		fmt.Sprint(snmp.EarlyRetransSegs),
+		fmt.Sprint(snmp.LostSegs),
+		fmt.Sprint(snmp.RepeatSegs),
+		fmt.Sprint(snmp.FECSegs),
+		fmt.Sprint(snmp.FECErrs),
+		fmt.Sprint(snmp.FECRecovered),
+		fmt.Sprint(snmp.FECShortShards),
+	}
+}
+
 // Copy make a copy of current snmp snapshot
 func (s *Snmp) Copy() *Snmp {
 	d := newSnmp()
@ -40,8 +101,10 @@ func (s *Snmp) Copy() *Snmp {
 	d.CurrEstab = atomic.LoadUint64(&s.CurrEstab)
 	d.InErrs = atomic.LoadUint64(&s.InErrs)
 	d.InCsumErrors = atomic.LoadUint64(&s.InCsumErrors)
+	d.KCPInErrors = atomic.LoadUint64(&s.KCPInErrors)
 	d.InSegs = atomic.LoadUint64(&s.InSegs)
 	d.OutSegs = atomic.LoadUint64(&s.OutSegs)
+	d.InBytes = atomic.LoadUint64(&s.InBytes)
 	d.OutBytes = atomic.LoadUint64(&s.OutBytes)
 	d.RetransSegs = atomic.LoadUint64(&s.RetransSegs)
 	d.FastRetransSegs = atomic.LoadUint64(&s.FastRetransSegs)
@ -51,9 +114,36 @@ func (s *Snmp) Copy() *Snmp {
 	d.FECSegs = atomic.LoadUint64(&s.FECSegs)
 	d.FECErrs = atomic.LoadUint64(&s.FECErrs)
 	d.FECRecovered = atomic.LoadUint64(&s.FECRecovered)
+	d.FECShortShards = atomic.LoadUint64(&s.FECShortShards)
 	return d
 }

+// Reset values to zero
+func (s *Snmp) Reset() {
+	atomic.StoreUint64(&s.BytesSent, 0)
+	atomic.StoreUint64(&s.BytesReceived, 0)
+	atomic.StoreUint64(&s.MaxConn, 0)
+	atomic.StoreUint64(&s.ActiveOpens, 0)
+	atomic.StoreUint64(&s.PassiveOpens, 0)
+	atomic.StoreUint64(&s.CurrEstab, 0)
+	atomic.StoreUint64(&s.InErrs, 0)
+	atomic.StoreUint64(&s.InCsumErrors, 0)
+	atomic.StoreUint64(&s.KCPInErrors, 0)
+	atomic.StoreUint64(&s.InSegs, 0)
+	atomic.StoreUint64(&s.OutSegs, 0)
+	atomic.StoreUint64(&s.InBytes, 0)
+	atomic.StoreUint64(&s.OutBytes, 0)
+	atomic.StoreUint64(&s.RetransSegs, 0)
+	atomic.StoreUint64(&s.FastRetransSegs, 0)
+	atomic.StoreUint64(&s.EarlyRetransSegs, 0)
+	atomic.StoreUint64(&s.LostSegs, 0)
+	atomic.StoreUint64(&s.RepeatSegs, 0)
+	atomic.StoreUint64(&s.FECSegs, 0)
+	atomic.StoreUint64(&s.FECErrs, 0)
+	atomic.StoreUint64(&s.FECRecovered, 0)
+	atomic.StoreUint64(&s.FECShortShards, 0)
+}
+
 // DefaultSnmp is the global KCP connection statistics collector
 var DefaultSnmp *Snmp

--- a/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/xor.go
+++ b/cmd/gost/vendor/gopkg.in/xtaci/kcp-go.v2/xor.go
@ -44,15 +44,18 @@ func safeXORBytes(dst, a, b []byte) int {
 	}

 	for i := ex; i < n; i += 8 {
-		dst[i] = a[i] ^ b[i]
-		dst[i+1] = a[i+1] ^ b[i+1]
-		dst[i+2] = a[i+2] ^ b[i+2]
-		dst[i+3] = a[i+3] ^ b[i+3]
+		_dst := dst[i : i+8]
+		_a := a[i : i+8]
+		_b := b[i : i+8]
+		_dst[0] = _a[0] ^ _b[0]
+		_dst[1] = _a[1] ^ _b[1]
+		_dst[2] = _a[2] ^ _b[2]
+		_dst[3] = _a[3] ^ _b[3]

-		dst[i+4] = a[i+4] ^ b[i+4]
-		dst[i+5] = a[i+5] ^ b[i+5]
-		dst[i+6] = a[i+6] ^ b[i+6]
-		dst[i+7] = a[i+7] ^ b[i+7]
+		_dst[4] = _a[4] ^ _b[4]
+		_dst[5] = _a[5] ^ _b[5]
+		_dst[6] = _a[6] ^ _b[6]
+		_dst[7] = _a[7] ^ _b[7]
 	}
 	return n
 }
@ -85,14 +88,17 @@ func fastXORWords(dst, a, b []byte) {
 	}

 	for i := ex; i < n; i += 8 {
-		dw[i] = aw[i] ^ bw[i]
-		dw[i+1] = aw[i+1] ^ bw[i+1]
-		dw[i+2] = aw[i+2] ^ bw[i+2]
-		dw[i+3] = aw[i+3] ^ bw[i+3]
-		dw[i+4] = aw[i+4] ^ bw[i+4]
-		dw[i+5] = aw[i+5] ^ bw[i+5]
-		dw[i+6] = aw[i+6] ^ bw[i+6]
-		dw[i+7] = aw[i+7] ^ bw[i+7]
+		_dw := dw[i : i+8]
+		_aw := aw[i : i+8]
+		_bw := bw[i : i+8]
+		_dw[0] = _aw[0] ^ _bw[0]
+		_dw[1] = _aw[1] ^ _bw[1]
+		_dw[2] = _aw[2] ^ _bw[2]
+		_dw[3] = _aw[3] ^ _bw[3]
+		_dw[4] = _aw[4] ^ _bw[4]
+		_dw[5] = _aw[5] ^ _bw[5]
+		_dw[6] = _aw[6] ^ _bw[6]
+		_dw[7] = _aw[7] ^ _bw[7]
 	}
 }

--- a/cmd/gost/vendor/gopkg.in/xtaci/smux.v1/README.md
+++ b/cmd/gost/vendor/gopkg.in/xtaci/smux.v1/README.md
@ -62,7 +62,7 @@ func client() {
        panic(err)
    }

-    // Stream implements net.Conn
+    // Stream implements io.ReadWriteCloser
    stream.Write([]byte("ping"))
 }

@ -94,4 +94,4 @@ func server() {

 ## Status

-Beta
+Stable
--- a/cmd/gost/vendor/gopkg.in/xtaci/smux.v1/session.go
+++ b/cmd/gost/vendor/gopkg.in/xtaci/smux.v1/session.go
@ -16,10 +16,19 @@ const (

 const (
 	errBrokenPipe      = "broken pipe"
-	errConnReset       = "connection reset by peer"
 	errInvalidProtocol = "invalid protocol version"
 )

+type writeRequest struct {
+	frame  Frame
+	result chan writeResult
+}
+
+type writeResult struct {
+	n   int
+	err error
+}
+
 // Session defines a multiplexed connection for streams
 type Session struct {
 	conn      io.ReadWriteCloser
@ -38,7 +47,12 @@ type Session struct {
 	dieLock   sync.Mutex
 	chAccepts chan *Stream

+	xmitPool  sync.Pool
 	dataReady int32 // flag data has arrived
+
+	deadline atomic.Value
+
+	writes chan writeRequest
 }

 func newSession(config *Config, conn io.ReadWriteCloser, client bool) *Session {
@ -50,12 +64,18 @@ func newSession(config *Config, conn io.ReadWriteCloser, client bool) *Session {
 	s.chAccepts = make(chan *Stream, defaultAcceptBacklog)
 	s.bucket = int32(config.MaxReceiveBuffer)
 	s.bucketCond = sync.NewCond(&sync.Mutex{})
+	s.xmitPool.New = func() interface{} {
+		return make([]byte, (1<<16)+headerSize)
+	}
+	s.writes = make(chan writeRequest)
+
 	if client {
 		s.nextStreamID = 1
 	} else {
 		s.nextStreamID = 2
 	}
 	go s.recvLoop()
+	go s.sendLoop()
 	go s.keepalive()
 	return s
 }
@ -82,9 +102,17 @@ func (s *Session) OpenStream() (*Stream, error) {
 // AcceptStream is used to block until the next available stream
 // is ready to be accepted.
 func (s *Session) AcceptStream() (*Stream, error) {
+	var deadline <-chan time.Time
+	if d, ok := s.deadline.Load().(time.Time); ok && !d.IsZero() {
+		timer := time.NewTimer(d.Sub(time.Now()))
+		defer timer.Stop()
+		deadline = timer.C
+	}
 	select {
 	case stream := <-s.chAccepts:
 		return stream, nil
+	case <-deadline:
+		return nil, errTimeout
 	case <-s.die:
 		return nil, errors.New(errBrokenPipe)
 	}
@ -93,13 +121,14 @@ func (s *Session) AcceptStream() (*Stream, error) {
 // Close is used to close the session and all streams.
 func (s *Session) Close() (err error) {
 	s.dieLock.Lock()
-	defer s.dieLock.Unlock()

 	select {
 	case <-s.die:
+		s.dieLock.Unlock()
 		return errors.New(errBrokenPipe)
 	default:
 		close(s.die)
+		s.dieLock.Unlock()
 		s.streamLock.Lock()
 		for k := range s.streams {
 			s.streams[k].sessionClose()
@ -130,6 +159,13 @@ func (s *Session) NumStreams() int {
 	return len(s.streams)
 }

+// SetDeadline sets a deadline used by Accept* calls.
+// A zero time value disables the deadline.
+func (s *Session) SetDeadline(t time.Time) error {
+	s.deadline.Store(t)
+	return nil
+}
+
 // notify the session that a stream has closed
 func (s *Session) streamClosed(sid uint32) {
 	s.streamLock.Lock()
@ -144,9 +180,12 @@ func (s *Session) streamClosed(sid uint32) {

 // returnTokens is called by stream to return token after read
 func (s *Session) returnTokens(n int) {
-	if atomic.AddInt32(&s.bucket, int32(n)) > 0 {
+	oldvalue := atomic.LoadInt32(&s.bucket)
+	newvalue := atomic.AddInt32(&s.bucket, int32(n))
+	if oldvalue <= 0 && newvalue > 0 {
 		s.bucketCond.Signal()
 	}
+
 }

 // session read a frame from underlying connection
@ -250,26 +289,56 @@ func (s *Session) keepalive() {
 	}
 }

+func (s *Session) sendLoop() {
+	for {
+		select {
+		case <-s.die:
+			return
+		case request, ok := <-s.writes:
+			if !ok {
+				continue
+			}
+			buf := s.xmitPool.Get().([]byte)
+			buf[0] = request.frame.ver
+			buf[1] = request.frame.cmd
+			binary.LittleEndian.PutUint16(buf[2:], uint16(len(request.frame.data)))
+			binary.LittleEndian.PutUint32(buf[4:], request.frame.sid)
+			copy(buf[headerSize:], request.frame.data)
+
+			s.writeLock.Lock()
+			n, err := s.conn.Write(buf[:headerSize+len(request.frame.data)])
+			s.writeLock.Unlock()
+			s.xmitPool.Put(buf)
+
+			n -= headerSize
+			if n < 0 {
+				n = 0
+			}
+
+			result := writeResult{
+				n:   n,
+				err: err,
+			}
+
+			request.result <- result
+			close(request.result)
+		}
+	}
+}
+
 // writeFrame writes the frame to the underlying connection
 // and returns the number of bytes written if successful
 func (s *Session) writeFrame(f Frame) (n int, err error) {
-	buf := make([]byte, headerSize+len(f.data))
-	buf[0] = f.ver
-	buf[1] = f.cmd
-	binary.LittleEndian.PutUint16(buf[2:], uint16(len(f.data)))
-	binary.LittleEndian.PutUint32(buf[4:], f.sid)
-	copy(buf[headerSize:], f.data)
-
-	s.writeLock.Lock()
-	n, err = s.conn.Write(buf)
-	s.writeLock.Unlock()
-	return n, err
-}
+	req := writeRequest{
+		frame:  f,
+		result: make(chan writeResult, 1),
+	}
+	select {
+	case <-s.die:
+		return 0, errors.New(errBrokenPipe)
+	case s.writes <- req:
+	}

-// writeBinary writes the byte slice to the underlying connection
-func (s *Session) writeBinary(bts []byte) (n int, err error) {
-	s.writeLock.Lock()
-	n, err = s.conn.Write(bts)
-	s.writeLock.Unlock()
-	return n, err
+	result := <-req.result
+	return result.n, result.err
 }
--- a/cmd/gost/vendor/gopkg.in/xtaci/smux.v1/stream.go
+++ b/cmd/gost/vendor/gopkg.in/xtaci/smux.v1/stream.go
@ -2,24 +2,28 @@ package smux

 import (
 	"bytes"
-	"encoding/binary"
+	"io"
+	"net"
 	"sync"
 	"sync/atomic"
+	"time"

 	"github.com/pkg/errors"
 )

 // Stream implements io.ReadWriteCloser
 type Stream struct {
-	id          uint32
-	rstflag     int32
-	sess        *Session
-	buffer      bytes.Buffer
-	bufferLock  sync.Mutex
-	frameSize   int
-	chReadEvent chan struct{} // notify a read event
-	die         chan struct{} // flag the stream has closed
-	dieLock     sync.Mutex
+	id            uint32
+	rstflag       int32
+	sess          *Session
+	buffer        bytes.Buffer
+	bufferLock    sync.Mutex
+	frameSize     int
+	chReadEvent   chan struct{} // notify a read event
+	die           chan struct{} // flag the stream has closed
+	dieLock       sync.Mutex
+	readDeadline  atomic.Value
+	writeDeadline atomic.Value
 }

 // newStream initiates a Stream struct
@ -35,10 +39,19 @@ func newStream(id uint32, frameSize int, sess *Session) *Stream {

 // Read implements io.ReadWriteCloser
 func (s *Stream) Read(b []byte) (n int, err error) {
+	var deadline <-chan time.Time
+	if d, ok := s.readDeadline.Load().(time.Time); ok && !d.IsZero() {
+		timer := time.NewTimer(d.Sub(time.Now()))
+		defer timer.Stop()
+		deadline = timer.C
+	}
+
 READ:
 	select {
 	case <-s.die:
 		return 0, errors.New(errBrokenPipe)
+	case <-deadline:
+		return n, errTimeout
 	default:
 	}

@ -51,12 +64,14 @@ READ:
 		return n, nil
 	} else if atomic.LoadInt32(&s.rstflag) == 1 {
 		_ = s.Close()
-		return 0, errors.New(errConnReset)
+		return 0, io.EOF
 	}

 	select {
 	case <-s.chReadEvent:
 		goto READ
+	case <-deadline:
+		return n, errTimeout
 	case <-s.die:
 		return 0, errors.New(errBrokenPipe)
 	}
@ -64,6 +79,13 @@ READ:

 // Write implements io.ReadWriteCloser
 func (s *Stream) Write(b []byte) (n int, err error) {
+	var deadline <-chan time.Time
+	if d, ok := s.writeDeadline.Load().(time.Time); ok && !d.IsZero() {
+		timer := time.NewTimer(d.Sub(time.Now()))
+		defer timer.Stop()
+		deadline = timer.C
+	}
+
 	select {
 	case <-s.die:
 		return 0, errors.New(errBrokenPipe)
@ -71,42 +93,82 @@ func (s *Stream) Write(b []byte) (n int, err error) {
 	}

 	frames := s.split(b, cmdPSH, s.id)
-	// preallocate buffer
-	buffer := make([]byte, len(frames)*headerSize+len(b))
-	bts := buffer
-
-	// combine frames into a large blob
+	sent := 0
 	for k := range frames {
-		bts[0] = version
-		bts[1] = frames[k].cmd
-		binary.LittleEndian.PutUint16(bts[2:], uint16(len(frames[k].data)))
-		binary.LittleEndian.PutUint32(bts[4:], frames[k].sid)
-		copy(bts[headerSize:], frames[k].data)
-		bts = bts[len(frames[k].data)+headerSize:]
-	}
+		req := writeRequest{
+			frame:  frames[k],
+			result: make(chan writeResult, 1),
+		}
+
+		select {
+		case s.sess.writes <- req:
+		case <-s.die:
+			return sent, errors.New(errBrokenPipe)
+		case <-deadline:
+			return sent, errTimeout
+		}

-	if _, err = s.sess.writeBinary(buffer); err != nil {
-		return 0, err
+		select {
+		case result := <-req.result:
+			sent += result.n
+			if result.err != nil {
+				return sent, result.err
+			}
+		case <-s.die:
+			return sent, errors.New(errBrokenPipe)
+		case <-deadline:
+			return sent, errTimeout
+		}
 	}
-	return len(b), nil
+	return sent, nil
 }

 // Close implements io.ReadWriteCloser
 func (s *Stream) Close() error {
 	s.dieLock.Lock()
-	defer s.dieLock.Unlock()

 	select {
 	case <-s.die:
+		s.dieLock.Unlock()
 		return errors.New(errBrokenPipe)
 	default:
 		close(s.die)
+		s.dieLock.Unlock()
 		s.sess.streamClosed(s.id)
 		_, err := s.sess.writeFrame(newFrame(cmdRST, s.id))
 		return err
 	}
 }

+// SetReadDeadline sets the read deadline as defined by
+// net.Conn.SetReadDeadline.
+// A zero time value disables the deadline.
+func (s *Stream) SetReadDeadline(t time.Time) error {
+	s.readDeadline.Store(t)
+	return nil
+}
+
+// SetWriteDeadline sets the write deadline as defined by
+// net.Conn.SetWriteDeadline.
+// A zero time value disables the deadline.
+func (s *Stream) SetWriteDeadline(t time.Time) error {
+	s.writeDeadline.Store(t)
+	return nil
+}
+
+// SetDeadline sets both read and write deadlines as defined by
+// net.Conn.SetDeadline.
+// A zero time value disables the deadlines.
+func (s *Stream) SetDeadline(t time.Time) error {
+	if err := s.SetReadDeadline(t); err != nil {
+		return err
+	}
+	if err := s.SetWriteDeadline(t); err != nil {
+		return err
+	}
+	return nil
+}
+
 // session closes the stream
 func (s *Stream) sessionClose() {
 	s.dieLock.Lock()
@ -119,6 +181,26 @@ func (s *Stream) sessionClose() {
 	}
 }

+// LocalAddr satisfies net.Conn interface
+func (s *Stream) LocalAddr() net.Addr {
+	if ts, ok := s.sess.conn.(interface {
+		LocalAddr() net.Addr
+	}); ok {
+		return ts.LocalAddr()
+	}
+	return nil
+}
+
+// RemoteAddr satisfies net.Conn interface
+func (s *Stream) RemoteAddr() net.Addr {
+	if ts, ok := s.sess.conn.(interface {
+		RemoteAddr() net.Addr
+	}); ok {
+		return ts.RemoteAddr()
+	}
+	return nil
+}
+
 // pushBytes a slice into buffer
 func (s *Stream) pushBytes(p []byte) {
 	s.bufferLock.Lock()
@ -164,3 +246,11 @@ func (s *Stream) notifyReadEvent() {
 func (s *Stream) markRST() {
 	atomic.StoreInt32(&s.rstflag, 1)
 }
+
+var errTimeout error = &timeoutError{}
+
+type timeoutError struct{}
+
+func (e *timeoutError) Error() string   { return "i/o timeout" }
+func (e *timeoutError) Timeout() bool   { return true }
+func (e *timeoutError) Temporary() bool { return true }
--- a/cmd/gost/vendor/vendor.json
+++ b/cmd/gost/vendor/vendor.json
@ -8,12 +8,6 @@
 			"revision": "c91e78db502ff629614837aacb7aa4efa61c651a",
 			"revisionTime": "2016-04-30T09:49:23Z"
 		},
-		{
-			"checksumSHA1": "QPs3L3mjPoi+a9GJCjW8HhyJczM=",
-			"path": "github.com/codahale/chacha20",
-			"revision": "ec07b4f69a3f70b1dd2a8ad77230deb1ba5d6953",
-			"revisionTime": "2015-11-07T02:50:05Z"
-		},
 		{
 			"checksumSHA1": "aIhLeVAIrsjs63CwqmU3+GU8yT4=",
 			"path": "github.com/ginuerzh/gosocks4",
@ -68,12 +62,6 @@
 			"revision": "09cded8978dc9e80714c4d85b0322337b0a1e5e0",
 			"revisionTime": "2016-03-02T07:53:16Z"
 		},
-		{
-			"checksumSHA1": "BM6ZlNJmtKy3GBoWwg2X55gnZ4A=",
-			"path": "github.com/klauspost/crc32",
-			"revision": "cb6bfca970f6908083f26f39a79009d608efd5cd",
-			"revisionTime": "2016-10-16T15:41:25Z"
-		},
 		{
 			"checksumSHA1": "dwSGkUfh3A2h0VkXndzBX/27hVc=",
 			"path": "github.com/klauspost/reedsolomon",
@ -291,16 +279,16 @@
 			"revisionTime": "2016-12-15T22:53:35Z"
 		},
 		{
-			"checksumSHA1": "nkIlj9QTxHQ78Vb+VgjhXZ4rZ3E=",
+			"checksumSHA1": "SbBORpjEg3VfPfdSrW82pa3f9Io=",
 			"path": "gopkg.in/xtaci/kcp-go.v2",
-			"revision": "6610d527ea5c4890cf593796ff8ff1f10486bb68",
-			"revisionTime": "2016-09-08T14:44:41Z"
+			"revision": "6da5044c742f24f05b00db9214b57b2ac943c9ab",
+			"revisionTime": "2017-01-20T08:43:10Z"
 		},
 		{
-			"checksumSHA1": "aIqXwA82JxLOXcgmuVSgcRqdJvU=",
+			"checksumSHA1": "EutBuLS2elfcDCMifXNMGj9farQ=",
 			"path": "gopkg.in/xtaci/smux.v1",
-			"revision": "9f2b528a60917e6446273926f4c676cac759d2b0",
-			"revisionTime": "2016-09-22T10:26:45Z"
+			"revision": "427dd804ce9fb0a9e7b27a628f68a124fb0d67a6",
+			"revisionTime": "2016-11-29T15:03:00Z"
 		}
 	],
 	"rootPath": "github.com/ginuerzh/gost/cmd/gost"