mirror of https://github.com/ginuerzh/gost
28 changed files with 699 additions and 2308 deletions
@ -1,21 +0,0 @@ |
|||
The MIT License (MIT) |
|||
|
|||
Copyright (c) 2014 Coda Hale |
|||
|
|||
Permission is hereby granted, free of charge, to any person obtaining a copy |
|||
of this software and associated documentation files (the "Software"), to deal |
|||
in the Software without restriction, including without limitation the rights |
|||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
|||
copies of the Software, and to permit persons to whom the Software is |
|||
furnished to do so, subject to the following conditions: |
|||
|
|||
The above copyright notice and this permission notice shall be included in |
|||
all copies or substantial portions of the Software. |
|||
|
|||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
|||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
|||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
|||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
|||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
|||
THE SOFTWARE. |
|||
@ -1,8 +0,0 @@ |
|||
chacha20 |
|||
======== |
|||
|
|||
[](https://travis-ci.org/codahale/chacha20) |
|||
|
|||
A pure Go implementation of the ChaCha20 stream cipher. |
|||
|
|||
For documentation, check [godoc](http://godoc.org/github.com/codahale/chacha20). |
|||
@ -1,235 +0,0 @@ |
|||
// Package chacha20 provides a pure Go implementation of ChaCha20, a fast,
|
|||
// secure stream cipher.
|
|||
//
|
|||
// From Bernstein, Daniel J. "ChaCha, a variant of Salsa20." Workshop Record of
|
|||
// SASC. 2008. (http://cr.yp.to/chacha/chacha-20080128.pdf):
|
|||
//
|
|||
// ChaCha8 is a 256-bit stream cipher based on the 8-round cipher Salsa20/8.
|
|||
// The changes from Salsa20/8 to ChaCha8 are designed to improve diffusion per
|
|||
// round, conjecturally increasing resistance to cryptanalysis, while
|
|||
// preserving -- and often improving -- time per round. ChaCha12 and ChaCha20
|
|||
// are analogous modifications of the 12-round and 20-round ciphers Salsa20/12
|
|||
// and Salsa20/20. This paper presents the ChaCha family and explains the
|
|||
// differences between Salsa20 and ChaCha.
|
|||
//
|
|||
// For more information, see http://cr.yp.to/chacha.html
|
|||
package chacha20 |
|||
|
|||
import ( |
|||
"crypto/cipher" |
|||
"encoding/binary" |
|||
"errors" |
|||
"unsafe" |
|||
) |
|||
|
|||
const ( |
|||
// KeySize is the length of ChaCha20 keys, in bytes.
|
|||
KeySize = 32 |
|||
// NonceSize is the length of ChaCha20 nonces, in bytes.
|
|||
NonceSize = 8 |
|||
// XNonceSize is the length of XChaCha20 nonces, in bytes.
|
|||
XNonceSize = 24 |
|||
) |
|||
|
|||
var ( |
|||
// ErrInvalidKey is returned when the provided key is not 256 bits long.
|
|||
ErrInvalidKey = errors.New("invalid key length (must be 256 bits)") |
|||
// ErrInvalidNonce is returned when the provided nonce is not 64 bits long.
|
|||
ErrInvalidNonce = errors.New("invalid nonce length (must be 64 bits)") |
|||
// ErrInvalidXNonce is returned when the provided nonce is not 192 bits
|
|||
// long.
|
|||
ErrInvalidXNonce = errors.New("invalid nonce length (must be 192 bits)") |
|||
// ErrInvalidRounds is returned when the provided rounds is not
|
|||
// 8, 12, or 20.
|
|||
ErrInvalidRounds = errors.New("invalid rounds number (must be 8, 12, or 20)") |
|||
) |
|||
|
|||
// New creates and returns a new cipher.Stream. The key argument must be 256
|
|||
// bits long, and the nonce argument must be 64 bits long. The nonce must be
|
|||
// randomly generated or used only once. This Stream instance must not be used
|
|||
// to encrypt more than 2^70 bytes (~1 zettabyte).
|
|||
func New(key []byte, nonce []byte) (cipher.Stream, error) { |
|||
return NewWithRounds(key, nonce, 20) |
|||
} |
|||
|
|||
// NewWithRounds creates and returns a new cipher.Stream just like New but
|
|||
// the rounds number of 8, 12, or 20 can be specified.
|
|||
func NewWithRounds(key []byte, nonce []byte, rounds uint8) (cipher.Stream, error) { |
|||
if len(key) != KeySize { |
|||
return nil, ErrInvalidKey |
|||
} |
|||
|
|||
if len(nonce) != NonceSize { |
|||
return nil, ErrInvalidNonce |
|||
} |
|||
|
|||
if (rounds != 8) && (rounds != 12) && (rounds != 20) { |
|||
return nil, ErrInvalidRounds |
|||
} |
|||
|
|||
s := new(stream) |
|||
s.init(key, nonce, rounds) |
|||
s.advance() |
|||
|
|||
return s, nil |
|||
} |
|||
|
|||
// NewXChaCha creates and returns a new cipher.Stream. The key argument must be
|
|||
// 256 bits long, and the nonce argument must be 192 bits long. The nonce must
|
|||
// be randomly generated or only used once. This Stream instance must not be
|
|||
// used to encrypt more than 2^70 bytes (~1 zetta byte).
|
|||
func NewXChaCha(key []byte, nonce []byte) (cipher.Stream, error) { |
|||
return NewXChaChaWithRounds(key, nonce, 20) |
|||
} |
|||
|
|||
// NewXChaChaWithRounds creates and returns a new cipher.Stream just like
|
|||
// NewXChaCha but the rounds number of 8, 12, or 20 can be specified.
|
|||
func NewXChaChaWithRounds(key []byte, nonce []byte, rounds uint8) (cipher.Stream, error) { |
|||
if len(key) != KeySize { |
|||
return nil, ErrInvalidKey |
|||
} |
|||
|
|||
if len(nonce) != XNonceSize { |
|||
return nil, ErrInvalidXNonce |
|||
} |
|||
|
|||
if (rounds != 8) && (rounds != 12) && (rounds != 20) { |
|||
return nil, ErrInvalidRounds |
|||
} |
|||
|
|||
s := new(stream) |
|||
s.init(key, nonce, rounds) |
|||
|
|||
// Call HChaCha to derive the subkey using the key and the first 16 bytes
|
|||
// of the nonce, and re-initialize the state using the subkey and the
|
|||
// remaining nonce.
|
|||
blockArr := (*[stateSize]uint32)(unsafe.Pointer(&s.block)) |
|||
core(&s.state, blockArr, s.rounds, true) |
|||
copy(s.state[4:8], blockArr[0:4]) |
|||
copy(s.state[8:12], blockArr[12:16]) |
|||
s.state[12] = 0 |
|||
s.state[13] = 0 |
|||
s.state[14] = binary.LittleEndian.Uint32(nonce[16:]) |
|||
s.state[15] = binary.LittleEndian.Uint32(nonce[20:]) |
|||
|
|||
s.advance() |
|||
|
|||
return s, nil |
|||
} |
|||
|
|||
type stream struct { |
|||
state [stateSize]uint32 // the state as an array of 16 32-bit words
|
|||
block [blockSize]byte // the keystream as an array of 64 bytes
|
|||
offset int // the offset of used bytes in block
|
|||
rounds uint8 |
|||
} |
|||
|
|||
func (s *stream) XORKeyStream(dst, src []byte) { |
|||
// Stride over the input in 64-byte blocks, minus the amount of keystream
|
|||
// previously used. This will produce best results when processing blocks
|
|||
// of a size evenly divisible by 64.
|
|||
i := 0 |
|||
max := len(src) |
|||
for i < max { |
|||
gap := blockSize - s.offset |
|||
|
|||
limit := i + gap |
|||
if limit > max { |
|||
limit = max |
|||
} |
|||
|
|||
o := s.offset |
|||
for j := i; j < limit; j++ { |
|||
dst[j] = src[j] ^ s.block[o] |
|||
o++ |
|||
} |
|||
|
|||
i += gap |
|||
s.offset = o |
|||
|
|||
if o == blockSize { |
|||
s.advance() |
|||
} |
|||
} |
|||
} |
|||
|
|||
func (s *stream) init(key []byte, nonce []byte, rounds uint8) { |
|||
// the magic constants for 256-bit keys
|
|||
s.state[0] = 0x61707865 |
|||
s.state[1] = 0x3320646e |
|||
s.state[2] = 0x79622d32 |
|||
s.state[3] = 0x6b206574 |
|||
|
|||
s.state[4] = binary.LittleEndian.Uint32(key[0:]) |
|||
s.state[5] = binary.LittleEndian.Uint32(key[4:]) |
|||
s.state[6] = binary.LittleEndian.Uint32(key[8:]) |
|||
s.state[7] = binary.LittleEndian.Uint32(key[12:]) |
|||
s.state[8] = binary.LittleEndian.Uint32(key[16:]) |
|||
s.state[9] = binary.LittleEndian.Uint32(key[20:]) |
|||
s.state[10] = binary.LittleEndian.Uint32(key[24:]) |
|||
s.state[11] = binary.LittleEndian.Uint32(key[28:]) |
|||
|
|||
switch len(nonce) { |
|||
case NonceSize: |
|||
// ChaCha20 uses 8 byte nonces.
|
|||
s.state[12] = 0 |
|||
s.state[13] = 0 |
|||
s.state[14] = binary.LittleEndian.Uint32(nonce[0:]) |
|||
s.state[15] = binary.LittleEndian.Uint32(nonce[4:]) |
|||
case XNonceSize: |
|||
// XChaCha20 derives the subkey via HChaCha initialized
|
|||
// with the first 16 bytes of the nonce.
|
|||
s.state[12] = binary.LittleEndian.Uint32(nonce[0:]) |
|||
s.state[13] = binary.LittleEndian.Uint32(nonce[4:]) |
|||
s.state[14] = binary.LittleEndian.Uint32(nonce[8:]) |
|||
s.state[15] = binary.LittleEndian.Uint32(nonce[12:]) |
|||
default: |
|||
// Never happens, both ctors validate the nonce length.
|
|||
panic("invalid nonce size") |
|||
} |
|||
|
|||
s.rounds = rounds |
|||
} |
|||
|
|||
// BUG(codahale): Totally untested on big-endian CPUs. Would very much
|
|||
// appreciate someone with an ARM device giving this a swing.
|
|||
|
|||
// advances the keystream
|
|||
func (s *stream) advance() { |
|||
core(&s.state, (*[stateSize]uint32)(unsafe.Pointer(&s.block)), s.rounds, false) |
|||
|
|||
if bigEndian { |
|||
j := blockSize - 1 |
|||
for i := 0; i < blockSize/2; i++ { |
|||
s.block[j], s.block[i] = s.block[i], s.block[j] |
|||
j-- |
|||
} |
|||
} |
|||
|
|||
s.offset = 0 |
|||
i := s.state[12] + 1 |
|||
s.state[12] = i |
|||
if i == 0 { |
|||
s.state[13]++ |
|||
} |
|||
} |
|||
|
|||
const ( |
|||
wordSize = 4 // the size of ChaCha20's words
|
|||
stateSize = 16 // the size of ChaCha20's state, in words
|
|||
blockSize = stateSize * wordSize // the size of ChaCha20's block, in bytes
|
|||
) |
|||
|
|||
var ( |
|||
bigEndian bool // whether or not we're running on a bigEndian CPU
|
|||
) |
|||
|
|||
// Do some up-front bookkeeping on what sort of CPU we're using. ChaCha20 treats
|
|||
// its state as a little-endian byte array when it comes to generating the
|
|||
// keystream, which allows for a zero-copy approach to the core transform. On
|
|||
// big-endian architectures, we have to take a hit to reverse the bytes.
|
|||
func init() { |
|||
x := uint32(0x04030201) |
|||
y := [4]byte{0x1, 0x2, 0x3, 0x4} |
|||
bigEndian = *(*[4]byte)(unsafe.Pointer(&x)) != y |
|||
} |
|||
@ -1,166 +0,0 @@ |
|||
// The ChaCha20 core transform.
|
|||
// An unrolled and inlined implementation in pure Go.
|
|||
|
|||
package chacha20 |
|||
|
|||
func core(input, output *[stateSize]uint32, rounds uint8, hchacha bool) { |
|||
var ( |
|||
x00 = input[0] |
|||
x01 = input[1] |
|||
x02 = input[2] |
|||
x03 = input[3] |
|||
x04 = input[4] |
|||
x05 = input[5] |
|||
x06 = input[6] |
|||
x07 = input[7] |
|||
x08 = input[8] |
|||
x09 = input[9] |
|||
x10 = input[10] |
|||
x11 = input[11] |
|||
x12 = input[12] |
|||
x13 = input[13] |
|||
x14 = input[14] |
|||
x15 = input[15] |
|||
) |
|||
|
|||
var x uint32 |
|||
|
|||
// Unrolling all 20 rounds kills performance on modern Intel processors
|
|||
// (Tested on a i5 Haswell, likely applies to Sandy Bridge+), due to uop
|
|||
// cache thrashing. The straight forward 2 rounds per loop implementation
|
|||
// of this has double the performance of the fully unrolled version.
|
|||
for i := uint8(0); i < rounds; i += 2 { |
|||
x00 += x04 |
|||
x = x12 ^ x00 |
|||
x12 = (x << 16) | (x >> 16) |
|||
x08 += x12 |
|||
x = x04 ^ x08 |
|||
x04 = (x << 12) | (x >> 20) |
|||
x00 += x04 |
|||
x = x12 ^ x00 |
|||
x12 = (x << 8) | (x >> 24) |
|||
x08 += x12 |
|||
x = x04 ^ x08 |
|||
x04 = (x << 7) | (x >> 25) |
|||
x01 += x05 |
|||
x = x13 ^ x01 |
|||
x13 = (x << 16) | (x >> 16) |
|||
x09 += x13 |
|||
x = x05 ^ x09 |
|||
x05 = (x << 12) | (x >> 20) |
|||
x01 += x05 |
|||
x = x13 ^ x01 |
|||
x13 = (x << 8) | (x >> 24) |
|||
x09 += x13 |
|||
x = x05 ^ x09 |
|||
x05 = (x << 7) | (x >> 25) |
|||
x02 += x06 |
|||
x = x14 ^ x02 |
|||
x14 = (x << 16) | (x >> 16) |
|||
x10 += x14 |
|||
x = x06 ^ x10 |
|||
x06 = (x << 12) | (x >> 20) |
|||
x02 += x06 |
|||
x = x14 ^ x02 |
|||
x14 = (x << 8) | (x >> 24) |
|||
x10 += x14 |
|||
x = x06 ^ x10 |
|||
x06 = (x << 7) | (x >> 25) |
|||
x03 += x07 |
|||
x = x15 ^ x03 |
|||
x15 = (x << 16) | (x >> 16) |
|||
x11 += x15 |
|||
x = x07 ^ x11 |
|||
x07 = (x << 12) | (x >> 20) |
|||
x03 += x07 |
|||
x = x15 ^ x03 |
|||
x15 = (x << 8) | (x >> 24) |
|||
x11 += x15 |
|||
x = x07 ^ x11 |
|||
x07 = (x << 7) | (x >> 25) |
|||
x00 += x05 |
|||
x = x15 ^ x00 |
|||
x15 = (x << 16) | (x >> 16) |
|||
x10 += x15 |
|||
x = x05 ^ x10 |
|||
x05 = (x << 12) | (x >> 20) |
|||
x00 += x05 |
|||
x = x15 ^ x00 |
|||
x15 = (x << 8) | (x >> 24) |
|||
x10 += x15 |
|||
x = x05 ^ x10 |
|||
x05 = (x << 7) | (x >> 25) |
|||
x01 += x06 |
|||
x = x12 ^ x01 |
|||
x12 = (x << 16) | (x >> 16) |
|||
x11 += x12 |
|||
x = x06 ^ x11 |
|||
x06 = (x << 12) | (x >> 20) |
|||
x01 += x06 |
|||
x = x12 ^ x01 |
|||
x12 = (x << 8) | (x >> 24) |
|||
x11 += x12 |
|||
x = x06 ^ x11 |
|||
x06 = (x << 7) | (x >> 25) |
|||
x02 += x07 |
|||
x = x13 ^ x02 |
|||
x13 = (x << 16) | (x >> 16) |
|||
x08 += x13 |
|||
x = x07 ^ x08 |
|||
x07 = (x << 12) | (x >> 20) |
|||
x02 += x07 |
|||
x = x13 ^ x02 |
|||
x13 = (x << 8) | (x >> 24) |
|||
x08 += x13 |
|||
x = x07 ^ x08 |
|||
x07 = (x << 7) | (x >> 25) |
|||
x03 += x04 |
|||
x = x14 ^ x03 |
|||
x14 = (x << 16) | (x >> 16) |
|||
x09 += x14 |
|||
x = x04 ^ x09 |
|||
x04 = (x << 12) | (x >> 20) |
|||
x03 += x04 |
|||
x = x14 ^ x03 |
|||
x14 = (x << 8) | (x >> 24) |
|||
x09 += x14 |
|||
x = x04 ^ x09 |
|||
x04 = (x << 7) | (x >> 25) |
|||
} |
|||
|
|||
if !hchacha { |
|||
output[0] = x00 + input[0] |
|||
output[1] = x01 + input[1] |
|||
output[2] = x02 + input[2] |
|||
output[3] = x03 + input[3] |
|||
output[4] = x04 + input[4] |
|||
output[5] = x05 + input[5] |
|||
output[6] = x06 + input[6] |
|||
output[7] = x07 + input[7] |
|||
output[8] = x08 + input[8] |
|||
output[9] = x09 + input[9] |
|||
output[10] = x10 + input[10] |
|||
output[11] = x11 + input[11] |
|||
output[12] = x12 + input[12] |
|||
output[13] = x13 + input[13] |
|||
output[14] = x14 + input[14] |
|||
output[15] = x15 + input[15] |
|||
} else { |
|||
output[0] = x00 |
|||
output[1] = x01 |
|||
output[2] = x02 |
|||
output[3] = x03 |
|||
output[4] = x04 |
|||
output[5] = x05 |
|||
output[6] = x06 |
|||
output[7] = x07 |
|||
output[8] = x08 |
|||
output[9] = x09 |
|||
output[10] = x10 |
|||
output[11] = x11 |
|||
output[12] = x12 |
|||
output[13] = x13 |
|||
output[14] = x14 |
|||
output[15] = x15 |
|||
} |
|||
} |
|||
@ -1,28 +0,0 @@ |
|||
Copyright (c) 2012 The Go Authors. All rights reserved. |
|||
Copyright (c) 2015 Klaus Post |
|||
|
|||
Redistribution and use in source and binary forms, with or without |
|||
modification, are permitted provided that the following conditions are |
|||
met: |
|||
|
|||
* Redistributions of source code must retain the above copyright |
|||
notice, this list of conditions and the following disclaimer. |
|||
* Redistributions in binary form must reproduce the above |
|||
copyright notice, this list of conditions and the following disclaimer |
|||
in the documentation and/or other materials provided with the |
|||
distribution. |
|||
* Neither the name of Google Inc. nor the names of its |
|||
contributors may be used to endorse or promote products derived from |
|||
this software without specific prior written permission. |
|||
|
|||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
|||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
|||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
|||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
|||
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
|||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
|||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
|||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
|||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
|||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|||
@ -1,87 +0,0 @@ |
|||
# crc32 |
|||
CRC32 hash with x64 optimizations |
|||
|
|||
This package is a drop-in replacement for the standard library `hash/crc32` package, that features SSE 4.2 optimizations on x64 platforms, for a 10x speedup. |
|||
|
|||
[](https://travis-ci.org/klauspost/crc32) |
|||
|
|||
# usage |
|||
|
|||
Install using `go get github.com/klauspost/crc32`. This library is based on Go 1.5 code and requires Go 1.3 or newer. |
|||
|
|||
Replace `import "hash/crc32"` with `import "github.com/klauspost/crc32"` and you are good to go. |
|||
|
|||
# changes |
|||
* Oct 20, 2016: Changes have been merged to upstream Go. Package updated to match. |
|||
* Dec 4, 2015: Uses the "slice-by-8" trick more extensively, which gives a 1.5 to 2.5x speedup if assembler is unavailable. |
|||
|
|||
|
|||
# performance |
|||
|
|||
For *Go 1.7* performance is equivalent to the standard library. So if you use this package for Go 1.7 you can switch back. |
|||
|
|||
|
|||
For IEEE tables (the most common), there is approximately a factor 10 speedup with "CLMUL" (Carryless multiplication) instruction: |
|||
``` |
|||
benchmark old ns/op new ns/op delta |
|||
BenchmarkCrc32KB 99955 10258 -89.74% |
|||
|
|||
benchmark old MB/s new MB/s speedup |
|||
BenchmarkCrc32KB 327.83 3194.20 9.74x |
|||
``` |
|||
|
|||
For other tables and "CLMUL" capable machines the performance is the same as the standard library. |
|||
|
|||
Here are some detailed benchmarks, comparing to go 1.5 standard library with and without assembler enabled. |
|||
|
|||
``` |
|||
Std: Standard Go 1.5 library |
|||
Crc: Indicates IEEE type CRC. |
|||
40B: Size of each slice encoded. |
|||
NoAsm: Assembler was disabled (ie. not an AMD64 or SSE 4.2+ capable machine). |
|||
Castagnoli: Castagnoli CRC type. |
|||
|
|||
BenchmarkStdCrc40B-4 10000000 158 ns/op 252.88 MB/s |
|||
BenchmarkCrc40BNoAsm-4 20000000 105 ns/op 377.38 MB/s (slice8) |
|||
BenchmarkCrc40B-4 20000000 105 ns/op 378.77 MB/s (slice8) |
|||
|
|||
BenchmarkStdCrc1KB-4 500000 3604 ns/op 284.10 MB/s |
|||
BenchmarkCrc1KBNoAsm-4 1000000 1463 ns/op 699.79 MB/s (slice8) |
|||
BenchmarkCrc1KB-4 3000000 396 ns/op 2583.69 MB/s (asm) |
|||
|
|||
BenchmarkStdCrc8KB-4 200000 11417 ns/op 717.48 MB/s (slice8) |
|||
BenchmarkCrc8KBNoAsm-4 200000 11317 ns/op 723.85 MB/s (slice8) |
|||
BenchmarkCrc8KB-4 500000 2919 ns/op 2805.73 MB/s (asm) |
|||
|
|||
BenchmarkStdCrc32KB-4 30000 45749 ns/op 716.24 MB/s (slice8) |
|||
BenchmarkCrc32KBNoAsm-4 30000 45109 ns/op 726.42 MB/s (slice8) |
|||
BenchmarkCrc32KB-4 100000 11497 ns/op 2850.09 MB/s (asm) |
|||
|
|||
BenchmarkStdNoAsmCastagnol40B-4 10000000 161 ns/op 246.94 MB/s |
|||
BenchmarkStdCastagnoli40B-4 50000000 28.4 ns/op 1410.69 MB/s (asm) |
|||
BenchmarkCastagnoli40BNoAsm-4 20000000 100 ns/op 398.01 MB/s (slice8) |
|||
BenchmarkCastagnoli40B-4 50000000 28.2 ns/op 1419.54 MB/s (asm) |
|||
|
|||
BenchmarkStdNoAsmCastagnoli1KB-4 500000 3622 ns/op 282.67 MB/s |
|||
BenchmarkStdCastagnoli1KB-4 10000000 144 ns/op 7099.78 MB/s (asm) |
|||
BenchmarkCastagnoli1KBNoAsm-4 1000000 1475 ns/op 694.14 MB/s (slice8) |
|||
BenchmarkCastagnoli1KB-4 10000000 146 ns/op 6993.35 MB/s (asm) |
|||
|
|||
BenchmarkStdNoAsmCastagnoli8KB-4 50000 28781 ns/op 284.63 MB/s |
|||
BenchmarkStdCastagnoli8KB-4 1000000 1029 ns/op 7957.89 MB/s (asm) |
|||
BenchmarkCastagnoli8KBNoAsm-4 200000 11410 ns/op 717.94 MB/s (slice8) |
|||
BenchmarkCastagnoli8KB-4 1000000 1000 ns/op 8188.71 MB/s (asm) |
|||
|
|||
BenchmarkStdNoAsmCastagnoli32KB-4 10000 115426 ns/op 283.89 MB/s |
|||
BenchmarkStdCastagnoli32KB-4 300000 4065 ns/op 8059.13 MB/s (asm) |
|||
BenchmarkCastagnoli32KBNoAsm-4 30000 45171 ns/op 725.41 MB/s (slice8) |
|||
BenchmarkCastagnoli32KB-4 500000 4077 ns/op 8035.89 MB/s (asm) |
|||
``` |
|||
|
|||
The IEEE assembler optimizations has been submitted and will be part of the Go 1.6 standard library. |
|||
|
|||
However, the improved use of slice-by-8 has not, but will probably be submitted for Go 1.7. |
|||
|
|||
# license |
|||
|
|||
Standard Go license. Changes are Copyright (c) 2015 Klaus Post under same conditions. |
|||
@ -1,207 +0,0 @@ |
|||
// Copyright 2009 The Go Authors. All rights reserved.
|
|||
// Use of this source code is governed by a BSD-style
|
|||
// license that can be found in the LICENSE file.
|
|||
|
|||
// Package crc32 implements the 32-bit cyclic redundancy check, or CRC-32,
|
|||
// checksum. See http://en.wikipedia.org/wiki/Cyclic_redundancy_check for
|
|||
// information.
|
|||
//
|
|||
// Polynomials are represented in LSB-first form also known as reversed representation.
|
|||
//
|
|||
// See http://en.wikipedia.org/wiki/Mathematics_of_cyclic_redundancy_checks#Reversed_representations_and_reciprocal_polynomials
|
|||
// for information.
|
|||
package crc32 |
|||
|
|||
import ( |
|||
"hash" |
|||
"sync" |
|||
) |
|||
|
|||
// The size of a CRC-32 checksum in bytes.
|
|||
const Size = 4 |
|||
|
|||
// Predefined polynomials.
|
|||
const ( |
|||
// IEEE is by far and away the most common CRC-32 polynomial.
|
|||
// Used by ethernet (IEEE 802.3), v.42, fddi, gzip, zip, png, ...
|
|||
IEEE = 0xedb88320 |
|||
|
|||
// Castagnoli's polynomial, used in iSCSI.
|
|||
// Has better error detection characteristics than IEEE.
|
|||
// http://dx.doi.org/10.1109/26.231911
|
|||
Castagnoli = 0x82f63b78 |
|||
|
|||
// Koopman's polynomial.
|
|||
// Also has better error detection characteristics than IEEE.
|
|||
// http://dx.doi.org/10.1109/DSN.2002.1028931
|
|||
Koopman = 0xeb31d82e |
|||
) |
|||
|
|||
// Table is a 256-word table representing the polynomial for efficient processing.
|
|||
type Table [256]uint32 |
|||
|
|||
// This file makes use of functions implemented in architecture-specific files.
|
|||
// The interface that they implement is as follows:
|
|||
//
|
|||
// // archAvailableIEEE reports whether an architecture-specific CRC32-IEEE
|
|||
// // algorithm is available.
|
|||
// archAvailableIEEE() bool
|
|||
//
|
|||
// // archInitIEEE initializes the architecture-specific CRC3-IEEE algorithm.
|
|||
// // It can only be called if archAvailableIEEE() returns true.
|
|||
// archInitIEEE()
|
|||
//
|
|||
// // archUpdateIEEE updates the given CRC32-IEEE. It can only be called if
|
|||
// // archInitIEEE() was previously called.
|
|||
// archUpdateIEEE(crc uint32, p []byte) uint32
|
|||
//
|
|||
// // archAvailableCastagnoli reports whether an architecture-specific
|
|||
// // CRC32-C algorithm is available.
|
|||
// archAvailableCastagnoli() bool
|
|||
//
|
|||
// // archInitCastagnoli initializes the architecture-specific CRC32-C
|
|||
// // algorithm. It can only be called if archAvailableCastagnoli() returns
|
|||
// // true.
|
|||
// archInitCastagnoli()
|
|||
//
|
|||
// // archUpdateCastagnoli updates the given CRC32-C. It can only be called
|
|||
// // if archInitCastagnoli() was previously called.
|
|||
// archUpdateCastagnoli(crc uint32, p []byte) uint32
|
|||
|
|||
// castagnoliTable points to a lazily initialized Table for the Castagnoli
|
|||
// polynomial. MakeTable will always return this value when asked to make a
|
|||
// Castagnoli table so we can compare against it to find when the caller is
|
|||
// using this polynomial.
|
|||
var castagnoliTable *Table |
|||
var castagnoliTable8 *slicing8Table |
|||
var castagnoliArchImpl bool |
|||
var updateCastagnoli func(crc uint32, p []byte) uint32 |
|||
var castagnoliOnce sync.Once |
|||
|
|||
func castagnoliInit() { |
|||
castagnoliTable = simpleMakeTable(Castagnoli) |
|||
castagnoliArchImpl = archAvailableCastagnoli() |
|||
|
|||
if castagnoliArchImpl { |
|||
archInitCastagnoli() |
|||
updateCastagnoli = archUpdateCastagnoli |
|||
} else { |
|||
// Initialize the slicing-by-8 table.
|
|||
castagnoliTable8 = slicingMakeTable(Castagnoli) |
|||
updateCastagnoli = func(crc uint32, p []byte) uint32 { |
|||
return slicingUpdate(crc, castagnoliTable8, p) |
|||
} |
|||
} |
|||
} |
|||
|
|||
// IEEETable is the table for the IEEE polynomial.
|
|||
var IEEETable = simpleMakeTable(IEEE) |
|||
|
|||
// ieeeTable8 is the slicing8Table for IEEE
|
|||
var ieeeTable8 *slicing8Table |
|||
var ieeeArchImpl bool |
|||
var updateIEEE func(crc uint32, p []byte) uint32 |
|||
var ieeeOnce sync.Once |
|||
|
|||
func ieeeInit() { |
|||
ieeeArchImpl = archAvailableIEEE() |
|||
|
|||
if ieeeArchImpl { |
|||
archInitIEEE() |
|||
updateIEEE = archUpdateIEEE |
|||
} else { |
|||
// Initialize the slicing-by-8 table.
|
|||
ieeeTable8 = slicingMakeTable(IEEE) |
|||
updateIEEE = func(crc uint32, p []byte) uint32 { |
|||
return slicingUpdate(crc, ieeeTable8, p) |
|||
} |
|||
} |
|||
} |
|||
|
|||
// MakeTable returns a Table constructed from the specified polynomial.
|
|||
// The contents of this Table must not be modified.
|
|||
func MakeTable(poly uint32) *Table { |
|||
switch poly { |
|||
case IEEE: |
|||
ieeeOnce.Do(ieeeInit) |
|||
return IEEETable |
|||
case Castagnoli: |
|||
castagnoliOnce.Do(castagnoliInit) |
|||
return castagnoliTable |
|||
} |
|||
return simpleMakeTable(poly) |
|||
} |
|||
|
|||
// digest represents the partial evaluation of a checksum.
|
|||
type digest struct { |
|||
crc uint32 |
|||
tab *Table |
|||
} |
|||
|
|||
// New creates a new hash.Hash32 computing the CRC-32 checksum
|
|||
// using the polynomial represented by the Table.
|
|||
// Its Sum method will lay the value out in big-endian byte order.
|
|||
func New(tab *Table) hash.Hash32 { |
|||
if tab == IEEETable { |
|||
ieeeOnce.Do(ieeeInit) |
|||
} |
|||
return &digest{0, tab} |
|||
} |
|||
|
|||
// NewIEEE creates a new hash.Hash32 computing the CRC-32 checksum
|
|||
// using the IEEE polynomial.
|
|||
// Its Sum method will lay the value out in big-endian byte order.
|
|||
func NewIEEE() hash.Hash32 { return New(IEEETable) } |
|||
|
|||
func (d *digest) Size() int { return Size } |
|||
|
|||
func (d *digest) BlockSize() int { return 1 } |
|||
|
|||
func (d *digest) Reset() { d.crc = 0 } |
|||
|
|||
// Update returns the result of adding the bytes in p to the crc.
|
|||
func Update(crc uint32, tab *Table, p []byte) uint32 { |
|||
switch tab { |
|||
case castagnoliTable: |
|||
return updateCastagnoli(crc, p) |
|||
case IEEETable: |
|||
// Unfortunately, because IEEETable is exported, IEEE may be used without a
|
|||
// call to MakeTable. We have to make sure it gets initialized in that case.
|
|||
ieeeOnce.Do(ieeeInit) |
|||
return updateIEEE(crc, p) |
|||
default: |
|||
return simpleUpdate(crc, tab, p) |
|||
} |
|||
} |
|||
|
|||
func (d *digest) Write(p []byte) (n int, err error) { |
|||
switch d.tab { |
|||
case castagnoliTable: |
|||
d.crc = updateCastagnoli(d.crc, p) |
|||
case IEEETable: |
|||
// We only create digest objects through New() which takes care of
|
|||
// initialization in this case.
|
|||
d.crc = updateIEEE(d.crc, p) |
|||
default: |
|||
d.crc = simpleUpdate(d.crc, d.tab, p) |
|||
} |
|||
return len(p), nil |
|||
} |
|||
|
|||
func (d *digest) Sum32() uint32 { return d.crc } |
|||
|
|||
func (d *digest) Sum(in []byte) []byte { |
|||
s := d.Sum32() |
|||
return append(in, byte(s>>24), byte(s>>16), byte(s>>8), byte(s)) |
|||
} |
|||
|
|||
// Checksum returns the CRC-32 checksum of data
|
|||
// using the polynomial represented by the Table.
|
|||
func Checksum(data []byte, tab *Table) uint32 { return Update(0, tab, data) } |
|||
|
|||
// ChecksumIEEE returns the CRC-32 checksum of data
|
|||
// using the IEEE polynomial.
|
|||
func ChecksumIEEE(data []byte) uint32 { |
|||
ieeeOnce.Do(ieeeInit) |
|||
return updateIEEE(0, data) |
|||
} |
|||
@ -1,230 +0,0 @@ |
|||
// Copyright 2011 The Go Authors. All rights reserved.
|
|||
// Use of this source code is governed by a BSD-style
|
|||
// license that can be found in the LICENSE file.
|
|||
|
|||
// +build !appengine,!gccgo
|
|||
|
|||
// AMD64-specific hardware-assisted CRC32 algorithms. See crc32.go for a
|
|||
// description of the interface that each architecture-specific file
|
|||
// implements.
|
|||
|
|||
package crc32 |
|||
|
|||
import "unsafe" |
|||
|
|||
// This file contains the code to call the SSE 4.2 version of the Castagnoli
|
|||
// and IEEE CRC.
|
|||
|
|||
// haveSSE41/haveSSE42/haveCLMUL are defined in crc_amd64.s and use
|
|||
// CPUID to test for SSE 4.1, 4.2 and CLMUL support.
|
|||
func haveSSE41() bool |
|||
func haveSSE42() bool |
|||
func haveCLMUL() bool |
|||
|
|||
// castagnoliSSE42 is defined in crc32_amd64.s and uses the SSE4.2 CRC32
|
|||
// instruction.
|
|||
//go:noescape
|
|||
func castagnoliSSE42(crc uint32, p []byte) uint32 |
|||
|
|||
// castagnoliSSE42Triple is defined in crc32_amd64.s and uses the SSE4.2 CRC32
|
|||
// instruction.
|
|||
//go:noescape
|
|||
func castagnoliSSE42Triple( |
|||
crcA, crcB, crcC uint32, |
|||
a, b, c []byte, |
|||
rounds uint32, |
|||
) (retA uint32, retB uint32, retC uint32) |
|||
|
|||
// ieeeCLMUL is defined in crc_amd64.s and uses the PCLMULQDQ
|
|||
// instruction as well as SSE 4.1.
|
|||
//go:noescape
|
|||
func ieeeCLMUL(crc uint32, p []byte) uint32 |
|||
|
|||
var sse42 = haveSSE42() |
|||
var useFastIEEE = haveCLMUL() && haveSSE41() |
|||
|
|||
const castagnoliK1 = 168 |
|||
const castagnoliK2 = 1344 |
|||
|
|||
type sse42Table [4]Table |
|||
|
|||
var castagnoliSSE42TableK1 *sse42Table |
|||
var castagnoliSSE42TableK2 *sse42Table |
|||
|
|||
func archAvailableCastagnoli() bool { |
|||
return sse42 |
|||
} |
|||
|
|||
func archInitCastagnoli() { |
|||
if !sse42 { |
|||
panic("arch-specific Castagnoli not available") |
|||
} |
|||
castagnoliSSE42TableK1 = new(sse42Table) |
|||
castagnoliSSE42TableK2 = new(sse42Table) |
|||
// See description in updateCastagnoli.
|
|||
// t[0][i] = CRC(i000, O)
|
|||
// t[1][i] = CRC(0i00, O)
|
|||
// t[2][i] = CRC(00i0, O)
|
|||
// t[3][i] = CRC(000i, O)
|
|||
// where O is a sequence of K zeros.
|
|||
var tmp [castagnoliK2]byte |
|||
for b := 0; b < 4; b++ { |
|||
for i := 0; i < 256; i++ { |
|||
val := uint32(i) << uint32(b*8) |
|||
castagnoliSSE42TableK1[b][i] = castagnoliSSE42(val, tmp[:castagnoliK1]) |
|||
castagnoliSSE42TableK2[b][i] = castagnoliSSE42(val, tmp[:]) |
|||
} |
|||
} |
|||
} |
|||
|
|||
// castagnoliShift computes the CRC32-C of K1 or K2 zeroes (depending on the
|
|||
// table given) with the given initial crc value. This corresponds to
|
|||
// CRC(crc, O) in the description in updateCastagnoli.
|
|||
func castagnoliShift(table *sse42Table, crc uint32) uint32 { |
|||
return table[3][crc>>24] ^ |
|||
table[2][(crc>>16)&0xFF] ^ |
|||
table[1][(crc>>8)&0xFF] ^ |
|||
table[0][crc&0xFF] |
|||
} |
|||
|
|||
func archUpdateCastagnoli(crc uint32, p []byte) uint32 { |
|||
if !sse42 { |
|||
panic("not available") |
|||
} |
|||
|
|||
// This method is inspired from the algorithm in Intel's white paper:
|
|||
// "Fast CRC Computation for iSCSI Polynomial Using CRC32 Instruction"
|
|||
// The same strategy of splitting the buffer in three is used but the
|
|||
// combining calculation is different; the complete derivation is explained
|
|||
// below.
|
|||
//
|
|||
// -- The basic idea --
|
|||
//
|
|||
// The CRC32 instruction (available in SSE4.2) can process 8 bytes at a
|
|||
// time. In recent Intel architectures the instruction takes 3 cycles;
|
|||
// however the processor can pipeline up to three instructions if they
|
|||
// don't depend on each other.
|
|||
//
|
|||
// Roughly this means that we can process three buffers in about the same
|
|||
// time we can process one buffer.
|
|||
//
|
|||
// The idea is then to split the buffer in three, CRC the three pieces
|
|||
// separately and then combine the results.
|
|||
//
|
|||
// Combining the results requires precomputed tables, so we must choose a
|
|||
// fixed buffer length to optimize. The longer the length, the faster; but
|
|||
// only buffers longer than this length will use the optimization. We choose
|
|||
// two cutoffs and compute tables for both:
|
|||
// - one around 512: 168*3=504
|
|||
// - one around 4KB: 1344*3=4032
|
|||
//
|
|||
// -- The nitty gritty --
|
|||
//
|
|||
// Let CRC(I, X) be the non-inverted CRC32-C of the sequence X (with
|
|||
// initial non-inverted CRC I). This function has the following properties:
|
|||
// (a) CRC(I, AB) = CRC(CRC(I, A), B)
|
|||
// (b) CRC(I, A xor B) = CRC(I, A) xor CRC(0, B)
|
|||
//
|
|||
// Say we want to compute CRC(I, ABC) where A, B, C are three sequences of
|
|||
// K bytes each, where K is a fixed constant. Let O be the sequence of K zero
|
|||
// bytes.
|
|||
//
|
|||
// CRC(I, ABC) = CRC(I, ABO xor C)
|
|||
// = CRC(I, ABO) xor CRC(0, C)
|
|||
// = CRC(CRC(I, AB), O) xor CRC(0, C)
|
|||
// = CRC(CRC(I, AO xor B), O) xor CRC(0, C)
|
|||
// = CRC(CRC(I, AO) xor CRC(0, B), O) xor CRC(0, C)
|
|||
// = CRC(CRC(CRC(I, A), O) xor CRC(0, B), O) xor CRC(0, C)
|
|||
//
|
|||
// The castagnoliSSE42Triple function can compute CRC(I, A), CRC(0, B),
|
|||
// and CRC(0, C) efficiently. We just need to find a way to quickly compute
|
|||
// CRC(uvwx, O) given a 4-byte initial value uvwx. We can precompute these
|
|||
// values; since we can't have a 32-bit table, we break it up into four
|
|||
// 8-bit tables:
|
|||
//
|
|||
// CRC(uvwx, O) = CRC(u000, O) xor
|
|||
// CRC(0v00, O) xor
|
|||
// CRC(00w0, O) xor
|
|||
// CRC(000x, O)
|
|||
//
|
|||
// We can compute tables corresponding to the four terms for all 8-bit
|
|||
// values.
|
|||
|
|||
crc = ^crc |
|||
|
|||
// If a buffer is long enough to use the optimization, process the first few
|
|||
// bytes to align the buffer to an 8 byte boundary (if necessary).
|
|||
if len(p) >= castagnoliK1*3 { |
|||
delta := int(uintptr(unsafe.Pointer(&p[0])) & 7) |
|||
if delta != 0 { |
|||
delta = 8 - delta |
|||
crc = castagnoliSSE42(crc, p[:delta]) |
|||
p = p[delta:] |
|||
} |
|||
} |
|||
|
|||
// Process 3*K2 at a time.
|
|||
for len(p) >= castagnoliK2*3 { |
|||
// Compute CRC(I, A), CRC(0, B), and CRC(0, C).
|
|||
crcA, crcB, crcC := castagnoliSSE42Triple( |
|||
crc, 0, 0, |
|||
p, p[castagnoliK2:], p[castagnoliK2*2:], |
|||
castagnoliK2/24) |
|||
|
|||
// CRC(I, AB) = CRC(CRC(I, A), O) xor CRC(0, B)
|
|||
crcAB := castagnoliShift(castagnoliSSE42TableK2, crcA) ^ crcB |
|||
// CRC(I, ABC) = CRC(CRC(I, AB), O) xor CRC(0, C)
|
|||
crc = castagnoliShift(castagnoliSSE42TableK2, crcAB) ^ crcC |
|||
p = p[castagnoliK2*3:] |
|||
} |
|||
|
|||
// Process 3*K1 at a time.
|
|||
for len(p) >= castagnoliK1*3 { |
|||
// Compute CRC(I, A), CRC(0, B), and CRC(0, C).
|
|||
crcA, crcB, crcC := castagnoliSSE42Triple( |
|||
crc, 0, 0, |
|||
p, p[castagnoliK1:], p[castagnoliK1*2:], |
|||
castagnoliK1/24) |
|||
|
|||
// CRC(I, AB) = CRC(CRC(I, A), O) xor CRC(0, B)
|
|||
crcAB := castagnoliShift(castagnoliSSE42TableK1, crcA) ^ crcB |
|||
// CRC(I, ABC) = CRC(CRC(I, AB), O) xor CRC(0, C)
|
|||
crc = castagnoliShift(castagnoliSSE42TableK1, crcAB) ^ crcC |
|||
p = p[castagnoliK1*3:] |
|||
} |
|||
|
|||
// Use the simple implementation for what's left.
|
|||
crc = castagnoliSSE42(crc, p) |
|||
return ^crc |
|||
} |
|||
|
|||
func archAvailableIEEE() bool { |
|||
return useFastIEEE |
|||
} |
|||
|
|||
var archIeeeTable8 *slicing8Table |
|||
|
|||
func archInitIEEE() { |
|||
if !useFastIEEE { |
|||
panic("not available") |
|||
} |
|||
// We still use slicing-by-8 for small buffers.
|
|||
archIeeeTable8 = slicingMakeTable(IEEE) |
|||
} |
|||
|
|||
func archUpdateIEEE(crc uint32, p []byte) uint32 { |
|||
if !useFastIEEE { |
|||
panic("not available") |
|||
} |
|||
|
|||
if len(p) >= 64 { |
|||
left := len(p) & 15 |
|||
do := len(p) - left |
|||
crc = ^ieeeCLMUL(^crc, p[:do]) |
|||
p = p[do:] |
|||
} |
|||
if len(p) == 0 { |
|||
return crc |
|||
} |
|||
return slicingUpdate(crc, archIeeeTable8, p) |
|||
} |
|||
@ -1,319 +0,0 @@ |
|||
// Copyright 2011 The Go Authors. All rights reserved. |
|||
// Use of this source code is governed by a BSD-style |
|||
// license that can be found in the LICENSE file. |
|||
|
|||
// +build gc |
|||
|
|||
#define NOSPLIT 4 |
|||
#define RODATA 8 |
|||
|
|||
// castagnoliSSE42 updates the (non-inverted) crc with the given buffer. |
|||
// |
|||
// func castagnoliSSE42(crc uint32, p []byte) uint32 |
|||
TEXT ·castagnoliSSE42(SB), NOSPLIT, $0 |
|||
MOVL crc+0(FP), AX // CRC value |
|||
MOVQ p+8(FP), SI // data pointer |
|||
MOVQ p_len+16(FP), CX // len(p) |
|||
|
|||
// If there are fewer than 8 bytes to process, skip alignment. |
|||
CMPQ CX, $8 |
|||
JL less_than_8 |
|||
|
|||
MOVQ SI, BX |
|||
ANDQ $7, BX |
|||
JZ aligned |
|||
|
|||
// Process the first few bytes to 8-byte align the input. |
|||
|
|||
// BX = 8 - BX. We need to process this many bytes to align. |
|||
SUBQ $1, BX |
|||
XORQ $7, BX |
|||
|
|||
BTQ $0, BX |
|||
JNC align_2 |
|||
|
|||
CRC32B (SI), AX |
|||
DECQ CX |
|||
INCQ SI |
|||
|
|||
align_2: |
|||
BTQ $1, BX |
|||
JNC align_4 |
|||
|
|||
// CRC32W (SI), AX |
|||
BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06 |
|||
|
|||
SUBQ $2, CX |
|||
ADDQ $2, SI |
|||
|
|||
align_4: |
|||
BTQ $2, BX |
|||
JNC aligned |
|||
|
|||
// CRC32L (SI), AX |
|||
BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06 |
|||
|
|||
SUBQ $4, CX |
|||
ADDQ $4, SI |
|||
|
|||
aligned: |
|||
// The input is now 8-byte aligned and we can process 8-byte chunks. |
|||
CMPQ CX, $8 |
|||
JL less_than_8 |
|||
|
|||
CRC32Q (SI), AX |
|||
ADDQ $8, SI |
|||
SUBQ $8, CX |
|||
JMP aligned |
|||
|
|||
less_than_8: |
|||
// We may have some bytes left over; process 4 bytes, then 2, then 1. |
|||
BTQ $2, CX |
|||
JNC less_than_4 |
|||
|
|||
// CRC32L (SI), AX |
|||
BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06 |
|||
ADDQ $4, SI |
|||
|
|||
less_than_4: |
|||
BTQ $1, CX |
|||
JNC less_than_2 |
|||
|
|||
// CRC32W (SI), AX |
|||
BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06 |
|||
ADDQ $2, SI |
|||
|
|||
less_than_2: |
|||
BTQ $0, CX |
|||
JNC done |
|||
|
|||
CRC32B (SI), AX |
|||
|
|||
done: |
|||
MOVL AX, ret+32(FP) |
|||
RET |
|||
|
|||
// castagnoliSSE42Triple updates three (non-inverted) crcs with (24*rounds) |
|||
// bytes from each buffer. |
|||
// |
|||
// func castagnoliSSE42Triple( |
|||
// crc1, crc2, crc3 uint32, |
|||
// a, b, c []byte, |
|||
// rounds uint32, |
|||
// ) (retA uint32, retB uint32, retC uint32) |
|||
TEXT ·castagnoliSSE42Triple(SB), NOSPLIT, $0 |
|||
MOVL crcA+0(FP), AX |
|||
MOVL crcB+4(FP), CX |
|||
MOVL crcC+8(FP), DX |
|||
|
|||
MOVQ a+16(FP), R8 // data pointer |
|||
MOVQ b+40(FP), R9 // data pointer |
|||
MOVQ c+64(FP), R10 // data pointer |
|||
|
|||
MOVL rounds+88(FP), R11 |
|||
|
|||
loop: |
|||
CRC32Q (R8), AX |
|||
CRC32Q (R9), CX |
|||
CRC32Q (R10), DX |
|||
|
|||
CRC32Q 8(R8), AX |
|||
CRC32Q 8(R9), CX |
|||
CRC32Q 8(R10), DX |
|||
|
|||
CRC32Q 16(R8), AX |
|||
CRC32Q 16(R9), CX |
|||
CRC32Q 16(R10), DX |
|||
|
|||
ADDQ $24, R8 |
|||
ADDQ $24, R9 |
|||
ADDQ $24, R10 |
|||
|
|||
DECQ R11 |
|||
JNZ loop |
|||
|
|||
MOVL AX, retA+96(FP) |
|||
MOVL CX, retB+100(FP) |
|||
MOVL DX, retC+104(FP) |
|||
RET |
|||
|
|||
// func haveSSE42() bool |
|||
TEXT ·haveSSE42(SB), NOSPLIT, $0 |
|||
XORQ AX, AX |
|||
INCL AX |
|||
CPUID |
|||
SHRQ $20, CX |
|||
ANDQ $1, CX |
|||
MOVB CX, ret+0(FP) |
|||
RET |
|||
|
|||
// func haveCLMUL() bool |
|||
TEXT ·haveCLMUL(SB), NOSPLIT, $0 |
|||
XORQ AX, AX |
|||
INCL AX |
|||
CPUID |
|||
SHRQ $1, CX |
|||
ANDQ $1, CX |
|||
MOVB CX, ret+0(FP) |
|||
RET |
|||
|
|||
// func haveSSE41() bool |
|||
TEXT ·haveSSE41(SB), NOSPLIT, $0 |
|||
XORQ AX, AX |
|||
INCL AX |
|||
CPUID |
|||
SHRQ $19, CX |
|||
ANDQ $1, CX |
|||
MOVB CX, ret+0(FP) |
|||
RET |
|||
|
|||
// CRC32 polynomial data |
|||
// |
|||
// These constants are lifted from the |
|||
// Linux kernel, since they avoid the costly |
|||
// PSHUFB 16 byte reversal proposed in the |
|||
// original Intel paper. |
|||
DATA r2r1kp<>+0(SB)/8, $0x154442bd4 |
|||
DATA r2r1kp<>+8(SB)/8, $0x1c6e41596 |
|||
DATA r4r3kp<>+0(SB)/8, $0x1751997d0 |
|||
DATA r4r3kp<>+8(SB)/8, $0x0ccaa009e |
|||
DATA rupolykp<>+0(SB)/8, $0x1db710641 |
|||
DATA rupolykp<>+8(SB)/8, $0x1f7011641 |
|||
DATA r5kp<>+0(SB)/8, $0x163cd6124 |
|||
|
|||
GLOBL r2r1kp<>(SB), RODATA, $16 |
|||
GLOBL r4r3kp<>(SB), RODATA, $16 |
|||
GLOBL rupolykp<>(SB), RODATA, $16 |
|||
GLOBL r5kp<>(SB), RODATA, $8 |
|||
|
|||
// Based on http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf |
|||
// len(p) must be at least 64, and must be a multiple of 16. |
|||
|
|||
// func ieeeCLMUL(crc uint32, p []byte) uint32 |
|||
TEXT ·ieeeCLMUL(SB), NOSPLIT, $0 |
|||
MOVL crc+0(FP), X0 // Initial CRC value |
|||
MOVQ p+8(FP), SI // data pointer |
|||
MOVQ p_len+16(FP), CX // len(p) |
|||
|
|||
MOVOU (SI), X1 |
|||
MOVOU 16(SI), X2 |
|||
MOVOU 32(SI), X3 |
|||
MOVOU 48(SI), X4 |
|||
PXOR X0, X1 |
|||
ADDQ $64, SI // buf+=64 |
|||
SUBQ $64, CX // len-=64 |
|||
CMPQ CX, $64 // Less than 64 bytes left |
|||
JB remain64 |
|||
|
|||
MOVOA r2r1kp<>+0(SB), X0 |
|||
|
|||
loopback64: |
|||
MOVOA X1, X5 |
|||
MOVOA X2, X6 |
|||
MOVOA X3, X7 |
|||
MOVOA X4, X8 |
|||
|
|||
PCLMULQDQ $0, X0, X1 |
|||
PCLMULQDQ $0, X0, X2 |
|||
PCLMULQDQ $0, X0, X3 |
|||
PCLMULQDQ $0, X0, X4 |
|||
|
|||
// Load next early |
|||
MOVOU (SI), X11 |
|||
MOVOU 16(SI), X12 |
|||
MOVOU 32(SI), X13 |
|||
MOVOU 48(SI), X14 |
|||
|
|||
PCLMULQDQ $0x11, X0, X5 |
|||
PCLMULQDQ $0x11, X0, X6 |
|||
PCLMULQDQ $0x11, X0, X7 |
|||
PCLMULQDQ $0x11, X0, X8 |
|||
|
|||
PXOR X5, X1 |
|||
PXOR X6, X2 |
|||
PXOR X7, X3 |
|||
PXOR X8, X4 |
|||
|
|||
PXOR X11, X1 |
|||
PXOR X12, X2 |
|||
PXOR X13, X3 |
|||
PXOR X14, X4 |
|||
|
|||
ADDQ $0x40, DI |
|||
ADDQ $64, SI // buf+=64 |
|||
SUBQ $64, CX // len-=64 |
|||
CMPQ CX, $64 // Less than 64 bytes left? |
|||
JGE loopback64 |
|||
|
|||
// Fold result into a single register (X1) |
|||
remain64: |
|||
MOVOA r4r3kp<>+0(SB), X0 |
|||
|
|||
MOVOA X1, X5 |
|||
PCLMULQDQ $0, X0, X1 |
|||
PCLMULQDQ $0x11, X0, X5 |
|||
PXOR X5, X1 |
|||
PXOR X2, X1 |
|||
|
|||
MOVOA X1, X5 |
|||
PCLMULQDQ $0, X0, X1 |
|||
PCLMULQDQ $0x11, X0, X5 |
|||
PXOR X5, X1 |
|||
PXOR X3, X1 |
|||
|
|||
MOVOA X1, X5 |
|||
PCLMULQDQ $0, X0, X1 |
|||
PCLMULQDQ $0x11, X0, X5 |
|||
PXOR X5, X1 |
|||
PXOR X4, X1 |
|||
|
|||
// If there is less than 16 bytes left we are done |
|||
CMPQ CX, $16 |
|||
JB finish |
|||
|
|||
// Encode 16 bytes |
|||
remain16: |
|||
MOVOU (SI), X10 |
|||
MOVOA X1, X5 |
|||
PCLMULQDQ $0, X0, X1 |
|||
PCLMULQDQ $0x11, X0, X5 |
|||
PXOR X5, X1 |
|||
PXOR X10, X1 |
|||
SUBQ $16, CX |
|||
ADDQ $16, SI |
|||
CMPQ CX, $16 |
|||
JGE remain16 |
|||
|
|||
finish: |
|||
// Fold final result into 32 bits and return it |
|||
PCMPEQB X3, X3 |
|||
PCLMULQDQ $1, X1, X0 |
|||
PSRLDQ $8, X1 |
|||
PXOR X0, X1 |
|||
|
|||
MOVOA X1, X2 |
|||
MOVQ r5kp<>+0(SB), X0 |
|||
|
|||
// Creates 32 bit mask. Note that we don't care about upper half. |
|||
PSRLQ $32, X3 |
|||
|
|||
PSRLDQ $4, X2 |
|||
PAND X3, X1 |
|||
PCLMULQDQ $0, X0, X1 |
|||
PXOR X2, X1 |
|||
|
|||
MOVOA rupolykp<>+0(SB), X0 |
|||
|
|||
MOVOA X1, X2 |
|||
PAND X3, X1 |
|||
PCLMULQDQ $0x10, X0, X1 |
|||
PAND X3, X1 |
|||
PCLMULQDQ $0, X0, X1 |
|||
PXOR X2, X1 |
|||
|
|||
// PEXTRD $1, X1, AX (SSE 4.1) |
|||
BYTE $0x66; BYTE $0x0f; BYTE $0x3a |
|||
BYTE $0x16; BYTE $0xc8; BYTE $0x01 |
|||
MOVL AX, ret+32(FP) |
|||
|
|||
RET |
|||
@ -1,43 +0,0 @@ |
|||
// Copyright 2011 The Go Authors. All rights reserved.
|
|||
// Use of this source code is governed by a BSD-style
|
|||
// license that can be found in the LICENSE file.
|
|||
|
|||
// +build !appengine,!gccgo
|
|||
|
|||
package crc32 |
|||
|
|||
// This file contains the code to call the SSE 4.2 version of the Castagnoli
|
|||
// CRC.
|
|||
|
|||
// haveSSE42 is defined in crc32_amd64p32.s and uses CPUID to test for SSE 4.2
|
|||
// support.
|
|||
func haveSSE42() bool |
|||
|
|||
// castagnoliSSE42 is defined in crc32_amd64p32.s and uses the SSE4.2 CRC32
|
|||
// instruction.
|
|||
//go:noescape
|
|||
func castagnoliSSE42(crc uint32, p []byte) uint32 |
|||
|
|||
var sse42 = haveSSE42() |
|||
|
|||
func archAvailableCastagnoli() bool { |
|||
return sse42 |
|||
} |
|||
|
|||
func archInitCastagnoli() { |
|||
if !sse42 { |
|||
panic("not available") |
|||
} |
|||
// No initialization necessary.
|
|||
} |
|||
|
|||
func archUpdateCastagnoli(crc uint32, p []byte) uint32 { |
|||
if !sse42 { |
|||
panic("not available") |
|||
} |
|||
return castagnoliSSE42(crc, p) |
|||
} |
|||
|
|||
func archAvailableIEEE() bool { return false } |
|||
func archInitIEEE() { panic("not available") } |
|||
func archUpdateIEEE(crc uint32, p []byte) uint32 { panic("not available") } |
|||
@ -1,67 +0,0 @@ |
|||
// Copyright 2011 The Go Authors. All rights reserved. |
|||
// Use of this source code is governed by a BSD-style |
|||
// license that can be found in the LICENSE file. |
|||
|
|||
// +build gc |
|||
|
|||
#define NOSPLIT 4 |
|||
#define RODATA 8 |
|||
|
|||
// func castagnoliSSE42(crc uint32, p []byte) uint32 |
|||
TEXT ·castagnoliSSE42(SB), NOSPLIT, $0 |
|||
MOVL crc+0(FP), AX // CRC value |
|||
MOVL p+4(FP), SI // data pointer |
|||
MOVL p_len+8(FP), CX // len(p) |
|||
|
|||
NOTL AX |
|||
|
|||
// If there's less than 8 bytes to process, we do it byte-by-byte. |
|||
CMPQ CX, $8 |
|||
JL cleanup |
|||
|
|||
// Process individual bytes until the input is 8-byte aligned. |
|||
startup: |
|||
MOVQ SI, BX |
|||
ANDQ $7, BX |
|||
JZ aligned |
|||
|
|||
CRC32B (SI), AX |
|||
DECQ CX |
|||
INCQ SI |
|||
JMP startup |
|||
|
|||
aligned: |
|||
// The input is now 8-byte aligned and we can process 8-byte chunks. |
|||
CMPQ CX, $8 |
|||
JL cleanup |
|||
|
|||
CRC32Q (SI), AX |
|||
ADDQ $8, SI |
|||
SUBQ $8, CX |
|||
JMP aligned |
|||
|
|||
cleanup: |
|||
// We may have some bytes left over that we process one at a time. |
|||
CMPQ CX, $0 |
|||
JE done |
|||
|
|||
CRC32B (SI), AX |
|||
INCQ SI |
|||
DECQ CX |
|||
JMP cleanup |
|||
|
|||
done: |
|||
NOTL AX |
|||
MOVL AX, ret+16(FP) |
|||
RET |
|||
|
|||
// func haveSSE42() bool |
|||
TEXT ·haveSSE42(SB), NOSPLIT, $0 |
|||
XORQ AX, AX |
|||
INCL AX |
|||
CPUID |
|||
SHRQ $20, CX |
|||
ANDQ $1, CX |
|||
MOVB CX, ret+0(FP) |
|||
RET |
|||
|
|||
@ -1,89 +0,0 @@ |
|||
// Copyright 2011 The Go Authors. All rights reserved.
|
|||
// Use of this source code is governed by a BSD-style
|
|||
// license that can be found in the LICENSE file.
|
|||
|
|||
// This file contains CRC32 algorithms that are not specific to any architecture
|
|||
// and don't use hardware acceleration.
|
|||
//
|
|||
// The simple (and slow) CRC32 implementation only uses a 256*4 bytes table.
|
|||
//
|
|||
// The slicing-by-8 algorithm is a faster implementation that uses a bigger
|
|||
// table (8*256*4 bytes).
|
|||
|
|||
package crc32 |
|||
|
|||
// simpleMakeTable allocates and constructs a Table for the specified
|
|||
// polynomial. The table is suitable for use with the simple algorithm
|
|||
// (simpleUpdate).
|
|||
func simpleMakeTable(poly uint32) *Table { |
|||
t := new(Table) |
|||
simplePopulateTable(poly, t) |
|||
return t |
|||
} |
|||
|
|||
// simplePopulateTable constructs a Table for the specified polynomial, suitable
|
|||
// for use with simpleUpdate.
|
|||
func simplePopulateTable(poly uint32, t *Table) { |
|||
for i := 0; i < 256; i++ { |
|||
crc := uint32(i) |
|||
for j := 0; j < 8; j++ { |
|||
if crc&1 == 1 { |
|||
crc = (crc >> 1) ^ poly |
|||
} else { |
|||
crc >>= 1 |
|||
} |
|||
} |
|||
t[i] = crc |
|||
} |
|||
} |
|||
|
|||
// simpleUpdate uses the simple algorithm to update the CRC, given a table that
|
|||
// was previously computed using simpleMakeTable.
|
|||
func simpleUpdate(crc uint32, tab *Table, p []byte) uint32 { |
|||
crc = ^crc |
|||
for _, v := range p { |
|||
crc = tab[byte(crc)^v] ^ (crc >> 8) |
|||
} |
|||
return ^crc |
|||
} |
|||
|
|||
// Use slicing-by-8 when payload >= this value.
|
|||
const slicing8Cutoff = 16 |
|||
|
|||
// slicing8Table is array of 8 Tables, used by the slicing-by-8 algorithm.
|
|||
type slicing8Table [8]Table |
|||
|
|||
// slicingMakeTable constructs a slicing8Table for the specified polynomial. The
|
|||
// table is suitable for use with the slicing-by-8 algorithm (slicingUpdate).
|
|||
func slicingMakeTable(poly uint32) *slicing8Table { |
|||
t := new(slicing8Table) |
|||
simplePopulateTable(poly, &t[0]) |
|||
for i := 0; i < 256; i++ { |
|||
crc := t[0][i] |
|||
for j := 1; j < 8; j++ { |
|||
crc = t[0][crc&0xFF] ^ (crc >> 8) |
|||
t[j][i] = crc |
|||
} |
|||
} |
|||
return t |
|||
} |
|||
|
|||
// slicingUpdate uses the slicing-by-8 algorithm to update the CRC, given a
|
|||
// table that was previously computed using slicingMakeTable.
|
|||
func slicingUpdate(crc uint32, tab *slicing8Table, p []byte) uint32 { |
|||
if len(p) >= slicing8Cutoff { |
|||
crc = ^crc |
|||
for len(p) > 8 { |
|||
crc ^= uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24 |
|||
crc = tab[0][p[7]] ^ tab[1][p[6]] ^ tab[2][p[5]] ^ tab[3][p[4]] ^ |
|||
tab[4][crc>>24] ^ tab[5][(crc>>16)&0xFF] ^ |
|||
tab[6][(crc>>8)&0xFF] ^ tab[7][crc&0xFF] |
|||
p = p[8:] |
|||
} |
|||
crc = ^crc |
|||
} |
|||
if len(p) == 0 { |
|||
return crc |
|||
} |
|||
return simpleUpdate(crc, &tab[0], p) |
|||
} |
|||
@ -1,15 +0,0 @@ |
|||
// Copyright 2011 The Go Authors. All rights reserved.
|
|||
// Use of this source code is governed by a BSD-style
|
|||
// license that can be found in the LICENSE file.
|
|||
|
|||
// +build !amd64,!amd64p32,!s390x
|
|||
|
|||
package crc32 |
|||
|
|||
func archAvailableIEEE() bool { return false } |
|||
func archInitIEEE() { panic("not available") } |
|||
func archUpdateIEEE(crc uint32, p []byte) uint32 { panic("not available") } |
|||
|
|||
func archAvailableCastagnoli() bool { return false } |
|||
func archInitCastagnoli() { panic("not available") } |
|||
func archUpdateCastagnoli(crc uint32, p []byte) uint32 { panic("not available") } |
|||
@ -1,91 +0,0 @@ |
|||
// Copyright 2016 The Go Authors. All rights reserved.
|
|||
// Use of this source code is governed by a BSD-style
|
|||
// license that can be found in the LICENSE file.
|
|||
|
|||
// +build s390x
|
|||
|
|||
package crc32 |
|||
|
|||
const ( |
|||
vxMinLen = 64 |
|||
vxAlignMask = 15 // align to 16 bytes
|
|||
) |
|||
|
|||
// hasVectorFacility reports whether the machine has the z/Architecture
|
|||
// vector facility installed and enabled.
|
|||
func hasVectorFacility() bool |
|||
|
|||
var hasVX = hasVectorFacility() |
|||
|
|||
// vectorizedCastagnoli implements CRC32 using vector instructions.
|
|||
// It is defined in crc32_s390x.s.
|
|||
//go:noescape
|
|||
func vectorizedCastagnoli(crc uint32, p []byte) uint32 |
|||
|
|||
// vectorizedIEEE implements CRC32 using vector instructions.
|
|||
// It is defined in crc32_s390x.s.
|
|||
//go:noescape
|
|||
func vectorizedIEEE(crc uint32, p []byte) uint32 |
|||
|
|||
func archAvailableCastagnoli() bool { |
|||
return hasVX |
|||
} |
|||
|
|||
var archCastagnoliTable8 *slicing8Table |
|||
|
|||
func archInitCastagnoli() { |
|||
if !hasVX { |
|||
panic("not available") |
|||
} |
|||
// We still use slicing-by-8 for small buffers.
|
|||
archCastagnoliTable8 = slicingMakeTable(Castagnoli) |
|||
} |
|||
|
|||
// archUpdateCastagnoli calculates the checksum of p using
|
|||
// vectorizedCastagnoli.
|
|||
func archUpdateCastagnoli(crc uint32, p []byte) uint32 { |
|||
if !hasVX { |
|||
panic("not available") |
|||
} |
|||
// Use vectorized function if data length is above threshold.
|
|||
if len(p) >= vxMinLen { |
|||
aligned := len(p) & ^vxAlignMask |
|||
crc = vectorizedCastagnoli(crc, p[:aligned]) |
|||
p = p[aligned:] |
|||
} |
|||
if len(p) == 0 { |
|||
return crc |
|||
} |
|||
return slicingUpdate(crc, archCastagnoliTable8, p) |
|||
} |
|||
|
|||
func archAvailableIEEE() bool { |
|||
return hasVX |
|||
} |
|||
|
|||
var archIeeeTable8 *slicing8Table |
|||
|
|||
func archInitIEEE() { |
|||
if !hasVX { |
|||
panic("not available") |
|||
} |
|||
// We still use slicing-by-8 for small buffers.
|
|||
archIeeeTable8 = slicingMakeTable(IEEE) |
|||
} |
|||
|
|||
// archUpdateIEEE calculates the checksum of p using vectorizedIEEE.
|
|||
func archUpdateIEEE(crc uint32, p []byte) uint32 { |
|||
if !hasVX { |
|||
panic("not available") |
|||
} |
|||
// Use vectorized function if data length is above threshold.
|
|||
if len(p) >= vxMinLen { |
|||
aligned := len(p) & ^vxAlignMask |
|||
crc = vectorizedIEEE(crc, p[:aligned]) |
|||
p = p[aligned:] |
|||
} |
|||
if len(p) == 0 { |
|||
return crc |
|||
} |
|||
return slicingUpdate(crc, archIeeeTable8, p) |
|||
} |
|||
@ -1,249 +0,0 @@ |
|||
// Copyright 2016 The Go Authors. All rights reserved. |
|||
// Use of this source code is governed by a BSD-style |
|||
// license that can be found in the LICENSE file. |
|||
|
|||
// +build s390x |
|||
|
|||
#include "textflag.h" |
|||
|
|||
// Vector register range containing CRC-32 constants |
|||
|
|||
#define CONST_PERM_LE2BE V9 |
|||
#define CONST_R2R1 V10 |
|||
#define CONST_R4R3 V11 |
|||
#define CONST_R5 V12 |
|||
#define CONST_RU_POLY V13 |
|||
#define CONST_CRC_POLY V14 |
|||
|
|||
// The CRC-32 constant block contains reduction constants to fold and |
|||
// process particular chunks of the input data stream in parallel. |
|||
// |
|||
// Note that the constant definitions below are extended in order to compute |
|||
// intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction. |
|||
// The rightmost doubleword can be 0 to prevent contribution to the result or |
|||
// can be multiplied by 1 to perform an XOR without the need for a separate |
|||
// VECTOR EXCLUSIVE OR instruction. |
|||
// |
|||
// The polynomials used are bit-reflected: |
|||
// |
|||
// IEEE: P'(x) = 0x0edb88320 |
|||
// Castagnoli: P'(x) = 0x082f63b78 |
|||
|
|||
// IEEE polynomial constants |
|||
DATA ·crcleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask |
|||
DATA ·crcleconskp+8(SB)/8, $0x0706050403020100 |
|||
DATA ·crcleconskp+16(SB)/8, $0x00000001c6e41596 // R2 |
|||
DATA ·crcleconskp+24(SB)/8, $0x0000000154442bd4 // R1 |
|||
DATA ·crcleconskp+32(SB)/8, $0x00000000ccaa009e // R4 |
|||
DATA ·crcleconskp+40(SB)/8, $0x00000001751997d0 // R3 |
|||
DATA ·crcleconskp+48(SB)/8, $0x0000000000000000 |
|||
DATA ·crcleconskp+56(SB)/8, $0x0000000163cd6124 // R5 |
|||
DATA ·crcleconskp+64(SB)/8, $0x0000000000000000 |
|||
DATA ·crcleconskp+72(SB)/8, $0x00000001F7011641 // u' |
|||
DATA ·crcleconskp+80(SB)/8, $0x0000000000000000 |
|||
DATA ·crcleconskp+88(SB)/8, $0x00000001DB710641 // P'(x) << 1 |
|||
|
|||
GLOBL ·crcleconskp(SB), RODATA, $144 |
|||
|
|||
// Castagonli Polynomial constants |
|||
DATA ·crccleconskp+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask |
|||
DATA ·crccleconskp+8(SB)/8, $0x0706050403020100 |
|||
DATA ·crccleconskp+16(SB)/8, $0x000000009e4addf8 // R2 |
|||
DATA ·crccleconskp+24(SB)/8, $0x00000000740eef02 // R1 |
|||
DATA ·crccleconskp+32(SB)/8, $0x000000014cd00bd6 // R4 |
|||
DATA ·crccleconskp+40(SB)/8, $0x00000000f20c0dfe // R3 |
|||
DATA ·crccleconskp+48(SB)/8, $0x0000000000000000 |
|||
DATA ·crccleconskp+56(SB)/8, $0x00000000dd45aab8 // R5 |
|||
DATA ·crccleconskp+64(SB)/8, $0x0000000000000000 |
|||
DATA ·crccleconskp+72(SB)/8, $0x00000000dea713f1 // u' |
|||
DATA ·crccleconskp+80(SB)/8, $0x0000000000000000 |
|||
DATA ·crccleconskp+88(SB)/8, $0x0000000105ec76f0 // P'(x) << 1 |
|||
|
|||
GLOBL ·crccleconskp(SB), RODATA, $144 |
|||
|
|||
// func hasVectorFacility() bool |
|||
TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1 |
|||
MOVD $x-24(SP), R1 |
|||
XC $24, 0(R1), 0(R1) // clear the storage |
|||
MOVD $2, R0 // R0 is the number of double words stored -1 |
|||
WORD $0xB2B01000 // STFLE 0(R1) |
|||
XOR R0, R0 // reset the value of R0 |
|||
MOVBZ z-8(SP), R1 |
|||
AND $0x40, R1 |
|||
BEQ novector |
|||
|
|||
vectorinstalled: |
|||
// check if the vector instruction has been enabled |
|||
VLEIB $0, $0xF, V16 |
|||
VLGVB $0, V16, R1 |
|||
CMPBNE R1, $0xF, novector |
|||
MOVB $1, ret+0(FP) // have vx |
|||
RET |
|||
|
|||
novector: |
|||
MOVB $0, ret+0(FP) // no vx |
|||
RET |
|||
|
|||
// The CRC-32 function(s) use these calling conventions: |
|||
// |
|||
// Parameters: |
|||
// |
|||
// R2: Initial CRC value, typically ~0; and final CRC (return) value. |
|||
// R3: Input buffer pointer, performance might be improved if the |
|||
// buffer is on a doubleword boundary. |
|||
// R4: Length of the buffer, must be 64 bytes or greater. |
|||
// |
|||
// Register usage: |
|||
// |
|||
// R5: CRC-32 constant pool base pointer. |
|||
// V0: Initial CRC value and intermediate constants and results. |
|||
// V1..V4: Data for CRC computation. |
|||
// V5..V8: Next data chunks that are fetched from the input buffer. |
|||
// |
|||
// V9..V14: CRC-32 constants. |
|||
|
|||
// func vectorizedIEEE(crc uint32, p []byte) uint32 |
|||
TEXT ·vectorizedIEEE(SB), NOSPLIT, $0 |
|||
MOVWZ crc+0(FP), R2 // R2 stores the CRC value |
|||
MOVD p+8(FP), R3 // data pointer |
|||
MOVD p_len+16(FP), R4 // len(p) |
|||
|
|||
MOVD $·crcleconskp(SB), R5 |
|||
BR vectorizedBody<>(SB) |
|||
|
|||
// func vectorizedCastagnoli(crc uint32, p []byte) uint32 |
|||
TEXT ·vectorizedCastagnoli(SB), NOSPLIT, $0 |
|||
MOVWZ crc+0(FP), R2 // R2 stores the CRC value |
|||
MOVD p+8(FP), R3 // data pointer |
|||
MOVD p_len+16(FP), R4 // len(p) |
|||
|
|||
// R5: crc-32 constant pool base pointer, constant is used to reduce crc |
|||
MOVD $·crccleconskp(SB), R5 |
|||
BR vectorizedBody<>(SB) |
|||
|
|||
TEXT vectorizedBody<>(SB), NOSPLIT, $0 |
|||
XOR $0xffffffff, R2 // NOTW R2 |
|||
VLM 0(R5), CONST_PERM_LE2BE, CONST_CRC_POLY |
|||
|
|||
// Load the initial CRC value into the rightmost word of V0 |
|||
VZERO V0 |
|||
VLVGF $3, R2, V0 |
|||
|
|||
// Crash if the input size is less than 64-bytes. |
|||
CMP R4, $64 |
|||
BLT crash |
|||
|
|||
// Load a 64-byte data chunk and XOR with CRC |
|||
VLM 0(R3), V1, V4 // 64-bytes into V1..V4 |
|||
|
|||
// Reflect the data if the CRC operation is in the bit-reflected domain |
|||
VPERM V1, V1, CONST_PERM_LE2BE, V1 |
|||
VPERM V2, V2, CONST_PERM_LE2BE, V2 |
|||
VPERM V3, V3, CONST_PERM_LE2BE, V3 |
|||
VPERM V4, V4, CONST_PERM_LE2BE, V4 |
|||
|
|||
VX V0, V1, V1 // V1 ^= CRC |
|||
ADD $64, R3 // BUF = BUF + 64 |
|||
ADD $(-64), R4 |
|||
|
|||
// Check remaining buffer size and jump to proper folding method |
|||
CMP R4, $64 |
|||
BLT less_than_64bytes |
|||
|
|||
fold_64bytes_loop: |
|||
// Load the next 64-byte data chunk into V5 to V8 |
|||
VLM 0(R3), V5, V8 |
|||
VPERM V5, V5, CONST_PERM_LE2BE, V5 |
|||
VPERM V6, V6, CONST_PERM_LE2BE, V6 |
|||
VPERM V7, V7, CONST_PERM_LE2BE, V7 |
|||
VPERM V8, V8, CONST_PERM_LE2BE, V8 |
|||
|
|||
// Perform a GF(2) multiplication of the doublewords in V1 with |
|||
// the reduction constants in V0. The intermediate result is |
|||
// then folded (accumulated) with the next data chunk in V5 and |
|||
// stored in V1. Repeat this step for the register contents |
|||
// in V2, V3, and V4 respectively. |
|||
|
|||
VGFMAG CONST_R2R1, V1, V5, V1 |
|||
VGFMAG CONST_R2R1, V2, V6, V2 |
|||
VGFMAG CONST_R2R1, V3, V7, V3 |
|||
VGFMAG CONST_R2R1, V4, V8, V4 |
|||
|
|||
// Adjust buffer pointer and length for next loop |
|||
ADD $64, R3 // BUF = BUF + 64 |
|||
ADD $(-64), R4 // LEN = LEN - 64 |
|||
|
|||
CMP R4, $64 |
|||
BGE fold_64bytes_loop |
|||
|
|||
less_than_64bytes: |
|||
// Fold V1 to V4 into a single 128-bit value in V1 |
|||
VGFMAG CONST_R4R3, V1, V2, V1 |
|||
VGFMAG CONST_R4R3, V1, V3, V1 |
|||
VGFMAG CONST_R4R3, V1, V4, V1 |
|||
|
|||
// Check whether to continue with 64-bit folding |
|||
CMP R4, $16 |
|||
BLT final_fold |
|||
|
|||
fold_16bytes_loop: |
|||
VL 0(R3), V2 // Load next data chunk |
|||
VPERM V2, V2, CONST_PERM_LE2BE, V2 |
|||
|
|||
VGFMAG CONST_R4R3, V1, V2, V1 // Fold next data chunk |
|||
|
|||
// Adjust buffer pointer and size for folding next data chunk |
|||
ADD $16, R3 |
|||
ADD $-16, R4 |
|||
|
|||
// Process remaining data chunks |
|||
CMP R4, $16 |
|||
BGE fold_16bytes_loop |
|||
|
|||
final_fold: |
|||
VLEIB $7, $0x40, V9 |
|||
VSRLB V9, CONST_R4R3, V0 |
|||
VLEIG $0, $1, V0 |
|||
|
|||
VGFMG V0, V1, V1 |
|||
|
|||
VLEIB $7, $0x20, V9 // Shift by words |
|||
VSRLB V9, V1, V2 // Store remaining bits in V2 |
|||
VUPLLF V1, V1 // Split rightmost doubleword |
|||
VGFMAG CONST_R5, V1, V2, V1 // V1 = (V1 * R5) XOR V2 |
|||
|
|||
// The input values to the Barret reduction are the degree-63 polynomial |
|||
// in V1 (R(x)), degree-32 generator polynomial, and the reduction |
|||
// constant u. The Barret reduction result is the CRC value of R(x) mod |
|||
// P(x). |
|||
// |
|||
// The Barret reduction algorithm is defined as: |
|||
// |
|||
// 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u |
|||
// 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x) |
|||
// 3. C(x) = R(x) XOR T2(x) mod x^32 |
|||
// |
|||
// Note: To compensate the division by x^32, use the vector unpack |
|||
// instruction to move the leftmost word into the leftmost doubleword |
|||
// of the vector register. The rightmost doubleword is multiplied |
|||
// with zero to not contribute to the intermedate results. |
|||
|
|||
// T1(x) = floor( R(x) / x^32 ) GF2MUL u |
|||
VUPLLF V1, V2 |
|||
VGFMG CONST_RU_POLY, V2, V2 |
|||
|
|||
// Compute the GF(2) product of the CRC polynomial in VO with T1(x) in |
|||
// V2 and XOR the intermediate result, T2(x), with the value in V1. |
|||
// The final result is in the rightmost word of V2. |
|||
|
|||
VUPLLF V2, V2 |
|||
VGFMAG CONST_CRC_POLY, V2, V1, V2 |
|||
|
|||
done: |
|||
VLGVF $2, V2, R2 |
|||
XOR $0xffffffff, R2 // NOTW R2 |
|||
MOVWZ R2, ret + 32(FP) |
|||
RET |
|||
|
|||
crash: |
|||
MOVD $0, (R0) // input size is less than 64-bytes |
|||
|
Before Width: | Height: | Size: 8.0 KiB After Width: | Height: | Size: 35 KiB |
|
After Width: | Height: | Size: 8.9 KiB |
Loading…
Reference in new issue