Implement hardware accelerated CRC32 and SHA1, using them if possible (#3348)

* Implement hardware accelerated CRC32 and SHA1, use them if possible.
CRC32's generic function is also replaced with zlib's as it is much more performant than our implementation
Full hash of a ~731MB disc took only ~369 ms with this, and the generic CRC32 isn't so far behind at ~659 ms
SHA1 should perform 4x faster if the user's CPU supports the SHA instructions.

Co-authored-by: YoshiRulz <OSSYoshiRulz@gmail.com>
Co-authored-by: Morilli <35152647+Morilli@users.noreply.github.com>
This commit is contained in:
CasualPokePlayer 2022-08-09 23:30:17 -07:00 committed by GitHub
parent 0236a820ec
commit 32e8afcedc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 2467 additions and 27 deletions

BIN
Assets/dll/libbizhash.dll Normal file

Binary file not shown.

BIN
Assets/dll/libbizhash.so Normal file

Binary file not shown.

View File

@ -0,0 +1,17 @@
CC = gcc
CFLAGS = -Wall -Wextra -O3 -flto -fvisibility=internal -fPIC -Icommon
LFLAGS = -s -shared
SRCS = $(wildcard common/*.c) $(wildcard crc32/*.c) $(wildcard sha1/*.c) bizinterface.c
ifeq ($(OS),Windows_NT)
EXT = dll
else
EXT = so
endif
all: libbizhash
libbizhash: $(SRCS)
$(CC) $(CFLAGS) $(SRCS) -o ../../Assets/dll/libbizhash.$(EXT) $(LFLAGS)

View File

@ -0,0 +1,22 @@
LibBizHash is the unmanaged side for BizHawk's hashing.
CRC32 code is taken from [zlib-ng](https://github.com/zlib-ng/zlib-ng) with massive slashing of code and various tweaks. This code is licensed under the zlib license.
SHA1 is code is taken from [SHA-Intrinsics](https://github.com/noloader/SHA-Intrinsics) with some tweaks. This code is under the public domain.
To build, just do `make` in this directory. Note gcc 10 or later is required (due to missing intrinsics in older gcc versions)
zlib-ng's license:
```
(C) 1995-2013 Jean-loup Gailly and Mark Adler
This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software.
Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions:
The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
This notice may not be removed or altered from any source distribution.
```

View File

@ -0,0 +1,54 @@
#include <stdbool.h>
#include "common.h"
__attribute__((visibility("default")))
crc32_func BizCalcCrcFunc(void) {
x86_check_features();
return (x86_cpu_has_pclmulqdq && x86_cpu_has_sse41) ? &crc32_pclmulqdq : &crc32_braid;
}
__attribute__((visibility("default")))
bool BizSupportsShaInstructions(void) {
x86_check_features();
return x86_cpu_has_sha && x86_cpu_has_sse41;
}
__attribute__((visibility("default")))
void BizCalcSha1(uint32_t state[5], const uint8_t data[], uint32_t length) {
x86_check_features();
uint64_t bit_length = length * 8ULL;
// hash most of the data, leaving at most 63 bytes left
sha1_sha(state, data, length);
data += length & ~0x3F;
length &= 0x3F;
// copy all remaining data to a buffer
uint8_t block[64] = {0};
memcpy(block, data, length);
// pad data with '1' bit
block[length++] = 0x80;
// the last 8 bytes in the last block contain the data length;
// if the current block is too full hash it and start a new one (here the old one is cleared and re-used)
if (__builtin_expect(length > 56, false)) {
sha1_sha(state, block, 64);
memset(block, 0, 56);
}
// fill the last 8 bytes in the last block with the data length in bits (big endian)
for (int i = 0; i != 8; i++) {
block[63 - i] = bit_length >> i * 8;
}
// hash the last block
sha1_sha(state, block, 64);
// byteswap state (to big endian format)
state[0] = __builtin_bswap32(state[0]);
state[1] = __builtin_bswap32(state[1]);
state[2] = __builtin_bswap32(state[2]);
state[3] = __builtin_bswap32(state[3]);
state[4] = __builtin_bswap32(state[4]);
}

View File

@ -0,0 +1,23 @@
/* cpu_features.h -- CPU architecture feature check
* Copyright (C) 2017 Hans Kristian Rosbach
* For conditions of distribution and use, see copyright notice in README.md
*/
#ifndef COMMON_H_
#define COMMON_H_
#include <stdint.h>
#include <string.h>
#include "x86_features.h"
/* CRC32 */
typedef uint32_t (*crc32_func)(uint32_t crc32, const uint8_t *buf, uint32_t len);
extern uint32_t crc32_braid(uint32_t crc, const uint8_t *buf, uint32_t len);
extern uint32_t crc32_pclmulqdq(uint32_t crc32, const uint8_t *buf, uint32_t len);
/* SHA1 */
void sha1_sha(uint32_t state[5], const uint8_t data[], uint32_t length);
#endif

View File

@ -0,0 +1,63 @@
/* x86_features.c - x86 feature check
*
* Copyright (C) 2013 Intel Corporation. All rights reserved.
* Author:
* Jim Kukunas
*
* For conditions of distribution and use, see copyright notice in README.md
*/
#include <cpuid.h>
#include <string.h>
int x86_cpu_has_avx2;
int x86_cpu_has_avx512;
int x86_cpu_has_avx512vnni;
int x86_cpu_has_sse2;
int x86_cpu_has_ssse3;
int x86_cpu_has_sse41;
int x86_cpu_has_sse42;
int x86_cpu_has_pclmulqdq;
int x86_cpu_has_vpclmulqdq;
int x86_cpu_has_tzcnt;
int x86_cpu_has_sha;
void x86_check_features(void) {
static int features_checked = 0;
if (features_checked)
return;
unsigned eax, ebx, ecx, edx;
unsigned maxbasic;
__cpuid(0, maxbasic, ebx, ecx, edx);
__cpuid(1 /*CPU_PROCINFO_AND_FEATUREBITS*/, eax, ebx, ecx, edx);
x86_cpu_has_sse2 = edx & 0x4000000;
x86_cpu_has_ssse3 = ecx & 0x200;
x86_cpu_has_sse41 = ecx & 0x80000;
x86_cpu_has_sse42 = ecx & 0x100000;
x86_cpu_has_pclmulqdq = ecx & 0x2;
if (maxbasic >= 7) {
__cpuid_count(7, 0, eax, ebx, ecx, edx);
// check BMI1 bit
// Reference: https://software.intel.com/sites/default/files/article/405250/how-to-detect-new-instruction-support-in-the-4th-generation-intel-core-processor-family.pdf
x86_cpu_has_tzcnt = ebx & 0x8;
// check AVX2 bit
x86_cpu_has_avx2 = ebx & 0x20;
x86_cpu_has_avx512 = ebx & 0x00010000;
x86_cpu_has_avx512vnni = ecx & 0x800;
x86_cpu_has_vpclmulqdq = ecx & 0x400;
// check SHA bit
x86_cpu_has_sha = ebx & 0x20000000;
} else {
x86_cpu_has_tzcnt = 0;
x86_cpu_has_avx2 = 0;
x86_cpu_has_avx512 = 0;
x86_cpu_has_avx512vnni = 0;
x86_cpu_has_vpclmulqdq = 0;
x86_cpu_has_sha = 0;
}
}

View File

@ -0,0 +1,23 @@
/* x86_features.h -- check for CPU features
* Copyright (C) 2013 Intel Corporation Jim Kukunas
* For conditions of distribution and use, see copyright notice in README.md
*/
#ifndef X86_FEATURES_H_
#define X86_FEATURES_H_
extern int x86_cpu_has_avx2;
extern int x86_cpu_has_avx512;
extern int x86_cpu_has_avx512vnni;
extern int x86_cpu_has_sse2;
extern int x86_cpu_has_ssse3;
extern int x86_cpu_has_sse41;
extern int x86_cpu_has_sse42;
extern int x86_cpu_has_pclmulqdq;
extern int x86_cpu_has_vpclmulqdq;
extern int x86_cpu_has_tzcnt;
extern int x86_cpu_has_sha;
void x86_check_features(void);
#endif /* CPU_H_ */

View File

@ -0,0 +1,111 @@
/* crc32_braid.c -- compute the CRC-32 of a data stream
* Copyright (C) 1995-2022 Mark Adler
* For conditions of distribution and use, see copyright notice in README.md
*
* This interleaved implementation of a CRC makes use of pipelined multiple
* arithmetic-logic units, commonly found in modern CPU cores. It is due to
* Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in upstream.
*/
#include "common.h"
#include "crc32_braid_tbl.h"
#define DO1 c = crc_table[(c ^ *buf++) & 0xff] ^ (c >> 8)
#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
static uint32_t crc_word(uint64_t data) {
unsigned k;
for (k = 0; k < sizeof(uint64_t); k++)
data = (data >> 8) ^ crc_table[data & 0xff];
return (uint32_t)data;
}
__attribute__((visibility("hidden")))
uint32_t crc32_braid(uint32_t crc, const uint8_t *buf, uint32_t len) {
register uint32_t c;
c = crc;
/* If provided enough bytes, do a braided CRC calculation. */
if (len >= 5 * sizeof(uint64_t) + sizeof(uint64_t) - 1) {
uint64_t blks;
uint64_t const *words;
unsigned k;
/* Compute the CRC up to a uint64_t boundary. */
while (len && ((uint64_t)buf & (sizeof(uint64_t) - 1)) != 0) {
len--;
DO1;
}
/* Compute the CRC on as many 5 uint64_t blocks as are available. */
blks = len / (5 * sizeof(uint64_t));
len -= blks * 5 * sizeof(uint64_t);
words = (uint64_t const *)buf;
uint64_t crc0, word0, comb;
uint64_t crc1, word1;
uint64_t crc2, word2;
uint64_t crc3, word3;
uint64_t crc4, word4;
/* Initialize the CRC for each braid. */
crc0 = c;
crc1 = 0;
crc2 = 0;
crc3 = 0;
crc4 = 0;
/* Process the first blks-1 blocks, computing the CRCs on each braid independently. */
while (--blks) {
/* Load the word for each braid into registers. */
word0 = crc0 ^ words[0];
word1 = crc1 ^ words[1];
word2 = crc2 ^ words[2];
word3 = crc3 ^ words[3];
word4 = crc4 ^ words[4];
words += 5;
/* Compute and update the CRC for each word. The loop should get unrolled. */
crc0 = crc_braid_table[0][word0 & 0xff];
crc1 = crc_braid_table[0][word1 & 0xff];
crc2 = crc_braid_table[0][word2 & 0xff];
crc3 = crc_braid_table[0][word3 & 0xff];
crc4 = crc_braid_table[0][word4 & 0xff];
for (k = 1; k < sizeof(uint64_t); k++) {
crc0 ^= crc_braid_table[k][(word0 >> (k << 3)) & 0xff];
crc1 ^= crc_braid_table[k][(word1 >> (k << 3)) & 0xff];
crc2 ^= crc_braid_table[k][(word2 >> (k << 3)) & 0xff];
crc3 ^= crc_braid_table[k][(word3 >> (k << 3)) & 0xff];
crc4 ^= crc_braid_table[k][(word4 >> (k << 3)) & 0xff];
}
}
/* Process the last block, combining the CRCs of the 5 braids at the same time. */
comb = crc_word(crc0 ^ words[0]);
comb = crc_word(crc1 ^ words[1] ^ comb);
comb = crc_word(crc2 ^ words[2] ^ comb);
comb = crc_word(crc3 ^ words[3] ^ comb);
comb = crc_word(crc4 ^ words[4] ^ comb);
words += 5;
c = comb;
/* Update the pointer to the remaining bytes to process. */
buf = (const uint8_t *)words;
}
/* Complete the computation of the CRC on any remaining bytes. */
while (len >= 8) {
len -= 8;
DO8;
}
while (len) {
len--;
DO1;
}
return c;
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,16 @@
/* crc32_fold.h -- crc32 folding interface
* Copyright (C) 2021 Nathan Moinvaziri
* For conditions of distribution and use, see copyright notice in README.md
*/
#ifndef CRC32_FOLD_H_
#define CRC32_FOLD_H_
#define CRC32_FOLD_BUFFER_SIZE (16 * 4)
/* sizeof(__m128i) * (4 folds) */
typedef struct crc32_fold_s {
uint8_t fold[CRC32_FOLD_BUFFER_SIZE];
uint32_t value;
} crc32_fold;
#endif

View File

@ -0,0 +1,449 @@
/*
* Compute the CRC32 using a parallelized folding approach with the PCLMULQDQ
* instruction.
*
* A white paper describing this algorithm can be found at:
* doc/crc-pclmulqdq.pdf
*
* Copyright (C) 2013 Intel Corporation. All rights reserved.
* Copyright (C) 2016 Marian Beermann (support for initial value)
* Authors:
* Wajdi Feghali <wajdi.k.feghali@intel.com>
* Jim Guilford <james.guilford@intel.com>
* Vinodh Gopal <vinodh.gopal@intel.com>
* Erdinc Ozturk <erdinc.ozturk@intel.com>
* Jim Kukunas <james.t.kukunas@linux.intel.com>
*
* For conditions of distribution and use, see copyright notice in README.md
*/
#include <immintrin.h>
#include <wmmintrin.h>
#include <smmintrin.h> // _mm_extract_epi32
#include <assert.h>
#include "common.h"
#include "crc32_fold.h"
extern uint64_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
__m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, uint64_t len, __m128i init_crc,
int32_t first);
extern uint64_t fold_16_vpclmulqdq_copy(__m128i *xmm_crc0, __m128i *xmm_crc1,
__m128i *xmm_crc2, __m128i *xmm_crc3, uint8_t *dst, const uint8_t *src, uint64_t len);
__attribute__((target("sse4.1", "pclmul")))
static void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
const __m128i xmm_fold4 = _mm_set_epi32(0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
__m128i x_tmp3;
__m128 ps_crc0, ps_crc3, ps_res;
x_tmp3 = *xmm_crc3;
*xmm_crc3 = *xmm_crc0;
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
ps_res = _mm_xor_ps(ps_crc0, ps_crc3);
*xmm_crc0 = *xmm_crc1;
*xmm_crc1 = *xmm_crc2;
*xmm_crc2 = x_tmp3;
*xmm_crc3 = _mm_castps_si128(ps_res);
}
__attribute__((target("sse4.1", "pclmul")))
static void fold_2(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
const __m128i xmm_fold4 = _mm_set_epi32(0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
__m128i x_tmp3, x_tmp2;
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res31, ps_res20;
x_tmp3 = *xmm_crc3;
x_tmp2 = *xmm_crc2;
*xmm_crc3 = *xmm_crc1;
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
ps_res31 = _mm_xor_ps(ps_crc3, ps_crc1);
*xmm_crc2 = *xmm_crc0;
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
ps_res20 = _mm_xor_ps(ps_crc0, ps_crc2);
*xmm_crc0 = x_tmp2;
*xmm_crc1 = x_tmp3;
*xmm_crc2 = _mm_castps_si128(ps_res20);
*xmm_crc3 = _mm_castps_si128(ps_res31);
}
__attribute__((target("sse4.1", "pclmul")))
static void fold_3(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
const __m128i xmm_fold4 = _mm_set_epi32(0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
__m128i x_tmp3;
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3, ps_res32, ps_res21, ps_res10;
x_tmp3 = *xmm_crc3;
*xmm_crc3 = *xmm_crc2;
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x10);
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
ps_res32 = _mm_xor_ps(ps_crc2, ps_crc3);
*xmm_crc2 = *xmm_crc1;
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x10);
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
ps_res21 = _mm_xor_ps(ps_crc1, ps_crc2);
*xmm_crc1 = *xmm_crc0;
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x10);
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
ps_res10 = _mm_xor_ps(ps_crc0, ps_crc1);
*xmm_crc0 = x_tmp3;
*xmm_crc1 = _mm_castps_si128(ps_res10);
*xmm_crc2 = _mm_castps_si128(ps_res21);
*xmm_crc3 = _mm_castps_si128(ps_res32);
}
__attribute__((target("sse4.1", "pclmul")))
static void fold_4(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
const __m128i xmm_fold4 = _mm_set_epi32(0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
__m128i x_tmp0, x_tmp1, x_tmp2, x_tmp3;
__m128 ps_crc0, ps_crc1, ps_crc2, ps_crc3;
__m128 ps_t0, ps_t1, ps_t2, ps_t3;
__m128 ps_res0, ps_res1, ps_res2, ps_res3;
x_tmp0 = *xmm_crc0;
x_tmp1 = *xmm_crc1;
x_tmp2 = *xmm_crc2;
x_tmp3 = *xmm_crc3;
*xmm_crc0 = _mm_clmulepi64_si128(*xmm_crc0, xmm_fold4, 0x01);
x_tmp0 = _mm_clmulepi64_si128(x_tmp0, xmm_fold4, 0x10);
ps_crc0 = _mm_castsi128_ps(*xmm_crc0);
ps_t0 = _mm_castsi128_ps(x_tmp0);
ps_res0 = _mm_xor_ps(ps_crc0, ps_t0);
*xmm_crc1 = _mm_clmulepi64_si128(*xmm_crc1, xmm_fold4, 0x01);
x_tmp1 = _mm_clmulepi64_si128(x_tmp1, xmm_fold4, 0x10);
ps_crc1 = _mm_castsi128_ps(*xmm_crc1);
ps_t1 = _mm_castsi128_ps(x_tmp1);
ps_res1 = _mm_xor_ps(ps_crc1, ps_t1);
*xmm_crc2 = _mm_clmulepi64_si128(*xmm_crc2, xmm_fold4, 0x01);
x_tmp2 = _mm_clmulepi64_si128(x_tmp2, xmm_fold4, 0x10);
ps_crc2 = _mm_castsi128_ps(*xmm_crc2);
ps_t2 = _mm_castsi128_ps(x_tmp2);
ps_res2 = _mm_xor_ps(ps_crc2, ps_t2);
*xmm_crc3 = _mm_clmulepi64_si128(*xmm_crc3, xmm_fold4, 0x01);
x_tmp3 = _mm_clmulepi64_si128(x_tmp3, xmm_fold4, 0x10);
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
ps_t3 = _mm_castsi128_ps(x_tmp3);
ps_res3 = _mm_xor_ps(ps_crc3, ps_t3);
*xmm_crc0 = _mm_castps_si128(ps_res0);
*xmm_crc1 = _mm_castps_si128(ps_res1);
*xmm_crc2 = _mm_castps_si128(ps_res2);
*xmm_crc3 = _mm_castps_si128(ps_res3);
}
static const unsigned _Alignas(32) pshufb_shf_table[60] = {
0x84838281, 0x88878685, 0x8c8b8a89, 0x008f8e8d, /* shl 15 (16 - 1)/shr1 */
0x85848382, 0x89888786, 0x8d8c8b8a, 0x01008f8e, /* shl 14 (16 - 3)/shr2 */
0x86858483, 0x8a898887, 0x8e8d8c8b, 0x0201008f, /* shl 13 (16 - 4)/shr3 */
0x87868584, 0x8b8a8988, 0x8f8e8d8c, 0x03020100, /* shl 12 (16 - 4)/shr4 */
0x88878685, 0x8c8b8a89, 0x008f8e8d, 0x04030201, /* shl 11 (16 - 5)/shr5 */
0x89888786, 0x8d8c8b8a, 0x01008f8e, 0x05040302, /* shl 10 (16 - 6)/shr6 */
0x8a898887, 0x8e8d8c8b, 0x0201008f, 0x06050403, /* shl 9 (16 - 7)/shr7 */
0x8b8a8988, 0x8f8e8d8c, 0x03020100, 0x07060504, /* shl 8 (16 - 8)/shr8 */
0x8c8b8a89, 0x008f8e8d, 0x04030201, 0x08070605, /* shl 7 (16 - 9)/shr9 */
0x8d8c8b8a, 0x01008f8e, 0x05040302, 0x09080706, /* shl 6 (16 -10)/shr10*/
0x8e8d8c8b, 0x0201008f, 0x06050403, 0x0a090807, /* shl 5 (16 -11)/shr11*/
0x8f8e8d8c, 0x03020100, 0x07060504, 0x0b0a0908, /* shl 4 (16 -12)/shr12*/
0x008f8e8d, 0x04030201, 0x08070605, 0x0c0b0a09, /* shl 3 (16 -13)/shr13*/
0x01008f8e, 0x05040302, 0x09080706, 0x0d0c0b0a, /* shl 2 (16 -14)/shr14*/
0x0201008f, 0x06050403, 0x0a090807, 0x0e0d0c0b /* shl 1 (16 -15)/shr15*/
};
__attribute__((target("sse4.1", "pclmul")))
static void partial_fold(const size_t len, __m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2,
__m128i *xmm_crc3, __m128i *xmm_crc_part) {
const __m128i xmm_fold4 = _mm_set_epi32(0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
const __m128i xmm_mask3 = _mm_set1_epi32((int32_t)0x80808080);
__m128i xmm_shl, xmm_shr, xmm_tmp1, xmm_tmp2, xmm_tmp3;
__m128i xmm_a0_0, xmm_a0_1;
__m128 ps_crc3, psa0_0, psa0_1, ps_res;
xmm_shl = _mm_load_si128((__m128i *)pshufb_shf_table + (len - 1));
xmm_shr = xmm_shl;
xmm_shr = _mm_xor_si128(xmm_shr, xmm_mask3);
xmm_a0_0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shl);
*xmm_crc0 = _mm_shuffle_epi8(*xmm_crc0, xmm_shr);
xmm_tmp1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shl);
*xmm_crc0 = _mm_or_si128(*xmm_crc0, xmm_tmp1);
*xmm_crc1 = _mm_shuffle_epi8(*xmm_crc1, xmm_shr);
xmm_tmp2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shl);
*xmm_crc1 = _mm_or_si128(*xmm_crc1, xmm_tmp2);
*xmm_crc2 = _mm_shuffle_epi8(*xmm_crc2, xmm_shr);
xmm_tmp3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shl);
*xmm_crc2 = _mm_or_si128(*xmm_crc2, xmm_tmp3);
*xmm_crc3 = _mm_shuffle_epi8(*xmm_crc3, xmm_shr);
*xmm_crc_part = _mm_shuffle_epi8(*xmm_crc_part, xmm_shl);
*xmm_crc3 = _mm_or_si128(*xmm_crc3, *xmm_crc_part);
xmm_a0_1 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x10);
xmm_a0_0 = _mm_clmulepi64_si128(xmm_a0_0, xmm_fold4, 0x01);
ps_crc3 = _mm_castsi128_ps(*xmm_crc3);
psa0_0 = _mm_castsi128_ps(xmm_a0_0);
psa0_1 = _mm_castsi128_ps(xmm_a0_1);
ps_res = _mm_xor_ps(ps_crc3, psa0_0);
ps_res = _mm_xor_ps(ps_res, psa0_1);
*xmm_crc3 = _mm_castps_si128(ps_res);
}
__attribute__((target("sse4.1", "pclmul")))
static inline void crc32_fold_load(__m128i *fold, __m128i *fold0, __m128i *fold1, __m128i *fold2, __m128i *fold3) {
*fold0 = _mm_load_si128(fold + 0);
*fold1 = _mm_load_si128(fold + 1);
*fold2 = _mm_load_si128(fold + 2);
*fold3 = _mm_load_si128(fold + 3);
}
__attribute__((target("sse4.1", "pclmul")))
static inline void crc32_fold_save(__m128i *fold, __m128i fold0, __m128i fold1, __m128i fold2, __m128i fold3) {
_mm_storeu_si128(fold + 0, fold0);
_mm_storeu_si128(fold + 1, fold1);
_mm_storeu_si128(fold + 2, fold2);
_mm_storeu_si128(fold + 3, fold3);
}
__attribute__((target("sse4.1", "pclmul")))
static uint32_t crc32_fold_pclmulqdq_reset(crc32_fold *crc) {
__m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
__m128i xmm_zero = _mm_setzero_si128();
crc32_fold_save((__m128i *)crc->fold, xmm_crc0, xmm_zero, xmm_zero, xmm_zero);
return 0;
}
#define ONCE(op) if (first) { first = 0; op; }
#define XOR_INITIAL(where) ONCE(where = _mm_xor_si128(where, xmm_initial))
__attribute__((target("sse4.1", "pclmul")))
void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, uint64_t len, uint32_t init_crc) {
size_t algn_diff;
__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
__m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
__m128i xmm_crc_part = _mm_setzero_si128();
__m128i xmm_initial = _mm_cvtsi32_si128(init_crc);
int32_t first = init_crc != 0;
/* Technically the CRC functions don't even call this for input < 64, but a bare minimum of 31
* bytes of input is needed for the aligning load that occurs. If there's an initial CRC, to
* carry it forward through the folded CRC there must be 16 - src % 16 + 16 bytes available, which
* by definition can be up to 15 bytes + one full vector load. */
assert(len >= 31 || first == 0);
crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
if (len < 16) {
goto partial;
}
algn_diff = ((uintptr_t)16 - ((uintptr_t)src & 0xF)) & 0xF;
if (algn_diff) {
xmm_crc_part = _mm_loadu_si128((__m128i *)src);
XOR_INITIAL(xmm_crc_part);
if (algn_diff < 4 && init_crc != 0) {
xmm_t0 = xmm_crc_part;
xmm_crc_part = _mm_loadu_si128((__m128i*)src + 1);
fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
src += 16;
len -= 16;
}
partial_fold(algn_diff, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
src += algn_diff;
len -= algn_diff;
}
if (x86_cpu_has_vpclmulqdq && x86_cpu_has_avx512 && (len >= 256)) {
uint64_t n = fold_16_vpclmulqdq(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, src, len, xmm_initial, first);
first = 0;
len -= n;
src += n;
}
while (len >= 64) {
len -= 64;
xmm_t0 = _mm_load_si128((__m128i *)src);
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
xmm_t2 = _mm_load_si128((__m128i *)src + 2);
xmm_t3 = _mm_load_si128((__m128i *)src + 3);
src += 64;
fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
XOR_INITIAL(xmm_t0);
xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t2);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t3);
}
// len = num bytes left - 64
if (len >= 48) {
len -= 48;
xmm_t0 = _mm_load_si128((__m128i *)src);
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
xmm_t2 = _mm_load_si128((__m128i *)src + 2);
src += 48;
XOR_INITIAL(xmm_t0);
fold_3(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t0);
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
} else if (len >= 32) {
len -= 32;
xmm_t0 = _mm_load_si128((__m128i *)src);
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
src += 32;
XOR_INITIAL(xmm_t0);
fold_2(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t0);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t1);
} else if (len >= 16) {
len -= 16;
xmm_t0 = _mm_load_si128((__m128i *)src);
src += 16;
XOR_INITIAL(xmm_t0);
fold_1(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t0);
}
partial:
if (len) {
memcpy(&xmm_crc_part, src, (size_t)len);
partial_fold((size_t)len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
}
crc32_fold_save((__m128i *)crc->fold, xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3);
}
static const unsigned _Alignas(16) crc_k[] = {
0xccaa009e, 0x00000000, /* rk1 */
0x751997d0, 0x00000001, /* rk2 */
0xccaa009e, 0x00000000, /* rk5 */
0x63cd6124, 0x00000001, /* rk6 */
0xf7011640, 0x00000001, /* rk7 */
0xdb710640, 0x00000001 /* rk8 */
};
static const unsigned _Alignas(16) crc_mask[4] = {
0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000
};
static const unsigned _Alignas(16) crc_mask2[4] = {
0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
};
__attribute__((target("sse4.1", "pclmul")))
static uint32_t crc32_fold_pclmulqdq_final(crc32_fold *crc) {
const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
__m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
__m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
crc32_fold_load((__m128i *)crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
// k1
crc_fold = _mm_load_si128((__m128i *)crc_k);
x_tmp0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x10);
xmm_crc0 = _mm_clmulepi64_si128(xmm_crc0, crc_fold, 0x01);
xmm_crc1 = _mm_xor_si128(xmm_crc1, x_tmp0);
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_crc0);
x_tmp1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x10);
xmm_crc1 = _mm_clmulepi64_si128(xmm_crc1, crc_fold, 0x01);
xmm_crc2 = _mm_xor_si128(xmm_crc2, x_tmp1);
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_crc1);
x_tmp2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x10);
xmm_crc2 = _mm_clmulepi64_si128(xmm_crc2, crc_fold, 0x01);
xmm_crc3 = _mm_xor_si128(xmm_crc3, x_tmp2);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
// k5
crc_fold = _mm_load_si128((__m128i *)crc_k + 1);
xmm_crc0 = xmm_crc3;
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
xmm_crc0 = _mm_srli_si128(xmm_crc0, 8);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
xmm_crc0 = xmm_crc3;
xmm_crc3 = _mm_slli_si128(xmm_crc3, 4);
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc0);
xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask2);
// k7
xmm_crc1 = xmm_crc3;
xmm_crc2 = xmm_crc3;
crc_fold = _mm_load_si128((__m128i *)crc_k + 2);
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
xmm_crc3 = _mm_and_si128(xmm_crc3, xmm_mask);
xmm_crc2 = xmm_crc3;
xmm_crc3 = _mm_clmulepi64_si128(xmm_crc3, crc_fold, 0x10);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc2);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_crc1);
crc->value = (uint32_t)_mm_extract_epi32(xmm_crc3, 2);
return crc->value;
}
__attribute__((target("sse4.1", "pclmul"))) __attribute__((visibility("hidden")))
uint32_t crc32_pclmulqdq(uint32_t crc32, const uint8_t *buf, uint32_t len) {
// For lens < 64, crc32_braid method is faster.
if (len < 64)
return crc32_braid(crc32, buf, len);
crc32_fold _Alignas(16) crc_state;
crc32_fold_pclmulqdq_reset(&crc_state);
crc32_fold_pclmulqdq(&crc_state, buf, len, crc32 ^ 0xFFFFFFFFU);
return crc32_fold_pclmulqdq_final(&crc_state);
}

View File

@ -0,0 +1,99 @@
/* crc32_fold_vpclmulqdq.c -- VPCMULQDQ-based CRC32 folding implementation.
* Copyright Wangyang Guo (wangyang.guo@intel.com)
* For conditions of distribution and use, see copyright notice in README.md
*/
#include <immintrin.h>
#include <stdint.h>
#define ONCE(op) if (first) { first = 0; op; }
#define XOR_INITIAL(where) ONCE(where = _mm512_xor_si512(where, zmm_initial))
__attribute__((target("sse4.1", "pclmul", "avx512f", "vpclmulqdq")))
uint64_t fold_16_vpclmulqdq(__m128i *xmm_crc0, __m128i *xmm_crc1,
__m128i *xmm_crc2, __m128i *xmm_crc3, const uint8_t *src, uint64_t len,
__m128i init_crc, int32_t first) {
__m512i zmm_initial = _mm512_zextsi128_si512(init_crc);
__m512i zmm_t0, zmm_t1, zmm_t2, zmm_t3;
__m512i zmm_crc0, zmm_crc1, zmm_crc2, zmm_crc3;
__m512i z0, z1, z2, z3;
uint64_t len_tmp = len;
const __m512i zmm_fold4 = _mm512_set4_epi32(0x00000001, 0x54442bd4, 0x00000001, 0xc6e41596);
const __m512i zmm_fold16 = _mm512_set4_epi32(0x00000001, 0x1542778a, 0x00000001, 0x322d1430);
// zmm register init
zmm_crc0 = _mm512_setzero_si512();
zmm_t0 = _mm512_loadu_si512((__m512i *)src);
XOR_INITIAL(zmm_t0);
zmm_crc1 = _mm512_loadu_si512((__m512i *)src + 1);
zmm_crc2 = _mm512_loadu_si512((__m512i *)src + 2);
zmm_crc3 = _mm512_loadu_si512((__m512i *)src + 3);
// already have intermediate CRC in xmm registers
// fold4 with 4 xmm_crc to get zmm_crc0
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc0, 0);
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc1, 1);
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc2, 2);
zmm_crc0 = _mm512_inserti32x4(zmm_crc0, *xmm_crc3, 3);
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0);
len -= 256;
src += 256;
// fold-16 loops
while (len >= 256) {
zmm_t0 = _mm512_loadu_si512((__m512i *)src);
zmm_t1 = _mm512_loadu_si512((__m512i *)src + 1);
zmm_t2 = _mm512_loadu_si512((__m512i *)src + 2);
zmm_t3 = _mm512_loadu_si512((__m512i *)src + 3);
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x01);
z1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x01);
z2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x01);
z3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold16, 0x10);
zmm_crc1 = _mm512_clmulepi64_epi128(zmm_crc1, zmm_fold16, 0x10);
zmm_crc2 = _mm512_clmulepi64_epi128(zmm_crc2, zmm_fold16, 0x10);
zmm_crc3 = _mm512_clmulepi64_epi128(zmm_crc3, zmm_fold16, 0x10);
zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
zmm_crc1 = _mm512_xor_si512(z1, zmm_crc1);
zmm_crc2 = _mm512_xor_si512(z2, zmm_crc2);
zmm_crc3 = _mm512_xor_si512(z3, zmm_crc3);
zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_t0);
zmm_crc1 = _mm512_xor_si512(zmm_crc1, zmm_t1);
zmm_crc2 = _mm512_xor_si512(zmm_crc2, zmm_t2);
zmm_crc3 = _mm512_xor_si512(zmm_crc3, zmm_t3);
len -= 256;
src += 256;
}
// zmm_crc[0,1,2,3] -> zmm_crc0
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc1);
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc2);
z0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x01);
zmm_crc0 = _mm512_clmulepi64_epi128(zmm_crc0, zmm_fold4, 0x10);
zmm_crc0 = _mm512_xor_si512(z0, zmm_crc0);
zmm_crc0 = _mm512_xor_si512(zmm_crc0, zmm_crc3);
// zmm_crc0 -> xmm_crc[0, 1, 2, 3]
*xmm_crc0 = _mm512_extracti32x4_epi32(zmm_crc0, 0);
*xmm_crc1 = _mm512_extracti32x4_epi32(zmm_crc0, 1);
*xmm_crc2 = _mm512_extracti32x4_epi32(zmm_crc0, 2);
*xmm_crc3 = _mm512_extracti32x4_epi32(zmm_crc0, 3);
return (len_tmp - len); // return n bytes processed
}

View File

@ -0,0 +1,193 @@
/* Intel SHA extensions using C intrinsics */
/* Written and place in public domain by Jeffrey Walton */
/* Based on code from Intel, and by Sean Gulley for */
/* the miTLS project. */
#include <immintrin.h>
#include "common.h"
__attribute__((target("sha", "sse4.1")))
void sha1_sha(uint32_t state[5], const uint8_t data[], uint32_t length) {
__m128i ABCD, ABCD_SAVE, E0, E0_SAVE, E1;
__m128i MSG0, MSG1, MSG2, MSG3;
const __m128i MASK = _mm_set_epi64x(0x0001020304050607ULL, 0x08090a0b0c0d0e0fULL);
/* Load initial values */
ABCD = _mm_loadu_si128((const __m128i*) state);
E0 = _mm_set_epi32(state[4], 0, 0, 0);
ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
while (length >= 64) {
/* Save current state */
ABCD_SAVE = ABCD;
E0_SAVE = E0;
/* Rounds 0-3 */
MSG0 = _mm_loadu_si128((const __m128i*)(data + 0));
MSG0 = _mm_shuffle_epi8(MSG0, MASK);
E0 = _mm_add_epi32(E0, MSG0);
E1 = ABCD;
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
/* Rounds 4-7 */
MSG1 = _mm_loadu_si128((const __m128i*)(data + 16));
MSG1 = _mm_shuffle_epi8(MSG1, MASK);
E1 = _mm_sha1nexte_epu32(E1, MSG1);
E0 = ABCD;
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
/* Rounds 8-11 */
MSG2 = _mm_loadu_si128((const __m128i*)(data + 32));
MSG2 = _mm_shuffle_epi8(MSG2, MASK);
E0 = _mm_sha1nexte_epu32(E0, MSG2);
E1 = ABCD;
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
MSG0 = _mm_xor_si128(MSG0, MSG2);
/* Rounds 12-15 */
MSG3 = _mm_loadu_si128((const __m128i*)(data + 48));
MSG3 = _mm_shuffle_epi8(MSG3, MASK);
E1 = _mm_sha1nexte_epu32(E1, MSG3);
E0 = ABCD;
MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 0);
MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
MSG1 = _mm_xor_si128(MSG1, MSG3);
/* Rounds 16-19 */
E0 = _mm_sha1nexte_epu32(E0, MSG0);
E1 = ABCD;
MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 0);
MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
MSG2 = _mm_xor_si128(MSG2, MSG0);
/* Rounds 20-23 */
E1 = _mm_sha1nexte_epu32(E1, MSG1);
E0 = ABCD;
MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
MSG3 = _mm_xor_si128(MSG3, MSG1);
/* Rounds 24-27 */
E0 = _mm_sha1nexte_epu32(E0, MSG2);
E1 = ABCD;
MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
MSG0 = _mm_xor_si128(MSG0, MSG2);
/* Rounds 28-31 */
E1 = _mm_sha1nexte_epu32(E1, MSG3);
E0 = ABCD;
MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
MSG1 = _mm_xor_si128(MSG1, MSG3);
/* Rounds 32-35 */
E0 = _mm_sha1nexte_epu32(E0, MSG0);
E1 = ABCD;
MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 1);
MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
MSG2 = _mm_xor_si128(MSG2, MSG0);
/* Rounds 36-39 */
E1 = _mm_sha1nexte_epu32(E1, MSG1);
E0 = ABCD;
MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 1);
MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
MSG3 = _mm_xor_si128(MSG3, MSG1);
/* Rounds 40-43 */
E0 = _mm_sha1nexte_epu32(E0, MSG2);
E1 = ABCD;
MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
MSG0 = _mm_xor_si128(MSG0, MSG2);
/* Rounds 44-47 */
E1 = _mm_sha1nexte_epu32(E1, MSG3);
E0 = ABCD;
MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
MSG1 = _mm_xor_si128(MSG1, MSG3);
/* Rounds 48-51 */
E0 = _mm_sha1nexte_epu32(E0, MSG0);
E1 = ABCD;
MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
MSG2 = _mm_xor_si128(MSG2, MSG0);
/* Rounds 52-55 */
E1 = _mm_sha1nexte_epu32(E1, MSG1);
E0 = ABCD;
MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 2);
MSG0 = _mm_sha1msg1_epu32(MSG0, MSG1);
MSG3 = _mm_xor_si128(MSG3, MSG1);
/* Rounds 56-59 */
E0 = _mm_sha1nexte_epu32(E0, MSG2);
E1 = ABCD;
MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 2);
MSG1 = _mm_sha1msg1_epu32(MSG1, MSG2);
MSG0 = _mm_xor_si128(MSG0, MSG2);
/* Rounds 60-63 */
E1 = _mm_sha1nexte_epu32(E1, MSG3);
E0 = ABCD;
MSG0 = _mm_sha1msg2_epu32(MSG0, MSG3);
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
MSG2 = _mm_sha1msg1_epu32(MSG2, MSG3);
MSG1 = _mm_xor_si128(MSG1, MSG3);
/* Rounds 64-67 */
E0 = _mm_sha1nexte_epu32(E0, MSG0);
E1 = ABCD;
MSG1 = _mm_sha1msg2_epu32(MSG1, MSG0);
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
MSG3 = _mm_sha1msg1_epu32(MSG3, MSG0);
MSG2 = _mm_xor_si128(MSG2, MSG0);
/* Rounds 68-71 */
E1 = _mm_sha1nexte_epu32(E1, MSG1);
E0 = ABCD;
MSG2 = _mm_sha1msg2_epu32(MSG2, MSG1);
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
MSG3 = _mm_xor_si128(MSG3, MSG1);
/* Rounds 72-75 */
E0 = _mm_sha1nexte_epu32(E0, MSG2);
E1 = ABCD;
MSG3 = _mm_sha1msg2_epu32(MSG3, MSG2);
ABCD = _mm_sha1rnds4_epu32(ABCD, E0, 3);
/* Rounds 76-79 */
E1 = _mm_sha1nexte_epu32(E1, MSG3);
E0 = ABCD;
ABCD = _mm_sha1rnds4_epu32(ABCD, E1, 3);
/* Combine state */
E0 = _mm_sha1nexte_epu32(E0, E0_SAVE);
ABCD = _mm_add_epi32(ABCD, ABCD_SAVE);
data += 64;
length -= 64;
}
/* Save state */
ABCD = _mm_shuffle_epi32(ABCD, 0x1B);
_mm_storeu_si128((__m128i*) state, ABCD);
state[4] = _mm_extract_epi32(E0, 3);
}

View File

@ -1,4 +1,5 @@
using System;
using System.Runtime.InteropServices;
namespace BizHawk.Common
{
@ -8,25 +9,18 @@ namespace BizHawk.Common
/// <remarks>coefficients of the polynomial, in the format Wikipedia calls "reversed"</remarks>
public const uint POLYNOMIAL_CONST = 0xEDB88320U;
private static readonly uint[] COMBINER_INIT_STATE;
/// <summary>
/// Delegate to unmanaged code that actually does the calculation.
/// This may be hardware accelerated, if the CPU supports such.
/// </summary>
private static readonly LibBizHash.CalcCRC _calcCRC;
private static readonly uint[] CRC32Table;
private static readonly uint[] COMBINER_INIT_STATE;
static CRC32()
{
// for Add (CRC32 computation):
CRC32Table = new uint[256];
for (var i = 0U; i < 256U; i++)
{
var crc = i;
for (var j = 0; j < 8; j++)
{
var xor = (crc & 1U) == 1U;
crc >>= 1;
if (xor) crc ^= POLYNOMIAL_CONST;
}
CRC32Table[i] = crc;
}
_calcCRC = Marshal.GetDelegateForFunctionPointer<LibBizHash.CalcCRC>(LibBizHash.BizCalcCrcFunc());
// for Incorporate:
var combinerState = (COMBINER_INIT_STATE = new uint[64]).AsSpan();
@ -80,17 +74,11 @@ namespace BizHawk.Common
/// <summary>The negated output (the typical result of the CRC calculation)</summary>
public uint Result => ~_current;
public void Add(byte datum)
public unsafe void Add(ReadOnlySpan<byte> data)
{
_current = CRC32Table[(_current ^ datum) & 0xFF] ^ (_current >> 8);
}
public void Add(ReadOnlySpan<byte> data)
{
foreach (var b in data)
fixed (byte* d = &data.GetPinnableReference())
{
// Add(b); // I assume this would be slower
_current = CRC32Table[(_current ^ b) & 0xFF] ^ (_current >> 8);
_current = _calcCRC(_current, (IntPtr) d, data.Length);
}
}

View File

@ -0,0 +1,22 @@
using System;
using System.Runtime.InteropServices;
namespace BizHawk.Common
{
public static class LibBizHash
{
private const CallingConvention cc = CallingConvention.Cdecl;
[UnmanagedFunctionPointer(cc)]
public delegate uint CalcCRC(uint current, IntPtr buffer, int len);
[DllImport("libbizhash", CallingConvention = cc)]
public static extern IntPtr BizCalcCrcFunc();
[DllImport("libbizhash", CallingConvention = cc)]
public static extern bool BizSupportsShaInstructions();
[DllImport("libbizhash", CallingConvention = cc)]
public static extern void BizCalcSha1(IntPtr state, byte[] data, int len);
}
}

View File

@ -1,11 +1,46 @@
using System;
using System.Diagnostics;
using System.Runtime.InteropServices;
using System.Security.Cryptography;
using BizHawk.Common.BufferExtensions;
namespace BizHawk.Common
{
public interface ISHA1
{
byte[] ComputeHash(byte[] buffer);
}
public sealed class NETSHA1 : ISHA1
{
private readonly SHA1 _sha1Impl;
public NETSHA1()
{
_sha1Impl = SHA1.Create();
Debug.Assert(_sha1Impl.CanReuseTransform && _sha1Impl.HashSize is SHA1Checksum.EXPECTED_LENGTH);
}
public byte[] ComputeHash(byte[] buffer)
=> _sha1Impl.ComputeHash(buffer);
}
public sealed class FastSHA1 : ISHA1
{
public unsafe byte[] ComputeHash(byte[] buffer)
{
// Set SHA1 start state
var state = stackalloc uint[] { 0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476, 0xC3D2E1F0 };
// This will use dedicated SHA instructions, which perform 4x faster than a generic implementation
LibBizHash.BizCalcSha1((IntPtr)state, buffer, buffer.Length);
// The copy seems wasteful, but pinning the state down actually has a bigger performance impact
var ret = new byte[20];
Marshal.Copy((IntPtr)state, ret, 0, 20);
return ret;
}
}
/// <summary>uses <see cref="SHA1"/> implementation from BCL</summary>
/// <seealso cref="CRC32Checksum"/>
/// <seealso cref="MD5Checksum"/>
@ -35,16 +70,17 @@ namespace BizHawk.Common
return impl.GetHashAndReset();
}
#else
private static SHA1? _sha1Impl;
private static ISHA1? _sha1Impl;
private static SHA1 SHA1Impl
private static ISHA1 SHA1Impl
{
get
{
if (_sha1Impl == null)
{
_sha1Impl = SHA1.Create();
Debug.Assert(_sha1Impl.CanReuseTransform && _sha1Impl.HashSize is EXPECTED_LENGTH);
_sha1Impl = LibBizHash.BizSupportsShaInstructions()
? new FastSHA1()
: new NETSHA1();
}
return _sha1Impl;
}

View File

@ -0,0 +1,57 @@
using System;
using System.Linq;
using System.Text;
using BizHawk.Common;
using Microsoft.VisualStudio.TestTools.UnitTesting;
namespace BizHawk.Tests.Common.checksums
{
[TestClass]
public sealed class SHA1Tests
{
[TestMethod]
public void TestSHA1Empty()
{
byte[] data = Array.Empty<byte>(); // empty data
byte[] expectedSha = { 0xda, 0x39, 0xa3, 0xee, 0x5e, 0x6b, 0x4b, 0x0d, 0x32, 0x55, 0xbf, 0xef, 0x95, 0x60, 0x18, 0x90, 0xaf, 0xd8, 0x07, 0x09 };
Assert.IsTrue(expectedSha.SequenceEqual(SHA1Checksum.Compute(data)));
}
[TestMethod]
public void TestSHA1Simple()
{
byte[] data = { (byte)'h', (byte)'a', (byte)'s', (byte)'h' }; // random short data
byte[] expectedSha = { 0x23, 0x46, 0xad, 0x27, 0xd7, 0x56, 0x8b, 0xa9, 0x89, 0x6f, 0x1b, 0x7d, 0xa6, 0xb5, 0x99, 0x12, 0x51, 0xde, 0xbd, 0xf2 };
Assert.IsTrue(expectedSha.SequenceEqual(SHA1Checksum.Compute(data)));
Assert.IsTrue(expectedSha.SequenceEqual(SHA1Checksum.ComputeConcat(Array.Empty<byte>(), data)));
Assert.IsTrue(expectedSha.SequenceEqual(SHA1Checksum.ComputeConcat(data, Array.Empty<byte>())));
data = new[] { (byte)'h', (byte)'a' };
byte[] data2 = { (byte)'s', (byte)'h' };
Assert.IsTrue(expectedSha.SequenceEqual(SHA1Checksum.ComputeConcat(data, data2)));
}
[TestMethod]
public void TestSHA1LessSimple()
{
const string testString = "The quick brown fox jumps over the lazy dog.";
byte[] data = Encoding.ASCII.GetBytes(testString);
byte[] expectedSha1 = { 0x40, 0x8d, 0x94, 0x38, 0x42, 0x16, 0xf8, 0x90, 0xff, 0x7a, 0x0c, 0x35, 0x28, 0xe8, 0xbe, 0xd1, 0xe0, 0xb0, 0x16, 0x21 };
Assert.IsTrue(expectedSha1.SequenceEqual(SHA1Checksum.Compute(data)));
data = new byte[65];
Encoding.ASCII.GetBytes(testString).CopyTo(data, 0);
byte[] expectedSha2 = { 0x65, 0x87, 0x84, 0xE2, 0x68, 0xBF, 0xB1, 0x67, 0x94, 0x7B, 0xB7, 0xF3, 0xFB, 0x76, 0x69, 0x62, 0x79, 0x3E, 0x8C, 0x46 };
Assert.IsTrue(expectedSha2.SequenceEqual(SHA1Checksum.Compute(new Span<byte>(data, 0, 64))));
byte[] expectedSha3 = { 0x34, 0xF3, 0xA2, 0x57, 0xBD, 0x12, 0x5E, 0x6E, 0x0E, 0x28, 0xD0, 0xE5, 0xDA, 0xBE, 0x22, 0x28, 0x97, 0xFA, 0x69, 0x55 };
Assert.IsTrue(expectedSha3.SequenceEqual(SHA1Checksum.Compute(data)));
}
}
}