Update xxHash to use Hardware CRC32C if available

This commit is contained in:
Luke Usher 2018-02-14 23:28:53 +00:00
parent ff514d82ee
commit a2f727f0a1
5 changed files with 389 additions and 1 deletions

View File

@ -195,6 +195,7 @@
<ClInclude Include="..\..\src\Common\Win32\XBAudio.h" />
<ClInclude Include="..\..\src\Common\XADPCM.h" />
<ClInclude Include="..\..\src\Common\XbePrinter.h" />
<ClInclude Include="..\..\src\CxbxKrnl\crc32c.h" />
<ClInclude Include="..\..\src\CxbxKrnl\EmuD3D8Logging.h" />
<ClInclude Include="..\..\import\stb\stb_image.h" />
<ClInclude Include="..\..\src\Common\EmuEEPROM.h" />
@ -391,6 +392,7 @@
<ClCompile Include="..\..\src\Common\CxbxDebugger.cpp" />
<ClCompile Include="..\..\src\Common\Win32\XBAudio.cpp" />
<ClCompile Include="..\..\src\Common\XbePrinter.cpp" />
<ClCompile Include="..\..\src\CxbxKrnl\crc32c.cpp" />
<ClCompile Include="..\..\src\CxbxKrnl\EmuD3D8Logging.cpp" />
<ClCompile Include="..\..\src\Common\EmuEEPROM.cpp" />
<ClCompile Include="..\..\src\Common\Logging.cpp" />

View File

@ -1,4 +1,4 @@
<?xml version="1.0" encoding="utf-8"?>
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<ClCompile Include="..\..\src\Cxbx\DlgControllerConfig.cpp">
@ -238,6 +238,9 @@
<ClCompile Include="..\..\src\devices\MCPXDevice.cpp">
<Filter>Hardware</Filter>
</ClCompile>
<ClCompile Include="..\..\src\CxbxKrnl\crc32c.cpp">
<Filter>Shared</Filter>
</ClCompile>
<ClCompile Include="..\..\src\devices\video\swizzle.cpp">
<Filter>Hardware</Filter>
</ClCompile>
@ -474,6 +477,9 @@
<ClInclude Include="..\..\src\devices\MCPXDevice.h">
<Filter>Hardware</Filter>
</ClInclude>
<ClInclude Include="..\..\src\CxbxKrnl\crc32c.h">
<Filter>Shared</Filter>
</ClInclude>
<ClInclude Include="..\..\src\CxbxKrnl\EmuKrnlKi.h">
<Filter>Kernel</Filter>
</ClInclude>

334
src/CxbxKrnl/crc32c.cpp Normal file
View File

@ -0,0 +1,334 @@
/*
Copyright (c) 2013 - 2014, 2016 Mark Adler, Robert Vazan, Max Vysokikh
This software is provided 'as-is', without any express or implied
warranty. In no event will the author be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
*/
#ifndef _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_WARNINGS
#endif
#include "crc32c.h"
#include <intrin.h>
#define NOMINMAX
#include <algorithm>
#define POLY 0x82f63b78
#define LONG_SHIFT 8192
#define SHORT_SHIFT 256
typedef const uint8_t *buffer;
static uint32_t table[16][256];
static uint32_t long_shifts[4][256];
static uint32_t short_shifts[4][256];
static bool _tableInitialized;
void calculate_table();
/* Table-driven software version as a fall-back. This is about 15 times slower
than using the hardware instructions. This assumes little-endian integers,
as is the case on Intel processors that the assembler code here is for. */
extern "C" CRC32C_API uint32_t crc32c_append_sw(uint32_t crci, buffer input, size_t length)
{
buffer next = input;
#ifdef _M_X64
uint64_t crc;
#else
uint32_t crc;
#endif
crc = crci ^ 0xffffffff;
#ifdef _M_X64
while (length && ((uintptr_t)next & 7) != 0)
{
crc = table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
--length;
}
while (length >= 16)
{
crc ^= *(uint64_t *)next;
uint64_t high = *(uint64_t *)(next + 8);
crc = table[15][crc & 0xff]
^ table[14][(crc >> 8) & 0xff]
^ table[13][(crc >> 16) & 0xff]
^ table[12][(crc >> 24) & 0xff]
^ table[11][(crc >> 32) & 0xff]
^ table[10][(crc >> 40) & 0xff]
^ table[9][(crc >> 48) & 0xff]
^ table[8][crc >> 56]
^ table[7][high & 0xff]
^ table[6][(high >> 8) & 0xff]
^ table[5][(high >> 16) & 0xff]
^ table[4][(high >> 24) & 0xff]
^ table[3][(high >> 32) & 0xff]
^ table[2][(high >> 40) & 0xff]
^ table[1][(high >> 48) & 0xff]
^ table[0][high >> 56];
next += 16;
length -= 16;
}
#else
while (length && ((uintptr_t)next & 3) != 0)
{
crc = table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
--length;
}
while (length >= 12)
{
crc ^= *(uint32_t *)next;
uint32_t high = *(uint32_t *)(next + 4);
uint32_t high2 = *(uint32_t *)(next + 8);
crc = table[11][crc & 0xff]
^ table[10][(crc >> 8) & 0xff]
^ table[9][(crc >> 16) & 0xff]
^ table[8][crc >> 24]
^ table[7][high & 0xff]
^ table[6][(high >> 8) & 0xff]
^ table[5][(high >> 16) & 0xff]
^ table[4][high >> 24]
^ table[3][high2 & 0xff]
^ table[2][(high2 >> 8) & 0xff]
^ table[1][(high2 >> 16) & 0xff]
^ table[0][high2 >> 24];
next += 12;
length -= 12;
}
#endif
while (length)
{
crc = table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
--length;
}
return (uint32_t)crc ^ 0xffffffff;
}
/* Apply the zeros operator table to crc. */
static inline uint32_t shift_crc(uint32_t shift_table[][256], uint32_t crc)
{
return shift_table[0][crc & 0xff]
^ shift_table[1][(crc >> 8) & 0xff]
^ shift_table[2][(crc >> 16) & 0xff]
^ shift_table[3][crc >> 24];
}
/* Compute CRC-32C using the Intel hardware instruction. */
extern "C" CRC32C_API uint32_t crc32c_append_hw(uint32_t crc, buffer buf, size_t len)
{
buffer next = buf;
buffer end;
#ifdef _M_X64
uint64_t crc0, crc1, crc2; /* need to be 64 bits for crc32q */
#else
uint32_t crc0, crc1, crc2;
#endif
/* pre-process the crc */
crc0 = crc ^ 0xffffffff;
/* compute the crc for up to seven leading bytes to bring the data pointer
to an eight-byte boundary */
while (len && ((uintptr_t)next & 7) != 0)
{
crc0 = _mm_crc32_u8(static_cast<uint32_t>(crc0), *next);
++next;
--len;
}
#ifdef _M_X64
/* compute the crc on sets of LONG_SHIFT*3 bytes, executing three independent crc
instructions, each on LONG_SHIFT bytes -- this is optimized for the Nehalem,
Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
throughput of one crc per cycle, but a latency of three cycles */
while (len >= 3 * LONG_SHIFT)
{
crc1 = 0;
crc2 = 0;
end = next + LONG_SHIFT;
do
{
crc0 = _mm_crc32_u64(crc0, *reinterpret_cast<const uint64_t *>(next));
crc1 = _mm_crc32_u64(crc1, *reinterpret_cast<const uint64_t *>(next + LONG_SHIFT));
crc2 = _mm_crc32_u64(crc2, *reinterpret_cast<const uint64_t *>(next + 2 * LONG_SHIFT));
next += 8;
} while (next < end);
crc0 = shift_crc(long_shifts, static_cast<uint32_t>(crc0)) ^ crc1;
crc0 = shift_crc(long_shifts, static_cast<uint32_t>(crc0)) ^ crc2;
next += 2 * LONG_SHIFT;
len -= 3 * LONG_SHIFT;
}
/* do the same thing, but now on SHORT_SHIFT*3 blocks for the remaining data less
than a LONG_SHIFT*3 block */
while (len >= 3 * SHORT_SHIFT)
{
crc1 = 0;
crc2 = 0;
end = next + SHORT_SHIFT;
do
{
crc0 = _mm_crc32_u64(crc0, *reinterpret_cast<const uint64_t *>(next));
crc1 = _mm_crc32_u64(crc1, *reinterpret_cast<const uint64_t *>(next + SHORT_SHIFT));
crc2 = _mm_crc32_u64(crc2, *reinterpret_cast<const uint64_t *>(next + 2 * SHORT_SHIFT));
next += 8;
} while (next < end);
crc0 = shift_crc(short_shifts, static_cast<uint32_t>(crc0)) ^ crc1;
crc0 = shift_crc(short_shifts, static_cast<uint32_t>(crc0)) ^ crc2;
next += 2 * SHORT_SHIFT;
len -= 3 * SHORT_SHIFT;
}
/* compute the crc on the remaining eight-byte units less than a SHORT_SHIFT*3
block */
end = next + (len - (len & 7));
while (next < end)
{
crc0 = _mm_crc32_u64(crc0, *reinterpret_cast<const uint64_t *>(next));
next += 8;
}
#else
/* compute the crc on sets of LONG_SHIFT*3 bytes, executing three independent crc
instructions, each on LONG_SHIFT bytes -- this is optimized for the Nehalem,
Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
throughput of one crc per cycle, but a latency of three cycles */
while (len >= 3 * LONG_SHIFT)
{
crc1 = 0;
crc2 = 0;
end = next + LONG_SHIFT;
do
{
crc0 = _mm_crc32_u32(crc0, *reinterpret_cast<const uint32_t *>(next));
crc1 = _mm_crc32_u32(crc1, *reinterpret_cast<const uint32_t *>(next + LONG_SHIFT));
crc2 = _mm_crc32_u32(crc2, *reinterpret_cast<const uint32_t *>(next + 2 * LONG_SHIFT));
next += 4;
} while (next < end);
crc0 = shift_crc(long_shifts, static_cast<uint32_t>(crc0)) ^ crc1;
crc0 = shift_crc(long_shifts, static_cast<uint32_t>(crc0)) ^ crc2;
next += 2 * LONG_SHIFT;
len -= 3 * LONG_SHIFT;
}
/* do the same thing, but now on SHORT_SHIFT*3 blocks for the remaining data less
than a LONG_SHIFT*3 block */
while (len >= 3 * SHORT_SHIFT)
{
crc1 = 0;
crc2 = 0;
end = next + SHORT_SHIFT;
do
{
crc0 = _mm_crc32_u32(crc0, *reinterpret_cast<const uint32_t *>(next));
crc1 = _mm_crc32_u32(crc1, *reinterpret_cast<const uint32_t *>(next + SHORT_SHIFT));
crc2 = _mm_crc32_u32(crc2, *reinterpret_cast<const uint32_t *>(next + 2 * SHORT_SHIFT));
next += 4;
} while (next < end);
crc0 = shift_crc(short_shifts, static_cast<uint32_t>(crc0)) ^ crc1;
crc0 = shift_crc(short_shifts, static_cast<uint32_t>(crc0)) ^ crc2;
next += 2 * SHORT_SHIFT;
len -= 3 * SHORT_SHIFT;
}
/* compute the crc on the remaining eight-byte units less than a SHORT_SHIFT*3
block */
end = next + (len - (len & 7));
while (next < end)
{
crc0 = _mm_crc32_u32(crc0, *reinterpret_cast<const uint32_t *>(next));
next += 4;
}
#endif
len &= 7;
/* compute the crc for up to seven trailing bytes */
while (len)
{
crc0 = _mm_crc32_u8(static_cast<uint32_t>(crc0), *next);
++next;
--len;
}
/* return a post-processed crc */
return static_cast<uint32_t>(crc0) ^ 0xffffffff;
}
extern "C" CRC32C_API int crc32c_hw_available()
{
int info[4];
__cpuid(info, 1);
return (info[2] & (1 << 20)) != 0;
}
void calculate_table()
{
for(int i = 0; i < 256; i++)
{
uint32_t res = (uint32_t)i;
for(int t = 0; t < 16; t++) {
for (int k = 0; k < 8; k++) res = (res & 1) == 1 ? POLY ^ (res >> 1) : (res >> 1);
table[t][i] = res;
}
}
_tableInitialized = true;
}
void calculate_table_hw()
{
for(int i = 0; i < 256; i++)
{
uint32_t res = (uint32_t)i;
for (int k = 0; k < 8 * (SHORT_SHIFT - 4); k++) res = (res & 1) == 1 ? POLY ^ (res >> 1) : (res >> 1);
for(int t = 0; t < 4; t++) {
for (int k = 0; k < 8; k++) res = (res & 1) == 1 ? POLY ^ (res >> 1) : (res >> 1);
short_shifts[3 - t][i] = res;
}
for (int k = 0; k < 8 * (LONG_SHIFT - 4 - SHORT_SHIFT); k++) res = (res & 1) == 1 ? POLY ^ (res >> 1) : (res >> 1);
for(int t = 0; t < 4; t++) {
for (int k = 0; k < 8; k++) res = (res & 1) == 1 ? POLY ^ (res >> 1) : (res >> 1);
long_shifts[3 - t][i] = res;
}
}
}
uint32_t (*append_func)(uint32_t, buffer, size_t);
void __crc32_init()
{
if (append_func == NULL)
{
// somebody can call sw version directly, so, precalculate table for this version
calculate_table();
if (crc32c_hw_available()) {
calculate_table_hw();
append_func = crc32c_append_hw;
} else {
append_func = crc32c_append_sw;
}
}
}
extern "C" CRC32C_API uint32_t crc32c_append(uint32_t crc, buffer input, size_t length)
{
return append_func(crc, input, length);
}

35
src/CxbxKrnl/crc32c.h Normal file
View File

@ -0,0 +1,35 @@
#ifndef CRC32C_H
#define CRC32C_H
#define CRC32C_API
#include <stdint.h>
/*
Computes CRC-32C (Castagnoli) checksum. Uses Intel's CRC32 instruction if it is available.
Otherwise it uses a very fast software fallback.
*/
extern "C" CRC32C_API uint32_t crc32c_append(
uint32_t crc, // Initial CRC value. Typically it's 0.
// You can supply non-trivial initial value here.
// Initial value can be used to chain CRC from multiple buffers.
const uint8_t *input, // Data to be put through the CRC algorithm.
size_t length); // Length of the data in the input buffer.
/*
Software fallback version of CRC-32C (Castagnoli) checksum.
*/
extern "C" CRC32C_API uint32_t crc32c_append_sw(uint32_t crc, const uint8_t *input, size_t length);
/*
Hardware version of CRC-32C (Castagnoli) checksum. Will fail, if CPU does not support related instructions. Use a crc32c_append version instead of.
*/
extern "C" CRC32C_API uint32_t crc32c_append_hw(uint32_t crc, const uint8_t *input, size_t length);
/*
Checks is hardware version of CRC-32C is available.
*/
extern "C" CRC32C_API int crc32c_hw_available();
#endif

View File

@ -5,6 +5,7 @@
//
#pragma once
#include <stdint.h> // for uint32_t and uint64_t
#include "crc32c.h"
/// XXHash (32 bit), based on Yann Collet's descriptions, see http://cyan4973.github.io/xxHash/
/** How to use:
uint32_t myseed = 0;
@ -120,6 +121,16 @@ public:
@return 32 bit XXHash **/
static uint32_t hash(const void* input, uint64_t length, uint32_t seed)
{
// Some modern CPUs support hardware accellerated CRC32
// This is significantly faster than xxHash, in some cases, by more than double
// So now we check for this capability and use it if it exists.
// This significantly reduces the impact of hashing on CPUs supporting SSE4.2
// but also keeps xxHash present as a fast fallback, for those who don't support it
static bool bHardwareCrc32 = crc32c_hw_available(); // Cache the result in a static variable to avoid _cpuid every call
if (bHardwareCrc32) {
return crc32c_append_hw(seed, (uint8_t*)input, (size_t)length);
}
XXHash32 hasher(seed);
hasher.add(input, length);
return hasher.hash();