Update xxHash to use Hardware CRC32C if available
This commit is contained in:
parent
ff514d82ee
commit
a2f727f0a1
|
@ -195,6 +195,7 @@
|
|||
<ClInclude Include="..\..\src\Common\Win32\XBAudio.h" />
|
||||
<ClInclude Include="..\..\src\Common\XADPCM.h" />
|
||||
<ClInclude Include="..\..\src\Common\XbePrinter.h" />
|
||||
<ClInclude Include="..\..\src\CxbxKrnl\crc32c.h" />
|
||||
<ClInclude Include="..\..\src\CxbxKrnl\EmuD3D8Logging.h" />
|
||||
<ClInclude Include="..\..\import\stb\stb_image.h" />
|
||||
<ClInclude Include="..\..\src\Common\EmuEEPROM.h" />
|
||||
|
@ -391,6 +392,7 @@
|
|||
<ClCompile Include="..\..\src\Common\CxbxDebugger.cpp" />
|
||||
<ClCompile Include="..\..\src\Common\Win32\XBAudio.cpp" />
|
||||
<ClCompile Include="..\..\src\Common\XbePrinter.cpp" />
|
||||
<ClCompile Include="..\..\src\CxbxKrnl\crc32c.cpp" />
|
||||
<ClCompile Include="..\..\src\CxbxKrnl\EmuD3D8Logging.cpp" />
|
||||
<ClCompile Include="..\..\src\Common\EmuEEPROM.cpp" />
|
||||
<ClCompile Include="..\..\src\Common\Logging.cpp" />
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup>
|
||||
<ClCompile Include="..\..\src\Cxbx\DlgControllerConfig.cpp">
|
||||
|
@ -238,6 +238,9 @@
|
|||
<ClCompile Include="..\..\src\devices\MCPXDevice.cpp">
|
||||
<Filter>Hardware</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\src\CxbxKrnl\crc32c.cpp">
|
||||
<Filter>Shared</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="..\..\src\devices\video\swizzle.cpp">
|
||||
<Filter>Hardware</Filter>
|
||||
</ClCompile>
|
||||
|
@ -474,6 +477,9 @@
|
|||
<ClInclude Include="..\..\src\devices\MCPXDevice.h">
|
||||
<Filter>Hardware</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\..\src\CxbxKrnl\crc32c.h">
|
||||
<Filter>Shared</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="..\..\src\CxbxKrnl\EmuKrnlKi.h">
|
||||
<Filter>Kernel</Filter>
|
||||
</ClInclude>
|
||||
|
|
|
@ -0,0 +1,334 @@
|
|||
/*
|
||||
Copyright (c) 2013 - 2014, 2016 Mark Adler, Robert Vazan, Max Vysokikh
|
||||
|
||||
This software is provided 'as-is', without any express or implied
|
||||
warranty. In no event will the author be held liable for any damages
|
||||
arising from the use of this software.
|
||||
|
||||
Permission is granted to anyone to use this software for any purpose,
|
||||
including commercial applications, and to alter it and redistribute it
|
||||
freely, subject to the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not
|
||||
claim that you wrote the original software. If you use this software
|
||||
in a product, an acknowledgment in the product documentation would be
|
||||
appreciated but is not required.
|
||||
2. Altered source versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software.
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
*/
|
||||
|
||||
#ifndef _CRT_SECURE_NO_WARNINGS
|
||||
#define _CRT_SECURE_NO_WARNINGS
|
||||
#endif
|
||||
|
||||
#include "crc32c.h"
|
||||
#include <intrin.h>
|
||||
|
||||
#define NOMINMAX
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#define POLY 0x82f63b78
|
||||
#define LONG_SHIFT 8192
|
||||
#define SHORT_SHIFT 256
|
||||
|
||||
typedef const uint8_t *buffer;
|
||||
|
||||
static uint32_t table[16][256];
|
||||
|
||||
static uint32_t long_shifts[4][256];
|
||||
|
||||
static uint32_t short_shifts[4][256];
|
||||
|
||||
static bool _tableInitialized;
|
||||
|
||||
void calculate_table();
|
||||
|
||||
/* Table-driven software version as a fall-back. This is about 15 times slower
|
||||
than using the hardware instructions. This assumes little-endian integers,
|
||||
as is the case on Intel processors that the assembler code here is for. */
|
||||
extern "C" CRC32C_API uint32_t crc32c_append_sw(uint32_t crci, buffer input, size_t length)
|
||||
{
|
||||
buffer next = input;
|
||||
#ifdef _M_X64
|
||||
uint64_t crc;
|
||||
#else
|
||||
uint32_t crc;
|
||||
#endif
|
||||
|
||||
crc = crci ^ 0xffffffff;
|
||||
#ifdef _M_X64
|
||||
while (length && ((uintptr_t)next & 7) != 0)
|
||||
{
|
||||
crc = table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
|
||||
--length;
|
||||
}
|
||||
while (length >= 16)
|
||||
{
|
||||
crc ^= *(uint64_t *)next;
|
||||
uint64_t high = *(uint64_t *)(next + 8);
|
||||
crc = table[15][crc & 0xff]
|
||||
^ table[14][(crc >> 8) & 0xff]
|
||||
^ table[13][(crc >> 16) & 0xff]
|
||||
^ table[12][(crc >> 24) & 0xff]
|
||||
^ table[11][(crc >> 32) & 0xff]
|
||||
^ table[10][(crc >> 40) & 0xff]
|
||||
^ table[9][(crc >> 48) & 0xff]
|
||||
^ table[8][crc >> 56]
|
||||
^ table[7][high & 0xff]
|
||||
^ table[6][(high >> 8) & 0xff]
|
||||
^ table[5][(high >> 16) & 0xff]
|
||||
^ table[4][(high >> 24) & 0xff]
|
||||
^ table[3][(high >> 32) & 0xff]
|
||||
^ table[2][(high >> 40) & 0xff]
|
||||
^ table[1][(high >> 48) & 0xff]
|
||||
^ table[0][high >> 56];
|
||||
next += 16;
|
||||
length -= 16;
|
||||
}
|
||||
#else
|
||||
while (length && ((uintptr_t)next & 3) != 0)
|
||||
{
|
||||
crc = table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
|
||||
--length;
|
||||
}
|
||||
while (length >= 12)
|
||||
{
|
||||
crc ^= *(uint32_t *)next;
|
||||
uint32_t high = *(uint32_t *)(next + 4);
|
||||
uint32_t high2 = *(uint32_t *)(next + 8);
|
||||
crc = table[11][crc & 0xff]
|
||||
^ table[10][(crc >> 8) & 0xff]
|
||||
^ table[9][(crc >> 16) & 0xff]
|
||||
^ table[8][crc >> 24]
|
||||
^ table[7][high & 0xff]
|
||||
^ table[6][(high >> 8) & 0xff]
|
||||
^ table[5][(high >> 16) & 0xff]
|
||||
^ table[4][high >> 24]
|
||||
^ table[3][high2 & 0xff]
|
||||
^ table[2][(high2 >> 8) & 0xff]
|
||||
^ table[1][(high2 >> 16) & 0xff]
|
||||
^ table[0][high2 >> 24];
|
||||
next += 12;
|
||||
length -= 12;
|
||||
}
|
||||
#endif
|
||||
while (length)
|
||||
{
|
||||
crc = table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
|
||||
--length;
|
||||
}
|
||||
return (uint32_t)crc ^ 0xffffffff;
|
||||
}
|
||||
|
||||
/* Apply the zeros operator table to crc. */
|
||||
static inline uint32_t shift_crc(uint32_t shift_table[][256], uint32_t crc)
|
||||
{
|
||||
return shift_table[0][crc & 0xff]
|
||||
^ shift_table[1][(crc >> 8) & 0xff]
|
||||
^ shift_table[2][(crc >> 16) & 0xff]
|
||||
^ shift_table[3][crc >> 24];
|
||||
}
|
||||
|
||||
/* Compute CRC-32C using the Intel hardware instruction. */
|
||||
extern "C" CRC32C_API uint32_t crc32c_append_hw(uint32_t crc, buffer buf, size_t len)
|
||||
{
|
||||
buffer next = buf;
|
||||
buffer end;
|
||||
#ifdef _M_X64
|
||||
uint64_t crc0, crc1, crc2; /* need to be 64 bits for crc32q */
|
||||
#else
|
||||
uint32_t crc0, crc1, crc2;
|
||||
#endif
|
||||
|
||||
/* pre-process the crc */
|
||||
crc0 = crc ^ 0xffffffff;
|
||||
|
||||
/* compute the crc for up to seven leading bytes to bring the data pointer
|
||||
to an eight-byte boundary */
|
||||
while (len && ((uintptr_t)next & 7) != 0)
|
||||
{
|
||||
crc0 = _mm_crc32_u8(static_cast<uint32_t>(crc0), *next);
|
||||
++next;
|
||||
--len;
|
||||
}
|
||||
|
||||
#ifdef _M_X64
|
||||
/* compute the crc on sets of LONG_SHIFT*3 bytes, executing three independent crc
|
||||
instructions, each on LONG_SHIFT bytes -- this is optimized for the Nehalem,
|
||||
Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
|
||||
throughput of one crc per cycle, but a latency of three cycles */
|
||||
while (len >= 3 * LONG_SHIFT)
|
||||
{
|
||||
crc1 = 0;
|
||||
crc2 = 0;
|
||||
end = next + LONG_SHIFT;
|
||||
do
|
||||
{
|
||||
crc0 = _mm_crc32_u64(crc0, *reinterpret_cast<const uint64_t *>(next));
|
||||
crc1 = _mm_crc32_u64(crc1, *reinterpret_cast<const uint64_t *>(next + LONG_SHIFT));
|
||||
crc2 = _mm_crc32_u64(crc2, *reinterpret_cast<const uint64_t *>(next + 2 * LONG_SHIFT));
|
||||
next += 8;
|
||||
} while (next < end);
|
||||
crc0 = shift_crc(long_shifts, static_cast<uint32_t>(crc0)) ^ crc1;
|
||||
crc0 = shift_crc(long_shifts, static_cast<uint32_t>(crc0)) ^ crc2;
|
||||
next += 2 * LONG_SHIFT;
|
||||
len -= 3 * LONG_SHIFT;
|
||||
}
|
||||
|
||||
/* do the same thing, but now on SHORT_SHIFT*3 blocks for the remaining data less
|
||||
than a LONG_SHIFT*3 block */
|
||||
while (len >= 3 * SHORT_SHIFT)
|
||||
{
|
||||
crc1 = 0;
|
||||
crc2 = 0;
|
||||
end = next + SHORT_SHIFT;
|
||||
do
|
||||
{
|
||||
crc0 = _mm_crc32_u64(crc0, *reinterpret_cast<const uint64_t *>(next));
|
||||
crc1 = _mm_crc32_u64(crc1, *reinterpret_cast<const uint64_t *>(next + SHORT_SHIFT));
|
||||
crc2 = _mm_crc32_u64(crc2, *reinterpret_cast<const uint64_t *>(next + 2 * SHORT_SHIFT));
|
||||
next += 8;
|
||||
} while (next < end);
|
||||
crc0 = shift_crc(short_shifts, static_cast<uint32_t>(crc0)) ^ crc1;
|
||||
crc0 = shift_crc(short_shifts, static_cast<uint32_t>(crc0)) ^ crc2;
|
||||
next += 2 * SHORT_SHIFT;
|
||||
len -= 3 * SHORT_SHIFT;
|
||||
}
|
||||
|
||||
/* compute the crc on the remaining eight-byte units less than a SHORT_SHIFT*3
|
||||
block */
|
||||
end = next + (len - (len & 7));
|
||||
while (next < end)
|
||||
{
|
||||
crc0 = _mm_crc32_u64(crc0, *reinterpret_cast<const uint64_t *>(next));
|
||||
next += 8;
|
||||
}
|
||||
#else
|
||||
/* compute the crc on sets of LONG_SHIFT*3 bytes, executing three independent crc
|
||||
instructions, each on LONG_SHIFT bytes -- this is optimized for the Nehalem,
|
||||
Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
|
||||
throughput of one crc per cycle, but a latency of three cycles */
|
||||
while (len >= 3 * LONG_SHIFT)
|
||||
{
|
||||
crc1 = 0;
|
||||
crc2 = 0;
|
||||
end = next + LONG_SHIFT;
|
||||
do
|
||||
{
|
||||
crc0 = _mm_crc32_u32(crc0, *reinterpret_cast<const uint32_t *>(next));
|
||||
crc1 = _mm_crc32_u32(crc1, *reinterpret_cast<const uint32_t *>(next + LONG_SHIFT));
|
||||
crc2 = _mm_crc32_u32(crc2, *reinterpret_cast<const uint32_t *>(next + 2 * LONG_SHIFT));
|
||||
next += 4;
|
||||
} while (next < end);
|
||||
crc0 = shift_crc(long_shifts, static_cast<uint32_t>(crc0)) ^ crc1;
|
||||
crc0 = shift_crc(long_shifts, static_cast<uint32_t>(crc0)) ^ crc2;
|
||||
next += 2 * LONG_SHIFT;
|
||||
len -= 3 * LONG_SHIFT;
|
||||
}
|
||||
|
||||
/* do the same thing, but now on SHORT_SHIFT*3 blocks for the remaining data less
|
||||
than a LONG_SHIFT*3 block */
|
||||
while (len >= 3 * SHORT_SHIFT)
|
||||
{
|
||||
crc1 = 0;
|
||||
crc2 = 0;
|
||||
end = next + SHORT_SHIFT;
|
||||
do
|
||||
{
|
||||
crc0 = _mm_crc32_u32(crc0, *reinterpret_cast<const uint32_t *>(next));
|
||||
crc1 = _mm_crc32_u32(crc1, *reinterpret_cast<const uint32_t *>(next + SHORT_SHIFT));
|
||||
crc2 = _mm_crc32_u32(crc2, *reinterpret_cast<const uint32_t *>(next + 2 * SHORT_SHIFT));
|
||||
next += 4;
|
||||
} while (next < end);
|
||||
crc0 = shift_crc(short_shifts, static_cast<uint32_t>(crc0)) ^ crc1;
|
||||
crc0 = shift_crc(short_shifts, static_cast<uint32_t>(crc0)) ^ crc2;
|
||||
next += 2 * SHORT_SHIFT;
|
||||
len -= 3 * SHORT_SHIFT;
|
||||
}
|
||||
|
||||
/* compute the crc on the remaining eight-byte units less than a SHORT_SHIFT*3
|
||||
block */
|
||||
end = next + (len - (len & 7));
|
||||
while (next < end)
|
||||
{
|
||||
crc0 = _mm_crc32_u32(crc0, *reinterpret_cast<const uint32_t *>(next));
|
||||
next += 4;
|
||||
}
|
||||
#endif
|
||||
len &= 7;
|
||||
|
||||
/* compute the crc for up to seven trailing bytes */
|
||||
while (len)
|
||||
{
|
||||
crc0 = _mm_crc32_u8(static_cast<uint32_t>(crc0), *next);
|
||||
++next;
|
||||
--len;
|
||||
}
|
||||
|
||||
/* return a post-processed crc */
|
||||
return static_cast<uint32_t>(crc0) ^ 0xffffffff;
|
||||
}
|
||||
|
||||
extern "C" CRC32C_API int crc32c_hw_available()
|
||||
{
|
||||
int info[4];
|
||||
__cpuid(info, 1);
|
||||
return (info[2] & (1 << 20)) != 0;
|
||||
|
||||
}
|
||||
|
||||
void calculate_table()
|
||||
{
|
||||
for(int i = 0; i < 256; i++)
|
||||
{
|
||||
uint32_t res = (uint32_t)i;
|
||||
for(int t = 0; t < 16; t++) {
|
||||
for (int k = 0; k < 8; k++) res = (res & 1) == 1 ? POLY ^ (res >> 1) : (res >> 1);
|
||||
table[t][i] = res;
|
||||
}
|
||||
}
|
||||
|
||||
_tableInitialized = true;
|
||||
}
|
||||
|
||||
void calculate_table_hw()
|
||||
{
|
||||
for(int i = 0; i < 256; i++)
|
||||
{
|
||||
uint32_t res = (uint32_t)i;
|
||||
for (int k = 0; k < 8 * (SHORT_SHIFT - 4); k++) res = (res & 1) == 1 ? POLY ^ (res >> 1) : (res >> 1);
|
||||
for(int t = 0; t < 4; t++) {
|
||||
for (int k = 0; k < 8; k++) res = (res & 1) == 1 ? POLY ^ (res >> 1) : (res >> 1);
|
||||
short_shifts[3 - t][i] = res;
|
||||
}
|
||||
for (int k = 0; k < 8 * (LONG_SHIFT - 4 - SHORT_SHIFT); k++) res = (res & 1) == 1 ? POLY ^ (res >> 1) : (res >> 1);
|
||||
for(int t = 0; t < 4; t++) {
|
||||
for (int k = 0; k < 8; k++) res = (res & 1) == 1 ? POLY ^ (res >> 1) : (res >> 1);
|
||||
long_shifts[3 - t][i] = res;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t (*append_func)(uint32_t, buffer, size_t);
|
||||
|
||||
void __crc32_init()
|
||||
{
|
||||
if (append_func == NULL)
|
||||
{
|
||||
// somebody can call sw version directly, so, precalculate table for this version
|
||||
calculate_table();
|
||||
if (crc32c_hw_available()) {
|
||||
calculate_table_hw();
|
||||
append_func = crc32c_append_hw;
|
||||
} else {
|
||||
append_func = crc32c_append_sw;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
extern "C" CRC32C_API uint32_t crc32c_append(uint32_t crc, buffer input, size_t length)
|
||||
{
|
||||
return append_func(crc, input, length);
|
||||
}
|
|
@ -0,0 +1,35 @@
|
|||
#ifndef CRC32C_H
|
||||
#define CRC32C_H
|
||||
|
||||
#define CRC32C_API
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
/*
|
||||
Computes CRC-32C (Castagnoli) checksum. Uses Intel's CRC32 instruction if it is available.
|
||||
Otherwise it uses a very fast software fallback.
|
||||
*/
|
||||
extern "C" CRC32C_API uint32_t crc32c_append(
|
||||
uint32_t crc, // Initial CRC value. Typically it's 0.
|
||||
// You can supply non-trivial initial value here.
|
||||
// Initial value can be used to chain CRC from multiple buffers.
|
||||
const uint8_t *input, // Data to be put through the CRC algorithm.
|
||||
size_t length); // Length of the data in the input buffer.
|
||||
|
||||
|
||||
/*
|
||||
Software fallback version of CRC-32C (Castagnoli) checksum.
|
||||
*/
|
||||
extern "C" CRC32C_API uint32_t crc32c_append_sw(uint32_t crc, const uint8_t *input, size_t length);
|
||||
|
||||
/*
|
||||
Hardware version of CRC-32C (Castagnoli) checksum. Will fail, if CPU does not support related instructions. Use a crc32c_append version instead of.
|
||||
*/
|
||||
extern "C" CRC32C_API uint32_t crc32c_append_hw(uint32_t crc, const uint8_t *input, size_t length);
|
||||
|
||||
/*
|
||||
Checks is hardware version of CRC-32C is available.
|
||||
*/
|
||||
extern "C" CRC32C_API int crc32c_hw_available();
|
||||
|
||||
#endif
|
|
@ -5,6 +5,7 @@
|
|||
//
|
||||
#pragma once
|
||||
#include <stdint.h> // for uint32_t and uint64_t
|
||||
#include "crc32c.h"
|
||||
/// XXHash (32 bit), based on Yann Collet's descriptions, see http://cyan4973.github.io/xxHash/
|
||||
/** How to use:
|
||||
uint32_t myseed = 0;
|
||||
|
@ -120,6 +121,16 @@ public:
|
|||
@return 32 bit XXHash **/
|
||||
static uint32_t hash(const void* input, uint64_t length, uint32_t seed)
|
||||
{
|
||||
// Some modern CPUs support hardware accellerated CRC32
|
||||
// This is significantly faster than xxHash, in some cases, by more than double
|
||||
// So now we check for this capability and use it if it exists.
|
||||
// This significantly reduces the impact of hashing on CPUs supporting SSE4.2
|
||||
// but also keeps xxHash present as a fast fallback, for those who don't support it
|
||||
static bool bHardwareCrc32 = crc32c_hw_available(); // Cache the result in a static variable to avoid _cpuid every call
|
||||
if (bHardwareCrc32) {
|
||||
return crc32c_append_hw(seed, (uint8_t*)input, (size_t)length);
|
||||
}
|
||||
|
||||
XXHash32 hasher(seed);
|
||||
hasher.add(input, length);
|
||||
return hasher.hash();
|
||||
|
|
Loading…
Reference in New Issue