Update xxHash to use Hardware CRC32C if available

2018-02-14 23:28:53 +00:00 · 2018-02-14 23:28:53 +00:00 · a2f727f0a1
parent ff514d82ee
commit a2f727f0a1
5 changed files with 389 additions and 1 deletions
--- a/build/win32/Cxbx.vcxproj
+++ b/build/win32/Cxbx.vcxproj
@ -195,6 +195,7 @@
    <ClInclude Include="..\..\src\Common\Win32\XBAudio.h" />
    <ClInclude Include="..\..\src\Common\XADPCM.h" />
    <ClInclude Include="..\..\src\Common\XbePrinter.h" />
+    <ClInclude Include="..\..\src\CxbxKrnl\crc32c.h" />
    <ClInclude Include="..\..\src\CxbxKrnl\EmuD3D8Logging.h" />
    <ClInclude Include="..\..\import\stb\stb_image.h" />
    <ClInclude Include="..\..\src\Common\EmuEEPROM.h" />
@ -391,6 +392,7 @@
    <ClCompile Include="..\..\src\Common\CxbxDebugger.cpp" />
    <ClCompile Include="..\..\src\Common\Win32\XBAudio.cpp" />
    <ClCompile Include="..\..\src\Common\XbePrinter.cpp" />
+    <ClCompile Include="..\..\src\CxbxKrnl\crc32c.cpp" />
    <ClCompile Include="..\..\src\CxbxKrnl\EmuD3D8Logging.cpp" />
    <ClCompile Include="..\..\src\Common\EmuEEPROM.cpp" />
    <ClCompile Include="..\..\src\Common\Logging.cpp" />
--- a/build/win32/Cxbx.vcxproj.filters
+++ b/build/win32/Cxbx.vcxproj.filters
@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup>
    <ClCompile Include="..\..\src\Cxbx\DlgControllerConfig.cpp">
@ -238,6 +238,9 @@
    <ClCompile Include="..\..\src\devices\MCPXDevice.cpp">
      <Filter>Hardware</Filter>
    </ClCompile>
+    <ClCompile Include="..\..\src\CxbxKrnl\crc32c.cpp">
+      <Filter>Shared</Filter>
+    </ClCompile>
    <ClCompile Include="..\..\src\devices\video\swizzle.cpp">
      <Filter>Hardware</Filter>
    </ClCompile>
@ -474,6 +477,9 @@
    <ClInclude Include="..\..\src\devices\MCPXDevice.h">
      <Filter>Hardware</Filter>
    </ClInclude>
+    <ClInclude Include="..\..\src\CxbxKrnl\crc32c.h">
+      <Filter>Shared</Filter>
+    </ClInclude>
    <ClInclude Include="..\..\src\CxbxKrnl\EmuKrnlKi.h">
      <Filter>Kernel</Filter>
    </ClInclude>
--- a/src/CxbxKrnl/crc32c.cpp
+++ b/src/CxbxKrnl/crc32c.cpp
@ -0,0 +1,334 @@
+/*
+  Copyright (c) 2013 - 2014, 2016 Mark Adler, Robert Vazan, Max Vysokikh
+
+  This software is provided 'as-is', without any express or implied
+  warranty.  In no event will the author be held liable for any damages
+  arising from the use of this software.
+
+  Permission is granted to anyone to use this software for any purpose,
+  including commercial applications, and to alter it and redistribute it
+  freely, subject to the following restrictions:
+
+  1. The origin of this software must not be misrepresented; you must not
+  claim that you wrote the original software. If you use this software
+  in a product, an acknowledgment in the product documentation would be
+  appreciated but is not required.
+  2. Altered source versions must be plainly marked as such, and must not be
+  misrepresented as being the original software.
+  3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+
+#include "crc32c.h"
+#include <intrin.h>
+
+#define NOMINMAX
+
+#include <algorithm>
+
+#define POLY 0x82f63b78
+#define LONG_SHIFT 8192
+#define SHORT_SHIFT 256
+
+typedef const uint8_t *buffer;
+
+static uint32_t table[16][256];
+
+static uint32_t long_shifts[4][256];
+
+static uint32_t short_shifts[4][256];
+
+static bool _tableInitialized;
+
+void calculate_table();
+
+/* Table-driven software version as a fall-back.  This is about 15 times slower
+   than using the hardware instructions.  This assumes little-endian integers,
+   as is the case on Intel processors that the assembler code here is for. */
+extern "C" CRC32C_API uint32_t crc32c_append_sw(uint32_t crci, buffer input, size_t length)
+{
+    buffer next = input;
+#ifdef _M_X64
+    uint64_t crc;
+#else
+    uint32_t crc;
+#endif
+
+    crc = crci ^ 0xffffffff;
+#ifdef _M_X64
+    while (length && ((uintptr_t)next & 7) != 0)
+    {
+        crc = table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
+        --length;
+    }
+    while (length >= 16)
+    {
+        crc ^= *(uint64_t *)next;
+        uint64_t high = *(uint64_t *)(next + 8);
+        crc = table[15][crc & 0xff]
+            ^ table[14][(crc >> 8) & 0xff]
+            ^ table[13][(crc >> 16) & 0xff]
+            ^ table[12][(crc >> 24) & 0xff]
+            ^ table[11][(crc >> 32) & 0xff]
+            ^ table[10][(crc >> 40) & 0xff]
+            ^ table[9][(crc >> 48) & 0xff]
+            ^ table[8][crc >> 56]
+            ^ table[7][high & 0xff]
+            ^ table[6][(high >> 8) & 0xff]
+            ^ table[5][(high >> 16) & 0xff]
+            ^ table[4][(high >> 24) & 0xff]
+            ^ table[3][(high >> 32) & 0xff]
+            ^ table[2][(high >> 40) & 0xff]
+            ^ table[1][(high >> 48) & 0xff]
+            ^ table[0][high >> 56];
+        next += 16;
+        length -= 16;
+    }
+#else
+    while (length && ((uintptr_t)next & 3) != 0)
+    {
+        crc = table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
+        --length;
+    }
+    while (length >= 12)
+    {
+        crc ^= *(uint32_t *)next;
+        uint32_t high = *(uint32_t *)(next + 4);
+        uint32_t high2 = *(uint32_t *)(next + 8);
+        crc = table[11][crc & 0xff]
+            ^ table[10][(crc >> 8) & 0xff]
+            ^ table[9][(crc >> 16) & 0xff]
+            ^ table[8][crc >> 24]
+            ^ table[7][high & 0xff]
+            ^ table[6][(high >> 8) & 0xff]
+            ^ table[5][(high >> 16) & 0xff]
+            ^ table[4][high >> 24]
+            ^ table[3][high2 & 0xff]
+            ^ table[2][(high2 >> 8) & 0xff]
+            ^ table[1][(high2 >> 16) & 0xff]
+            ^ table[0][high2 >> 24];
+        next += 12;
+        length -= 12;
+    }
+#endif
+    while (length)
+    {
+        crc = table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
+        --length;
+    }
+    return (uint32_t)crc ^ 0xffffffff;
+}
+
+/* Apply the zeros operator table to crc. */
+static inline uint32_t shift_crc(uint32_t shift_table[][256], uint32_t crc)
+{
+    return shift_table[0][crc & 0xff]
+        ^ shift_table[1][(crc >> 8) & 0xff]
+        ^ shift_table[2][(crc >> 16) & 0xff]
+        ^ shift_table[3][crc >> 24];
+}
+
+/* Compute CRC-32C using the Intel hardware instruction. */
+extern "C" CRC32C_API uint32_t crc32c_append_hw(uint32_t crc, buffer buf, size_t len)
+{
+    buffer next = buf;
+    buffer end;
+#ifdef _M_X64
+    uint64_t crc0, crc1, crc2;      /* need to be 64 bits for crc32q */
+#else
+    uint32_t crc0, crc1, crc2;
+#endif
+
+    /* pre-process the crc */
+    crc0 = crc ^ 0xffffffff;
+
+    /* compute the crc for up to seven leading bytes to bring the data pointer
+       to an eight-byte boundary */
+    while (len && ((uintptr_t)next & 7) != 0)
+    {
+        crc0 = _mm_crc32_u8(static_cast<uint32_t>(crc0), *next);
+        ++next;
+        --len;
+    }
+
+#ifdef _M_X64
+    /* compute the crc on sets of LONG_SHIFT*3 bytes, executing three independent crc
+       instructions, each on LONG_SHIFT bytes -- this is optimized for the Nehalem,
+       Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
+       throughput of one crc per cycle, but a latency of three cycles */
+    while (len >= 3 * LONG_SHIFT)
+    {
+        crc1 = 0;
+        crc2 = 0;
+        end = next + LONG_SHIFT;
+        do
+        {
+            crc0 = _mm_crc32_u64(crc0, *reinterpret_cast<const uint64_t *>(next));
+            crc1 = _mm_crc32_u64(crc1, *reinterpret_cast<const uint64_t *>(next + LONG_SHIFT));
+            crc2 = _mm_crc32_u64(crc2, *reinterpret_cast<const uint64_t *>(next + 2 * LONG_SHIFT));
+            next += 8;
+        } while (next < end);
+        crc0 = shift_crc(long_shifts, static_cast<uint32_t>(crc0)) ^ crc1;
+        crc0 = shift_crc(long_shifts, static_cast<uint32_t>(crc0)) ^ crc2;
+        next += 2 * LONG_SHIFT;
+        len -= 3 * LONG_SHIFT;
+    }
+
+    /* do the same thing, but now on SHORT_SHIFT*3 blocks for the remaining data less
+       than a LONG_SHIFT*3 block */
+    while (len >= 3 * SHORT_SHIFT)
+    {
+        crc1 = 0;
+        crc2 = 0;
+        end = next + SHORT_SHIFT;
+        do
+        {
+            crc0 = _mm_crc32_u64(crc0, *reinterpret_cast<const uint64_t *>(next));
+            crc1 = _mm_crc32_u64(crc1, *reinterpret_cast<const uint64_t *>(next + SHORT_SHIFT));
+            crc2 = _mm_crc32_u64(crc2, *reinterpret_cast<const uint64_t *>(next + 2 * SHORT_SHIFT));
+            next += 8;
+        } while (next < end);
+        crc0 = shift_crc(short_shifts, static_cast<uint32_t>(crc0)) ^ crc1;
+        crc0 = shift_crc(short_shifts, static_cast<uint32_t>(crc0)) ^ crc2;
+        next += 2 * SHORT_SHIFT;
+        len -= 3 * SHORT_SHIFT;
+    }
+
+    /* compute the crc on the remaining eight-byte units less than a SHORT_SHIFT*3
+    block */
+    end = next + (len - (len & 7));
+    while (next < end)
+    {
+        crc0 = _mm_crc32_u64(crc0, *reinterpret_cast<const uint64_t *>(next));
+        next += 8;
+    }
+#else
+    /* compute the crc on sets of LONG_SHIFT*3 bytes, executing three independent crc
+    instructions, each on LONG_SHIFT bytes -- this is optimized for the Nehalem,
+    Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
+    throughput of one crc per cycle, but a latency of three cycles */
+    while (len >= 3 * LONG_SHIFT)
+    {
+        crc1 = 0;
+        crc2 = 0;
+        end = next + LONG_SHIFT;
+        do
+        {
+            crc0 = _mm_crc32_u32(crc0, *reinterpret_cast<const uint32_t *>(next));
+            crc1 = _mm_crc32_u32(crc1, *reinterpret_cast<const uint32_t *>(next + LONG_SHIFT));
+            crc2 = _mm_crc32_u32(crc2, *reinterpret_cast<const uint32_t *>(next + 2 * LONG_SHIFT));
+            next += 4;
+        } while (next < end);
+        crc0 = shift_crc(long_shifts, static_cast<uint32_t>(crc0)) ^ crc1;
+        crc0 = shift_crc(long_shifts, static_cast<uint32_t>(crc0)) ^ crc2;
+        next += 2 * LONG_SHIFT;
+        len -= 3 * LONG_SHIFT;
+    }
+
+    /* do the same thing, but now on SHORT_SHIFT*3 blocks for the remaining data less
+    than a LONG_SHIFT*3 block */
+    while (len >= 3 * SHORT_SHIFT)
+    {
+        crc1 = 0;
+        crc2 = 0;
+        end = next + SHORT_SHIFT;
+        do
+        {
+            crc0 = _mm_crc32_u32(crc0, *reinterpret_cast<const uint32_t *>(next));
+            crc1 = _mm_crc32_u32(crc1, *reinterpret_cast<const uint32_t *>(next + SHORT_SHIFT));
+            crc2 = _mm_crc32_u32(crc2, *reinterpret_cast<const uint32_t *>(next + 2 * SHORT_SHIFT));
+            next += 4;
+        } while (next < end);
+        crc0 = shift_crc(short_shifts, static_cast<uint32_t>(crc0)) ^ crc1;
+        crc0 = shift_crc(short_shifts, static_cast<uint32_t>(crc0)) ^ crc2;
+        next += 2 * SHORT_SHIFT;
+        len -= 3 * SHORT_SHIFT;
+    }
+
+    /* compute the crc on the remaining eight-byte units less than a SHORT_SHIFT*3
+    block */
+    end = next + (len - (len & 7));
+    while (next < end)
+    {
+        crc0 = _mm_crc32_u32(crc0, *reinterpret_cast<const uint32_t *>(next));
+        next += 4;
+    }
+#endif
+    len &= 7;
+
+    /* compute the crc for up to seven trailing bytes */
+    while (len)
+    {
+        crc0 = _mm_crc32_u8(static_cast<uint32_t>(crc0), *next);
+        ++next;
+        --len;
+    }
+
+    /* return a post-processed crc */
+    return static_cast<uint32_t>(crc0) ^ 0xffffffff;
+}
+
+extern "C" CRC32C_API int crc32c_hw_available()
+{
+    int info[4];
+    __cpuid(info, 1);
+    return (info[2] & (1 << 20)) != 0;
+
+}
+
+void calculate_table() 
+{
+	for(int i = 0; i < 256; i++) 
+	{
+		uint32_t res = (uint32_t)i;
+		for(int t = 0; t < 16; t++) {
+			for (int k = 0; k < 8; k++) res = (res & 1) == 1 ? POLY ^ (res >> 1) : (res >> 1);
+			table[t][i] = res;
+		}
+	}
+
+	_tableInitialized = true;
+}
+
+void calculate_table_hw()
+{
+	for(int i = 0; i < 256; i++) 
+	{
+		uint32_t res = (uint32_t)i;
+		for (int k = 0; k < 8 * (SHORT_SHIFT - 4); k++) res = (res & 1) == 1 ? POLY ^ (res >> 1) : (res >> 1);
+		for(int t = 0; t < 4; t++) {
+			for (int k = 0; k < 8; k++) res = (res & 1) == 1 ? POLY ^ (res >> 1) : (res >> 1);
+			short_shifts[3 - t][i] = res;
+		}
+		for (int k = 0; k < 8 * (LONG_SHIFT - 4 - SHORT_SHIFT); k++) res = (res & 1) == 1 ? POLY ^ (res >> 1) : (res >> 1);
+		for(int t = 0; t < 4; t++) {
+			for (int k = 0; k < 8; k++) res = (res & 1) == 1 ? POLY ^ (res >> 1) : (res >> 1);
+			long_shifts[3 - t][i] = res;
+		}
+	}
+}
+
+uint32_t (*append_func)(uint32_t, buffer, size_t);
+
+void __crc32_init()
+{
+	if (append_func == NULL)
+	{
+		// somebody can call sw version directly, so, precalculate table for this version
+		calculate_table();
+		if (crc32c_hw_available()) {
+			calculate_table_hw();
+			append_func = crc32c_append_hw;
+		} else {
+			append_func = crc32c_append_sw;
+		}
+	}
+}
+
+extern "C" CRC32C_API uint32_t crc32c_append(uint32_t crc, buffer input, size_t length)
+{
+	return append_func(crc, input, length);
+}
--- a/src/CxbxKrnl/crc32c.h
+++ b/src/CxbxKrnl/crc32c.h
@ -0,0 +1,35 @@
+#ifndef CRC32C_H
+#define CRC32C_H
+
+#define CRC32C_API
+
+#include <stdint.h>
+
+/*
+    Computes CRC-32C (Castagnoli) checksum. Uses Intel's CRC32 instruction if it is available.
+    Otherwise it uses a very fast software fallback.
+*/
+extern "C" CRC32C_API uint32_t crc32c_append(
+    uint32_t crc,               // Initial CRC value. Typically it's 0.
+                                // You can supply non-trivial initial value here.
+                                // Initial value can be used to chain CRC from multiple buffers.
+    const uint8_t *input,       // Data to be put through the CRC algorithm.
+    size_t length);             // Length of the data in the input buffer.
+
+
+/*
+	Software fallback version of CRC-32C (Castagnoli) checksum.
+*/
+extern "C" CRC32C_API uint32_t crc32c_append_sw(uint32_t crc, const uint8_t *input, size_t length);
+
+/*
+	Hardware version of CRC-32C (Castagnoli) checksum. Will fail, if CPU does not support related instructions. Use a crc32c_append version instead of.
+*/
+extern "C" CRC32C_API uint32_t crc32c_append_hw(uint32_t crc, const uint8_t *input, size_t length);
+
+/*
+	Checks is hardware version of CRC-32C is available.
+*/
+extern "C" CRC32C_API int crc32c_hw_available();
+
+#endif
--- a/src/CxbxKrnl/xxhash32.h
+++ b/src/CxbxKrnl/xxhash32.h
@ -5,6 +5,7 @@
 //
 #pragma once
 #include <stdint.h> // for uint32_t and uint64_t
+#include "crc32c.h"
 /// XXHash (32 bit), based on Yann Collet's descriptions, see http://cyan4973.github.io/xxHash/
 /** How to use:
 uint32_t myseed = 0;
@ -120,6 +121,16 @@ public:
 	@return 32 bit XXHash **/
 	static uint32_t hash(const void* input, uint64_t length, uint32_t seed)
 	{
+		// Some modern CPUs support hardware accellerated CRC32
+		// This is significantly faster than xxHash, in some cases, by more than double
+		// So now we check for this capability and use it if it exists.
+		// This significantly reduces the impact of hashing on CPUs supporting SSE4.2
+		// but also keeps xxHash present as a fast fallback, for those who don't support it
+		static bool bHardwareCrc32 = crc32c_hw_available();	// Cache the result in a static variable to avoid _cpuid every call
+		if (bHardwareCrc32) {
+			return crc32c_append_hw(seed, (uint8_t*)input, (size_t)length);
+		}
+	
 		XXHash32 hasher(seed);
 		hasher.add(input, length);
 		return hasher.hash();