diff --git a/build/win32/Cxbx.vcxproj b/build/win32/Cxbx.vcxproj
index 17eaefa50..145024108 100644
--- a/build/win32/Cxbx.vcxproj
+++ b/build/win32/Cxbx.vcxproj
@@ -195,6 +195,7 @@
+
@@ -391,6 +392,7 @@
+
diff --git a/build/win32/Cxbx.vcxproj.filters b/build/win32/Cxbx.vcxproj.filters
index 78b52d605..ccbc0df38 100644
--- a/build/win32/Cxbx.vcxproj.filters
+++ b/build/win32/Cxbx.vcxproj.filters
@@ -1,4 +1,4 @@
-
+
@@ -238,6 +238,9 @@
Hardware
+
+ Shared
+
Hardware
@@ -474,6 +477,9 @@
Hardware
+
+ Shared
+
Kernel
diff --git a/src/CxbxKrnl/crc32c.cpp b/src/CxbxKrnl/crc32c.cpp
new file mode 100644
index 000000000..7c72e2d7b
--- /dev/null
+++ b/src/CxbxKrnl/crc32c.cpp
@@ -0,0 +1,334 @@
+/*
+ Copyright (c) 2013 - 2014, 2016 Mark Adler, Robert Vazan, Max Vysokikh
+
+ This software is provided 'as-is', without any express or implied
+ warranty. In no event will the author be held liable for any damages
+ arising from the use of this software.
+
+ Permission is granted to anyone to use this software for any purpose,
+ including commercial applications, and to alter it and redistribute it
+ freely, subject to the following restrictions:
+
+ 1. The origin of this software must not be misrepresented; you must not
+ claim that you wrote the original software. If you use this software
+ in a product, an acknowledgment in the product documentation would be
+ appreciated but is not required.
+ 2. Altered source versions must be plainly marked as such, and must not be
+ misrepresented as being the original software.
+ 3. This notice may not be removed or altered from any source distribution.
+*/
+
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+
+#include "crc32c.h"
+#include
+
+#define NOMINMAX
+
+#include
+
+#define POLY 0x82f63b78
+#define LONG_SHIFT 8192
+#define SHORT_SHIFT 256
+
+typedef const uint8_t *buffer;
+
+static uint32_t table[16][256];
+
+static uint32_t long_shifts[4][256];
+
+static uint32_t short_shifts[4][256];
+
+static bool _tableInitialized;
+
+void calculate_table();
+
+/* Table-driven software version as a fall-back. This is about 15 times slower
+ than using the hardware instructions. This assumes little-endian integers,
+ as is the case on Intel processors that the assembler code here is for. */
+extern "C" CRC32C_API uint32_t crc32c_append_sw(uint32_t crci, buffer input, size_t length)
+{
+ buffer next = input;
+#ifdef _M_X64
+ uint64_t crc;
+#else
+ uint32_t crc;
+#endif
+
+ crc = crci ^ 0xffffffff;
+#ifdef _M_X64
+ while (length && ((uintptr_t)next & 7) != 0)
+ {
+ crc = table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
+ --length;
+ }
+ while (length >= 16)
+ {
+ crc ^= *(uint64_t *)next;
+ uint64_t high = *(uint64_t *)(next + 8);
+ crc = table[15][crc & 0xff]
+ ^ table[14][(crc >> 8) & 0xff]
+ ^ table[13][(crc >> 16) & 0xff]
+ ^ table[12][(crc >> 24) & 0xff]
+ ^ table[11][(crc >> 32) & 0xff]
+ ^ table[10][(crc >> 40) & 0xff]
+ ^ table[9][(crc >> 48) & 0xff]
+ ^ table[8][crc >> 56]
+ ^ table[7][high & 0xff]
+ ^ table[6][(high >> 8) & 0xff]
+ ^ table[5][(high >> 16) & 0xff]
+ ^ table[4][(high >> 24) & 0xff]
+ ^ table[3][(high >> 32) & 0xff]
+ ^ table[2][(high >> 40) & 0xff]
+ ^ table[1][(high >> 48) & 0xff]
+ ^ table[0][high >> 56];
+ next += 16;
+ length -= 16;
+ }
+#else
+ while (length && ((uintptr_t)next & 3) != 0)
+ {
+ crc = table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
+ --length;
+ }
+ while (length >= 12)
+ {
+ crc ^= *(uint32_t *)next;
+ uint32_t high = *(uint32_t *)(next + 4);
+ uint32_t high2 = *(uint32_t *)(next + 8);
+ crc = table[11][crc & 0xff]
+ ^ table[10][(crc >> 8) & 0xff]
+ ^ table[9][(crc >> 16) & 0xff]
+ ^ table[8][crc >> 24]
+ ^ table[7][high & 0xff]
+ ^ table[6][(high >> 8) & 0xff]
+ ^ table[5][(high >> 16) & 0xff]
+ ^ table[4][high >> 24]
+ ^ table[3][high2 & 0xff]
+ ^ table[2][(high2 >> 8) & 0xff]
+ ^ table[1][(high2 >> 16) & 0xff]
+ ^ table[0][high2 >> 24];
+ next += 12;
+ length -= 12;
+ }
+#endif
+ while (length)
+ {
+ crc = table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
+ --length;
+ }
+ return (uint32_t)crc ^ 0xffffffff;
+}
+
+/* Apply the zeros operator table to crc. */
+static inline uint32_t shift_crc(uint32_t shift_table[][256], uint32_t crc)
+{
+ return shift_table[0][crc & 0xff]
+ ^ shift_table[1][(crc >> 8) & 0xff]
+ ^ shift_table[2][(crc >> 16) & 0xff]
+ ^ shift_table[3][crc >> 24];
+}
+
+/* Compute CRC-32C using the Intel hardware instruction. */
+extern "C" CRC32C_API uint32_t crc32c_append_hw(uint32_t crc, buffer buf, size_t len)
+{
+ buffer next = buf;
+ buffer end;
+#ifdef _M_X64
+ uint64_t crc0, crc1, crc2; /* need to be 64 bits for crc32q */
+#else
+ uint32_t crc0, crc1, crc2;
+#endif
+
+ /* pre-process the crc */
+ crc0 = crc ^ 0xffffffff;
+
+ /* compute the crc for up to seven leading bytes to bring the data pointer
+ to an eight-byte boundary */
+ while (len && ((uintptr_t)next & 7) != 0)
+ {
+ crc0 = _mm_crc32_u8(static_cast(crc0), *next);
+ ++next;
+ --len;
+ }
+
+#ifdef _M_X64
+ /* compute the crc on sets of LONG_SHIFT*3 bytes, executing three independent crc
+ instructions, each on LONG_SHIFT bytes -- this is optimized for the Nehalem,
+ Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
+ throughput of one crc per cycle, but a latency of three cycles */
+ while (len >= 3 * LONG_SHIFT)
+ {
+ crc1 = 0;
+ crc2 = 0;
+ end = next + LONG_SHIFT;
+ do
+ {
+ crc0 = _mm_crc32_u64(crc0, *reinterpret_cast(next));
+ crc1 = _mm_crc32_u64(crc1, *reinterpret_cast(next + LONG_SHIFT));
+ crc2 = _mm_crc32_u64(crc2, *reinterpret_cast(next + 2 * LONG_SHIFT));
+ next += 8;
+ } while (next < end);
+ crc0 = shift_crc(long_shifts, static_cast(crc0)) ^ crc1;
+ crc0 = shift_crc(long_shifts, static_cast(crc0)) ^ crc2;
+ next += 2 * LONG_SHIFT;
+ len -= 3 * LONG_SHIFT;
+ }
+
+ /* do the same thing, but now on SHORT_SHIFT*3 blocks for the remaining data less
+ than a LONG_SHIFT*3 block */
+ while (len >= 3 * SHORT_SHIFT)
+ {
+ crc1 = 0;
+ crc2 = 0;
+ end = next + SHORT_SHIFT;
+ do
+ {
+ crc0 = _mm_crc32_u64(crc0, *reinterpret_cast(next));
+ crc1 = _mm_crc32_u64(crc1, *reinterpret_cast(next + SHORT_SHIFT));
+ crc2 = _mm_crc32_u64(crc2, *reinterpret_cast(next + 2 * SHORT_SHIFT));
+ next += 8;
+ } while (next < end);
+ crc0 = shift_crc(short_shifts, static_cast(crc0)) ^ crc1;
+ crc0 = shift_crc(short_shifts, static_cast(crc0)) ^ crc2;
+ next += 2 * SHORT_SHIFT;
+ len -= 3 * SHORT_SHIFT;
+ }
+
+ /* compute the crc on the remaining eight-byte units less than a SHORT_SHIFT*3
+ block */
+ end = next + (len - (len & 7));
+ while (next < end)
+ {
+ crc0 = _mm_crc32_u64(crc0, *reinterpret_cast(next));
+ next += 8;
+ }
+#else
+ /* compute the crc on sets of LONG_SHIFT*3 bytes, executing three independent crc
+ instructions, each on LONG_SHIFT bytes -- this is optimized for the Nehalem,
+ Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
+ throughput of one crc per cycle, but a latency of three cycles */
+ while (len >= 3 * LONG_SHIFT)
+ {
+ crc1 = 0;
+ crc2 = 0;
+ end = next + LONG_SHIFT;
+ do
+ {
+ crc0 = _mm_crc32_u32(crc0, *reinterpret_cast(next));
+ crc1 = _mm_crc32_u32(crc1, *reinterpret_cast(next + LONG_SHIFT));
+ crc2 = _mm_crc32_u32(crc2, *reinterpret_cast(next + 2 * LONG_SHIFT));
+ next += 4;
+ } while (next < end);
+ crc0 = shift_crc(long_shifts, static_cast(crc0)) ^ crc1;
+ crc0 = shift_crc(long_shifts, static_cast(crc0)) ^ crc2;
+ next += 2 * LONG_SHIFT;
+ len -= 3 * LONG_SHIFT;
+ }
+
+ /* do the same thing, but now on SHORT_SHIFT*3 blocks for the remaining data less
+ than a LONG_SHIFT*3 block */
+ while (len >= 3 * SHORT_SHIFT)
+ {
+ crc1 = 0;
+ crc2 = 0;
+ end = next + SHORT_SHIFT;
+ do
+ {
+ crc0 = _mm_crc32_u32(crc0, *reinterpret_cast(next));
+ crc1 = _mm_crc32_u32(crc1, *reinterpret_cast(next + SHORT_SHIFT));
+ crc2 = _mm_crc32_u32(crc2, *reinterpret_cast(next + 2 * SHORT_SHIFT));
+ next += 4;
+ } while (next < end);
+ crc0 = shift_crc(short_shifts, static_cast(crc0)) ^ crc1;
+ crc0 = shift_crc(short_shifts, static_cast(crc0)) ^ crc2;
+ next += 2 * SHORT_SHIFT;
+ len -= 3 * SHORT_SHIFT;
+ }
+
+ /* compute the crc on the remaining eight-byte units less than a SHORT_SHIFT*3
+ block */
+ end = next + (len - (len & 7));
+ while (next < end)
+ {
+ crc0 = _mm_crc32_u32(crc0, *reinterpret_cast(next));
+ next += 4;
+ }
+#endif
+ len &= 7;
+
+ /* compute the crc for up to seven trailing bytes */
+ while (len)
+ {
+ crc0 = _mm_crc32_u8(static_cast(crc0), *next);
+ ++next;
+ --len;
+ }
+
+ /* return a post-processed crc */
+ return static_cast(crc0) ^ 0xffffffff;
+}
+
+extern "C" CRC32C_API int crc32c_hw_available()
+{
+ int info[4];
+ __cpuid(info, 1);
+ return (info[2] & (1 << 20)) != 0;
+
+}
+
+void calculate_table()
+{
+ for(int i = 0; i < 256; i++)
+ {
+ uint32_t res = (uint32_t)i;
+ for(int t = 0; t < 16; t++) {
+ for (int k = 0; k < 8; k++) res = (res & 1) == 1 ? POLY ^ (res >> 1) : (res >> 1);
+ table[t][i] = res;
+ }
+ }
+
+ _tableInitialized = true;
+}
+
+void calculate_table_hw()
+{
+ for(int i = 0; i < 256; i++)
+ {
+ uint32_t res = (uint32_t)i;
+ for (int k = 0; k < 8 * (SHORT_SHIFT - 4); k++) res = (res & 1) == 1 ? POLY ^ (res >> 1) : (res >> 1);
+ for(int t = 0; t < 4; t++) {
+ for (int k = 0; k < 8; k++) res = (res & 1) == 1 ? POLY ^ (res >> 1) : (res >> 1);
+ short_shifts[3 - t][i] = res;
+ }
+ for (int k = 0; k < 8 * (LONG_SHIFT - 4 - SHORT_SHIFT); k++) res = (res & 1) == 1 ? POLY ^ (res >> 1) : (res >> 1);
+ for(int t = 0; t < 4; t++) {
+ for (int k = 0; k < 8; k++) res = (res & 1) == 1 ? POLY ^ (res >> 1) : (res >> 1);
+ long_shifts[3 - t][i] = res;
+ }
+ }
+}
+
+uint32_t (*append_func)(uint32_t, buffer, size_t);
+
+void __crc32_init()
+{
+ if (append_func == NULL)
+ {
+ // somebody can call sw version directly, so, precalculate table for this version
+ calculate_table();
+ if (crc32c_hw_available()) {
+ calculate_table_hw();
+ append_func = crc32c_append_hw;
+ } else {
+ append_func = crc32c_append_sw;
+ }
+ }
+}
+
+extern "C" CRC32C_API uint32_t crc32c_append(uint32_t crc, buffer input, size_t length)
+{
+ return append_func(crc, input, length);
+}
diff --git a/src/CxbxKrnl/crc32c.h b/src/CxbxKrnl/crc32c.h
new file mode 100644
index 000000000..07e824622
--- /dev/null
+++ b/src/CxbxKrnl/crc32c.h
@@ -0,0 +1,35 @@
+#ifndef CRC32C_H
+#define CRC32C_H
+
+#define CRC32C_API
+
+#include
+
+/*
+ Computes CRC-32C (Castagnoli) checksum. Uses Intel's CRC32 instruction if it is available.
+ Otherwise it uses a very fast software fallback.
+*/
+extern "C" CRC32C_API uint32_t crc32c_append(
+ uint32_t crc, // Initial CRC value. Typically it's 0.
+ // You can supply non-trivial initial value here.
+ // Initial value can be used to chain CRC from multiple buffers.
+ const uint8_t *input, // Data to be put through the CRC algorithm.
+ size_t length); // Length of the data in the input buffer.
+
+
+/*
+ Software fallback version of CRC-32C (Castagnoli) checksum.
+*/
+extern "C" CRC32C_API uint32_t crc32c_append_sw(uint32_t crc, const uint8_t *input, size_t length);
+
+/*
+ Hardware version of CRC-32C (Castagnoli) checksum. Will fail, if CPU does not support related instructions. Use a crc32c_append version instead of.
+*/
+extern "C" CRC32C_API uint32_t crc32c_append_hw(uint32_t crc, const uint8_t *input, size_t length);
+
+/*
+ Checks is hardware version of CRC-32C is available.
+*/
+extern "C" CRC32C_API int crc32c_hw_available();
+
+#endif
diff --git a/src/CxbxKrnl/xxhash32.h b/src/CxbxKrnl/xxhash32.h
index b06dce068..b706efdae 100644
--- a/src/CxbxKrnl/xxhash32.h
+++ b/src/CxbxKrnl/xxhash32.h
@@ -5,6 +5,7 @@
//
#pragma once
#include // for uint32_t and uint64_t
+#include "crc32c.h"
/// XXHash (32 bit), based on Yann Collet's descriptions, see http://cyan4973.github.io/xxHash/
/** How to use:
uint32_t myseed = 0;
@@ -120,6 +121,16 @@ public:
@return 32 bit XXHash **/
static uint32_t hash(const void* input, uint64_t length, uint32_t seed)
{
+ // Some modern CPUs support hardware accellerated CRC32
+ // This is significantly faster than xxHash, in some cases, by more than double
+ // So now we check for this capability and use it if it exists.
+ // This significantly reduces the impact of hashing on CPUs supporting SSE4.2
+ // but also keeps xxHash present as a fast fallback, for those who don't support it
+ static bool bHardwareCrc32 = crc32c_hw_available(); // Cache the result in a static variable to avoid _cpuid every call
+ if (bHardwareCrc32) {
+ return crc32c_append_hw(seed, (uint8_t*)input, (size_t)length);
+ }
+
XXHash32 hasher(seed);
hasher.add(input, length);
return hasher.hash();