From 17c554c1656fc6ea2df1463a97628379954b65ed Mon Sep 17 00:00:00 2001 From: Shawn Hoffman Date: Fri, 22 Jul 2022 08:52:26 -0700 Subject: [PATCH] Common/Hash: use zlib-ng for adler32. small cleanups. --- Source/Core/Common/Hash.cpp | 387 +++++++------------ Source/Core/Common/Hash.h | 17 +- Source/Core/DiscIO/VolumeVerifier.cpp | 4 +- Source/Core/VideoCommon/NativeVertexFormat.h | 32 +- Source/Core/VideoCommon/TextureCacheBase.cpp | 2 - 5 files changed, 190 insertions(+), 252 deletions(-) diff --git a/Source/Core/Common/Hash.cpp b/Source/Core/Common/Hash.cpp index 62b56bc82c..f11ac4d62c 100644 --- a/Source/Core/Common/Hash.cpp +++ b/Source/Core/Common/Hash.cpp @@ -22,94 +22,28 @@ namespace Common { -static u64 (*ptrHashFunction)(const u8* src, u32 len, u32 samples) = nullptr; - -// uint32_t -// WARNING - may read one more byte! -// Implementation from Wikipedia. -u32 HashFletcher(const u8* data_u8, size_t length) -{ - const u16* data = (const u16*)data_u8; /* Pointer to the data to be summed */ - size_t len = (length + 1) / 2; /* Length in 16-bit words */ - u32 sum1 = 0xffff, sum2 = 0xffff; - - while (len) - { - size_t tlen = len > 360 ? 360 : len; - len -= tlen; - - do - { - sum1 += *data++; - sum2 += sum1; - } while (--tlen); - - sum1 = (sum1 & 0xffff) + (sum1 >> 16); - sum2 = (sum2 & 0xffff) + (sum2 >> 16); - } - - // Second reduction step to reduce sums to 16 bits - sum1 = (sum1 & 0xffff) + (sum1 >> 16); - sum2 = (sum2 & 0xffff) + (sum2 >> 16); - return (sum2 << 16 | sum1); -} - -// Implementation from Wikipedia -// Slightly slower than Fletcher above, but slightly more reliable. -// data: Pointer to the data to be summed; len is in bytes u32 HashAdler32(const u8* data, size_t len) { - static const u32 MOD_ADLER = 65521; - u32 a = 1, b = 0; - - while (len) - { - size_t tlen = len > 5550 ? 5550 : len; - len -= tlen; - - do - { - a += *data++; - b += a; - } while (--tlen); - - a = (a & 0xffff) + (a >> 16) * (65536 - MOD_ADLER); - b = (b & 0xffff) + (b >> 16) * (65536 - MOD_ADLER); - } - - // It can be shown that a <= 0x1013a here, so a single subtract will do. - if (a >= MOD_ADLER) - { - a -= MOD_ADLER; - } - - // It can be shown that b can reach 0xfff87 here. - b = (b & 0xffff) + (b >> 16) * (65536 - MOD_ADLER); - - if (b >= MOD_ADLER) - { - b -= MOD_ADLER; - } - - return ((b << 16) | a); + // Use fast implementation from zlib-ng + return adler32_z(1, data, len); } // Stupid hash - but can't go back now :) // Don't use for new things. At least it's reasonably fast. -u32 HashEctor(const u8* ptr, size_t length) +u32 HashEctor(const u8* data, size_t len) { u32 crc = 0; - for (size_t i = 0; i < length; i++) + for (size_t i = 0; i < len; i++) { - crc ^= ptr[i]; + crc ^= data[i]; crc = (crc << 3) | (crc >> 29); } - return (crc); + return crc; } -#if _ARCH_64 +#ifdef _ARCH_64 //----------------------------------------------------------------------------- // Block read - if your platform needs to do endian-swapping or can only @@ -250,133 +184,8 @@ static u64 GetMurmurHash3(const u8* src, u32 len, u32 samples) return h1; } -// CRC32 hash using the SSE4.2 instruction -#if defined(_M_X86_64) - -FUNCTION_TARGET_SSE42 -static u64 GetCRC32(const u8* src, u32 len, u32 samples) -{ - u64 h[4] = {len, 0, 0, 0}; - u32 Step = (len / 8); - const u64* data = (const u64*)src; - const u64* end = data + Step; - if (samples == 0) - samples = std::max(Step, 1u); - Step = Step / samples; - if (Step < 1) - Step = 1; - - while (data < end - Step * 3) - { - h[0] = _mm_crc32_u64(h[0], data[Step * 0]); - h[1] = _mm_crc32_u64(h[1], data[Step * 1]); - h[2] = _mm_crc32_u64(h[2], data[Step * 2]); - h[3] = _mm_crc32_u64(h[3], data[Step * 3]); - data += Step * 4; - } - if (data < end - Step * 0) - h[0] = _mm_crc32_u64(h[0], data[Step * 0]); - if (data < end - Step * 1) - h[1] = _mm_crc32_u64(h[1], data[Step * 1]); - if (data < end - Step * 2) - h[2] = _mm_crc32_u64(h[2], data[Step * 2]); - - if (len & 7) - { - u64 temp = 0; - memcpy(&temp, end, len & 7); - h[0] = _mm_crc32_u64(h[0], temp); - } - - // FIXME: is there a better way to combine these partial hashes? - return h[0] + (h[1] << 10) + (h[2] << 21) + (h[3] << 32); -} - -#elif defined(_M_ARM_64) - -static u64 GetCRC32(const u8* src, u32 len, u32 samples) -{ - u64 h[4] = {len, 0, 0, 0}; - u32 Step = (len / 8); - const u64* data = (const u64*)src; - const u64* end = data + Step; - if (samples == 0) - samples = std::max(Step, 1u); - Step = Step / samples; - if (Step < 1) - Step = 1; - - while (data < end - Step * 3) - { - h[0] = __crc32d(h[0], data[Step * 0]); - h[1] = __crc32d(h[1], data[Step * 1]); - h[2] = __crc32d(h[2], data[Step * 2]); - h[3] = __crc32d(h[3], data[Step * 3]); - data += Step * 4; - } - if (data < end - Step * 0) - h[0] = __crc32d(h[0], data[Step * 0]); - if (data < end - Step * 1) - h[1] = __crc32d(h[1], data[Step * 1]); - if (data < end - Step * 2) - h[2] = __crc32d(h[2], data[Step * 2]); - - if (len & 7) - { - u64 temp = 0; - memcpy(&temp, end, len & 7); - h[0] = __crc32d(h[0], temp); - } - - // FIXME: is there a better way to combine these partial hashes? - return h[0] + (h[1] << 10) + (h[2] << 21) + (h[3] << 32); -} - #else -static u64 GetCRC32(const u8* src, u32 len, u32 samples) -{ - return 0; -} - -#endif - -#else - -// CRC32 hash using the SSE4.2 instruction -#if defined(_M_X86) - -FUNCTION_TARGET_SSE42 -static u64 GetCRC32(const u8* src, u32 len, u32 samples) -{ - u32 h = len; - u32 Step = (len / 4); - const u32* data = (const u32*)src; - const u32* end = data + Step; - if (samples == 0) - samples = std::max(Step, 1u); - Step = Step / samples; - if (Step < 1) - Step = 1; - while (data < end) - { - h = _mm_crc32_u32(h, data[0]); - data += Step; - } - - const u8* data2 = (const u8*)end; - return (u64)_mm_crc32_u32(h, u32(data2[0])); -} - -#else - -static u64 GetCRC32(const u8* src, u32 len, u32 samples) -{ - return 0; -} - -#endif - //----------------------------------------------------------------------------- // Block read - if your platform needs to do endian-swapping or can only // handle aligned reads, do the conversion here @@ -504,55 +313,159 @@ static u64 GetMurmurHash3(const u8* src, u32 len, u32 samples) return *((u64*)&out); } + #endif +#if defined(_M_X86_64) + +FUNCTION_TARGET_SSE42 +static u64 GetHash64_SSE42_CRC32(const u8* src, u32 len, u32 samples) +{ + u64 h[4] = {len, 0, 0, 0}; + u32 Step = (len / 8); + const u64* data = (const u64*)src; + const u64* end = data + Step; + if (samples == 0) + samples = std::max(Step, 1u); + Step = Step / samples; + if (Step < 1) + Step = 1; + + while (data < end - Step * 3) + { + h[0] = _mm_crc32_u64(h[0], data[Step * 0]); + h[1] = _mm_crc32_u64(h[1], data[Step * 1]); + h[2] = _mm_crc32_u64(h[2], data[Step * 2]); + h[3] = _mm_crc32_u64(h[3], data[Step * 3]); + data += Step * 4; + } + if (data < end - Step * 0) + h[0] = _mm_crc32_u64(h[0], data[Step * 0]); + if (data < end - Step * 1) + h[1] = _mm_crc32_u64(h[1], data[Step * 1]); + if (data < end - Step * 2) + h[2] = _mm_crc32_u64(h[2], data[Step * 2]); + + if (len & 7) + { + u64 temp = 0; + memcpy(&temp, end, len & 7); + h[0] = _mm_crc32_u64(h[0], temp); + } + + // FIXME: is there a better way to combine these partial hashes? + return h[0] + (h[1] << 10) + (h[2] << 21) + (h[3] << 32); +} + +#elif defined(_M_X86) + +FUNCTION_TARGET_SSE42 +static u64 GetHash64_SSE42_CRC32(const u8* src, u32 len, u32 samples) +{ + u32 h = len; + u32 Step = (len / 4); + const u32* data = (const u32*)src; + const u32* end = data + Step; + if (samples == 0) + samples = std::max(Step, 1u); + Step = Step / samples; + if (Step < 1) + Step = 1; + while (data < end) + { + h = _mm_crc32_u32(h, data[0]); + data += Step; + } + + const u8* data2 = (const u8*)end; + return (u64)_mm_crc32_u32(h, u32(data2[0])); +} + +#elif defined(_M_ARM_64) + +static u64 GetHash64_ARMv8_CRC32(const u8* src, u32 len, u32 samples) +{ + u64 h[4] = {len, 0, 0, 0}; + u32 Step = (len / 8); + const u64* data = (const u64*)src; + const u64* end = data + Step; + if (samples == 0) + samples = std::max(Step, 1u); + Step = Step / samples; + if (Step < 1) + Step = 1; + + while (data < end - Step * 3) + { + h[0] = __crc32d(h[0], data[Step * 0]); + h[1] = __crc32d(h[1], data[Step * 1]); + h[2] = __crc32d(h[2], data[Step * 2]); + h[3] = __crc32d(h[3], data[Step * 3]); + data += Step * 4; + } + if (data < end - Step * 0) + h[0] = __crc32d(h[0], data[Step * 0]); + if (data < end - Step * 1) + h[1] = __crc32d(h[1], data[Step * 1]); + if (data < end - Step * 2) + h[2] = __crc32d(h[2], data[Step * 2]); + + if (len & 7) + { + u64 temp = 0; + memcpy(&temp, end, len & 7); + h[0] = __crc32d(h[0], temp); + } + + // FIXME: is there a better way to combine these partial hashes? + return h[0] + (h[1] << 10) + (h[2] << 21) + (h[3] << 32); +} + +#endif + +using TextureHashFunction = u64 (*)(const u8* src, u32 len, u32 samples); +static u64 SetHash64Function(const u8* src, u32 len, u32 samples); +static TextureHashFunction s_texture_hash_func = SetHash64Function; + +static u64 SetHash64Function(const u8* src, u32 len, u32 samples) +{ + if (cpu_info.bCRC32) + { +#if defined(_M_X86_64) || defined(_M_X86) + s_texture_hash_func = &GetHash64_SSE42_CRC32; +#elif defined(_M_ARM_64) + s_texture_hash_func = &GetHash64_ARMv8_CRC32; +#endif + } + else + { + s_texture_hash_func = &GetMurmurHash3; + } + return s_texture_hash_func(src, len, samples); +} + u64 GetHash64(const u8* src, u32 len, u32 samples) { - return ptrHashFunction(src, len, samples); -} - -// sets the hash function used for the texture cache -void SetHash64Function() -{ -#if defined(_M_X86_64) || defined(_M_X86) - if (cpu_info.bSSE4_2) // sse crc32 version - { - ptrHashFunction = &GetCRC32; - } - else -#elif defined(_M_ARM_64) - if (cpu_info.bCRC32) - { - ptrHashFunction = &GetCRC32; - } - else -#endif - { - ptrHashFunction = &GetMurmurHash3; - } -} - -u32 ComputeCRC32(std::string_view data) -{ - return ComputeCRC32(reinterpret_cast(data.data()), static_cast(data.size())); -} - -u32 ComputeCRC32(const u8* ptr, u32 length) -{ - return UpdateCRC32(StartCRC32(), ptr, length); + return s_texture_hash_func(src, len, samples); } u32 StartCRC32() { - return crc32(0L, Z_NULL, 0); + return crc32_z(0L, Z_NULL, 0); } -u32 UpdateCRC32(u32 crc, const u8* ptr, u32 length) +u32 UpdateCRC32(u32 crc, const u8* data, size_t len) { - static_assert(std::is_same_v); - static_assert(std::is_same_v); - // Use zlib's crc32 implementation to compute the hash - // crc32_z (which takes a size_t) would be better, but it isn't available on Android - return crc32(crc, ptr, length); + return crc32_z(crc, data, len); +} + +u32 ComputeCRC32(const u8* data, size_t len) +{ + return UpdateCRC32(StartCRC32(), data, len); +} + +u32 ComputeCRC32(std::string_view data) +{ + return ComputeCRC32(reinterpret_cast(data.data()), data.size()); } } // namespace Common diff --git a/Source/Core/Common/Hash.h b/Source/Core/Common/Hash.h index c742ed0b64..6166b74bd9 100644 --- a/Source/Core/Common/Hash.h +++ b/Source/Core/Common/Hash.h @@ -10,14 +10,15 @@ namespace Common { -u32 HashFletcher(const u8* data_u8, size_t length); // FAST. Length & 1 == 0. -u32 HashAdler32(const u8* data, size_t len); // Fairly accurate, slightly slower -u32 HashEctor(const u8* ptr, size_t length); // JUNK. DO NOT USE FOR NEW THINGS -u64 GetHash64(const u8* src, u32 len, u32 samples); -void SetHash64Function(); +u32 HashAdler32(const u8* data, size_t len); +// JUNK. DO NOT USE FOR NEW THINGS +u32 HashEctor(const u8* data, size_t len); + +// Specialized hash function used for the texture cache +u64 GetHash64(const u8* src, u32 len, u32 samples); -u32 ComputeCRC32(std::string_view data); -u32 ComputeCRC32(const u8* ptr, u32 length); u32 StartCRC32(); -u32 UpdateCRC32(u32 crc, const u8* ptr, u32 length); +u32 UpdateCRC32(u32 crc, const u8* data, size_t len); +u32 ComputeCRC32(const u8* data, size_t len); +u32 ComputeCRC32(std::string_view data); } // namespace Common diff --git a/Source/Core/DiscIO/VolumeVerifier.cpp b/Source/Core/DiscIO/VolumeVerifier.cpp index 33828382a9..1e5f6515f0 100644 --- a/Source/Core/DiscIO/VolumeVerifier.cpp +++ b/Source/Core/DiscIO/VolumeVerifier.cpp @@ -1175,8 +1175,8 @@ void VolumeVerifier::Process() if (m_hashes_to_calculate.crc32) { m_crc32_future = std::async(std::launch::async, [this, byte_increment] { - m_crc32_context = - Common::UpdateCRC32(m_crc32_context, m_data.data(), static_cast(byte_increment)); + m_crc32_context = Common::UpdateCRC32(m_crc32_context, m_data.data(), + static_cast(byte_increment)); }); } diff --git a/Source/Core/VideoCommon/NativeVertexFormat.h b/Source/Core/VideoCommon/NativeVertexFormat.h index 94dc1a0fa2..55e1a40178 100644 --- a/Source/Core/VideoCommon/NativeVertexFormat.h +++ b/Source/Core/VideoCommon/NativeVertexFormat.h @@ -7,7 +7,6 @@ #include // for hash #include "Common/CommonTypes.h" -#include "Common/Hash.h" #include "VideoCommon/CPMemory.h" // m_components @@ -79,10 +78,37 @@ namespace std template <> struct hash { - size_t operator()(const PortableVertexDeclaration& decl) const + // Implementation from Wikipedia. + template + u32 Fletcher32(const T& data) const { - return Common::HashFletcher(reinterpret_cast(&decl), sizeof(decl)); + static_assert(sizeof(T) % sizeof(u16) == 0); + + auto buf = reinterpret_cast(&data); + size_t len = sizeof(T) / sizeof(u16); + u32 sum1 = 0xffff, sum2 = 0xffff; + + while (len) + { + size_t tlen = len > 360 ? 360 : len; + len -= tlen; + + do + { + sum1 += *buf++; + sum2 += sum1; + } while (--tlen); + + sum1 = (sum1 & 0xffff) + (sum1 >> 16); + sum2 = (sum2 & 0xffff) + (sum2 >> 16); + } + + // Second reduction step to reduce sums to 16 bits + sum1 = (sum1 & 0xffff) + (sum1 >> 16); + sum2 = (sum2 & 0xffff) + (sum2 >> 16); + return (sum2 << 16 | sum1); } + size_t operator()(const PortableVertexDeclaration& decl) const { return Fletcher32(decl); } }; } // namespace std diff --git a/Source/Core/VideoCommon/TextureCacheBase.cpp b/Source/Core/VideoCommon/TextureCacheBase.cpp index 07bbe0aa2e..b17f80ba6f 100644 --- a/Source/Core/VideoCommon/TextureCacheBase.cpp +++ b/Source/Core/VideoCommon/TextureCacheBase.cpp @@ -94,8 +94,6 @@ TextureCacheBase::TextureCacheBase() HiresTexture::Init(); - Common::SetHash64Function(); - TMEM::InvalidateAll(); }