From 3a3b782f3cf57464fb1e44f7925fb759cc1e0efb Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Mon, 8 Jun 2015 01:20:24 -0500 Subject: [PATCH 1/2] [AArch64] Detect AES/SHA1/SHA2/CRC32 in CPUDetect. --- Source/Core/Common/ArmCPUDetect.cpp | 6 ++++++ Source/Core/Common/CPUDetect.h | 3 +++ 2 files changed, 9 insertions(+) diff --git a/Source/Core/Common/ArmCPUDetect.cpp b/Source/Core/Common/ArmCPUDetect.cpp index e17eb30f6b..aab1787ba6 100644 --- a/Source/Core/Common/ArmCPUDetect.cpp +++ b/Source/Core/Common/ArmCPUDetect.cpp @@ -239,8 +239,14 @@ void CPUInfo::Detect() if (GetCPUImplementer() == 0x51 && GetCPUPart() == 0x6F) // Krait(300) is 0x6F, Scorpion is 0x4D bIDIVa = bIDIVt = true; // These two require ARMv8 or higher +#ifdef _M_ARM_64 bFP = CheckCPUFeature("fp"); bASIMD = CheckCPUFeature("asimd"); + bAES = CheckCPUFeature("aes"); + bCRC32 = CheckCPUFeature("crc32"); + bSHA1 = CheckCPUFeature("sha1"); + bSHA2 = CheckCPUFeature("sha2"); +#endif #endif // On android, we build a separate library for ARMv7 so this is fine. // TODO: Check for ARMv7 on other platforms. diff --git a/Source/Core/Common/CPUDetect.h b/Source/Core/Common/CPUDetect.h index 28263ceeb2..dd4922750e 100644 --- a/Source/Core/Common/CPUDetect.h +++ b/Source/Core/Common/CPUDetect.h @@ -75,6 +75,9 @@ struct CPUInfo // ARMv8 specific bool bFP; bool bASIMD; + bool bCRC32; + bool bSHA1; + bool bSHA2; // Call Detect() explicit CPUInfo(); From ffe085f5eaf0546018bdda342c4ce2753e4016f6 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Mon, 8 Jun 2015 01:21:04 -0500 Subject: [PATCH 2/2] [AArch64] Implement CRC32 texture hashing. In a particular hashing heavy scene in Crazy Taxi the Murmur3 hash used 3.11% CPU time. The new CRC32 hash in the same scene used 1.86% This was tested on a Nvidia SHIELD Android TV with Cortex-A57s. This will be a bit slower on the Nexus 9, the Denver CPU core is a bit slower with CRC32 texture hashing than Murmur3 texture hashing. --- Source/Core/Common/Hash.cpp | 65 ++++++++++++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/Source/Core/Common/Hash.cpp b/Source/Core/Common/Hash.cpp index 844dd31f35..868de4e68f 100644 --- a/Source/Core/Common/Hash.cpp +++ b/Source/Core/Common/Hash.cpp @@ -228,7 +228,7 @@ u64 GetMurmurHash3(const u8 *src, u32 len, u32 samples) // CRC32 hash using the SSE4.2 instruction u64 GetCRC32(const u8 *src, u32 len, u32 samples) { -#if _M_SSE >= 0x402 +#if _M_SSE >= 0x402 || defined(_M_ARM_64) u64 h[4] = { len, 0, 0, 0 }; u32 Step = (len / 8); const u64 *data = (const u64 *)src; @@ -238,6 +238,9 @@ u64 GetCRC32(const u8 *src, u32 len, u32 samples) Step = Step / samples; if (Step < 1) Step = 1; +#endif + +#if _M_SSE >= 0x402 while (data < end - Step * 3) { h[0] = _mm_crc32_u64(h[0], data[Step * 0]); @@ -259,12 +262,66 @@ u64 GetCRC32(const u8 *src, u32 len, u32 samples) memcpy(&temp, end, len & 7); h[0] = _mm_crc32_u64(h[0], temp); } +#elif defined(_M_ARM_64) + // We should be able to use intrinsics for this + // Too bad the intrinsics for this instruction was added in GCC 4.9.1 + // The Android NDK (as of r10e) only has GCC 4.9 + // Once the Android NDK has a newer GCC version, update these to use intrinsics + while (data < end - Step * 3) + { + asm ("crc32x %w[res], %w[two], %x[three]" + : [res] "=r" (h[0]) + : [two] "r" (h[0]), + [three] "r" (data[Step * 0])); + asm ("crc32x %w[res], %w[two], %x[three]" + : [res] "=r" (h[1]) + : [two] "r" (h[1]), + [three] "r" (data[Step * 1])); + asm ("crc32x %w[res], %w[two], %x[three]" + : [res] "=r" (h[2]) + : [two] "r" (h[2]), + [three] "r" (data[Step * 2])); + asm ("crc32x %w[res], %w[two], %x[three]" + : [res] "=r" (h[3]) + : [two] "r" (h[3]), + [three] "r" (data[Step * 3])); + data += Step * 4; + } + if (data < end - Step * 0) + asm ("crc32x %w[res], %w[two], %x[three]" + : [res] "=r" (h[0]) + : [two] "r" (h[0]), + [three] "r" (data[Step * 0])); + if (data < end - Step * 1) + asm ("crc32x %w[res], %w[two], %x[three]" + : [res] "=r" (h[1]) + : [two] "r" (h[1]), + [three] "r" (data[Step * 1])); + if (data < end - Step * 2) + asm ("crc32x %w[res], %w[two], %x[three]" + : [res] "=r" (h[2]) + : [two] "r" (h[2]), + [three] "r" (data[Step * 2])); + + if (len & 7) + { + u64 temp = 0; + memcpy(&temp, end, len & 7); + asm ("crc32x %w[res], %w[two], %x[three]" + : [res] "=r" (h[0]) + : [two] "r" (h[0]), + [three] "r" (temp)); + } +#endif + +#if _M_SSE >= 0x402 || defined(_M_ARM_64) // FIXME: is there a better way to combine these partial hashes? return h[0] + (h[1] << 10) + (h[2] << 21) + (h[3] << 32); #else return 0; #endif + } @@ -532,6 +589,12 @@ void SetHash64Function() ptrHashFunction = &GetCRC32; } else +#elif defined(_M_ARM_64) + if (cpu_info.bCRC32) + { + ptrHashFunction = &GetCRC32; + } + else #endif { ptrHashFunction = &GetMurmurHash3;