From 0f1b5a05ea9bef45a381cf39380ee04b75c17a56 Mon Sep 17 00:00:00 2001 From: Stenzek Date: Fri, 23 Dec 2016 23:41:54 +1000 Subject: [PATCH 1/5] Common: Support for function-specific instruction set target on x64 --- Source/Core/Common/Intrinsics.h | 79 +++++++++++++++++++++++++-------- 1 file changed, 60 insertions(+), 19 deletions(-) diff --git a/Source/Core/Common/Intrinsics.h b/Source/Core/Common/Intrinsics.h index 6ea9038c67..3e3fdb5d29 100644 --- a/Source/Core/Common/Intrinsics.h +++ b/Source/Core/Common/Intrinsics.h @@ -4,28 +4,69 @@ #pragma once -#ifdef _M_X86 +#if defined(_M_X86) + +/** + * It is assumed that all compilers used to build Dolphin support intrinsics up to and including + * SSE 4.2 on x86/x64. + */ + +#if defined(__GNUC__) || defined(__clang__) + +/** + * Due to limitations in GCC, SSE intrinsics are only available when compiling with the + * corresponding instruction set enabled. However, using the target attribute, we can compile + * single functions with a different target instruction set, while still creating a generic build. + * + * Since this instruction set is enabled per-function, any callers should verify that the + * instruction set is supported at runtime before calling it, and provide a fallback implementation + * when not supported. + * + * When building with -march=native, or enabling the instruction sets in the compile flags, permit + * usage of the instrinsics without any function attributes. If the command-line architecture does + * not support this instruction set, enable it via function targeting. +*/ -#ifdef _MSC_VER -#include -#else #include +#ifndef __SSE4_2__ +#define FUNCTION_TARGET_SSE42 [[gnu::target("sse4.2")]] +#endif +#ifndef __SSE4_1__ +#define FUNCTION_TARGET_SSR41 [[gnu::target("sse4.1")]] +#endif +#ifndef __SSSE3__ +#define FUNCTION_TARGET_SSSE3 [[gnu::target("ssse3")]] +#endif +#ifndef __SSE3__ +#define FUNCTION_TARGET_SSE3 [[gnu::target("sse3")]] #endif -#if defined _M_GENERIC -#define _M_SSE 0 -#elif _MSC_VER || __INTEL_COMPILER -#define _M_SSE 0x402 -#elif defined __GNUC__ -#if defined __SSE4_2__ -#define _M_SSE 0x402 -#elif defined __SSE4_1__ -#define _M_SSE 0x401 -#elif defined __SSSE3__ -#define _M_SSE 0x301 -#elif defined __SSE3__ -#define _M_SSE 0x300 -#endif -#endif +#elif defined(_MSC_VER) || defined(__INTEL_COMPILER) + +/** + * MSVC and ICC support intrinsics for any instruction set without any function attributes. + */ +#include + +#endif // defined(_MSC_VER) || defined(__INTEL_COMPILER) #endif // _M_X86 + +/** + * Define the FUNCTION_TARGET macros to nothing if they are not needed, or not on an X86 platform. + * This way when a function is defined with FUNCTION_TARGET you don't need to define a second + * version without the macro around a #ifdef guard. Be careful when using intrinsics, as all use + * should still be placed around a #ifdef _M_X86 if the file is compiled on all architectures. + */ +#ifndef FUNCTION_TARGET_SSE42 +#define FUNCTION_TARGET_SSE42 +#endif +#ifndef FUNCTION_TARGET_SSR41 +#define FUNCTION_TARGET_SSR41 +#endif +#ifndef FUNCTION_TARGET_SSSE3 +#define FUNCTION_TARGET_SSSE3 +#endif +#ifndef FUNCTION_TARGET_SSE3 +#define FUNCTION_TARGET_SSE3 +#endif From b74029ec575e2009057f046c8fa8a1840fc1010a Mon Sep 17 00:00:00 2001 From: Stenzek Date: Fri, 23 Dec 2016 23:45:54 +1000 Subject: [PATCH 2/5] TextureDecoder: Use target attributes on SSSE3 decoders --- .../Core/VideoCommon/TextureDecoder_x64.cpp | 33 ++++++------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/Source/Core/VideoCommon/TextureDecoder_x64.cpp b/Source/Core/VideoCommon/TextureDecoder_x64.cpp index 4697e8ba1b..935c509f24 100644 --- a/Source/Core/VideoCommon/TextureDecoder_x64.cpp +++ b/Source/Core/VideoCommon/TextureDecoder_x64.cpp @@ -249,11 +249,11 @@ static void TexDecoder_DecodeImpl_C4(u32* dst, const u8* src, int width, int hei } } +FUNCTION_TARGET_SSSE3 static void TexDecoder_DecodeImpl_I4_SSSE3(u32* dst, const u8* src, int width, int height, int texformat, const u8* tlut, TlutFormat tlutfmt, int Wsteps4, int Wsteps8) { -#if _M_SSE >= 0x301 const __m128i kMask_x0f = _mm_set1_epi32(0x0f0f0f0fL); const __m128i kMask_xf0 = _mm_set1_epi32(0xf0f0f0f0L); @@ -296,7 +296,6 @@ static void TexDecoder_DecodeImpl_I4_SSSE3(u32* dst, const u8* src, int width, i } } } -#endif } static void TexDecoder_DecodeImpl_I4(u32* dst, const u8* src, int width, int height, int texformat, @@ -389,11 +388,11 @@ static void TexDecoder_DecodeImpl_I4(u32* dst, const u8* src, int width, int hei } } +FUNCTION_TARGET_SSSE3 static void TexDecoder_DecodeImpl_I8_SSSE3(u32* dst, const u8* src, int width, int height, int texformat, const u8* tlut, TlutFormat tlutfmt, int Wsteps4, int Wsteps8) { -#if _M_SSE >= 0x301 // xsacha optimized with SSSE3 intrinsics // Produces a ~10% speed improvement over SSE2 implementation for (int y = 0; y < height; y += 4) @@ -418,7 +417,6 @@ static void TexDecoder_DecodeImpl_I8_SSSE3(u32* dst, const u8* src, int width, i } } } -#endif } static void TexDecoder_DecodeImpl_I8(u32* dst, const u8* src, int width, int height, int texformat, @@ -572,11 +570,11 @@ static void TexDecoder_DecodeImpl_IA4(u32* dst, const u8* src, int width, int he } } +FUNCTION_TARGET_SSSE3 static void TexDecoder_DecodeImpl_IA8_SSSE3(u32* dst, const u8* src, int width, int height, int texformat, const u8* tlut, TlutFormat tlutfmt, int Wsteps4, int Wsteps8) { -#if _M_SSE >= 0x301 // xsacha optimized with SSSE3 intrinsics. // Produces an ~50% speed improvement over SSE2 implementation. for (int y = 0; y < height; y += 4) @@ -595,7 +593,6 @@ static void TexDecoder_DecodeImpl_IA8_SSSE3(u32* dst, const u8* src, int width, } } } -#endif } static void TexDecoder_DecodeImpl_IA8(u32* dst, const u8* src, int width, int height, int texformat, @@ -767,11 +764,11 @@ static void TexDecoder_DecodeImpl_RGB565(u32* dst, const u8* src, int width, int } } +FUNCTION_TARGET_SSSE3 static void TexDecoder_DecodeImpl_RGB5A3_SSSE3(u32* dst, const u8* src, int width, int height, int texformat, const u8* tlut, TlutFormat tlutfmt, int Wsteps4, int Wsteps8) { -#if _M_SSE >= 0x301 const __m128i kMask_x1f = _mm_set1_epi32(0x0000001fL); const __m128i kMask_x0f = _mm_set1_epi32(0x0000000fL); const __m128i kMask_x07 = _mm_set1_epi32(0x00000007L); @@ -872,7 +869,6 @@ static void TexDecoder_DecodeImpl_RGB5A3_SSSE3(u32* dst, const u8* src, int widt } } } -#endif } static void TexDecoder_DecodeImpl_RGB5A3(u32* dst, const u8* src, int width, int height, @@ -995,11 +991,11 @@ static void TexDecoder_DecodeImpl_RGB5A3(u32* dst, const u8* src, int width, int } } +FUNCTION_TARGET_SSSE3 static void TexDecoder_DecodeImpl_RGBA8_SSSE3(u32* dst, const u8* src, int width, int height, int texformat, const u8* tlut, TlutFormat tlutfmt, int Wsteps4, int Wsteps8) { -#if _M_SSE >= 0x301 // xsacha optimized with SSSE3 instrinsics // Produces a ~30% speed improvement over SSE2 implementation for (int y = 0; y < height; y += 4) @@ -1028,7 +1024,6 @@ static void TexDecoder_DecodeImpl_RGBA8_SSSE3(u32* dst, const u8* src, int width _mm_storeu_si128(dst128, rgba11); } } -#endif } static void TexDecoder_DecodeImpl_RGBA8(u32* dst, const u8* src, int width, int height, @@ -1414,14 +1409,6 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int int Wsteps4 = (width + 3) / 4; int Wsteps8 = (width + 7) / 8; -// If the binary was not compiled with SSSE3 support, the functions turn into no-ops. -// Therefore, we shouldn't call them based on what the CPU reports at runtime alone. -#if _M_SSE >= 0x301 - bool has_SSSE3 = cpu_info.bSSSE3; -#else - bool has_SSSE3 = false; -#endif - switch (texformat) { case GX_TF_C4: @@ -1429,7 +1416,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int break; case GX_TF_I4: - if (has_SSSE3) + if (cpu_info.bSSSE3) TexDecoder_DecodeImpl_I4_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4, Wsteps8); else @@ -1437,7 +1424,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int break; case GX_TF_I8: - if (has_SSSE3) + if (cpu_info.bSSSE3) TexDecoder_DecodeImpl_I8_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4, Wsteps8); else @@ -1453,7 +1440,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int break; case GX_TF_IA8: - if (has_SSSE3) + if (cpu_info.bSSSE3) TexDecoder_DecodeImpl_IA8_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4, Wsteps8); else @@ -1472,7 +1459,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int break; case GX_TF_RGB5A3: - if (has_SSSE3) + if (cpu_info.bSSSE3) TexDecoder_DecodeImpl_RGB5A3_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4, Wsteps8); else @@ -1481,7 +1468,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int break; case GX_TF_RGBA8: - if (has_SSSE3) + if (cpu_info.bSSSE3) TexDecoder_DecodeImpl_RGBA8_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4, Wsteps8); else From 0f978227f56565644a1d653e56509799b846819c Mon Sep 17 00:00:00 2001 From: Stenzek Date: Fri, 23 Dec 2016 23:48:55 +1000 Subject: [PATCH 3/5] Common: Use function-level targeting for CRC32 (SSE4.2) --- Source/Core/Common/Hash.cpp | 57 ++++++++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 13 deletions(-) diff --git a/Source/Core/Common/Hash.cpp b/Source/Core/Common/Hash.cpp index 27d0e412bc..b6d00cfa39 100644 --- a/Source/Core/Common/Hash.cpp +++ b/Source/Core/Common/Hash.cpp @@ -238,9 +238,11 @@ u64 GetMurmurHash3(const u8* src, u32 len, u32 samples) } // CRC32 hash using the SSE4.2 instruction +#if defined(_M_X86_64) + +FUNCTION_TARGET_SSE42 u64 GetCRC32(const u8* src, u32 len, u32 samples) { -#if _M_SSE >= 0x402 || defined(_M_ARM_64) u64 h[4] = {len, 0, 0, 0}; u32 Step = (len / 8); const u64* data = (const u64*)src; @@ -250,9 +252,7 @@ u64 GetCRC32(const u8* src, u32 len, u32 samples) Step = Step / samples; if (Step < 1) Step = 1; -#endif -#if _M_SSE >= 0x402 while (data < end - Step * 3) { h[0] = _mm_crc32_u64(h[0], data[Step * 0]); @@ -274,7 +274,25 @@ u64 GetCRC32(const u8* src, u32 len, u32 samples) memcpy(&temp, end, len & 7); h[0] = _mm_crc32_u64(h[0], temp); } + + // FIXME: is there a better way to combine these partial hashes? + return h[0] + (h[1] << 10) + (h[2] << 21) + (h[3] << 32); +} + #elif defined(_M_ARM_64) + +u64 GetCRC32(const u8* src, u32 len, u32 samples) +{ + u64 h[4] = {len, 0, 0, 0}; + u32 Step = (len / 8); + const u64* data = (const u64*)src; + const u64* end = data + Step; + if (samples == 0) + samples = std::max(Step, 1u); + Step = Step / samples; + if (Step < 1) + Step = 1; + // We should be able to use intrinsics for this // Too bad the intrinsics for this instruction was added in GCC 4.9.1 // The Android NDK (as of r10e) only has GCC 4.9 @@ -317,16 +335,20 @@ u64 GetCRC32(const u8* src, u32 len, u32 samples) : [res] "=r"(h[0]) : [two] "r"(h[0]), [three] "r"(temp)); } -#endif -#if _M_SSE >= 0x402 || defined(_M_ARM_64) // FIXME: is there a better way to combine these partial hashes? return h[0] + (h[1] << 10) + (h[2] << 21) + (h[3] << 32); -#else - return 0; -#endif } +#else + +u64 GetCRC32(const u8* src, u32 len, u32 samples) +{ + return 0; +} + +#endif + /* * NOTE: This hash function is used for custom texture loading/dumping, so * it should not be changed, which would require all custom textures to be @@ -386,10 +408,13 @@ u64 GetHashHiresTexture(const u8* src, u32 len, u32 samples) return h; } #else + // CRC32 hash using the SSE4.2 instruction +#if defined(_M_X86) + +FUNCTION_TARGET_SSE42 u64 GetCRC32(const u8* src, u32 len, u32 samples) { -#if _M_SSE >= 0x402 u32 h = len; u32 Step = (len / 4); const u32* data = (const u32*)src; @@ -407,11 +432,17 @@ u64 GetCRC32(const u8* src, u32 len, u32 samples) const u8* data2 = (const u8*)end; return (u64)_mm_crc32_u32(h, u32(data2[0])); -#else - return 0; -#endif } +#else + +u64 GetCRC32(const u8* src, u32 len, u32 samples) +{ + return 0; +} + +#endif + //----------------------------------------------------------------------------- // Block read - if your platform needs to do endian-swapping or can only // handle aligned reads, do the conversion here @@ -606,7 +637,7 @@ u64 GetHash64(const u8* src, u32 len, u32 samples) // sets the hash function used for the texture cache void SetHash64Function() { -#if _M_SSE >= 0x402 +#if defined(_M_X86_64) || defined(_M_X86) if (cpu_info.bSSE4_2) // sse crc32 version { ptrHashFunction = &GetCRC32; From d315052552de6933de08f69c53dbb6c1873af5c8 Mon Sep 17 00:00:00 2001 From: Stenzek Date: Sat, 24 Dec 2016 17:24:36 +1000 Subject: [PATCH 4/5] AudioCommon: Remove unused _M_SSE test --- Source/Core/AudioCommon/Mixer.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/Source/Core/AudioCommon/Mixer.cpp b/Source/Core/AudioCommon/Mixer.cpp index de5a4fdde8..24cb18d172 100644 --- a/Source/Core/AudioCommon/Mixer.cpp +++ b/Source/Core/AudioCommon/Mixer.cpp @@ -12,10 +12,6 @@ #include "Common/MathUtil.h" #include "Core/ConfigManager.h" -#if _M_SSE >= 0x301 && !(defined __GNUC__ && !defined __SSSE3__) -#include -#endif - CMixer::CMixer(unsigned int BackendSampleRate) : m_sampleRate(BackendSampleRate) { INFO_LOG(AUDIO_INTERFACE, "Mixer is initialized"); From 214aea1aeac5de10b285c288d0e8f01b3a19ba25 Mon Sep 17 00:00:00 2001 From: Stenzek Date: Sat, 24 Dec 2016 17:34:33 +1000 Subject: [PATCH 5/5] DSPHWInterface: Use SSSE3 function targeting --- Source/Core/Core/DSP/DSPHWInterface.cpp | 46 ++++++++++++++++--------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/Source/Core/Core/DSP/DSPHWInterface.cpp b/Source/Core/Core/DSP/DSPHWInterface.cpp index 7620f3bee9..5e19da22db 100644 --- a/Source/Core/Core/DSP/DSPHWInterface.cpp +++ b/Source/Core/Core/DSP/DSPHWInterface.cpp @@ -252,25 +252,41 @@ static const u8* gdsp_idma_out(u16 dsp_addr, u32 addr, u32 size) return nullptr; } -#if _M_SSE >= 0x301 +#if defined(_M_X86) || defined(_M_X86_64) static const __m128i s_mask = _mm_set_epi32(0x0E0F0C0DL, 0x0A0B0809L, 0x06070405L, 0x02030001L); + +FUNCTION_TARGET_SSSE3 +static void gdsp_ddma_in_SSSE3(u16 dsp_addr, u32 addr, u32 size, u8* dst) +{ + for (u32 i = 0; i < size; i += 16) + { + _mm_storeu_si128( + (__m128i*)&dst[dsp_addr + i], + _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)&g_dsp.cpu_ram[(addr + i) & 0x7FFFFFFF]), + s_mask)); + } +} + +FUNCTION_TARGET_SSSE3 +static void gdsp_ddma_out_SSSE3(u16 dsp_addr, u32 addr, u32 size, const u8* src) +{ + for (u32 i = 0; i < size; i += 16) + { + _mm_storeu_si128((__m128i*)&g_dsp.cpu_ram[(addr + i) & 0x7FFFFFFF], + _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)&src[dsp_addr + i]), s_mask)); + } +} #endif // TODO: These should eat clock cycles. static const u8* gdsp_ddma_in(u16 dsp_addr, u32 addr, u32 size) { - u8* dst = ((u8*)g_dsp.dram); + u8* dst = reinterpret_cast(g_dsp.dram); -#if _M_SSE >= 0x301 +#if defined(_M_X86) || defined(_M_X86_64) if (cpu_info.bSSSE3 && !(size % 16)) { - for (u32 i = 0; i < size; i += 16) - { - _mm_storeu_si128( - (__m128i*)&dst[dsp_addr + i], - _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)&g_dsp.cpu_ram[(addr + i) & 0x7FFFFFFF]), - s_mask)); - } + gdsp_ddma_in_SSSE3(dsp_addr, addr, size, dst); } else #endif @@ -289,16 +305,12 @@ static const u8* gdsp_ddma_in(u16 dsp_addr, u32 addr, u32 size) static const u8* gdsp_ddma_out(u16 dsp_addr, u32 addr, u32 size) { - const u8* src = ((const u8*)g_dsp.dram); + const u8* src = reinterpret_cast(g_dsp.dram); -#if _M_SSE >= 0x301 +#ifdef _M_X86 if (cpu_info.bSSSE3 && !(size % 16)) { - for (u32 i = 0; i < size; i += 16) - { - _mm_storeu_si128((__m128i*)&g_dsp.cpu_ram[(addr + i) & 0x7FFFFFFF], - _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)&src[dsp_addr + i]), s_mask)); - } + gdsp_ddma_out_SSSE3(dsp_addr, addr, size, src); } else #endif