diff --git a/src/core/hle/D3D8/Direct3D9/Direct3D9.cpp b/src/core/hle/D3D8/Direct3D9/Direct3D9.cpp index f027d3ac4..d82092b39 100644 --- a/src/core/hle/D3D8/Direct3D9/Direct3D9.cpp +++ b/src/core/hle/D3D8/Direct3D9/Direct3D9.cpp @@ -7053,7 +7053,7 @@ void XTL::CxbxDrawIndexed(CxbxDrawContext &DrawContext) //Walk through index buffer // Determine highest and lowest index in use : INDEX16 LowIndex, HighIndex; - WalkIndexBuffer_SIMD(LowIndex, HighIndex, &(DrawContext.pIndexData[DrawContext.dwStartVertex]), DrawContext.dwVertexCount); + WalkIndexBuffer(LowIndex, HighIndex, &(DrawContext.pIndexData[DrawContext.dwStartVertex]), DrawContext.dwVertexCount); VertexBufferConverter.Apply(&DrawContext, LowIndex); if (DrawContext.XboxPrimitiveType == X_D3DPT_QUADLIST) { @@ -7572,7 +7572,7 @@ VOID WINAPI XTL::EMUPATCH(D3DDevice_DrawIndexedVerticesUP) else { // Walk through the index buffer INDEX16 LowIndex, HighIndex; - WalkIndexBuffer_SIMD(LowIndex, HighIndex, (INDEX16*)pIndexData, DrawContext.dwVertexCount); + WalkIndexBuffer(LowIndex, HighIndex, (INDEX16*)pIndexData, DrawContext.dwVertexCount); // LOG_TEST_CASE("DrawIndexedPrimitiveUP"); // Test-case : Burnout, Namco Museum 50th Anniversary HRESULT hRet = g_pD3DDevice->DrawIndexedPrimitiveUP( diff --git a/src/core/hle/D3D8/Direct3D9/WalkIndexBuffer.cpp b/src/core/hle/D3D8/Direct3D9/WalkIndexBuffer.cpp index c5598058c..2183cfc34 100644 --- a/src/core/hle/D3D8/Direct3D9/WalkIndexBuffer.cpp +++ b/src/core/hle/D3D8/Direct3D9/WalkIndexBuffer.cpp @@ -5,33 +5,13 @@ #include // SSE4.1 //#include // SSE4.2 //#include // AVX - -#include "core\kernel\support\Emu.h" -#include "core\kernel\support\EmuXTL.h" - #include "common\util\CPUID.h" #include "WalkIndexBuffer.h" -void WalkIndexBuffer_SSE41(XTL::INDEX16 & LowIndex, XTL::INDEX16 & HighIndex, XTL::INDEX16 * pIndexData, DWORD dwIndexCount); +// Walk an index buffer to find the minimum and maximum indices -void Init_SIMD -( - XTL::INDEX16 &LowIndex, - XTL::INDEX16 &HighIndex, - XTL::INDEX16 *pIndexData, - DWORD dwIndexCount -) -{ - SimdCaps supports; - if (supports.SSE41()) - WalkIndexBuffer_SIMD = WalkIndexBuffer_SSE41; - else - WalkIndexBuffer_SIMD = WalkIndexBuffer; - - WalkIndexBuffer_SIMD(LowIndex, HighIndex, pIndexData, dwIndexCount); -} - -void WalkIndexBuffer(XTL::INDEX16 & LowIndex, XTL::INDEX16 & HighIndex, XTL::INDEX16 * pIndexData, DWORD dwIndexCount) +// Default implementation +void WalkIndexBuffer_NoSIMD(XTL::INDEX16 & LowIndex, XTL::INDEX16 & HighIndex, XTL::INDEX16 * pIndexData, DWORD dwIndexCount) { // Determine highest and lowest index in use LowIndex = pIndexData[0]; @@ -45,6 +25,7 @@ void WalkIndexBuffer(XTL::INDEX16 & LowIndex, XTL::INDEX16 & HighIndex, XTL::IND } } +//SSE 4.1 implementation void WalkIndexBuffer_SSE41(XTL::INDEX16 & LowIndex, XTL::INDEX16 & HighIndex, XTL::INDEX16 * pIndexData, DWORD dwIndexCount) { // We can fit 8 ushorts into 128 bit SIMD registers @@ -53,12 +34,13 @@ void WalkIndexBuffer_SSE41(XTL::INDEX16 & LowIndex, XTL::INDEX16 & HighIndex, XT // Fallback to basic function if we can't even min / max 2 registers together if (iterations < 2) { - WalkIndexBuffer(LowIndex, HighIndex, pIndexData, dwIndexCount); + WalkIndexBuffer_NoSIMD(LowIndex, HighIndex, pIndexData, dwIndexCount); return; } - __m128i *unalignedIndices = (__m128i*) pIndexData;\ - __m128i min = _mm_set1_epi16(USHRT_MAX); + // Initialize mins and maxes + __m128i *unalignedIndices = (__m128i*) pIndexData; + __m128i min = _mm_set1_epi16(static_cast(USHRT_MAX)); // cast as set1 only takes signed shorts __m128i max = _mm_setzero_si128(); // Min / max over index data @@ -71,13 +53,13 @@ void WalkIndexBuffer_SSE41(XTL::INDEX16 & LowIndex, XTL::INDEX16 & HighIndex, XT // horizontal min min = _mm_minpos_epu16(min); - // horizontal max (using minpos) - max = _mm_subs_epu16(_mm_set1_epi16(USHRT_MAX), max); //invert + // horizontal max (no maxpos, we invert and use minpos) + max = _mm_subs_epu16(_mm_set1_epi16(static_cast(USHRT_MAX)), max); //invert max = _mm_minpos_epu16(max); // Get the min and max out LowIndex = (XTL::INDEX16) _mm_cvtsi128_si32(min); - HighIndex = (XTL::INDEX16) USHRT_MAX - _mm_cvtsi128_si32(max); + HighIndex = (XTL::INDEX16) USHRT_MAX - _mm_cvtsi128_si32(max); // invert back // Compare with the remaining values that didn't fit neatly into the SIMD registers for (DWORD i = dwIndexCount - remainder; i < dwIndexCount; i++) { @@ -89,4 +71,17 @@ void WalkIndexBuffer_SSE41(XTL::INDEX16 & LowIndex, XTL::INDEX16 & HighIndex, XT } } -// TODO AVX2, AVX512 +// TODO AVX2, AVX512 implementations + +// Detect SSE support to select real implementation on first call +void(*WalkIndexBuffer)(XTL::INDEX16 &, XTL::INDEX16 &, XTL::INDEX16 *, DWORD) = +[](XTL::INDEX16 &LowIndex, XTL::INDEX16 &HighIndex, XTL::INDEX16 *pIndexData, DWORD dwIndexCount) +{ + SimdCaps supports; + if (supports.SSE41()) + WalkIndexBuffer = WalkIndexBuffer_SSE41; + else + WalkIndexBuffer = WalkIndexBuffer_NoSIMD; + + WalkIndexBuffer(LowIndex, HighIndex, pIndexData, dwIndexCount); +}; diff --git a/src/core/hle/D3D8/Direct3D9/WalkIndexBuffer.h b/src/core/hle/D3D8/Direct3D9/WalkIndexBuffer.h index e2482b21b..d064c7a4a 100644 --- a/src/core/hle/D3D8/Direct3D9/WalkIndexBuffer.h +++ b/src/core/hle/D3D8/Direct3D9/WalkIndexBuffer.h @@ -4,7 +4,7 @@ #include "core\kernel\support\Emu.h" #include "core\kernel\support\EmuXTL.h" -void WalkIndexBuffer +extern void(*WalkIndexBuffer) ( XTL::INDEX16 &LowIndex, XTL::INDEX16 &HighIndex, @@ -12,22 +12,4 @@ void WalkIndexBuffer DWORD dwIndexCount ); - -void Init_SIMD -( - XTL::INDEX16 &LowIndex, - XTL::INDEX16 &HighIndex, - XTL::INDEX16 *pIndexData, - DWORD dwIndexCount -); - -static void(*WalkIndexBuffer_SIMD) -( - XTL::INDEX16 &LowIndex, - XTL::INDEX16 &HighIndex, - XTL::INDEX16 *pIndexData, - DWORD dwIndexCount -) = Init_SIMD; - - #endif