diff --git a/Source/Core/VideoBackends/Software/SWVertexLoader.cpp b/Source/Core/VideoBackends/Software/SWVertexLoader.cpp index c712115282..0612a743da 100644 --- a/Source/Core/VideoBackends/Software/SWVertexLoader.cpp +++ b/Source/Core/VideoBackends/Software/SWVertexLoader.cpp @@ -42,15 +42,15 @@ void SWVertexLoader::SetFormat(u8 attributeIndex, u8 primitiveType) { m_CurrentVat = &g_main_cp_state.vtx_attr[attributeIndex]; - posScale = 1.0f / float(1 << m_CurrentVat->g0.PosFrac); - tcScale[0] = 1.0f / float(1 << m_CurrentVat->g0.Tex0Frac); - tcScale[1] = 1.0f / float(1 << m_CurrentVat->g1.Tex1Frac); - tcScale[2] = 1.0f / float(1 << m_CurrentVat->g1.Tex2Frac); - tcScale[3] = 1.0f / float(1 << m_CurrentVat->g1.Tex3Frac); - tcScale[4] = 1.0f / float(1 << m_CurrentVat->g2.Tex4Frac); - tcScale[5] = 1.0f / float(1 << m_CurrentVat->g2.Tex5Frac); - tcScale[6] = 1.0f / float(1 << m_CurrentVat->g2.Tex6Frac); - tcScale[7] = 1.0f / float(1 << m_CurrentVat->g2.Tex7Frac); + posScale[0] = posScale[1] = posScale[2] = posScale[3] = 1.0f / float(1 << m_CurrentVat->g0.PosFrac); + tcScale[0][0] = tcScale[0][1] = 1.0f / float(1 << m_CurrentVat->g0.Tex0Frac); + tcScale[1][0] = tcScale[1][1] = 1.0f / float(1 << m_CurrentVat->g1.Tex1Frac); + tcScale[2][0] = tcScale[2][1] = 1.0f / float(1 << m_CurrentVat->g1.Tex2Frac); + tcScale[3][0] = tcScale[3][1] = 1.0f / float(1 << m_CurrentVat->g1.Tex3Frac); + tcScale[4][0] = tcScale[4][1] = 1.0f / float(1 << m_CurrentVat->g2.Tex4Frac); + tcScale[5][0] = tcScale[5][1] = 1.0f / float(1 << m_CurrentVat->g2.Tex5Frac); + tcScale[6][0] = tcScale[6][1] = 1.0f / float(1 << m_CurrentVat->g2.Tex6Frac); + tcScale[7][0] = tcScale[7][1] = 1.0f / float(1 << m_CurrentVat->g2.Tex7Frac); //TexMtx const u64 tmDesc[8] = { diff --git a/Source/Core/VideoCommon/VertexLoader.cpp b/Source/Core/VideoCommon/VertexLoader.cpp index 373dfae16b..80a891ff26 100644 --- a/Source/Core/VideoCommon/VertexLoader.cpp +++ b/Source/Core/VideoCommon/VertexLoader.cpp @@ -41,8 +41,9 @@ static int s_texmtxread = 0; int tcIndex; int colIndex; int colElements[2]; -float posScale; -float tcScale[8]; +// Duplicated (4x and 2x respectively) and used in SSE code in the vertex loader JIT +GC_ALIGNED128(float posScale[4]); +GC_ALIGNED64(float tcScale[8][2]); static const float fractionTable[32] = { 1.0f / (1U << 0), 1.0f / (1U << 1), 1.0f / (1U << 2), 1.0f / (1U << 3), @@ -65,10 +66,8 @@ static void LOADERDECL PosMtx_ReadDirect_UByte() static void LOADERDECL PosMtx_Write() { - DataWrite(s_curposmtx); - DataWrite(0); - DataWrite(0); - DataWrite(0); + // u8, 0, 0, 0 + DataWrite(s_curposmtx); } static void LOADERDECL TexMtx_ReadDirect_UByte() @@ -451,10 +450,10 @@ void VertexLoader::SetupRunVertices(const VAT& vat, int primitive, int const cou m_VtxAttr.texCoord[6].Frac = vat.g2.Tex6Frac; m_VtxAttr.texCoord[7].Frac = vat.g2.Tex7Frac; - posScale = fractionTable[m_VtxAttr.PosFrac]; + posScale[0] = posScale[1] = posScale[2] = posScale[3] = fractionTable[m_VtxAttr.PosFrac]; if (m_native_components & VB_HAS_UVALL) for (int i = 0; i < 8; i++) - tcScale[i] = fractionTable[m_VtxAttr.texCoord[i].Frac]; + tcScale[i][0] = tcScale[i][1] = fractionTable[m_VtxAttr.texCoord[i].Frac]; for (int i = 0; i < 2; i++) colElements[i] = m_VtxAttr.color[i].Elements; diff --git a/Source/Core/VideoCommon/VertexLoader.h b/Source/Core/VideoCommon/VertexLoader.h index c81be52556..71159b60d7 100644 --- a/Source/Core/VideoCommon/VertexLoader.h +++ b/Source/Core/VideoCommon/VertexLoader.h @@ -19,6 +19,13 @@ #include "VideoCommon/DataReader.h" #include "VideoCommon/NativeVertexFormat.h" +#if _M_SSE >= 0x401 +#include +#include +#elif _M_SSE >= 0x301 && !(defined __GNUC__ && !defined __SSSE3__) +#include +#endif + #ifdef _M_X86 #define USE_VERTEX_LOADER_JIT #endif @@ -27,8 +34,8 @@ extern int tcIndex; extern int colIndex; extern int colElements[2]; -extern float posScale; -extern float tcScale[8]; +GC_ALIGNED128(extern float posScale[4]); +GC_ALIGNED64(extern float tcScale[8][2]); class VertexLoaderUID { @@ -155,3 +162,61 @@ private: void WriteSetVariable(int bits, void *address, Gen::OpArg dest); #endif }; + +#if _M_SSE >= 0x301 +static const __m128i kMaskSwap32_3 = _mm_set_epi32(0xFFFFFFFFL, 0x08090A0BL, 0x04050607L, 0x00010203L); +static const __m128i kMaskSwap32_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L); +static const __m128i kMaskSwap16to32l_3 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFF0405L, 0xFFFF0203L, 0xFFFF0001L); +static const __m128i kMaskSwap16to32l_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFF0203L, 0xFFFF0001L); +static const __m128i kMaskSwap16to32h_3 = _mm_set_epi32(0xFFFFFFFFL, 0x0405FFFFL, 0x0203FFFFL, 0x0001FFFFL); +static const __m128i kMaskSwap16to32h_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x0203FFFFL, 0x0001FFFFL); +static const __m128i kMask8to32l_3 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFF02L, 0xFFFFFF01L, 0xFFFFFF00L); +static const __m128i kMask8to32l_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFF01L, 0xFFFFFF00L); +static const __m128i kMask8to32h_3 = _mm_set_epi32(0xFFFFFFFFL, 0x02FFFFFFL, 0x01FFFFFFL, 0x00FFFFFFL); +static const __m128i kMask8to32h_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x01FFFFFFL, 0x00FFFFFFL); + +template +__forceinline void Vertex_Read_SSSE3(const T* pData, __m128 scale) +{ + __m128i coords, mask; + + int loadBytes = sizeof(T) * (2 + threeIn); + if (loadBytes > 8) + coords = _mm_loadu_si128((__m128i*)pData); + else if (loadBytes > 4) + coords = _mm_loadl_epi64((__m128i*)pData); + else + coords = _mm_cvtsi32_si128(*(u32*)pData); + + // Float case (no scaling) + if (sizeof(T) == 4) + { + coords = _mm_shuffle_epi8(coords, threeIn ? kMaskSwap32_3 : kMaskSwap32_2); + if (threeOut) + _mm_storeu_si128((__m128i*)VertexManager::s_pCurBufferPointer, coords); + else + _mm_storel_epi64((__m128i*)VertexManager::s_pCurBufferPointer, coords); + } + else + { + // Byte swap, unpack, and move to high bytes for sign extend. + if (std::is_unsigned::value) + mask = sizeof(T) == 2 ? (threeIn ? kMaskSwap16to32l_3 : kMaskSwap16to32l_2) : (threeIn ? kMask8to32l_3 : kMask8to32l_2); + else + mask = sizeof(T) == 2 ? (threeIn ? kMaskSwap16to32h_3 : kMaskSwap16to32h_2) : (threeIn ? kMask8to32h_3 : kMask8to32h_2); + coords = _mm_shuffle_epi8(coords, mask); + + // Sign extend + if (std::is_signed::value) + coords = _mm_srai_epi32(coords, 32 - sizeof(T) * 8); + + __m128 out = _mm_mul_ps(_mm_cvtepi32_ps(coords), scale); + if (threeOut) + _mm_storeu_ps((float*)VertexManager::s_pCurBufferPointer, out); + else + _mm_storel_pi((__m64*)VertexManager::s_pCurBufferPointer, out); + } + + VertexManager::s_pCurBufferPointer += sizeof(float) * (2 + threeOut); +} +#endif \ No newline at end of file diff --git a/Source/Core/VideoCommon/VertexLoader_Normal.cpp b/Source/Core/VideoCommon/VertexLoader_Normal.cpp index 3d58592d70..8ff5a767e7 100644 --- a/Source/Core/VideoCommon/VertexLoader_Normal.cpp +++ b/Source/Core/VideoCommon/VertexLoader_Normal.cpp @@ -3,7 +3,7 @@ // Refer to the license.txt file included. #include -#include +#include #include "Common/CommonTypes.h" #include "Common/CPUDetect.h" @@ -13,13 +13,6 @@ #include "VideoCommon/VertexManagerBase.h" #include "VideoCommon/VideoCommon.h" -#if _M_SSE >= 0x401 -#include -#include -#elif _M_SSE >= 0x301 && !(defined __GNUC__ && !defined __SSSE3__) -#include -#endif - // warning: mapping buffer should be disabled to use this #define LOG_NORM() // PRIM_LOG("norm: %f %f %f, ", ((float*)VertexManager::s_pCurBufferPointer)[-3], ((float*)VertexManager::s_pCurBufferPointer)[-2], ((float*)VertexManager::s_pCurBufferPointer)[-1]); @@ -37,7 +30,7 @@ __forceinline float FracAdjust(T val) //auto const U16FRAC = 1.f / (1u << 15); // TODO: is this right? - return val / float(1u << (sizeof(T) * 8 - std::numeric_limits::is_signed - 1)); + return val / float(1u << (sizeof(T) * 8 - std::is_signed::value - 1)); } template <> @@ -76,11 +69,11 @@ struct Normal_Direct template __forceinline void Normal_Index_Offset() { - static_assert(!std::numeric_limits::is_signed, "Only unsigned I is sane!"); + static_assert(std::is_unsigned::value, "Only unsigned I is sane!"); auto const index = DataRead(); auto const data = reinterpret_cast(cached_arraybases[ARRAY_NORMAL] - + (index * g_main_cp_state.array_strides[ARRAY_NORMAL]) + sizeof(T) * 3 * Offset); + + (index * g_main_cp_state.array_strides[ARRAY_NORMAL]) + sizeof(T) * 3 * Offset); ReadIndirect(data); } @@ -108,6 +101,63 @@ struct Normal_Index_Indices3 static const int size = sizeof(I) * 3; }; +#if _M_SSE >= 0x301 +template +struct Normal_Direct_SSSE3 +{ + static void LOADERDECL function() + { + const T* pData = reinterpret_cast(DataGetPosition()); + const float frac = 1. / float(1u << (sizeof(T) * 8 - std::is_signed::value - 1)); + const __m128 scale = _mm_set_ps(frac, frac, frac, frac); + for (int i = 0; i < N; i++, pData += 3) + Vertex_Read_SSSE3(pData, scale); + DataSkip(); + } + + static const int size = sizeof(T) * N * 3; +}; + +template +__forceinline void Normal_Index_Offset_SSSE3() +{ + static_assert(std::is_unsigned::value, "Only unsigned I is sane!"); + + auto const index = DataRead(); + const T* pData = (const T*)(cached_arraybases[ARRAY_NORMAL] + + (index * g_main_cp_state.array_strides[ARRAY_NORMAL]) + sizeof(T) * 3 * Offset); + const float frac = 1. / float(1u << (sizeof(T) * 8 - std::is_signed::value - 1)); + const __m128 scale = _mm_set_ps(frac, frac, frac, frac); + for (int i = 0; i < N; i++, pData += 3) + Vertex_Read_SSSE3(pData, scale); +} + +template +struct Normal_Index_SSSE3 +{ + static void LOADERDECL function() + { + Normal_Index_Offset_SSSE3(); + } + + static const int size = sizeof(I); +}; + +template +struct Normal_Index_Indices3_SSSE3 +{ + static void LOADERDECL function() + { + Normal_Index_Offset_SSSE3(); + Normal_Index_Offset_SSSE3(); + Normal_Index_Offset_SSSE3(); + } + + static const int size = sizeof(I) * 3; +}; + +#endif + } void VertexLoader_Normal::Init() @@ -180,6 +230,77 @@ void VertexLoader_Normal::Init() m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_USHORT] = Normal_Index_Indices3(); m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_SHORT] = Normal_Index_Indices3(); m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_FLOAT] = Normal_Index_Indices3(); + +#if _M_SSE >= 0x301 + if (cpu_info.bSSSE3) + { + m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT][FORMAT_UBYTE] = Normal_Direct_SSSE3(); + m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT][FORMAT_BYTE] = Normal_Direct_SSSE3(); + m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT][FORMAT_USHORT] = Normal_Direct_SSSE3(); + m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT][FORMAT_SHORT] = Normal_Direct_SSSE3(); + m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT][FORMAT_FLOAT] = Normal_Direct_SSSE3(); + m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT3][FORMAT_UBYTE] = Normal_Direct_SSSE3(); + m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT3][FORMAT_BYTE] = Normal_Direct_SSSE3(); + m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT3][FORMAT_USHORT] = Normal_Direct_SSSE3(); + m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT3][FORMAT_SHORT] = Normal_Direct_SSSE3(); + m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT3][FORMAT_FLOAT] = Normal_Direct_SSSE3(); + + m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT][FORMAT_UBYTE] = Normal_Direct_SSSE3(); + m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT][FORMAT_BYTE] = Normal_Direct_SSSE3(); + m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT][FORMAT_USHORT] = Normal_Direct_SSSE3(); + m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT][FORMAT_SHORT] = Normal_Direct_SSSE3(); + m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT][FORMAT_FLOAT] = Normal_Direct_SSSE3(); + m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT3][FORMAT_UBYTE] = Normal_Direct_SSSE3(); + m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT3][FORMAT_BYTE] = Normal_Direct_SSSE3(); + m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT3][FORMAT_USHORT] = Normal_Direct_SSSE3(); + m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT3][FORMAT_SHORT] = Normal_Direct_SSSE3(); + m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT3][FORMAT_FLOAT] = Normal_Direct_SSSE3(); + + m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT][FORMAT_UBYTE] = Normal_Index_SSSE3(); + m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT][FORMAT_BYTE] = Normal_Index_SSSE3(); + m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT][FORMAT_USHORT] = Normal_Index_SSSE3(); + m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT][FORMAT_SHORT] = Normal_Index_SSSE3(); + m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT][FORMAT_FLOAT] = Normal_Index_SSSE3(); + m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT3][FORMAT_UBYTE] = Normal_Index_SSSE3(); + m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT3][FORMAT_BYTE] = Normal_Index_SSSE3(); + m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT3][FORMAT_USHORT] = Normal_Index_SSSE3(); + m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT3][FORMAT_SHORT] = Normal_Index_SSSE3(); + m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT3][FORMAT_FLOAT] = Normal_Index_SSSE3(); + + m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT][FORMAT_UBYTE] = Normal_Index_SSSE3(); + m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT][FORMAT_BYTE] = Normal_Index_SSSE3(); + m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT][FORMAT_USHORT] = Normal_Index_SSSE3(); + m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT][FORMAT_SHORT] = Normal_Index_SSSE3(); + m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT][FORMAT_FLOAT] = Normal_Index_SSSE3(); + m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT3][FORMAT_UBYTE] = Normal_Index_Indices3_SSSE3(); + m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT3][FORMAT_BYTE] = Normal_Index_Indices3_SSSE3(); + m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT3][FORMAT_USHORT] = Normal_Index_Indices3_SSSE3(); + m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT3][FORMAT_SHORT] = Normal_Index_Indices3_SSSE3(); + m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT3][FORMAT_FLOAT] = Normal_Index_Indices3_SSSE3(); + + m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT][FORMAT_UBYTE] = Normal_Index_SSSE3(); + m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT][FORMAT_BYTE] = Normal_Index_SSSE3(); + m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT][FORMAT_USHORT] = Normal_Index_SSSE3(); + m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT][FORMAT_SHORT] = Normal_Index_SSSE3(); + m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT][FORMAT_FLOAT] = Normal_Index_SSSE3(); + m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT3][FORMAT_UBYTE] = Normal_Index_SSSE3(); + m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT3][FORMAT_BYTE] = Normal_Index_SSSE3(); + m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT3][FORMAT_USHORT] = Normal_Index_SSSE3(); + m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT3][FORMAT_SHORT] = Normal_Index_SSSE3(); + m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT3][FORMAT_FLOAT] = Normal_Index_SSSE3(); + + m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT][FORMAT_UBYTE] = Normal_Index_SSSE3(); + m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT][FORMAT_BYTE] = Normal_Index_SSSE3(); + m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT][FORMAT_USHORT] = Normal_Index_SSSE3(); + m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT][FORMAT_SHORT] = Normal_Index_SSSE3(); + m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT][FORMAT_FLOAT] = Normal_Index_SSSE3(); + m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_UBYTE] = Normal_Index_Indices3_SSSE3(); + m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_BYTE] = Normal_Index_Indices3_SSSE3(); + m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_USHORT] = Normal_Index_Indices3_SSSE3(); + m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_SHORT] = Normal_Index_Indices3_SSSE3(); + m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_FLOAT] = Normal_Index_Indices3_SSSE3(); + } +#endif } unsigned int VertexLoader_Normal::GetSize(u64 _type, diff --git a/Source/Core/VideoCommon/VertexLoader_Position.cpp b/Source/Core/VideoCommon/VertexLoader_Position.cpp index a38d429d58..6c50dd1560 100644 --- a/Source/Core/VideoCommon/VertexLoader_Position.cpp +++ b/Source/Core/VideoCommon/VertexLoader_Position.cpp @@ -2,7 +2,7 @@ // Licensed under GPLv2 // Refer to the license.txt file included. -#include +#include #include "Common/CommonTypes.h" #include "Common/CPUDetect.h" @@ -74,7 +74,7 @@ template void LOADERDECL Pos_ReadDirect() { static_assert(N <= 3, "N > 3 is not sane!"); - auto const scale = posScale; + auto const scale = posScale[0]; DataWriter dst; DataReader src; @@ -87,12 +87,12 @@ void LOADERDECL Pos_ReadDirect() template void LOADERDECL Pos_ReadIndex() { - static_assert(!std::numeric_limits::is_signed, "Only unsigned I is sane!"); + static_assert(std::is_unsigned::value, "Only unsigned I is sane!"); static_assert(N <= 3, "N > 3 is not sane!"); auto const index = DataRead(); auto const data = reinterpret_cast(cached_arraybases[ARRAY_POSITION] + (index * g_main_cp_state.array_strides[ARRAY_POSITION])); - auto const scale = posScale; + auto const scale = posScale[0]; DataWriter dst; for (int i = 0; i < 3; ++i) @@ -102,18 +102,22 @@ void LOADERDECL Pos_ReadIndex() } #if _M_SSE >= 0x301 -static const __m128i kMaskSwap32_3 = _mm_set_epi32(0xFFFFFFFFL, 0x08090A0BL, 0x04050607L, 0x00010203L); -static const __m128i kMaskSwap32_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L); - -template -void LOADERDECL Pos_ReadIndex_Float_SSSE3() +template +void LOADERDECL Pos_ReadDirect_SSSE3() { + const T* pData = reinterpret_cast(DataGetPosition()); + Vertex_Read_SSSE3(pData, *(__m128*)posScale); + DataSkip<(2 + three) * sizeof(T)>(); + LOG_VTX(); +} + +template +void LOADERDECL Pos_ReadIndex_SSSE3() +{ + static_assert(std::is_unsigned::value, "Only unsigned I is sane!"); auto const index = DataRead(); - const u32* pData = (const u32 *)(cached_arraybases[ARRAY_POSITION] + (index * g_main_cp_state.array_strides[ARRAY_POSITION])); - GC_ALIGNED128(const __m128i a = _mm_loadu_si128((__m128i*)pData)); - GC_ALIGNED128(__m128i b = _mm_shuffle_epi8(a, three ? kMaskSwap32_3 : kMaskSwap32_2)); - _mm_storeu_si128((__m128i*)VertexManager::s_pCurBufferPointer, b); - VertexManager::s_pCurBufferPointer += sizeof(float) * 3; + const T* pData = (const T*)(cached_arraybases[ARRAY_POSITION] + (index * g_main_cp_state.array_strides[ARRAY_POSITION])); + Vertex_Read_SSSE3(pData, *(__m128*)posScale); LOG_VTX(); } #endif @@ -169,15 +173,39 @@ void VertexLoader_Position::Init() { #if _M_SSE >= 0x301 - if (cpu_info.bSSSE3) { - tableReadPosition[2][4][0] = Pos_ReadIndex_Float_SSSE3; - tableReadPosition[2][4][1] = Pos_ReadIndex_Float_SSSE3; - tableReadPosition[3][4][0] = Pos_ReadIndex_Float_SSSE3; - tableReadPosition[3][4][1] = Pos_ReadIndex_Float_SSSE3; + tableReadPosition[1][0][0] = Pos_ReadDirect_SSSE3; + tableReadPosition[1][0][1] = Pos_ReadDirect_SSSE3; + tableReadPosition[1][1][0] = Pos_ReadDirect_SSSE3; + tableReadPosition[1][1][1] = Pos_ReadDirect_SSSE3; + tableReadPosition[1][2][0] = Pos_ReadDirect_SSSE3; + tableReadPosition[1][2][1] = Pos_ReadDirect_SSSE3; + tableReadPosition[1][3][0] = Pos_ReadDirect_SSSE3; + tableReadPosition[1][3][1] = Pos_ReadDirect_SSSE3; + tableReadPosition[1][4][0] = Pos_ReadDirect_SSSE3; + tableReadPosition[1][4][1] = Pos_ReadDirect_SSSE3; + tableReadPosition[2][0][0] = Pos_ReadIndex_SSSE3; + tableReadPosition[2][0][1] = Pos_ReadIndex_SSSE3; + tableReadPosition[3][0][0] = Pos_ReadIndex_SSSE3; + tableReadPosition[3][0][1] = Pos_ReadIndex_SSSE3; + tableReadPosition[2][1][0] = Pos_ReadIndex_SSSE3; + tableReadPosition[2][1][1] = Pos_ReadIndex_SSSE3; + tableReadPosition[3][1][0] = Pos_ReadIndex_SSSE3; + tableReadPosition[3][1][1] = Pos_ReadIndex_SSSE3; + tableReadPosition[2][2][0] = Pos_ReadIndex_SSSE3; + tableReadPosition[2][2][1] = Pos_ReadIndex_SSSE3; + tableReadPosition[3][2][0] = Pos_ReadIndex_SSSE3; + tableReadPosition[3][2][1] = Pos_ReadIndex_SSSE3; + tableReadPosition[2][3][0] = Pos_ReadIndex_SSSE3; + tableReadPosition[2][3][1] = Pos_ReadIndex_SSSE3; + tableReadPosition[3][3][0] = Pos_ReadIndex_SSSE3; + tableReadPosition[3][3][1] = Pos_ReadIndex_SSSE3; + tableReadPosition[2][4][0] = Pos_ReadIndex_SSSE3; + tableReadPosition[2][4][1] = Pos_ReadIndex_SSSE3; + tableReadPosition[3][4][0] = Pos_ReadIndex_SSSE3; + tableReadPosition[3][4][1] = Pos_ReadIndex_SSSE3; } - #endif } diff --git a/Source/Core/VideoCommon/VertexLoader_TextCoord.cpp b/Source/Core/VideoCommon/VertexLoader_TextCoord.cpp index 25114ac82d..8acebd9213 100644 --- a/Source/Core/VideoCommon/VertexLoader_TextCoord.cpp +++ b/Source/Core/VideoCommon/VertexLoader_TextCoord.cpp @@ -2,6 +2,8 @@ // Licensed under GPLv2 // Refer to the license.txt file included. +#include + #include "Common/CommonTypes.h" #include "Common/CPUDetect.h" @@ -10,13 +12,6 @@ #include "VideoCommon/VertexManagerBase.h" #include "VideoCommon/VideoCommon.h" - -#if _M_SSE >= 0x401 -#include -#elif _M_SSE >= 0x301 && !(defined __GNUC__ && !defined __SSSE3__) -#include -#endif - template void LOG_TEX(); @@ -54,7 +49,7 @@ float TCScale(float val, float scale) template void LOADERDECL TexCoord_ReadDirect() { - auto const scale = tcScale[tcIndex]; + auto const scale = tcScale[tcIndex][0]; DataWriter dst; DataReader src; @@ -69,12 +64,12 @@ void LOADERDECL TexCoord_ReadDirect() template void LOADERDECL TexCoord_ReadIndex() { - static_assert(!std::numeric_limits::is_signed, "Only unsigned I is sane!"); + static_assert(std::is_unsigned::value, "Only unsigned I is sane!"); auto const index = DataRead(); auto const data = reinterpret_cast(cached_arraybases[ARRAY_TEXCOORD0 + tcIndex] - + (index * g_main_cp_state.array_strides[ARRAY_TEXCOORD0 + tcIndex])); - auto const scale = tcScale[tcIndex]; + + (index * g_main_cp_state.array_strides[ARRAY_TEXCOORD0 + tcIndex])); + auto const scale = tcScale[tcIndex][0]; DataWriter dst; for (int i = 0; i != N; ++i) @@ -84,44 +79,27 @@ void LOADERDECL TexCoord_ReadIndex() ++tcIndex; } -#if _M_SSE >= 0x401 -static const __m128i kMaskSwap16_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0x02030001L); - -template -void LOADERDECL TexCoord_ReadIndex_Short2_SSE4() +#if _M_SSE >= 0x301 +template +void LOADERDECL TexCoord_ReadDirect2_SSSE3() { - static_assert(!std::numeric_limits::is_signed, "Only unsigned I is sane!"); - - // Heavy in ZWW - auto const index = DataRead(); - const s32 *pData = (const s32*)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (index * g_main_cp_state.array_strides[ARRAY_TEXCOORD0+tcIndex])); - const __m128i a = _mm_cvtsi32_si128(*pData); - const __m128i b = _mm_shuffle_epi8(a, kMaskSwap16_2); - const __m128i c = _mm_cvtepi16_epi32(b); - const __m128 d = _mm_cvtepi32_ps(c); - const __m128 e = _mm_load1_ps(&tcScale[tcIndex]); - const __m128 f = _mm_mul_ps(d, e); - _mm_storeu_ps((float*)VertexManager::s_pCurBufferPointer, f); - VertexManager::s_pCurBufferPointer += sizeof(float) * 2; + const T* pData = reinterpret_cast(DataGetPosition()); + __m128 scale = _mm_castsi128_ps(_mm_loadl_epi64((__m128i*)tcScale[tcIndex])); + Vertex_Read_SSSE3(pData, scale); + DataSkip<2 * sizeof(T)>(); LOG_TEX<2>(); tcIndex++; } -#endif -#if _M_SSE >= 0x301 -static const __m128i kMaskSwap32 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L); - -template -void LOADERDECL TexCoord_ReadIndex_Float2_SSSE3() +template +void LOADERDECL TexCoord_ReadIndex2_SSSE3() { - static_assert(!std::numeric_limits::is_signed, "Only unsigned I is sane!"); + static_assert(std::is_unsigned::value, "Only unsigned I is sane!"); auto const index = DataRead(); - const u32 *pData = (const u32 *)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (index * g_main_cp_state.array_strides[ARRAY_TEXCOORD0+tcIndex])); - GC_ALIGNED128(const __m128i a = _mm_loadl_epi64((__m128i*)pData)); - GC_ALIGNED128(const __m128i b = _mm_shuffle_epi8(a, kMaskSwap32)); - _mm_storel_epi64((__m128i*)VertexManager::s_pCurBufferPointer, b); - VertexManager::s_pCurBufferPointer += sizeof(float) * 2; + const T* pData = (const T*)(cached_arraybases[ARRAY_TEXCOORD0 + tcIndex] + (index * g_main_cp_state.array_strides[ARRAY_TEXCOORD0 + tcIndex])); + __m128 scale = _mm_castsi128_ps(_mm_loadl_epi64((__m128i*)tcScale[tcIndex])); + Vertex_Read_SSSE3(pData, scale); LOG_TEX<2>(); tcIndex++; } @@ -177,23 +155,24 @@ void VertexLoader_TextCoord::Init() { #if _M_SSE >= 0x301 - if (cpu_info.bSSSE3) { - tableReadTexCoord[2][4][1] = TexCoord_ReadIndex_Float2_SSSE3; - tableReadTexCoord[3][4][1] = TexCoord_ReadIndex_Float2_SSSE3; + tableReadTexCoord[1][0][1] = TexCoord_ReadDirect2_SSSE3; + tableReadTexCoord[1][1][1] = TexCoord_ReadDirect2_SSSE3; + tableReadTexCoord[1][2][1] = TexCoord_ReadDirect2_SSSE3; + tableReadTexCoord[1][3][1] = TexCoord_ReadDirect2_SSSE3; + tableReadTexCoord[1][4][1] = TexCoord_ReadDirect2_SSSE3; + tableReadTexCoord[2][0][1] = TexCoord_ReadIndex2_SSSE3; + tableReadTexCoord[3][0][1] = TexCoord_ReadIndex2_SSSE3; + tableReadTexCoord[2][1][1] = TexCoord_ReadIndex2_SSSE3; + tableReadTexCoord[3][1][1] = TexCoord_ReadIndex2_SSSE3; + tableReadTexCoord[2][2][1] = TexCoord_ReadIndex2_SSSE3; + tableReadTexCoord[3][2][1] = TexCoord_ReadIndex2_SSSE3; + tableReadTexCoord[2][3][1] = TexCoord_ReadIndex2_SSSE3; + tableReadTexCoord[3][3][1] = TexCoord_ReadIndex2_SSSE3; + tableReadTexCoord[2][4][1] = TexCoord_ReadIndex2_SSSE3; + tableReadTexCoord[3][4][1] = TexCoord_ReadIndex2_SSSE3; } - -#endif - -#if _M_SSE >= 0x401 - - if (cpu_info.bSSE4_1) - { - tableReadTexCoord[2][3][1] = TexCoord_ReadIndex_Short2_SSE4; - tableReadTexCoord[3][3][1] = TexCoord_ReadIndex_Short2_SSE4; - } - #endif } diff --git a/Source/Core/VideoCommon/VertexManagerBase.cpp b/Source/Core/VideoCommon/VertexManagerBase.cpp index 7a18ba435b..dcd8780ede 100644 --- a/Source/Core/VideoCommon/VertexManagerBase.cpp +++ b/Source/Core/VideoCommon/VertexManagerBase.cpp @@ -53,7 +53,8 @@ u32 VertexManager::GetRemainingSize() void VertexManager::PrepareForAdditionalData(int primitive, u32 count, u32 stride) { - u32 const needed_vertex_bytes = count * stride; + // The SSE vertex loader can write up to 4 bytes past the end + u32 const needed_vertex_bytes = count * stride + 4; // We can't merge different kinds of primitives, so we have to flush here if (current_primitive_type != primitive_from_gx[primitive])