From 956b8eb54db07e98f694082f5f14a9558e393a88 Mon Sep 17 00:00:00 2001 From: nodchip Date: Fri, 9 Apr 2010 15:13:42 +0000 Subject: [PATCH] VideoCommon: Added automatic selection routines for SSSE3/SSE4.1 codes. It selects SSSE3/SSE4.1 codes only if a proper preprocessor definition is defined and the target cpu supports SSSE3/SSE4.1. The selection routines in VertexLoader_* use function pointers. TextureDecoder uses a combination of "#if" and "if" statements. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@5302 8ced0084-cf51-0410-be5f-012b33b47a6e --- .../Core/VideoCommon/Src/TextureDecoder.cpp | 184 +++++++++++------- Source/Core/VideoCommon/Src/VertexLoader.cpp | 12 +- .../VideoCommon/Src/VertexLoader_Normal.cpp | 44 +++-- .../VideoCommon/Src/VertexLoader_Normal.h | 5 + .../VideoCommon/Src/VertexLoader_Position.cpp | 70 +++++-- .../VideoCommon/Src/VertexLoader_Position.h | 20 +- .../Src/VertexLoader_TextCoord.cpp | 93 ++++++--- .../VideoCommon/Src/VertexLoader_TextCoord.h | 25 +-- 8 files changed, 295 insertions(+), 158 deletions(-) diff --git a/Source/Core/VideoCommon/Src/TextureDecoder.cpp b/Source/Core/VideoCommon/Src/TextureDecoder.cpp index 7cc0c05247..e10292ae3b 100644 --- a/Source/Core/VideoCommon/Src/TextureDecoder.cpp +++ b/Source/Core/VideoCommon/Src/TextureDecoder.cpp @@ -366,37 +366,41 @@ inline void decodebytesC8_5A3_To_BGRA32(u32 *dst, const u8 *src, int tlutaddr) } } -#if _M_SSE >= 0x301 - static const __m128i kMaskSwap16 = _mm_set_epi32(0x0E0F0C0DL, 0x0A0B0809L, 0x06070405L, 0x02030001L); -#endif - inline void decodebytesC8_To_Raw16(u16* dst, const u8* src, int tlutaddr) { u16* tlut = (u16*)(texMem + tlutaddr); + for (int x = 0; x < 8; x++) + { + u8 val = src[x]; + *dst++ = Common::swap16(tlut[val]); + } +} #if _M_SSE >= 0x301 +static const __m128i kMaskSwap16 = _mm_set_epi32(0x0E0F0C0DL, 0x0A0B0809L, 0x06070405L, 0x02030001L); + +inline void decodebytesC8_To_Raw16_SSSE3(u16* dst, const u8* src, int tlutaddr) +{ + u16* tlut = (u16*)(texMem + tlutaddr); // Make 8 16-bits unsigned integer values - const __m128i a = _mm_set_epi16(tlut[src[7]], tlut[src[6]], tlut[src[5]], tlut[src[4]], tlut[src[3]], tlut[src[2]], tlut[src[1]], tlut[src[0]]); + __m128i a = _mm_setzero_si128(); + a = _mm_insert_epi16(a, tlut[src[0]], 0); + a = _mm_insert_epi16(a, tlut[src[1]], 1); + a = _mm_insert_epi16(a, tlut[src[2]], 2); + a = _mm_insert_epi16(a, tlut[src[3]], 3); + a = _mm_insert_epi16(a, tlut[src[4]], 4); + a = _mm_insert_epi16(a, tlut[src[5]], 5); + a = _mm_insert_epi16(a, tlut[src[6]], 6); + a = _mm_insert_epi16(a, tlut[src[7]], 7); // Apply Common::swap16() to 16-bits unsigned integers at once const __m128i b = _mm_shuffle_epi8(a, kMaskSwap16); // Store values to dst without polluting the caches _mm_stream_si128((__m128i*)dst, b); - -#else - - for (int x = 0; x < 8; x++) - { - u8 val = src[x]; - *dst++ = Common::swap16(tlut[val]); - } - -#endif - } - +#endif inline void decodebytesC14X2_5A3_To_BGRA32(u32 *dst, const u16 *src, int tlutaddr) { @@ -958,10 +962,26 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh } else { - for (int y = 0; y < height; y += 4) - for (int x = 0; x < width; x += 8) - for (int iy = 0; iy < 4; iy++, src += 8) - decodebytesC8_To_Raw16((u16*)dst + (y + iy) * width + x, src, tlutaddr); + +#if _M_SSE >= 0x301 + + if (cpu_info.bSSSE3) { + for (int y = 0; y < height; y += 4) + for (int x = 0; x < width; x += 8) + for (int iy = 0; iy < 4; iy++, src += 8) + decodebytesC8_To_Raw16_SSSE3((u16*)dst + (y + iy) * width + x, src, tlutaddr); + break; + } else + +#endif + + { + for (int y = 0; y < height; y += 4) + for (int x = 0; x < width; x += 8) + for (int iy = 0; iy < 4; iy++, src += 8) + decodebytesC8_To_Raw16((u16*)dst + (y + iy) * width + x, src, tlutaddr); + + } } return GetPCFormatFromTLUTFormat(tlutfmt); case GX_TF_IA4: @@ -1028,59 +1048,93 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh case GX_TF_RGBA8: // speed critical { -#if _M_SSE >= 0x301 - - for (int y = 0; y < height; y += 4) { - __m128i* p = (__m128i*)(src + y * width * 4); - for (int x = 0; x < width; x += 4) { + // FIXME(nodchip): the following code is too complicated. #if _M_SSE >= 0x401 - // Load 64-bytes at once. - const __m128i a0 = _mm_stream_load_si128(p++); - const __m128i a1 = _mm_stream_load_si128(p++); - const __m128i a2 = _mm_stream_load_si128(p++); - const __m128i a3 = _mm_stream_load_si128(p++); -#else - const __m128i a0 = _mm_load_si128(p++); - const __m128i a1 = _mm_load_si128(p++); - const __m128i a2 = _mm_load_si128(p++); - const __m128i a3 = _mm_load_si128(p++); + + if (cpu_info.bSSE4_1) { + for (int y = 0; y < height; y += 4) { + __m128i* p = (__m128i*)(src + y * width * 4); + for (int x = 0; x < width; x += 4) { + + // Load 64-bytes at once. + const __m128i a0 = _mm_stream_load_si128(p++); + const __m128i a1 = _mm_stream_load_si128(p++); + const __m128i a2 = _mm_stream_load_si128(p++); + const __m128i a3 = _mm_stream_load_si128(p++); + + // Shuffle 16-bit integeres by _mm_unpacklo_epi16()/_mm_unpackhi_epi16(), + // apply Common::swap32() by _mm_shuffle_epi8() and + // store them by _mm_stream_si128(). + // See decodebytesARGB8_4() about the idea. + const __m128i b0 = _mm_unpacklo_epi16(a0, a2); + const __m128i c0 = _mm_shuffle_epi8(b0, kMaskSwap32); + _mm_stream_si128((__m128i*)((u32*)dst + (y + 0) * width + x), c0); + + const __m128i b1 = _mm_unpackhi_epi16(a0, a2); + const __m128i c1 = _mm_shuffle_epi8(b1, kMaskSwap32); + _mm_stream_si128((__m128i*)((u32*)dst + (y + 1) * width + x), c1); + + const __m128i b2 = _mm_unpacklo_epi16(a1, a3); + const __m128i c2 = _mm_shuffle_epi8(b2, kMaskSwap32); + _mm_stream_si128((__m128i*)((u32*)dst + (y + 2) * width + x), c2); + + const __m128i b3 = _mm_unpackhi_epi16(a1, a3); + const __m128i c3 = _mm_shuffle_epi8(b3, kMaskSwap32); + _mm_stream_si128((__m128i*)((u32*)dst + (y + 3) * width + x), c3); + } + } + } else + #endif - // Shuffle 16-bit integeres by _mm_unpacklo_epi16()/_mm_unpackhi_epi16(), - // apply Common::swap32() by _mm_shuffle_epi8() and - // store them by _mm_stream_si128(). - // See decodebytesARGB8_4() about the idea. - const __m128i b0 = _mm_unpacklo_epi16(a0, a2); - const __m128i c0 = _mm_shuffle_epi8(b0, kMaskSwap32); - _mm_stream_si128((__m128i*)((u32*)dst + (y + 0) * width + x), c0); +#if _M_SSE >= 0x301 - const __m128i b1 = _mm_unpackhi_epi16(a0, a2); - const __m128i c1 = _mm_shuffle_epi8(b1, kMaskSwap32); - _mm_stream_si128((__m128i*)((u32*)dst + (y + 1) * width + x), c1); + if (cpu_info.bSSSE3) { + for (int y = 0; y < height; y += 4) { + __m128i* p = (__m128i*)(src + y * width * 4); + for (int x = 0; x < width; x += 4) { - const __m128i b2 = _mm_unpacklo_epi16(a1, a3); - const __m128i c2 = _mm_shuffle_epi8(b2, kMaskSwap32); - _mm_stream_si128((__m128i*)((u32*)dst + (y + 2) * width + x), c2); + const __m128i a0 = _mm_load_si128(p++); + const __m128i a1 = _mm_load_si128(p++); + const __m128i a2 = _mm_load_si128(p++); + const __m128i a3 = _mm_load_si128(p++); - const __m128i b3 = _mm_unpackhi_epi16(a1, a3); - const __m128i c3 = _mm_shuffle_epi8(b3, kMaskSwap32); - _mm_stream_si128((__m128i*)((u32*)dst + (y + 3) * width + x), c3); + // Shuffle 16-bit integeres by _mm_unpacklo_epi16()/_mm_unpackhi_epi16(), + // apply Common::swap32() by _mm_shuffle_epi8() and + // store them by _mm_stream_si128(). + // See decodebytesARGB8_4() about the idea. + const __m128i b0 = _mm_unpacklo_epi16(a0, a2); + const __m128i c0 = _mm_shuffle_epi8(b0, kMaskSwap32); + _mm_stream_si128((__m128i*)((u32*)dst + (y + 0) * width + x), c0); + + const __m128i b1 = _mm_unpackhi_epi16(a0, a2); + const __m128i c1 = _mm_shuffle_epi8(b1, kMaskSwap32); + _mm_stream_si128((__m128i*)((u32*)dst + (y + 1) * width + x), c1); + + const __m128i b2 = _mm_unpacklo_epi16(a1, a3); + const __m128i c2 = _mm_shuffle_epi8(b2, kMaskSwap32); + _mm_stream_si128((__m128i*)((u32*)dst + (y + 2) * width + x), c2); + + const __m128i b3 = _mm_unpackhi_epi16(a1, a3); + const __m128i c3 = _mm_shuffle_epi8(b3, kMaskSwap32); + _mm_stream_si128((__m128i*)((u32*)dst + (y + 3) * width + x), c3); + } } + } else + +#endif + + { + for (int y = 0; y < height; y += 4) + for (int x = 0; x < width; x += 4) + { + for (int iy = 0; iy < 4; iy++) + decodebytesARGB8_4((u32*)dst + (y+iy)*width + x, (u16*)src + 4 * iy, (u16*)src + 4 * iy + 16); + src += 64; + } } - -#else - - for (int y = 0; y < height; y += 4) - for (int x = 0; x < width; x += 4) - { - for (int iy = 0; iy < 4; iy++) - decodebytesARGB8_4((u32*)dst + (y+iy)*width + x, (u16*)src + 4 * iy, (u16*)src + 4 * iy + 16); - src += 64; - } - -#endif - } + } return PC_TEX_FMT_BGRA32; case GX_TF_CMPR: // speed critical // The metroid games use this format almost exclusively. diff --git a/Source/Core/VideoCommon/Src/VertexLoader.cpp b/Source/Core/VideoCommon/Src/VertexLoader.cpp index 7937b20750..a2a70483c6 100644 --- a/Source/Core/VideoCommon/Src/VertexLoader.cpp +++ b/Source/Core/VideoCommon/Src/VertexLoader.cpp @@ -175,6 +175,8 @@ VertexLoader::VertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr) m_NativeFmt = NativeVertexFormat::Create(); loop_counter = 0; VertexLoader_Normal::Init(); + VertexLoader_Position::Init(); + VertexLoader_TextCoord::Init(); m_VtxDesc = vtx_desc; SetVAT(vtx_attr.g0.Hex, vtx_attr.g1.Hex, vtx_attr.g2.Hex); @@ -268,8 +270,8 @@ void VertexLoader::CompileVertexTranslator() _assert_msg_(VIDEO, FORMAT_UBYTE <= m_VtxAttr.PosFormat && m_VtxAttr.PosFormat <= FORMAT_FLOAT, "Invalid vertex position format!\n(m_VtxAttr.PosFormat = %d)", m_VtxAttr.PosFormat); _assert_msg_(VIDEO, 0 <= m_VtxAttr.PosElements && m_VtxAttr.PosElements <= 1, "Invalid number of vertex position elemnts!\n(m_VtxAttr.PosElements = %d)", m_VtxAttr.PosElements); - WriteCall(tableReadPosition[m_VtxDesc.Position][m_VtxAttr.PosFormat][m_VtxAttr.PosElements]); - m_VertexSize += tableReadPositionVertexSize[m_VtxDesc.Position][m_VtxAttr.PosFormat][m_VtxAttr.PosElements]; + WriteCall(VertexLoader_Position::GetFunction(m_VtxDesc.Position, m_VtxAttr.PosFormat, m_VtxAttr.PosElements)); + m_VertexSize += VertexLoader_Position::GetSize(m_VtxDesc.Position, m_VtxAttr.PosFormat, m_VtxAttr.PosElements); nat_offset += 12; // OK, so we just got a point. Let's go back and read it for the bounding box. @@ -423,8 +425,8 @@ void VertexLoader::CompileVertexTranslator() _assert_msg_(VIDEO, 0 <= elements && elements <= 1, "Invalid number of texture coordinates elemnts!\n(elements = %d)", elements); m_NativeFmt->m_components |= VB_HAS_UV0 << i; - WriteCall(tableReadTexCoord[tc[i]][format][elements]); - m_VertexSize += tableReadTexCoordVertexSize[tc[i]][format][elements]; + WriteCall(VertexLoader_TextCoord::GetFunction(tc[i], format, elements)); + m_VertexSize += VertexLoader_TextCoord::GetSize(tc[i], format, elements); } if (m_NativeFmt->m_components & (VB_HAS_TEXMTXIDX0 << i)) { @@ -459,7 +461,7 @@ void VertexLoader::CompileVertexTranslator() int j = i + 1; for (; j < 8; ++j) { if (tc[j] != NOT_PRESENT) { - WriteCall(TexCoord_Read_Dummy); // important to get indices right! + WriteCall(VertexLoader_TextCoord::GetDummyFunction()); // important to get indices right! break; } } diff --git a/Source/Core/VideoCommon/Src/VertexLoader_Normal.cpp b/Source/Core/VideoCommon/Src/VertexLoader_Normal.cpp index 4cee83d9f6..225713a335 100644 --- a/Source/Core/VideoCommon/Src/VertexLoader_Normal.cpp +++ b/Source/Core/VideoCommon/Src/VertexLoader_Normal.cpp @@ -20,6 +20,7 @@ #include "VertexLoader.h" #include "VertexLoader_Normal.h" #include "NativeVertexWriter.h" +#include "CPUDetect.h" #if _M_SSE >= 0x301 #include @@ -114,6 +115,18 @@ void VertexLoader_Normal::Init(void) m_TableExpand16[NRM_INDEX16][NRM_INDICES1][NRM_NBT3][FORMAT_BYTE] = Set(2, Normal_Index16_Byte3_Indices1_Expand16); m_TableExpand16[NRM_INDEX16][NRM_INDICES3][NRM_NBT] [FORMAT_BYTE] = Set(2, Normal_Index16_Byte_Expand16); m_TableExpand16[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_BYTE] = Set(6, Normal_Index16_Byte3_Indices3_Expand16); + +#if _M_SSE >= 0x301 + + if (cpu_info.bSSSE3) { + m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT] [FORMAT_USHORT] = Set(2, Normal_Index16_Short_SSSE3); //HACK + m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT] [FORMAT_SHORT] = Set(2, Normal_Index16_Short_SSSE3); + m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT] [FORMAT_USHORT] = Set(2, Normal_Index16_Short_SSSE3); //HACK + m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT] [FORMAT_SHORT] = Set(2, Normal_Index16_Short_SSSE3); + } + +#endif + } unsigned int VertexLoader_Normal::GetSize(unsigned int _type, unsigned int _format, unsigned int _elements, unsigned int _index3) @@ -415,34 +428,33 @@ void LOADERDECL VertexLoader_Normal::Normal_Index16_Byte_Expand16() LOG_NORM16(); } -#if _M_SSE >= 0x301 -static const __m128i kMaskSwap16_3 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x0FFFF0405L, 0x02030001L); -#endif - void LOADERDECL VertexLoader_Normal::Normal_Index16_Short() { u16 Index = DataReadU16(); const u16* pData = (const u16 *)(cached_arraybases[ARRAY_NORMAL] + (Index * arraystrides[ARRAY_NORMAL])); - -#if _M_SSE >= 0x301 - - __m128i a = _mm_loadl_epi64((__m128i*)pData); - __m128i b = _mm_shuffle_epi8(a, kMaskSwap16_3); - _mm_storel_epi64((__m128i*)VertexManager::s_pCurBufferPointer, b); - -#else - ((u16*)VertexManager::s_pCurBufferPointer)[0] = Common::swap16(pData[0]); ((u16*)VertexManager::s_pCurBufferPointer)[1] = Common::swap16(pData[1]); ((u16*)VertexManager::s_pCurBufferPointer)[2] = Common::swap16(pData[2]); ((u16*)VertexManager::s_pCurBufferPointer)[3] = 0; - -#endif - VertexManager::s_pCurBufferPointer += 8; LOG_NORM16(); } +#if _M_SSE >= 0x301 +static const __m128i kMaskSwap16_3 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x0FFFF0405L, 0x02030001L); + +void LOADERDECL VertexLoader_Normal::Normal_Index16_Short_SSSE3() +{ + u16 Index = DataReadU16(); + const u16* pData = (const u16 *)(cached_arraybases[ARRAY_NORMAL] + (Index * arraystrides[ARRAY_NORMAL])); + __m128i a = _mm_loadl_epi64((__m128i*)pData); + __m128i b = _mm_shuffle_epi8(a, kMaskSwap16_3); + _mm_storel_epi64((__m128i*)VertexManager::s_pCurBufferPointer, b); + VertexManager::s_pCurBufferPointer += 8; + LOG_NORM16(); +} +#endif + void LOADERDECL VertexLoader_Normal::Normal_Index16_Float() { u16 Index = DataReadU16(); diff --git a/Source/Core/VideoCommon/Src/VertexLoader_Normal.h b/Source/Core/VideoCommon/Src/VertexLoader_Normal.h index 947c83fce8..26cd1c300d 100644 --- a/Source/Core/VideoCommon/Src/VertexLoader_Normal.h +++ b/Source/Core/VideoCommon/Src/VertexLoader_Normal.h @@ -117,6 +117,11 @@ private: static void LOADERDECL Normal_Index16_Byte3_Indices3_Expand16(); static void LOADERDECL Normal_Index16_Short3_Indices3(); static void LOADERDECL Normal_Index16_Float3_Indices3(); + +#if _M_SSE >= 0x301 + static void LOADERDECL Normal_Index16_Short_SSSE3(); +#endif + }; #endif diff --git a/Source/Core/VideoCommon/Src/VertexLoader_Position.cpp b/Source/Core/VideoCommon/Src/VertexLoader_Position.cpp index 6ff0884cec..7eb2767e7f 100644 --- a/Source/Core/VideoCommon/Src/VertexLoader_Position.cpp +++ b/Source/Core/VideoCommon/Src/VertexLoader_Position.cpp @@ -20,8 +20,9 @@ #include "VertexLoader.h" #include "VertexLoader_Position.h" #include "NativeVertexWriter.h" +#include "CPUDetect.h" -#if _M_SSE >= 301 +#if _M_SSE >= 0x301 #include #endif @@ -150,37 +151,36 @@ inline void Pos_ReadIndex_Short(int Index) VertexManager::s_pCurBufferPointer += 12; } -#if _M_SSE >= 0x301 -static const __m128i kMaskSwap32_3 = _mm_set_epi32(0xFFFFFFFFL, 0x08090A0BL, 0x04050607L, 0x00010203L); -static const __m128i kMaskSwap32_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L); -#endif - template inline void Pos_ReadIndex_Float(int Index) { const u32* pData = (const u32 *)(cached_arraybases[ARRAY_POSITION] + (Index * arraystrides[ARRAY_POSITION])); - -#if _M_SSE >= 0x301 - - const __m128i a = _mm_loadu_si128((__m128i*)pData); - __m128i b = _mm_shuffle_epi8(a, three ? kMaskSwap32_3 : kMaskSwap32_2); - _mm_storeu_si128((__m128i*)VertexManager::s_pCurBufferPointer, b); - -#else - ((u32*)VertexManager::s_pCurBufferPointer)[0] = Common::swap32(pData[0]); ((u32*)VertexManager::s_pCurBufferPointer)[1] = Common::swap32(pData[1]); if (three) ((u32*)VertexManager::s_pCurBufferPointer)[2] = Common::swap32(pData[2]); else ((float*)VertexManager::s_pCurBufferPointer)[2] = 0.0f; - -#endif - LOG_VTX(); VertexManager::s_pCurBufferPointer += 12; } +#if _M_SSE >= 0x301 +static const __m128i kMaskSwap32_3 = _mm_set_epi32(0xFFFFFFFFL, 0x08090A0BL, 0x04050607L, 0x00010203L); +static const __m128i kMaskSwap32_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L); + +template +inline void Pos_ReadIndex_Float_SSSE3(int Index) +{ + const u32* pData = (const u32 *)(cached_arraybases[ARRAY_POSITION] + (Index * arraystrides[ARRAY_POSITION])); + const __m128i a = _mm_loadu_si128((__m128i*)pData); + __m128i b = _mm_shuffle_epi8(a, three ? kMaskSwap32_3 : kMaskSwap32_2); + _mm_storeu_si128((__m128i*)VertexManager::s_pCurBufferPointer, b); + LOG_VTX(); + VertexManager::s_pCurBufferPointer += 12; +} +#endif + // ============================================================================== // Index 8 // ============================================================================== @@ -209,7 +209,14 @@ void LOADERDECL Pos_ReadIndex16_UShort2() {Pos_ReadIndex_Short(DataR void LOADERDECL Pos_ReadIndex16_Short2() {Pos_ReadIndex_Short(DataReadU16());} void LOADERDECL Pos_ReadIndex16_Float2() {Pos_ReadIndex_Float (DataReadU16());} -ReadPosision tableReadPosition[4][8][2] = { +#if _M_SSE >= 0x301 +void LOADERDECL Pos_ReadIndex8_Float3_SSSE3() {Pos_ReadIndex_Float_SSSE3 (DataReadU8());} +void LOADERDECL Pos_ReadIndex8_Float2_SSSE3() {Pos_ReadIndex_Float_SSSE3 (DataReadU8());} +void LOADERDECL Pos_ReadIndex16_Float3_SSSE3() {Pos_ReadIndex_Float_SSSE3 (DataReadU16());} +void LOADERDECL Pos_ReadIndex16_Float2_SSSE3() {Pos_ReadIndex_Float_SSSE3 (DataReadU16());} +#endif + +static TPipelineFunction tableReadPosition[4][8][2] = { { {NULL, NULL,}, {NULL, NULL,}, @@ -240,7 +247,7 @@ ReadPosision tableReadPosition[4][8][2] = { }, }; -int tableReadPositionVertexSize[4][8][2] = { +static int tableReadPositionVertexSize[4][8][2] = { { {0, 0,}, {0, 0,}, @@ -271,3 +278,26 @@ int tableReadPositionVertexSize[4][8][2] = { }, }; + +void VertexLoader_Position::Init(void) { + +#if _M_SSE >= 0x301 + + if (cpu_info.bSSSE3) { + tableReadPosition[2][4][0] = Pos_ReadIndex8_Float2_SSSE3; + tableReadPosition[2][4][1] = Pos_ReadIndex8_Float3_SSSE3; + tableReadPosition[3][4][0] = Pos_ReadIndex16_Float2_SSSE3; + tableReadPosition[3][4][1] = Pos_ReadIndex16_Float3_SSSE3; + } + +#endif + +} + +unsigned int VertexLoader_Position::GetSize(unsigned int _type, unsigned int _format, unsigned int _elements) { + return tableReadPositionVertexSize[_type][_format][_elements]; +} + +TPipelineFunction VertexLoader_Position::GetFunction(unsigned int _type, unsigned int _format, unsigned int _elements) { + return tableReadPosition[_type][_format][_elements]; +} diff --git a/Source/Core/VideoCommon/Src/VertexLoader_Position.h b/Source/Core/VideoCommon/Src/VertexLoader_Position.h index 027e739225..db8d8a38b5 100644 --- a/Source/Core/VideoCommon/Src/VertexLoader_Position.h +++ b/Source/Core/VideoCommon/Src/VertexLoader_Position.h @@ -18,17 +18,17 @@ #ifndef VERTEXLOADER_POSITION_H #define VERTEXLOADER_POSITION_H -typedef void (LOADERDECL *ReadPosision)(); +class VertexLoader_Position { +public: -// Hold function pointers of vertex loaders. -// The first dimension corresponds to TVtxDesc.Position. -// The second dimension corresponds to TVtxAttr.PosFormat. -// The third dimension corresponds to TVtxAttr.PosElements. -// The dimensions are aligned to 2^n for speed up. -extern ReadPosision tableReadPosition[4][8][2]; + // Init + static void Init(void); -// Hold vertex size of each vertex format. -// The dimensions are same as tableReadPosition. -extern int tableReadPositionVertexSize[4][8][2]; + // GetSize + static unsigned int GetSize(unsigned int _type, unsigned int _format, unsigned int _elements); + + // GetFunction + static TPipelineFunction GetFunction(unsigned int _type, unsigned int _format, unsigned int _elements); +}; #endif diff --git a/Source/Core/VideoCommon/Src/VertexLoader_TextCoord.cpp b/Source/Core/VideoCommon/Src/VertexLoader_TextCoord.cpp index ae4e6d0bb8..08b5f706b1 100644 --- a/Source/Core/VideoCommon/Src/VertexLoader_TextCoord.cpp +++ b/Source/Core/VideoCommon/Src/VertexLoader_TextCoord.cpp @@ -20,6 +20,7 @@ #include "VertexLoader.h" #include "VertexLoader_TextCoord.h" #include "NativeVertexWriter.h" +#include "CPUDetect.h" #if _M_SSE >= 0x401 #include @@ -291,17 +292,25 @@ void LOADERDECL TexCoord_ReadIndex16_Short1() tcIndex++; } -#if _M_SSE >= 0x401 -static const __m128i kMaskSwap16_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0x02030001L); -#endif - void LOADERDECL TexCoord_ReadIndex16_Short2() { // Heavy in ZWW - u16 Index = DataReadU16(); + u16 Index = DataReadU16(); + const u16 *pData = (const u16 *)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex])); + ((float*)VertexManager::s_pCurBufferPointer)[0] = (float)(s16)Common::swap16(pData[0]) * tcScale[tcIndex]; + ((float*)VertexManager::s_pCurBufferPointer)[1] = (float)(s16)Common::swap16(pData[1]) * tcScale[tcIndex]; + LOG_TEX2(); + VertexManager::s_pCurBufferPointer += 8; + tcIndex++; +} #if _M_SSE >= 0x401 +static const __m128i kMaskSwap16_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0x02030001L); +void LOADERDECL TexCoord_ReadIndex16_Short2_SSE4() +{ + // Heavy in ZWW + u16 Index = DataReadU16(); const s32 *pData = (const s32*)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex])); const __m128i a = _mm_cvtsi32_si128(*pData); const __m128i b = _mm_shuffle_epi8(a, kMaskSwap16_2); @@ -310,19 +319,11 @@ void LOADERDECL TexCoord_ReadIndex16_Short2() const __m128 e = _mm_load1_ps(&tcScale[tcIndex]); const __m128 f = _mm_mul_ps(d, e); _mm_storeu_ps((float*)VertexManager::s_pCurBufferPointer, f); - -#else - - const u16 *pData = (const u16 *)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex])); - ((float*)VertexManager::s_pCurBufferPointer)[0] = (float)(s16)Common::swap16(pData[0]) * tcScale[tcIndex]; - ((float*)VertexManager::s_pCurBufferPointer)[1] = (float)(s16)Common::swap16(pData[1]) * tcScale[tcIndex]; - -#endif - LOG_TEX2(); VertexManager::s_pCurBufferPointer += 8; tcIndex++; } +#endif void LOADERDECL TexCoord_ReadIndex16_Float1() { @@ -334,17 +335,24 @@ void LOADERDECL TexCoord_ReadIndex16_Float1() tcIndex++; } -#if _M_SSE >= 0x301 -static const __m128i kMaskSwap32 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L); -#endif - -void LOADERDECL TexCoord_ReadIndex16_Float2() +void LOADERDECL TexCoord_ReadIndex16_Float2() { u16 Index = DataReadU16(); const u32 *pData = (const u32 *)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex])); + ((u32*)VertexManager::s_pCurBufferPointer)[0] = Common::swap32(pData[0]); + ((u32*)VertexManager::s_pCurBufferPointer)[1] = Common::swap32(pData[1]); + LOG_TEX2(); + VertexManager::s_pCurBufferPointer += 8; + tcIndex++; +} #if _M_SSE >= 0x301 +static const __m128i kMaskSwap32 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L); +void LOADERDECL TexCoord_ReadIndex16_Float2_SSSE3() +{ + u16 Index = DataReadU16(); + const u32 *pData = (const u32 *)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (Index * arraystrides[ARRAY_TEXCOORD0+tcIndex])); const __m128i a = _mm_loadl_epi64((__m128i*)pData); const __m128i b = _mm_shuffle_epi8(a, kMaskSwap32); u8* p = VertexManager::s_pCurBufferPointer; @@ -353,19 +361,10 @@ void LOADERDECL TexCoord_ReadIndex16_Float2() p += 8; VertexManager::s_pCurBufferPointer = p; tcIndex++; - -#else - - ((u32*)VertexManager::s_pCurBufferPointer)[0] = Common::swap32(pData[0]); - ((u32*)VertexManager::s_pCurBufferPointer)[1] = Common::swap32(pData[1]); - LOG_TEX2(); - VertexManager::s_pCurBufferPointer += 8; - tcIndex++; - -#endif } +#endif -ReadTexCoord tableReadTexCoord[4][8][2] = { +static TPipelineFunction tableReadTexCoord[4][8][2] = { { {NULL, NULL,}, {NULL, NULL,}, @@ -396,7 +395,7 @@ ReadTexCoord tableReadTexCoord[4][8][2] = { }, }; -int tableReadTexCoordVertexSize[4][8][2] = { +static int tableReadTexCoordVertexSize[4][8][2] = { { {0, 0,}, {0, 0,}, @@ -426,3 +425,35 @@ int tableReadTexCoordVertexSize[4][8][2] = { {2, 2,}, }, }; + +void VertexLoader_TextCoord::Init(void) { + +#if _M_SSE >= 0x301 + + if (cpu_info.bSSSE3) { + tableReadTexCoord[3][4][1] = TexCoord_ReadIndex16_Float2_SSSE3; + } + +#endif + +#if _M_SSE >= 0x401 + + if (cpu_info.bSSE4_1) { + tableReadTexCoord[3][3][1] = TexCoord_ReadIndex16_Short2_SSE4; + } + +#endif + +} + +unsigned int VertexLoader_TextCoord::GetSize(unsigned int _type, unsigned int _format, unsigned int _elements) { + return tableReadTexCoordVertexSize[_type][_format][_elements]; +} + +TPipelineFunction VertexLoader_TextCoord::GetFunction(unsigned int _type, unsigned int _format, unsigned int _elements) { + return tableReadTexCoord[_type][_format][_elements]; +} + +TPipelineFunction VertexLoader_TextCoord::GetDummyFunction() { + return TexCoord_Read_Dummy; +} diff --git a/Source/Core/VideoCommon/Src/VertexLoader_TextCoord.h b/Source/Core/VideoCommon/Src/VertexLoader_TextCoord.h index 6d94a0076b..f7d5ee7ee7 100644 --- a/Source/Core/VideoCommon/Src/VertexLoader_TextCoord.h +++ b/Source/Core/VideoCommon/Src/VertexLoader_TextCoord.h @@ -20,19 +20,22 @@ #include "NativeVertexFormat.h" -typedef void (LOADERDECL *ReadTexCoord)(); +class VertexLoader_TextCoord +{ +public: -// Hold function pointers of texture coordinates loaders. -// The first dimension corresponds to TVtxDesc.Tex?Coord. -// The second dimension corresponds to TVtxAttr.texCoord[?].Format. -// The third dimension corresponds to TVtxAttr.texCoord[?].Elements. -// The dimensions are aligned to 2^n for speed up. -extern ReadTexCoord tableReadTexCoord[4][8][2]; + // Init + static void Init(void); -// Hold vertex size of each vertex format. -// The dimensions are same as tableReadPosition. -extern int tableReadTexCoordVertexSize[4][8][2]; + // GetSize + static unsigned int GetSize(unsigned int _type, unsigned int _format, unsigned int _elements); -void LOADERDECL TexCoord_Read_Dummy(); + // GetFunction + static TPipelineFunction GetFunction(unsigned int _type, unsigned int _format, unsigned int _elements); + + // GetDummyFunction + // It is important to synchronize tcIndex. + static TPipelineFunction GetDummyFunction(); +}; #endif