VertexLoader: remove non-JIT SSE code

This commit is contained in:
Tillmann Karras 2015-01-19 18:33:08 +01:00
parent 38cb9bbfae
commit 873902b4a3
6 changed files with 8 additions and 349 deletions

View File

@ -17,8 +17,6 @@ namespace FifoAnalyzer
void Init() void Init()
{ {
VertexLoader_Normal::Init(); VertexLoader_Normal::Init();
VertexLoader_Position::Init();
VertexLoader_TextCoord::Init();
} }
u8 ReadFifo8(u8 *&data) u8 ReadFifo8(u8 *&data)

View File

@ -28,16 +28,6 @@
u8* g_video_buffer_read_ptr; u8* g_video_buffer_read_ptr;
u8* g_vertex_manager_write_ptr; u8* g_vertex_manager_write_ptr;
void* VertexLoader::operator new (size_t size)
{
return AllocateAlignedMemory(size, 16);
}
void VertexLoader::operator delete (void *p)
{
FreeAlignedMemory(p);
}
static void LOADERDECL PosMtx_ReadDirect_UByte(VertexLoader* loader) static void LOADERDECL PosMtx_ReadDirect_UByte(VertexLoader* loader)
{ {
u8 posmtx = BoundingBox::posMtxIdx = DataReadU8() & 0x3f; u8 posmtx = BoundingBox::posMtxIdx = DataReadU8() & 0x3f;
@ -66,15 +56,9 @@ static void LOADERDECL TexMtx_Write_Float2(VertexLoader* loader)
static void LOADERDECL TexMtx_Write_Float3(VertexLoader* loader) static void LOADERDECL TexMtx_Write_Float3(VertexLoader* loader)
{ {
#if _M_SSE >= 0x200
__m128 output = _mm_cvtsi32_ss(_mm_castsi128_ps(_mm_setzero_si128()), loader->m_curtexmtx[loader->m_texmtxwrite++]);
_mm_storeu_ps((float*)g_vertex_manager_write_ptr, _mm_shuffle_ps(output, output, 0x45 /* 1, 1, 0, 1 */));
g_vertex_manager_write_ptr += sizeof(float) * 3;
#else
DataWrite(0.f); DataWrite(0.f);
DataWrite(0.f); DataWrite(0.f);
DataWrite(float(loader->m_curtexmtx[loader->m_texmtxwrite++])); DataWrite(float(loader->m_curtexmtx[loader->m_texmtxwrite++]));
#endif
} }
static void LOADERDECL SkipVertex(VertexLoader* loader) static void LOADERDECL SkipVertex(VertexLoader* loader)
@ -92,15 +76,13 @@ VertexLoader::VertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr)
: VertexLoaderBase(vtx_desc, vtx_attr) : VertexLoaderBase(vtx_desc, vtx_attr)
{ {
VertexLoader_Normal::Init(); VertexLoader_Normal::Init();
VertexLoader_Position::Init();
VertexLoader_TextCoord::Init();
CompileVertexTranslator(); CompileVertexTranslator();
// generate frac factors // generate frac factors
m_posScale[0] = m_posScale[1] = m_posScale[2] = m_posScale[3] = 1.0f / (1U << m_VtxAttr.PosFrac); m_posScale = 1.0f / (1U << m_VtxAttr.PosFrac);
for (int i = 0; i < 8; i++) for (int i = 0; i < 8; i++)
m_tcScale[i][0] = m_tcScale[i][1] = 1.0f / (1U << m_VtxAttr.texCoord[i].Frac); m_tcScale[i] = 1.0f / (1U << m_VtxAttr.texCoord[i].Frac);
for (int i = 0; i < 2; i++) for (int i = 0; i < 2; i++)
m_colElements[i] = m_VtxAttr.color[i].Elements; m_colElements[i] = m_VtxAttr.color[i].Elements;

View File

@ -18,13 +18,6 @@
#include "VideoCommon/VertexLoaderBase.h" #include "VideoCommon/VertexLoaderBase.h"
#include "VideoCommon/VertexLoaderUtils.h" #include "VideoCommon/VertexLoaderUtils.h"
#if _M_SSE >= 0x401
#include <smmintrin.h>
#include <emmintrin.h>
#elif _M_SSE >= 0x301 && !(defined __GNUC__ && !defined __SSSE3__)
#include <tmmintrin.h>
#endif
#ifdef WIN32 #ifdef WIN32
#define LOADERDECL __cdecl #define LOADERDECL __cdecl
#else #else
@ -37,11 +30,6 @@ typedef void (LOADERDECL *TPipelineFunction)(VertexLoader* loader);
class VertexLoader : public VertexLoaderBase class VertexLoader : public VertexLoaderBase
{ {
public: public:
// This class need a 16 byte alignment. As this is broken on
// MSVC right now (Dec 2014), we use custom allocation.
void* operator new (size_t size);
void operator delete (void *p);
VertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr); VertexLoader(const TVtxDesc &vtx_desc, const VAT &vtx_attr);
int RunVertices(int primitive, int count, DataReader src, DataReader dst) override; int RunVertices(int primitive, int count, DataReader src, DataReader dst) override;
@ -49,9 +37,8 @@ public:
bool IsInitialized() override { return true; } // This vertex loader supports all formats bool IsInitialized() override { return true; } // This vertex loader supports all formats
// They are used for the communication with the loader functions // They are used for the communication with the loader functions
// Duplicated (4x and 2x respectively) and used in SSE code in the vertex loader JIT float m_posScale;
GC_ALIGNED128(float m_posScale[4]); float m_tcScale[8];
GC_ALIGNED64(float m_tcScale[8][2]);
int m_tcIndex; int m_tcIndex;
int m_colIndex; int m_colIndex;
int m_colElements[2]; int m_colElements[2];
@ -73,61 +60,3 @@ private:
void WriteCall(TPipelineFunction); void WriteCall(TPipelineFunction);
}; };
#if _M_SSE >= 0x301
static const __m128i kMaskSwap32_3 = _mm_set_epi32(0xFFFFFFFFL, 0x08090A0BL, 0x04050607L, 0x00010203L);
static const __m128i kMaskSwap32_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L);
static const __m128i kMaskSwap16to32l_3 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFF0405L, 0xFFFF0203L, 0xFFFF0001L);
static const __m128i kMaskSwap16to32l_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFF0203L, 0xFFFF0001L);
static const __m128i kMaskSwap16to32h_3 = _mm_set_epi32(0xFFFFFFFFL, 0x0405FFFFL, 0x0203FFFFL, 0x0001FFFFL);
static const __m128i kMaskSwap16to32h_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x0203FFFFL, 0x0001FFFFL);
static const __m128i kMask8to32l_3 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFF02L, 0xFFFFFF01L, 0xFFFFFF00L);
static const __m128i kMask8to32l_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFF01L, 0xFFFFFF00L);
static const __m128i kMask8to32h_3 = _mm_set_epi32(0xFFFFFFFFL, 0x02FFFFFFL, 0x01FFFFFFL, 0x00FFFFFFL);
static const __m128i kMask8to32h_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x01FFFFFFL, 0x00FFFFFFL);
template <typename T, bool threeIn, bool threeOut>
__forceinline void Vertex_Read_SSSE3(const T* pData, __m128 scale)
{
__m128i coords, mask;
int loadBytes = sizeof(T) * (2 + threeIn);
if (loadBytes > 8)
coords = _mm_loadu_si128((__m128i*)pData);
else if (loadBytes > 4)
coords = _mm_loadl_epi64((__m128i*)pData);
else
coords = _mm_cvtsi32_si128(*(u32*)pData);
// Float case (no scaling)
if (sizeof(T) == 4)
{
coords = _mm_shuffle_epi8(coords, threeIn ? kMaskSwap32_3 : kMaskSwap32_2);
if (threeOut)
_mm_storeu_si128((__m128i*)g_vertex_manager_write_ptr, coords);
else
_mm_storel_epi64((__m128i*)g_vertex_manager_write_ptr, coords);
}
else
{
// Byte swap, unpack, and move to high bytes for sign extend.
if (std::is_unsigned<T>::value)
mask = sizeof(T) == 2 ? (threeIn ? kMaskSwap16to32l_3 : kMaskSwap16to32l_2) : (threeIn ? kMask8to32l_3 : kMask8to32l_2);
else
mask = sizeof(T) == 2 ? (threeIn ? kMaskSwap16to32h_3 : kMaskSwap16to32h_2) : (threeIn ? kMask8to32h_3 : kMask8to32h_2);
coords = _mm_shuffle_epi8(coords, mask);
// Sign extend
if (std::is_signed<T>::value)
coords = _mm_srai_epi32(coords, 32 - sizeof(T) * 8);
__m128 out = _mm_mul_ps(_mm_cvtepi32_ps(coords), scale);
if (threeOut)
_mm_storeu_ps((float*)g_vertex_manager_write_ptr, out);
else
_mm_storel_pi((__m64*)g_vertex_manager_write_ptr, out);
}
g_vertex_manager_write_ptr += sizeof(float) * (2 + threeOut);
}
#endif

View File

@ -6,8 +6,6 @@
#include <type_traits> #include <type_traits>
#include "Common/CommonTypes.h" #include "Common/CommonTypes.h"
#include "Common/CPUDetect.h"
#include "VideoCommon/VertexLoader.h" #include "VideoCommon/VertexLoader.h"
#include "VideoCommon/VertexLoader_Normal.h" #include "VideoCommon/VertexLoader_Normal.h"
#include "VideoCommon/VertexManagerBase.h" #include "VideoCommon/VertexManagerBase.h"
@ -102,63 +100,6 @@ struct Normal_Index_Indices3
static const int size = sizeof(I) * 3; static const int size = sizeof(I) * 3;
}; };
#if _M_SSE >= 0x301
template <typename T, int N>
struct Normal_Direct_SSSE3
{
static void LOADERDECL function(VertexLoader* loader)
{
const T* pData = reinterpret_cast<const T*>(DataGetPosition());
const float frac = 1. / float(1u << (sizeof(T) * 8 - std::is_signed<T>::value - 1));
const __m128 scale = _mm_set_ps1(frac);
for (int i = 0; i < N; i++, pData += 3)
Vertex_Read_SSSE3<T, true, true>(pData, scale);
DataSkip<N * 3 * sizeof(T)>();
}
static const int size = sizeof(T) * N * 3;
};
template <typename I, typename T, int N, int Offset>
__forceinline void Normal_Index_Offset_SSSE3()
{
static_assert(std::is_unsigned<I>::value, "Only unsigned I is sane!");
auto const index = DataRead<I>();
const T* pData = (const T*)(cached_arraybases[ARRAY_NORMAL]
+ (index * g_main_cp_state.array_strides[ARRAY_NORMAL]) + sizeof(T) * 3 * Offset);
const float frac = 1. / float(1u << (sizeof(T) * 8 - std::is_signed<T>::value - 1));
const __m128 scale = _mm_set_ps1(frac);
for (int i = 0; i < N; i++, pData += 3)
Vertex_Read_SSSE3<T, true, true>(pData, scale);
}
template <typename I, typename T, int N>
struct Normal_Index_SSSE3
{
static void LOADERDECL function(VertexLoader* loader)
{
Normal_Index_Offset_SSSE3<I, T, N, 0>();
}
static const int size = sizeof(I);
};
template <typename I, typename T>
struct Normal_Index_Indices3_SSSE3
{
static void LOADERDECL function(VertexLoader* loader)
{
Normal_Index_Offset_SSSE3<I, T, 1, 0>();
Normal_Index_Offset_SSSE3<I, T, 1, 1>();
Normal_Index_Offset_SSSE3<I, T, 1, 2>();
}
static const int size = sizeof(I) * 3;
};
#endif
} }
void VertexLoader_Normal::Init() void VertexLoader_Normal::Init()
@ -231,77 +172,6 @@ void VertexLoader_Normal::Init()
m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_USHORT] = Normal_Index_Indices3<u16, u16>(); m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_USHORT] = Normal_Index_Indices3<u16, u16>();
m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_SHORT] = Normal_Index_Indices3<u16, s16>(); m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_SHORT] = Normal_Index_Indices3<u16, s16>();
m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_FLOAT] = Normal_Index_Indices3<u16, float>(); m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_FLOAT] = Normal_Index_Indices3<u16, float>();
#if _M_SSE >= 0x301
if (cpu_info.bSSSE3)
{
m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT][FORMAT_UBYTE] = Normal_Direct_SSSE3<u8, 1>();
m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT][FORMAT_BYTE] = Normal_Direct_SSSE3<s8, 1>();
m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT][FORMAT_USHORT] = Normal_Direct_SSSE3<u16, 1>();
m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT][FORMAT_SHORT] = Normal_Direct_SSSE3<s16, 1>();
m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT][FORMAT_FLOAT] = Normal_Direct_SSSE3<float, 1>();
m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT3][FORMAT_UBYTE] = Normal_Direct_SSSE3<u8, 3>();
m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT3][FORMAT_BYTE] = Normal_Direct_SSSE3<s8, 3>();
m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT3][FORMAT_USHORT] = Normal_Direct_SSSE3<u16, 3>();
m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT3][FORMAT_SHORT] = Normal_Direct_SSSE3<s16, 3>();
m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT3][FORMAT_FLOAT] = Normal_Direct_SSSE3<float, 3>();
m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT][FORMAT_UBYTE] = Normal_Direct_SSSE3<u8, 1>();
m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT][FORMAT_BYTE] = Normal_Direct_SSSE3<s8, 1>();
m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT][FORMAT_USHORT] = Normal_Direct_SSSE3<u16, 1>();
m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT][FORMAT_SHORT] = Normal_Direct_SSSE3<s16, 1>();
m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT][FORMAT_FLOAT] = Normal_Direct_SSSE3<float, 1>();
m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT3][FORMAT_UBYTE] = Normal_Direct_SSSE3<u8, 3>();
m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT3][FORMAT_BYTE] = Normal_Direct_SSSE3<s8, 3>();
m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT3][FORMAT_USHORT] = Normal_Direct_SSSE3<u16, 3>();
m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT3][FORMAT_SHORT] = Normal_Direct_SSSE3<s16, 3>();
m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT3][FORMAT_FLOAT] = Normal_Direct_SSSE3<float, 3>();
m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT][FORMAT_UBYTE] = Normal_Index_SSSE3<u8, u8, 1>();
m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT][FORMAT_BYTE] = Normal_Index_SSSE3<u8, s8, 1>();
m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT][FORMAT_USHORT] = Normal_Index_SSSE3<u8, u16, 1>();
m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT][FORMAT_SHORT] = Normal_Index_SSSE3<u8, s16, 1>();
m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT][FORMAT_FLOAT] = Normal_Index_SSSE3<u8, float, 1>();
m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT3][FORMAT_UBYTE] = Normal_Index_SSSE3<u8, u8, 3>();
m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT3][FORMAT_BYTE] = Normal_Index_SSSE3<u8, s8, 3>();
m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT3][FORMAT_USHORT] = Normal_Index_SSSE3<u8, u16, 3>();
m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT3][FORMAT_SHORT] = Normal_Index_SSSE3<u8, s16, 3>();
m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT3][FORMAT_FLOAT] = Normal_Index_SSSE3<u8, float, 3>();
m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT][FORMAT_UBYTE] = Normal_Index_SSSE3<u8, u8, 1>();
m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT][FORMAT_BYTE] = Normal_Index_SSSE3<u8, s8, 1>();
m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT][FORMAT_USHORT] = Normal_Index_SSSE3<u8, u16, 1>();
m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT][FORMAT_SHORT] = Normal_Index_SSSE3<u8, s16, 1>();
m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT][FORMAT_FLOAT] = Normal_Index_SSSE3<u8, float, 1>();
m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT3][FORMAT_UBYTE] = Normal_Index_Indices3_SSSE3<u8, u8>();
m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT3][FORMAT_BYTE] = Normal_Index_Indices3_SSSE3<u8, s8>();
m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT3][FORMAT_USHORT] = Normal_Index_Indices3_SSSE3<u8, u16>();
m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT3][FORMAT_SHORT] = Normal_Index_Indices3_SSSE3<u8, s16>();
m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT3][FORMAT_FLOAT] = Normal_Index_Indices3_SSSE3<u8, float>();
m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT][FORMAT_UBYTE] = Normal_Index_SSSE3<u16, u8, 1>();
m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT][FORMAT_BYTE] = Normal_Index_SSSE3<u16, s8, 1>();
m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT][FORMAT_USHORT] = Normal_Index_SSSE3<u16, u16, 1>();
m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT][FORMAT_SHORT] = Normal_Index_SSSE3<u16, s16, 1>();
m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT][FORMAT_FLOAT] = Normal_Index_SSSE3<u16, float, 1>();
m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT3][FORMAT_UBYTE] = Normal_Index_SSSE3<u16, u8, 3>();
m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT3][FORMAT_BYTE] = Normal_Index_SSSE3<u16, s8, 3>();
m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT3][FORMAT_USHORT] = Normal_Index_SSSE3<u16, u16, 3>();
m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT3][FORMAT_SHORT] = Normal_Index_SSSE3<u16, s16, 3>();
m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT3][FORMAT_FLOAT] = Normal_Index_SSSE3<u16, float, 3>();
m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT][FORMAT_UBYTE] = Normal_Index_SSSE3<u16, u8, 1>();
m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT][FORMAT_BYTE] = Normal_Index_SSSE3<u16, s8, 1>();
m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT][FORMAT_USHORT] = Normal_Index_SSSE3<u16, u16, 1>();
m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT][FORMAT_SHORT] = Normal_Index_SSSE3<u16, s16, 1>();
m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT][FORMAT_FLOAT] = Normal_Index_SSSE3<u16, float, 1>();
m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_UBYTE] = Normal_Index_Indices3_SSSE3<u16, u8>();
m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_BYTE] = Normal_Index_Indices3_SSSE3<u16, s8>();
m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_USHORT] = Normal_Index_Indices3_SSSE3<u16, u16>();
m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_SHORT] = Normal_Index_Indices3_SSSE3<u16, s16>();
m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_FLOAT] = Normal_Index_Indices3_SSSE3<u16, float>();
}
#endif
} }
unsigned int VertexLoader_Normal::GetSize(u64 _type, unsigned int VertexLoader_Normal::GetSize(u64 _type,

View File

@ -5,8 +5,6 @@
#include <type_traits> #include <type_traits>
#include "Common/CommonTypes.h" #include "Common/CommonTypes.h"
#include "Common/CPUDetect.h"
#include "VideoCommon/VertexLoader.h" #include "VideoCommon/VertexLoader.h"
#include "VideoCommon/VertexLoader_Position.h" #include "VideoCommon/VertexLoader_Position.h"
#include "VideoCommon/VertexManagerBase.h" #include "VideoCommon/VertexManagerBase.h"
@ -28,7 +26,7 @@ template <typename T, int N>
void LOADERDECL Pos_ReadDirect(VertexLoader* loader) void LOADERDECL Pos_ReadDirect(VertexLoader* loader)
{ {
static_assert(N <= 3, "N > 3 is not sane!"); static_assert(N <= 3, "N > 3 is not sane!");
auto const scale = loader->m_posScale[0];; auto const scale = loader->m_posScale;
DataReader dst(g_vertex_manager_write_ptr, nullptr); DataReader dst(g_vertex_manager_write_ptr, nullptr);
DataReader src(g_video_buffer_read_ptr, nullptr); DataReader src(g_video_buffer_read_ptr, nullptr);
@ -49,7 +47,7 @@ void LOADERDECL Pos_ReadIndex(VertexLoader* loader)
auto const index = DataRead<I>(); auto const index = DataRead<I>();
loader->m_vertexSkip = index == std::numeric_limits<I>::max(); loader->m_vertexSkip = index == std::numeric_limits<I>::max();
auto const data = reinterpret_cast<const T*>(cached_arraybases[ARRAY_POSITION] + (index * g_main_cp_state.array_strides[ARRAY_POSITION])); auto const data = reinterpret_cast<const T*>(cached_arraybases[ARRAY_POSITION] + (index * g_main_cp_state.array_strides[ARRAY_POSITION]));
auto const scale = loader->m_posScale[0]; auto const scale = loader->m_posScale;
DataReader dst(g_vertex_manager_write_ptr, nullptr); DataReader dst(g_vertex_manager_write_ptr, nullptr);
for (int i = 0; i < 3; ++i) for (int i = 0; i < 3; ++i)
@ -59,28 +57,6 @@ void LOADERDECL Pos_ReadIndex(VertexLoader* loader)
LOG_VTX(); LOG_VTX();
} }
#if _M_SSE >= 0x301
template <typename T, bool three>
void LOADERDECL Pos_ReadDirect_SSSE3(VertexLoader* loader)
{
const T* pData = reinterpret_cast<const T*>(DataGetPosition());
Vertex_Read_SSSE3<T, three, true>(pData, *(__m128*)loader->m_posScale);
DataSkip<(2 + three) * sizeof(T)>();
LOG_VTX();
}
template <typename I, typename T, bool three>
void LOADERDECL Pos_ReadIndex_SSSE3(VertexLoader* loader)
{
static_assert(std::is_unsigned<I>::value, "Only unsigned I is sane!");
auto const index = DataRead<I>();
loader->m_vertexSkip = index == std::numeric_limits<I>::max();
const T* pData = (const T*)(cached_arraybases[ARRAY_POSITION] + (index * g_main_cp_state.array_strides[ARRAY_POSITION]));
Vertex_Read_SSSE3<T, three, true>(pData, *(__m128*)loader->m_posScale);
LOG_VTX();
}
#endif
static TPipelineFunction tableReadPosition[4][8][2] = { static TPipelineFunction tableReadPosition[4][8][2] = {
{ {
{nullptr, nullptr,}, {nullptr, nullptr,},
@ -127,48 +103,6 @@ static int tableReadPositionVertexSize[4][8][2] = {
}, },
}; };
void VertexLoader_Position::Init()
{
#if _M_SSE >= 0x301
if (cpu_info.bSSSE3)
{
tableReadPosition[1][0][0] = Pos_ReadDirect_SSSE3<u8, false>;
tableReadPosition[1][0][1] = Pos_ReadDirect_SSSE3<u8, true>;
tableReadPosition[1][1][0] = Pos_ReadDirect_SSSE3<s8, false>;
tableReadPosition[1][1][1] = Pos_ReadDirect_SSSE3<s8, true>;
tableReadPosition[1][2][0] = Pos_ReadDirect_SSSE3<u16, false>;
tableReadPosition[1][2][1] = Pos_ReadDirect_SSSE3<u16, true>;
tableReadPosition[1][3][0] = Pos_ReadDirect_SSSE3<s16, false>;
tableReadPosition[1][3][1] = Pos_ReadDirect_SSSE3<s16, true>;
tableReadPosition[1][4][0] = Pos_ReadDirect_SSSE3<float, false>;
tableReadPosition[1][4][1] = Pos_ReadDirect_SSSE3<float, true>;
tableReadPosition[2][0][0] = Pos_ReadIndex_SSSE3<u8, u8, false>;
tableReadPosition[2][0][1] = Pos_ReadIndex_SSSE3<u8, u8, true>;
tableReadPosition[3][0][0] = Pos_ReadIndex_SSSE3<u16, u8, false>;
tableReadPosition[3][0][1] = Pos_ReadIndex_SSSE3<u16, u8, true>;
tableReadPosition[2][1][0] = Pos_ReadIndex_SSSE3<u8, s8, false>;
tableReadPosition[2][1][1] = Pos_ReadIndex_SSSE3<u8, s8, true>;
tableReadPosition[3][1][0] = Pos_ReadIndex_SSSE3<u16, s8, false>;
tableReadPosition[3][1][1] = Pos_ReadIndex_SSSE3<u16, s8, true>;
tableReadPosition[2][2][0] = Pos_ReadIndex_SSSE3<u8, u16, false>;
tableReadPosition[2][2][1] = Pos_ReadIndex_SSSE3<u8, u16, true>;
tableReadPosition[3][2][0] = Pos_ReadIndex_SSSE3<u16, u16, false>;
tableReadPosition[3][2][1] = Pos_ReadIndex_SSSE3<u16, u16, true>;
tableReadPosition[2][3][0] = Pos_ReadIndex_SSSE3<u8, s16, false>;
tableReadPosition[2][3][1] = Pos_ReadIndex_SSSE3<u8, s16, true>;
tableReadPosition[3][3][0] = Pos_ReadIndex_SSSE3<u16, s16, false>;
tableReadPosition[3][3][1] = Pos_ReadIndex_SSSE3<u16, s16, true>;
tableReadPosition[2][4][0] = Pos_ReadIndex_SSSE3<u8, float, false>;
tableReadPosition[2][4][1] = Pos_ReadIndex_SSSE3<u8, float, true>;
tableReadPosition[3][4][0] = Pos_ReadIndex_SSSE3<u16, float, false>;
tableReadPosition[3][4][1] = Pos_ReadIndex_SSSE3<u16, float, true>;
}
#endif
}
unsigned int VertexLoader_Position::GetSize(u64 _type, unsigned int _format, unsigned int _elements) unsigned int VertexLoader_Position::GetSize(u64 _type, unsigned int _format, unsigned int _elements)
{ {
return tableReadPositionVertexSize[_type][_format][_elements]; return tableReadPositionVertexSize[_type][_format][_elements];

View File

@ -5,8 +5,6 @@
#include <type_traits> #include <type_traits>
#include "Common/CommonTypes.h" #include "Common/CommonTypes.h"
#include "Common/CPUDetect.h"
#include "VideoCommon/VertexLoader.h" #include "VideoCommon/VertexLoader.h"
#include "VideoCommon/VertexLoader_TextCoord.h" #include "VideoCommon/VertexLoader_TextCoord.h"
#include "VideoCommon/VertexManagerBase.h" #include "VideoCommon/VertexManagerBase.h"
@ -49,7 +47,7 @@ float TCScale(float val, float scale)
template <typename T, int N> template <typename T, int N>
void LOADERDECL TexCoord_ReadDirect(VertexLoader* loader) void LOADERDECL TexCoord_ReadDirect(VertexLoader* loader)
{ {
auto const scale = loader->m_tcScale[loader->m_tcIndex][0]; auto const scale = loader->m_tcScale[loader->m_tcIndex];
DataReader dst(g_vertex_manager_write_ptr, nullptr); DataReader dst(g_vertex_manager_write_ptr, nullptr);
DataReader src(g_video_buffer_read_ptr, nullptr); DataReader src(g_video_buffer_read_ptr, nullptr);
@ -71,7 +69,7 @@ void LOADERDECL TexCoord_ReadIndex(VertexLoader* loader)
auto const index = DataRead<I>(); auto const index = DataRead<I>();
auto const data = reinterpret_cast<const T*>(cached_arraybases[ARRAY_TEXCOORD0 + loader->m_tcIndex] auto const data = reinterpret_cast<const T*>(cached_arraybases[ARRAY_TEXCOORD0 + loader->m_tcIndex]
+ (index * g_main_cp_state.array_strides[ARRAY_TEXCOORD0 + loader->m_tcIndex])); + (index * g_main_cp_state.array_strides[ARRAY_TEXCOORD0 + loader->m_tcIndex]));
auto const scale = loader->m_tcScale[loader->m_tcIndex][0]; auto const scale = loader->m_tcScale[loader->m_tcIndex];
DataReader dst(g_vertex_manager_write_ptr, nullptr); DataReader dst(g_vertex_manager_write_ptr, nullptr);
for (int i = 0; i != N; ++i) for (int i = 0; i != N; ++i)
@ -82,32 +80,6 @@ void LOADERDECL TexCoord_ReadIndex(VertexLoader* loader)
++loader->m_tcIndex; ++loader->m_tcIndex;
} }
#if _M_SSE >= 0x301
template <typename T>
void LOADERDECL TexCoord_ReadDirect2_SSSE3(VertexLoader* loader)
{
const T* pData = reinterpret_cast<const T*>(DataGetPosition());
__m128 scale = _mm_castsi128_ps(_mm_loadl_epi64((__m128i*)loader->m_tcScale[loader->m_tcIndex]));
Vertex_Read_SSSE3<T, false, false>(pData, scale);
DataSkip<2 * sizeof(T)>();
LOG_TEX<2>();
loader->m_tcIndex++;
}
template <typename I, typename T>
void LOADERDECL TexCoord_ReadIndex2_SSSE3(VertexLoader* loader)
{
static_assert(std::is_unsigned<I>::value, "Only unsigned I is sane!");
auto const index = DataRead<I>();
const T* pData = (const T*)(cached_arraybases[ARRAY_TEXCOORD0 + loader->m_tcIndex] + (index * g_main_cp_state.array_strides[ARRAY_TEXCOORD0 + loader->m_tcIndex]));
__m128 scale = _mm_castsi128_ps(_mm_loadl_epi64((__m128i*)loader->m_tcScale[loader->m_tcIndex]));
Vertex_Read_SSSE3<T, false, false>(pData, scale);
LOG_TEX<2>();
loader->m_tcIndex++;
}
#endif
static TPipelineFunction tableReadTexCoord[4][8][2] = { static TPipelineFunction tableReadTexCoord[4][8][2] = {
{ {
{nullptr, nullptr,}, {nullptr, nullptr,},
@ -154,32 +126,6 @@ static int tableReadTexCoordVertexSize[4][8][2] = {
}, },
}; };
void VertexLoader_TextCoord::Init()
{
#if _M_SSE >= 0x301
if (cpu_info.bSSSE3)
{
tableReadTexCoord[1][0][1] = TexCoord_ReadDirect2_SSSE3<u8>;
tableReadTexCoord[1][1][1] = TexCoord_ReadDirect2_SSSE3<s8>;
tableReadTexCoord[1][2][1] = TexCoord_ReadDirect2_SSSE3<u16>;
tableReadTexCoord[1][3][1] = TexCoord_ReadDirect2_SSSE3<s16>;
tableReadTexCoord[1][4][1] = TexCoord_ReadDirect2_SSSE3<float>;
tableReadTexCoord[2][0][1] = TexCoord_ReadIndex2_SSSE3<u8, u8>;
tableReadTexCoord[2][1][1] = TexCoord_ReadIndex2_SSSE3<u8, s8>;
tableReadTexCoord[2][2][1] = TexCoord_ReadIndex2_SSSE3<u8, u16>;
tableReadTexCoord[2][3][1] = TexCoord_ReadIndex2_SSSE3<u8, s16>;
tableReadTexCoord[2][4][1] = TexCoord_ReadIndex2_SSSE3<u8, float>;
tableReadTexCoord[3][0][1] = TexCoord_ReadIndex2_SSSE3<u16, u8>;
tableReadTexCoord[3][1][1] = TexCoord_ReadIndex2_SSSE3<u16, s8>;
tableReadTexCoord[3][2][1] = TexCoord_ReadIndex2_SSSE3<u16, u16>;
tableReadTexCoord[3][3][1] = TexCoord_ReadIndex2_SSSE3<u16, s16>;
tableReadTexCoord[3][4][1] = TexCoord_ReadIndex2_SSSE3<u16, float>;
}
#endif
}
unsigned int VertexLoader_TextCoord::GetSize(u64 _type, unsigned int _format, unsigned int _elements) unsigned int VertexLoader_TextCoord::GetSize(u64 _type, unsigned int _format, unsigned int _elements)
{ {
return tableReadTexCoordVertexSize[_type][_format][_elements]; return tableReadTexCoordVertexSize[_type][_format][_elements];