Vertex Loader: SSE implementations of more position/texcoord/normal formats
~35-45% faster NFS:HP2, possibly other vertex-bound games.
This commit is contained in:
parent
da962a3d2b
commit
3ddf82a318
|
@ -42,15 +42,15 @@ void SWVertexLoader::SetFormat(u8 attributeIndex, u8 primitiveType)
|
|||
{
|
||||
m_CurrentVat = &g_main_cp_state.vtx_attr[attributeIndex];
|
||||
|
||||
posScale = 1.0f / float(1 << m_CurrentVat->g0.PosFrac);
|
||||
tcScale[0] = 1.0f / float(1 << m_CurrentVat->g0.Tex0Frac);
|
||||
tcScale[1] = 1.0f / float(1 << m_CurrentVat->g1.Tex1Frac);
|
||||
tcScale[2] = 1.0f / float(1 << m_CurrentVat->g1.Tex2Frac);
|
||||
tcScale[3] = 1.0f / float(1 << m_CurrentVat->g1.Tex3Frac);
|
||||
tcScale[4] = 1.0f / float(1 << m_CurrentVat->g2.Tex4Frac);
|
||||
tcScale[5] = 1.0f / float(1 << m_CurrentVat->g2.Tex5Frac);
|
||||
tcScale[6] = 1.0f / float(1 << m_CurrentVat->g2.Tex6Frac);
|
||||
tcScale[7] = 1.0f / float(1 << m_CurrentVat->g2.Tex7Frac);
|
||||
posScale[0] = posScale[1] = posScale[2] = posScale[3] = 1.0f / float(1 << m_CurrentVat->g0.PosFrac);
|
||||
tcScale[0][0] = tcScale[0][1] = 1.0f / float(1 << m_CurrentVat->g0.Tex0Frac);
|
||||
tcScale[1][0] = tcScale[1][1] = 1.0f / float(1 << m_CurrentVat->g1.Tex1Frac);
|
||||
tcScale[2][0] = tcScale[2][1] = 1.0f / float(1 << m_CurrentVat->g1.Tex2Frac);
|
||||
tcScale[3][0] = tcScale[3][1] = 1.0f / float(1 << m_CurrentVat->g1.Tex3Frac);
|
||||
tcScale[4][0] = tcScale[4][1] = 1.0f / float(1 << m_CurrentVat->g2.Tex4Frac);
|
||||
tcScale[5][0] = tcScale[5][1] = 1.0f / float(1 << m_CurrentVat->g2.Tex5Frac);
|
||||
tcScale[6][0] = tcScale[6][1] = 1.0f / float(1 << m_CurrentVat->g2.Tex6Frac);
|
||||
tcScale[7][0] = tcScale[7][1] = 1.0f / float(1 << m_CurrentVat->g2.Tex7Frac);
|
||||
|
||||
//TexMtx
|
||||
const u64 tmDesc[8] = {
|
||||
|
|
|
@ -41,8 +41,9 @@ static int s_texmtxread = 0;
|
|||
int tcIndex;
|
||||
int colIndex;
|
||||
int colElements[2];
|
||||
float posScale;
|
||||
float tcScale[8];
|
||||
// Duplicated (4x and 2x respectively) and used in SSE code in the vertex loader JIT
|
||||
GC_ALIGNED128(float posScale[4]);
|
||||
GC_ALIGNED64(float tcScale[8][2]);
|
||||
|
||||
static const float fractionTable[32] = {
|
||||
1.0f / (1U << 0), 1.0f / (1U << 1), 1.0f / (1U << 2), 1.0f / (1U << 3),
|
||||
|
@ -65,10 +66,8 @@ static void LOADERDECL PosMtx_ReadDirect_UByte()
|
|||
|
||||
static void LOADERDECL PosMtx_Write()
|
||||
{
|
||||
DataWrite<u8>(s_curposmtx);
|
||||
DataWrite<u8>(0);
|
||||
DataWrite<u8>(0);
|
||||
DataWrite<u8>(0);
|
||||
// u8, 0, 0, 0
|
||||
DataWrite<u32>(s_curposmtx);
|
||||
}
|
||||
|
||||
static void LOADERDECL TexMtx_ReadDirect_UByte()
|
||||
|
@ -451,10 +450,10 @@ void VertexLoader::SetupRunVertices(const VAT& vat, int primitive, int const cou
|
|||
m_VtxAttr.texCoord[6].Frac = vat.g2.Tex6Frac;
|
||||
m_VtxAttr.texCoord[7].Frac = vat.g2.Tex7Frac;
|
||||
|
||||
posScale = fractionTable[m_VtxAttr.PosFrac];
|
||||
posScale[0] = posScale[1] = posScale[2] = posScale[3] = fractionTable[m_VtxAttr.PosFrac];
|
||||
if (m_native_components & VB_HAS_UVALL)
|
||||
for (int i = 0; i < 8; i++)
|
||||
tcScale[i] = fractionTable[m_VtxAttr.texCoord[i].Frac];
|
||||
tcScale[i][0] = tcScale[i][1] = fractionTable[m_VtxAttr.texCoord[i].Frac];
|
||||
for (int i = 0; i < 2; i++)
|
||||
colElements[i] = m_VtxAttr.color[i].Elements;
|
||||
|
||||
|
|
|
@ -19,6 +19,13 @@
|
|||
#include "VideoCommon/DataReader.h"
|
||||
#include "VideoCommon/NativeVertexFormat.h"
|
||||
|
||||
#if _M_SSE >= 0x401
|
||||
#include <smmintrin.h>
|
||||
#include <emmintrin.h>
|
||||
#elif _M_SSE >= 0x301 && !(defined __GNUC__ && !defined __SSSE3__)
|
||||
#include <tmmintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef _M_X86
|
||||
#define USE_VERTEX_LOADER_JIT
|
||||
#endif
|
||||
|
@ -27,8 +34,8 @@
|
|||
extern int tcIndex;
|
||||
extern int colIndex;
|
||||
extern int colElements[2];
|
||||
extern float posScale;
|
||||
extern float tcScale[8];
|
||||
GC_ALIGNED128(extern float posScale[4]);
|
||||
GC_ALIGNED64(extern float tcScale[8][2]);
|
||||
|
||||
class VertexLoaderUID
|
||||
{
|
||||
|
@ -155,3 +162,61 @@ private:
|
|||
void WriteSetVariable(int bits, void *address, Gen::OpArg dest);
|
||||
#endif
|
||||
};
|
||||
|
||||
#if _M_SSE >= 0x301
|
||||
static const __m128i kMaskSwap32_3 = _mm_set_epi32(0xFFFFFFFFL, 0x08090A0BL, 0x04050607L, 0x00010203L);
|
||||
static const __m128i kMaskSwap32_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L);
|
||||
static const __m128i kMaskSwap16to32l_3 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFF0405L, 0xFFFF0203L, 0xFFFF0001L);
|
||||
static const __m128i kMaskSwap16to32l_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFF0203L, 0xFFFF0001L);
|
||||
static const __m128i kMaskSwap16to32h_3 = _mm_set_epi32(0xFFFFFFFFL, 0x0405FFFFL, 0x0203FFFFL, 0x0001FFFFL);
|
||||
static const __m128i kMaskSwap16to32h_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x0203FFFFL, 0x0001FFFFL);
|
||||
static const __m128i kMask8to32l_3 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFF02L, 0xFFFFFF01L, 0xFFFFFF00L);
|
||||
static const __m128i kMask8to32l_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFF01L, 0xFFFFFF00L);
|
||||
static const __m128i kMask8to32h_3 = _mm_set_epi32(0xFFFFFFFFL, 0x02FFFFFFL, 0x01FFFFFFL, 0x00FFFFFFL);
|
||||
static const __m128i kMask8to32h_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x01FFFFFFL, 0x00FFFFFFL);
|
||||
|
||||
template <typename T, bool threeIn, bool threeOut>
|
||||
__forceinline void Vertex_Read_SSSE3(const T* pData, __m128 scale)
|
||||
{
|
||||
__m128i coords, mask;
|
||||
|
||||
int loadBytes = sizeof(T) * (2 + threeIn);
|
||||
if (loadBytes > 8)
|
||||
coords = _mm_loadu_si128((__m128i*)pData);
|
||||
else if (loadBytes > 4)
|
||||
coords = _mm_loadl_epi64((__m128i*)pData);
|
||||
else
|
||||
coords = _mm_cvtsi32_si128(*(u32*)pData);
|
||||
|
||||
// Float case (no scaling)
|
||||
if (sizeof(T) == 4)
|
||||
{
|
||||
coords = _mm_shuffle_epi8(coords, threeIn ? kMaskSwap32_3 : kMaskSwap32_2);
|
||||
if (threeOut)
|
||||
_mm_storeu_si128((__m128i*)VertexManager::s_pCurBufferPointer, coords);
|
||||
else
|
||||
_mm_storel_epi64((__m128i*)VertexManager::s_pCurBufferPointer, coords);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Byte swap, unpack, and move to high bytes for sign extend.
|
||||
if (std::is_unsigned<T>::value)
|
||||
mask = sizeof(T) == 2 ? (threeIn ? kMaskSwap16to32l_3 : kMaskSwap16to32l_2) : (threeIn ? kMask8to32l_3 : kMask8to32l_2);
|
||||
else
|
||||
mask = sizeof(T) == 2 ? (threeIn ? kMaskSwap16to32h_3 : kMaskSwap16to32h_2) : (threeIn ? kMask8to32h_3 : kMask8to32h_2);
|
||||
coords = _mm_shuffle_epi8(coords, mask);
|
||||
|
||||
// Sign extend
|
||||
if (std::is_signed<T>::value)
|
||||
coords = _mm_srai_epi32(coords, 32 - sizeof(T) * 8);
|
||||
|
||||
__m128 out = _mm_mul_ps(_mm_cvtepi32_ps(coords), scale);
|
||||
if (threeOut)
|
||||
_mm_storeu_ps((float*)VertexManager::s_pCurBufferPointer, out);
|
||||
else
|
||||
_mm_storel_pi((__m64*)VertexManager::s_pCurBufferPointer, out);
|
||||
}
|
||||
|
||||
VertexManager::s_pCurBufferPointer += sizeof(float) * (2 + threeOut);
|
||||
}
|
||||
#endif
|
|
@ -3,7 +3,7 @@
|
|||
// Refer to the license.txt file included.
|
||||
|
||||
#include <cmath>
|
||||
#include <limits>
|
||||
#include <type_traits>
|
||||
|
||||
#include "Common/CommonTypes.h"
|
||||
#include "Common/CPUDetect.h"
|
||||
|
@ -13,13 +13,6 @@
|
|||
#include "VideoCommon/VertexManagerBase.h"
|
||||
#include "VideoCommon/VideoCommon.h"
|
||||
|
||||
#if _M_SSE >= 0x401
|
||||
#include <smmintrin.h>
|
||||
#include <emmintrin.h>
|
||||
#elif _M_SSE >= 0x301 && !(defined __GNUC__ && !defined __SSSE3__)
|
||||
#include <tmmintrin.h>
|
||||
#endif
|
||||
|
||||
// warning: mapping buffer should be disabled to use this
|
||||
#define LOG_NORM() // PRIM_LOG("norm: %f %f %f, ", ((float*)VertexManager::s_pCurBufferPointer)[-3], ((float*)VertexManager::s_pCurBufferPointer)[-2], ((float*)VertexManager::s_pCurBufferPointer)[-1]);
|
||||
|
||||
|
@ -37,7 +30,7 @@ __forceinline float FracAdjust(T val)
|
|||
//auto const U16FRAC = 1.f / (1u << 15);
|
||||
|
||||
// TODO: is this right?
|
||||
return val / float(1u << (sizeof(T) * 8 - std::numeric_limits<T>::is_signed - 1));
|
||||
return val / float(1u << (sizeof(T) * 8 - std::is_signed<T>::value - 1));
|
||||
}
|
||||
|
||||
template <>
|
||||
|
@ -76,7 +69,7 @@ struct Normal_Direct
|
|||
template <typename I, typename T, int N, int Offset>
|
||||
__forceinline void Normal_Index_Offset()
|
||||
{
|
||||
static_assert(!std::numeric_limits<I>::is_signed, "Only unsigned I is sane!");
|
||||
static_assert(std::is_unsigned<I>::value, "Only unsigned I is sane!");
|
||||
|
||||
auto const index = DataRead<I>();
|
||||
auto const data = reinterpret_cast<const T*>(cached_arraybases[ARRAY_NORMAL]
|
||||
|
@ -108,6 +101,63 @@ struct Normal_Index_Indices3
|
|||
static const int size = sizeof(I) * 3;
|
||||
};
|
||||
|
||||
#if _M_SSE >= 0x301
|
||||
template <typename T, int N>
|
||||
struct Normal_Direct_SSSE3
|
||||
{
|
||||
static void LOADERDECL function()
|
||||
{
|
||||
const T* pData = reinterpret_cast<const T*>(DataGetPosition());
|
||||
const float frac = 1. / float(1u << (sizeof(T) * 8 - std::is_signed<T>::value - 1));
|
||||
const __m128 scale = _mm_set_ps(frac, frac, frac, frac);
|
||||
for (int i = 0; i < N; i++, pData += 3)
|
||||
Vertex_Read_SSSE3<T, true, true>(pData, scale);
|
||||
DataSkip<N * 3 * sizeof(T)>();
|
||||
}
|
||||
|
||||
static const int size = sizeof(T) * N * 3;
|
||||
};
|
||||
|
||||
template <typename I, typename T, int N, int Offset>
|
||||
__forceinline void Normal_Index_Offset_SSSE3()
|
||||
{
|
||||
static_assert(std::is_unsigned<I>::value, "Only unsigned I is sane!");
|
||||
|
||||
auto const index = DataRead<I>();
|
||||
const T* pData = (const T*)(cached_arraybases[ARRAY_NORMAL]
|
||||
+ (index * g_main_cp_state.array_strides[ARRAY_NORMAL]) + sizeof(T) * 3 * Offset);
|
||||
const float frac = 1. / float(1u << (sizeof(T) * 8 - std::is_signed<T>::value - 1));
|
||||
const __m128 scale = _mm_set_ps(frac, frac, frac, frac);
|
||||
for (int i = 0; i < N; i++, pData += 3)
|
||||
Vertex_Read_SSSE3<T, true, true>(pData, scale);
|
||||
}
|
||||
|
||||
template <typename I, typename T, int N>
|
||||
struct Normal_Index_SSSE3
|
||||
{
|
||||
static void LOADERDECL function()
|
||||
{
|
||||
Normal_Index_Offset_SSSE3<I, T, N, 0>();
|
||||
}
|
||||
|
||||
static const int size = sizeof(I);
|
||||
};
|
||||
|
||||
template <typename I, typename T>
|
||||
struct Normal_Index_Indices3_SSSE3
|
||||
{
|
||||
static void LOADERDECL function()
|
||||
{
|
||||
Normal_Index_Offset_SSSE3<I, T, 1, 0>();
|
||||
Normal_Index_Offset_SSSE3<I, T, 1, 1>();
|
||||
Normal_Index_Offset_SSSE3<I, T, 1, 2>();
|
||||
}
|
||||
|
||||
static const int size = sizeof(I) * 3;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void VertexLoader_Normal::Init()
|
||||
|
@ -180,6 +230,77 @@ void VertexLoader_Normal::Init()
|
|||
m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_USHORT] = Normal_Index_Indices3<u16, u16>();
|
||||
m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_SHORT] = Normal_Index_Indices3<u16, s16>();
|
||||
m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_FLOAT] = Normal_Index_Indices3<u16, float>();
|
||||
|
||||
#if _M_SSE >= 0x301
|
||||
if (cpu_info.bSSSE3)
|
||||
{
|
||||
m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT][FORMAT_UBYTE] = Normal_Direct_SSSE3<u8, 1>();
|
||||
m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT][FORMAT_BYTE] = Normal_Direct_SSSE3<s8, 1>();
|
||||
m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT][FORMAT_USHORT] = Normal_Direct_SSSE3<u16, 1>();
|
||||
m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT][FORMAT_SHORT] = Normal_Direct_SSSE3<s16, 1>();
|
||||
m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT][FORMAT_FLOAT] = Normal_Direct_SSSE3<float, 1>();
|
||||
m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT3][FORMAT_UBYTE] = Normal_Direct_SSSE3<u8, 3>();
|
||||
m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT3][FORMAT_BYTE] = Normal_Direct_SSSE3<s8, 3>();
|
||||
m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT3][FORMAT_USHORT] = Normal_Direct_SSSE3<u16, 3>();
|
||||
m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT3][FORMAT_SHORT] = Normal_Direct_SSSE3<s16, 3>();
|
||||
m_Table[NRM_DIRECT][NRM_INDICES1][NRM_NBT3][FORMAT_FLOAT] = Normal_Direct_SSSE3<float, 3>();
|
||||
|
||||
m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT][FORMAT_UBYTE] = Normal_Direct_SSSE3<u8, 1>();
|
||||
m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT][FORMAT_BYTE] = Normal_Direct_SSSE3<s8, 1>();
|
||||
m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT][FORMAT_USHORT] = Normal_Direct_SSSE3<u16, 1>();
|
||||
m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT][FORMAT_SHORT] = Normal_Direct_SSSE3<s16, 1>();
|
||||
m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT][FORMAT_FLOAT] = Normal_Direct_SSSE3<float, 1>();
|
||||
m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT3][FORMAT_UBYTE] = Normal_Direct_SSSE3<u8, 3>();
|
||||
m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT3][FORMAT_BYTE] = Normal_Direct_SSSE3<s8, 3>();
|
||||
m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT3][FORMAT_USHORT] = Normal_Direct_SSSE3<u16, 3>();
|
||||
m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT3][FORMAT_SHORT] = Normal_Direct_SSSE3<s16, 3>();
|
||||
m_Table[NRM_DIRECT][NRM_INDICES3][NRM_NBT3][FORMAT_FLOAT] = Normal_Direct_SSSE3<float, 3>();
|
||||
|
||||
m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT][FORMAT_UBYTE] = Normal_Index_SSSE3<u8, u8, 1>();
|
||||
m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT][FORMAT_BYTE] = Normal_Index_SSSE3<u8, s8, 1>();
|
||||
m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT][FORMAT_USHORT] = Normal_Index_SSSE3<u8, u16, 1>();
|
||||
m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT][FORMAT_SHORT] = Normal_Index_SSSE3<u8, s16, 1>();
|
||||
m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT][FORMAT_FLOAT] = Normal_Index_SSSE3<u8, float, 1>();
|
||||
m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT3][FORMAT_UBYTE] = Normal_Index_SSSE3<u8, u8, 3>();
|
||||
m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT3][FORMAT_BYTE] = Normal_Index_SSSE3<u8, s8, 3>();
|
||||
m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT3][FORMAT_USHORT] = Normal_Index_SSSE3<u8, u16, 3>();
|
||||
m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT3][FORMAT_SHORT] = Normal_Index_SSSE3<u8, s16, 3>();
|
||||
m_Table[NRM_INDEX8][NRM_INDICES1][NRM_NBT3][FORMAT_FLOAT] = Normal_Index_SSSE3<u8, float, 3>();
|
||||
|
||||
m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT][FORMAT_UBYTE] = Normal_Index_SSSE3<u8, u8, 1>();
|
||||
m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT][FORMAT_BYTE] = Normal_Index_SSSE3<u8, s8, 1>();
|
||||
m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT][FORMAT_USHORT] = Normal_Index_SSSE3<u8, u16, 1>();
|
||||
m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT][FORMAT_SHORT] = Normal_Index_SSSE3<u8, s16, 1>();
|
||||
m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT][FORMAT_FLOAT] = Normal_Index_SSSE3<u8, float, 1>();
|
||||
m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT3][FORMAT_UBYTE] = Normal_Index_Indices3_SSSE3<u8, u8>();
|
||||
m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT3][FORMAT_BYTE] = Normal_Index_Indices3_SSSE3<u8, s8>();
|
||||
m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT3][FORMAT_USHORT] = Normal_Index_Indices3_SSSE3<u8, u16>();
|
||||
m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT3][FORMAT_SHORT] = Normal_Index_Indices3_SSSE3<u8, s16>();
|
||||
m_Table[NRM_INDEX8][NRM_INDICES3][NRM_NBT3][FORMAT_FLOAT] = Normal_Index_Indices3_SSSE3<u8, float>();
|
||||
|
||||
m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT][FORMAT_UBYTE] = Normal_Index_SSSE3<u16, u8, 1>();
|
||||
m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT][FORMAT_BYTE] = Normal_Index_SSSE3<u16, s8, 1>();
|
||||
m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT][FORMAT_USHORT] = Normal_Index_SSSE3<u16, u16, 1>();
|
||||
m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT][FORMAT_SHORT] = Normal_Index_SSSE3<u16, s16, 1>();
|
||||
m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT][FORMAT_FLOAT] = Normal_Index_SSSE3<u16, float, 1>();
|
||||
m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT3][FORMAT_UBYTE] = Normal_Index_SSSE3<u16, u8, 3>();
|
||||
m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT3][FORMAT_BYTE] = Normal_Index_SSSE3<u16, s8, 3>();
|
||||
m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT3][FORMAT_USHORT] = Normal_Index_SSSE3<u16, u16, 3>();
|
||||
m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT3][FORMAT_SHORT] = Normal_Index_SSSE3<u16, s16, 3>();
|
||||
m_Table[NRM_INDEX16][NRM_INDICES1][NRM_NBT3][FORMAT_FLOAT] = Normal_Index_SSSE3<u16, float, 3>();
|
||||
|
||||
m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT][FORMAT_UBYTE] = Normal_Index_SSSE3<u16, u8, 1>();
|
||||
m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT][FORMAT_BYTE] = Normal_Index_SSSE3<u16, s8, 1>();
|
||||
m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT][FORMAT_USHORT] = Normal_Index_SSSE3<u16, u16, 1>();
|
||||
m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT][FORMAT_SHORT] = Normal_Index_SSSE3<u16, s16, 1>();
|
||||
m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT][FORMAT_FLOAT] = Normal_Index_SSSE3<u16, float, 1>();
|
||||
m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_UBYTE] = Normal_Index_Indices3_SSSE3<u16, u8>();
|
||||
m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_BYTE] = Normal_Index_Indices3_SSSE3<u16, s8>();
|
||||
m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_USHORT] = Normal_Index_Indices3_SSSE3<u16, u16>();
|
||||
m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_SHORT] = Normal_Index_Indices3_SSSE3<u16, s16>();
|
||||
m_Table[NRM_INDEX16][NRM_INDICES3][NRM_NBT3][FORMAT_FLOAT] = Normal_Index_Indices3_SSSE3<u16, float>();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
unsigned int VertexLoader_Normal::GetSize(u64 _type,
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
// Licensed under GPLv2
|
||||
// Refer to the license.txt file included.
|
||||
|
||||
#include <limits>
|
||||
#include <type_traits>
|
||||
|
||||
#include "Common/CommonTypes.h"
|
||||
#include "Common/CPUDetect.h"
|
||||
|
@ -74,7 +74,7 @@ template <typename T, int N>
|
|||
void LOADERDECL Pos_ReadDirect()
|
||||
{
|
||||
static_assert(N <= 3, "N > 3 is not sane!");
|
||||
auto const scale = posScale;
|
||||
auto const scale = posScale[0];
|
||||
DataWriter dst;
|
||||
DataReader src;
|
||||
|
||||
|
@ -87,12 +87,12 @@ void LOADERDECL Pos_ReadDirect()
|
|||
template <typename I, typename T, int N>
|
||||
void LOADERDECL Pos_ReadIndex()
|
||||
{
|
||||
static_assert(!std::numeric_limits<I>::is_signed, "Only unsigned I is sane!");
|
||||
static_assert(std::is_unsigned<I>::value, "Only unsigned I is sane!");
|
||||
static_assert(N <= 3, "N > 3 is not sane!");
|
||||
|
||||
auto const index = DataRead<I>();
|
||||
auto const data = reinterpret_cast<const T*>(cached_arraybases[ARRAY_POSITION] + (index * g_main_cp_state.array_strides[ARRAY_POSITION]));
|
||||
auto const scale = posScale;
|
||||
auto const scale = posScale[0];
|
||||
DataWriter dst;
|
||||
|
||||
for (int i = 0; i < 3; ++i)
|
||||
|
@ -102,18 +102,22 @@ void LOADERDECL Pos_ReadIndex()
|
|||
}
|
||||
|
||||
#if _M_SSE >= 0x301
|
||||
static const __m128i kMaskSwap32_3 = _mm_set_epi32(0xFFFFFFFFL, 0x08090A0BL, 0x04050607L, 0x00010203L);
|
||||
static const __m128i kMaskSwap32_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L);
|
||||
|
||||
template <typename I, bool three>
|
||||
void LOADERDECL Pos_ReadIndex_Float_SSSE3()
|
||||
template <typename T, bool three>
|
||||
void LOADERDECL Pos_ReadDirect_SSSE3()
|
||||
{
|
||||
const T* pData = reinterpret_cast<const T*>(DataGetPosition());
|
||||
Vertex_Read_SSSE3<T, three, true>(pData, *(__m128*)posScale);
|
||||
DataSkip<(2 + three) * sizeof(T)>();
|
||||
LOG_VTX();
|
||||
}
|
||||
|
||||
template <typename I, typename T, bool three>
|
||||
void LOADERDECL Pos_ReadIndex_SSSE3()
|
||||
{
|
||||
static_assert(std::is_unsigned<I>::value, "Only unsigned I is sane!");
|
||||
auto const index = DataRead<I>();
|
||||
const u32* pData = (const u32 *)(cached_arraybases[ARRAY_POSITION] + (index * g_main_cp_state.array_strides[ARRAY_POSITION]));
|
||||
GC_ALIGNED128(const __m128i a = _mm_loadu_si128((__m128i*)pData));
|
||||
GC_ALIGNED128(__m128i b = _mm_shuffle_epi8(a, three ? kMaskSwap32_3 : kMaskSwap32_2));
|
||||
_mm_storeu_si128((__m128i*)VertexManager::s_pCurBufferPointer, b);
|
||||
VertexManager::s_pCurBufferPointer += sizeof(float) * 3;
|
||||
const T* pData = (const T*)(cached_arraybases[ARRAY_POSITION] + (index * g_main_cp_state.array_strides[ARRAY_POSITION]));
|
||||
Vertex_Read_SSSE3<T, three, true>(pData, *(__m128*)posScale);
|
||||
LOG_VTX();
|
||||
}
|
||||
#endif
|
||||
|
@ -169,15 +173,39 @@ void VertexLoader_Position::Init()
|
|||
{
|
||||
|
||||
#if _M_SSE >= 0x301
|
||||
|
||||
if (cpu_info.bSSSE3)
|
||||
{
|
||||
tableReadPosition[2][4][0] = Pos_ReadIndex_Float_SSSE3<u8, false>;
|
||||
tableReadPosition[2][4][1] = Pos_ReadIndex_Float_SSSE3<u8, true>;
|
||||
tableReadPosition[3][4][0] = Pos_ReadIndex_Float_SSSE3<u16, false>;
|
||||
tableReadPosition[3][4][1] = Pos_ReadIndex_Float_SSSE3<u16, true>;
|
||||
tableReadPosition[1][0][0] = Pos_ReadDirect_SSSE3<u8, false>;
|
||||
tableReadPosition[1][0][1] = Pos_ReadDirect_SSSE3<u8, true>;
|
||||
tableReadPosition[1][1][0] = Pos_ReadDirect_SSSE3<s8, false>;
|
||||
tableReadPosition[1][1][1] = Pos_ReadDirect_SSSE3<s8, true>;
|
||||
tableReadPosition[1][2][0] = Pos_ReadDirect_SSSE3<u16, false>;
|
||||
tableReadPosition[1][2][1] = Pos_ReadDirect_SSSE3<u16, true>;
|
||||
tableReadPosition[1][3][0] = Pos_ReadDirect_SSSE3<s16, false>;
|
||||
tableReadPosition[1][3][1] = Pos_ReadDirect_SSSE3<s16, true>;
|
||||
tableReadPosition[1][4][0] = Pos_ReadDirect_SSSE3<float, false>;
|
||||
tableReadPosition[1][4][1] = Pos_ReadDirect_SSSE3<float, true>;
|
||||
tableReadPosition[2][0][0] = Pos_ReadIndex_SSSE3<u8, u8, false>;
|
||||
tableReadPosition[2][0][1] = Pos_ReadIndex_SSSE3<u8, u8, true>;
|
||||
tableReadPosition[3][0][0] = Pos_ReadIndex_SSSE3<u16, u8, false>;
|
||||
tableReadPosition[3][0][1] = Pos_ReadIndex_SSSE3<u16, u8, true>;
|
||||
tableReadPosition[2][1][0] = Pos_ReadIndex_SSSE3<u8, s8, false>;
|
||||
tableReadPosition[2][1][1] = Pos_ReadIndex_SSSE3<u8, s8, true>;
|
||||
tableReadPosition[3][1][0] = Pos_ReadIndex_SSSE3<u16, s8, false>;
|
||||
tableReadPosition[3][1][1] = Pos_ReadIndex_SSSE3<u16, s8, true>;
|
||||
tableReadPosition[2][2][0] = Pos_ReadIndex_SSSE3<u8, u16, false>;
|
||||
tableReadPosition[2][2][1] = Pos_ReadIndex_SSSE3<u8, u16, true>;
|
||||
tableReadPosition[3][2][0] = Pos_ReadIndex_SSSE3<u16, u16, false>;
|
||||
tableReadPosition[3][2][1] = Pos_ReadIndex_SSSE3<u16, u16, true>;
|
||||
tableReadPosition[2][3][0] = Pos_ReadIndex_SSSE3<u8, s16, false>;
|
||||
tableReadPosition[2][3][1] = Pos_ReadIndex_SSSE3<u8, s16, true>;
|
||||
tableReadPosition[3][3][0] = Pos_ReadIndex_SSSE3<u16, s16, false>;
|
||||
tableReadPosition[3][3][1] = Pos_ReadIndex_SSSE3<u16, s16, true>;
|
||||
tableReadPosition[2][4][0] = Pos_ReadIndex_SSSE3<u8, float, false>;
|
||||
tableReadPosition[2][4][1] = Pos_ReadIndex_SSSE3<u8, float, true>;
|
||||
tableReadPosition[3][4][0] = Pos_ReadIndex_SSSE3<u16, float, false>;
|
||||
tableReadPosition[3][4][1] = Pos_ReadIndex_SSSE3<u16, float, true>;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
|
|
@ -2,6 +2,8 @@
|
|||
// Licensed under GPLv2
|
||||
// Refer to the license.txt file included.
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
#include "Common/CommonTypes.h"
|
||||
#include "Common/CPUDetect.h"
|
||||
|
||||
|
@ -10,13 +12,6 @@
|
|||
#include "VideoCommon/VertexManagerBase.h"
|
||||
#include "VideoCommon/VideoCommon.h"
|
||||
|
||||
|
||||
#if _M_SSE >= 0x401
|
||||
#include <smmintrin.h>
|
||||
#elif _M_SSE >= 0x301 && !(defined __GNUC__ && !defined __SSSE3__)
|
||||
#include <tmmintrin.h>
|
||||
#endif
|
||||
|
||||
template <int N>
|
||||
void LOG_TEX();
|
||||
|
||||
|
@ -54,7 +49,7 @@ float TCScale(float val, float scale)
|
|||
template <typename T, int N>
|
||||
void LOADERDECL TexCoord_ReadDirect()
|
||||
{
|
||||
auto const scale = tcScale[tcIndex];
|
||||
auto const scale = tcScale[tcIndex][0];
|
||||
DataWriter dst;
|
||||
DataReader src;
|
||||
|
||||
|
@ -69,12 +64,12 @@ void LOADERDECL TexCoord_ReadDirect()
|
|||
template <typename I, typename T, int N>
|
||||
void LOADERDECL TexCoord_ReadIndex()
|
||||
{
|
||||
static_assert(!std::numeric_limits<I>::is_signed, "Only unsigned I is sane!");
|
||||
static_assert(std::is_unsigned<I>::value, "Only unsigned I is sane!");
|
||||
|
||||
auto const index = DataRead<I>();
|
||||
auto const data = reinterpret_cast<const T*>(cached_arraybases[ARRAY_TEXCOORD0 + tcIndex]
|
||||
+ (index * g_main_cp_state.array_strides[ARRAY_TEXCOORD0 + tcIndex]));
|
||||
auto const scale = tcScale[tcIndex];
|
||||
auto const scale = tcScale[tcIndex][0];
|
||||
DataWriter dst;
|
||||
|
||||
for (int i = 0; i != N; ++i)
|
||||
|
@ -84,44 +79,27 @@ void LOADERDECL TexCoord_ReadIndex()
|
|||
++tcIndex;
|
||||
}
|
||||
|
||||
#if _M_SSE >= 0x401
|
||||
static const __m128i kMaskSwap16_2 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0x02030001L);
|
||||
|
||||
template <typename I>
|
||||
void LOADERDECL TexCoord_ReadIndex_Short2_SSE4()
|
||||
#if _M_SSE >= 0x301
|
||||
template <typename T>
|
||||
void LOADERDECL TexCoord_ReadDirect2_SSSE3()
|
||||
{
|
||||
static_assert(!std::numeric_limits<I>::is_signed, "Only unsigned I is sane!");
|
||||
|
||||
// Heavy in ZWW
|
||||
auto const index = DataRead<I>();
|
||||
const s32 *pData = (const s32*)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (index * g_main_cp_state.array_strides[ARRAY_TEXCOORD0+tcIndex]));
|
||||
const __m128i a = _mm_cvtsi32_si128(*pData);
|
||||
const __m128i b = _mm_shuffle_epi8(a, kMaskSwap16_2);
|
||||
const __m128i c = _mm_cvtepi16_epi32(b);
|
||||
const __m128 d = _mm_cvtepi32_ps(c);
|
||||
const __m128 e = _mm_load1_ps(&tcScale[tcIndex]);
|
||||
const __m128 f = _mm_mul_ps(d, e);
|
||||
_mm_storeu_ps((float*)VertexManager::s_pCurBufferPointer, f);
|
||||
VertexManager::s_pCurBufferPointer += sizeof(float) * 2;
|
||||
const T* pData = reinterpret_cast<const T*>(DataGetPosition());
|
||||
__m128 scale = _mm_castsi128_ps(_mm_loadl_epi64((__m128i*)tcScale[tcIndex]));
|
||||
Vertex_Read_SSSE3<T, false, false>(pData, scale);
|
||||
DataSkip<2 * sizeof(T)>();
|
||||
LOG_TEX<2>();
|
||||
tcIndex++;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if _M_SSE >= 0x301
|
||||
static const __m128i kMaskSwap32 = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0x04050607L, 0x00010203L);
|
||||
|
||||
template <typename I>
|
||||
void LOADERDECL TexCoord_ReadIndex_Float2_SSSE3()
|
||||
template <typename I, typename T>
|
||||
void LOADERDECL TexCoord_ReadIndex2_SSSE3()
|
||||
{
|
||||
static_assert(!std::numeric_limits<I>::is_signed, "Only unsigned I is sane!");
|
||||
static_assert(std::is_unsigned<I>::value, "Only unsigned I is sane!");
|
||||
|
||||
auto const index = DataRead<I>();
|
||||
const u32 *pData = (const u32 *)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (index * g_main_cp_state.array_strides[ARRAY_TEXCOORD0+tcIndex]));
|
||||
GC_ALIGNED128(const __m128i a = _mm_loadl_epi64((__m128i*)pData));
|
||||
GC_ALIGNED128(const __m128i b = _mm_shuffle_epi8(a, kMaskSwap32));
|
||||
_mm_storel_epi64((__m128i*)VertexManager::s_pCurBufferPointer, b);
|
||||
VertexManager::s_pCurBufferPointer += sizeof(float) * 2;
|
||||
const T* pData = (const T*)(cached_arraybases[ARRAY_TEXCOORD0 + tcIndex] + (index * g_main_cp_state.array_strides[ARRAY_TEXCOORD0 + tcIndex]));
|
||||
__m128 scale = _mm_castsi128_ps(_mm_loadl_epi64((__m128i*)tcScale[tcIndex]));
|
||||
Vertex_Read_SSSE3<T, false, false>(pData, scale);
|
||||
LOG_TEX<2>();
|
||||
tcIndex++;
|
||||
}
|
||||
|
@ -177,23 +155,24 @@ void VertexLoader_TextCoord::Init()
|
|||
{
|
||||
|
||||
#if _M_SSE >= 0x301
|
||||
|
||||
if (cpu_info.bSSSE3)
|
||||
{
|
||||
tableReadTexCoord[2][4][1] = TexCoord_ReadIndex_Float2_SSSE3<u8>;
|
||||
tableReadTexCoord[3][4][1] = TexCoord_ReadIndex_Float2_SSSE3<u16>;
|
||||
tableReadTexCoord[1][0][1] = TexCoord_ReadDirect2_SSSE3<u8>;
|
||||
tableReadTexCoord[1][1][1] = TexCoord_ReadDirect2_SSSE3<s8>;
|
||||
tableReadTexCoord[1][2][1] = TexCoord_ReadDirect2_SSSE3<u16>;
|
||||
tableReadTexCoord[1][3][1] = TexCoord_ReadDirect2_SSSE3<s16>;
|
||||
tableReadTexCoord[1][4][1] = TexCoord_ReadDirect2_SSSE3<float>;
|
||||
tableReadTexCoord[2][0][1] = TexCoord_ReadIndex2_SSSE3<u8, u8>;
|
||||
tableReadTexCoord[3][0][1] = TexCoord_ReadIndex2_SSSE3<u16, u8>;
|
||||
tableReadTexCoord[2][1][1] = TexCoord_ReadIndex2_SSSE3<u8, s8>;
|
||||
tableReadTexCoord[3][1][1] = TexCoord_ReadIndex2_SSSE3<u16, s8>;
|
||||
tableReadTexCoord[2][2][1] = TexCoord_ReadIndex2_SSSE3<u8, u16>;
|
||||
tableReadTexCoord[3][2][1] = TexCoord_ReadIndex2_SSSE3<u16, u16>;
|
||||
tableReadTexCoord[2][3][1] = TexCoord_ReadIndex2_SSSE3<u8, s16>;
|
||||
tableReadTexCoord[3][3][1] = TexCoord_ReadIndex2_SSSE3<u16, s16>;
|
||||
tableReadTexCoord[2][4][1] = TexCoord_ReadIndex2_SSSE3<u8, float>;
|
||||
tableReadTexCoord[3][4][1] = TexCoord_ReadIndex2_SSSE3<u16, float>;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if _M_SSE >= 0x401
|
||||
|
||||
if (cpu_info.bSSE4_1)
|
||||
{
|
||||
tableReadTexCoord[2][3][1] = TexCoord_ReadIndex_Short2_SSE4<u8>;
|
||||
tableReadTexCoord[3][3][1] = TexCoord_ReadIndex_Short2_SSE4<u16>;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
|
|
@ -53,7 +53,8 @@ u32 VertexManager::GetRemainingSize()
|
|||
|
||||
void VertexManager::PrepareForAdditionalData(int primitive, u32 count, u32 stride)
|
||||
{
|
||||
u32 const needed_vertex_bytes = count * stride;
|
||||
// The SSE vertex loader can write up to 4 bytes past the end
|
||||
u32 const needed_vertex_bytes = count * stride + 4;
|
||||
|
||||
// We can't merge different kinds of primitives, so we have to flush here
|
||||
if (current_primitive_type != primitive_from_gx[primitive])
|
||||
|
|
Loading…
Reference in New Issue