Use SSSE3 shuffle for DataReader's DataReadU32xN in VideoCommon. The function is used for reading up to 16 u32's at a time (512-bits) and then converting endianness.
git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6802 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
parent
f49efa8868
commit
f9e4e73e42
|
@ -305,7 +305,6 @@ u8 AnalyzeAndRunDisplayList(u32 address, int size, CachedDisplayList *dl)
|
|||
u32 Cmd2 = DataReadU32();
|
||||
int transfer_size = ((Cmd2 >> 16) & 15) + 1;
|
||||
u32 xf_address = Cmd2 & 0xFFFF;
|
||||
// TODO - speed this up. pshufb?
|
||||
u32 data_buffer[16];
|
||||
DataReadU32xFuncs[transfer_size-1](data_buffer);
|
||||
LoadXFReg(transfer_size, xf_address, data_buffer);
|
||||
|
@ -453,7 +452,6 @@ bool CompileAndRunDisplayList(u32 address, int size, CachedDisplayList *dl)
|
|||
u32 Cmd2 = DataReadU32();
|
||||
int transfer_size = ((Cmd2 >> 16) & 15) + 1;
|
||||
u32 xf_address = Cmd2 & 0xFFFF;
|
||||
// TODO - speed this up. pshufb?
|
||||
ReferencedDataRegion* NewRegion = new ReferencedDataRegion;
|
||||
NewRegion->MustClean = true;
|
||||
NewRegion->size = transfer_size * 4;
|
||||
|
|
|
@ -15,11 +15,17 @@
|
|||
// Official SVN repository and contact information can be found at
|
||||
// http://code.google.com/p/dolphin-emu/
|
||||
|
||||
|
||||
|
||||
#ifndef _DATAREADER_H
|
||||
#define _DATAREADER_H
|
||||
|
||||
extern u8* g_pVideoData;
|
||||
|
||||
#if _M_SSE >= 0x301 && !(defined __GNUC__ && !defined __SSSE3__)
|
||||
#include <tmmintrin.h>
|
||||
#endif
|
||||
|
||||
__forceinline void DataSkip(u32 skip)
|
||||
{
|
||||
g_pVideoData += skip;
|
||||
|
@ -64,6 +70,49 @@ __forceinline u32 DataReadU32()
|
|||
return tmp;
|
||||
}
|
||||
|
||||
typedef void (*DataReadU32xNfunc)(u32 *buf);
|
||||
extern DataReadU32xNfunc DataReadU32xFuncs[16];
|
||||
|
||||
#if _M_SSE >= 0x301
|
||||
|
||||
const __m128i mask1 = _mm_set_epi8(15,14,13,12,11,10,9,8,7,6,5,4,0,1,2,3);
|
||||
const __m128i mask2 = _mm_set_epi8(15,14,13,12,11,10,9,8,4,5,6,7,0,1,2,3);
|
||||
const __m128i mask3 = _mm_set_epi8(15,14,13,12,8,9,10,11,4,5,6,7,0,1,2,3);
|
||||
const __m128i mask4 = _mm_set_epi8(12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3);
|
||||
|
||||
template<unsigned int N>
|
||||
void DataReadU32xN_SSSE3(u32 *bufx16)
|
||||
{
|
||||
__m128i* store = (__m128i *)bufx16;
|
||||
__m128i* load = (__m128i *)g_pVideoData;
|
||||
switch(N)
|
||||
{
|
||||
case 13: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
|
||||
case 9: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
|
||||
case 5: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
|
||||
case 1: _mm_storeu_si128(store, _mm_shuffle_epi8(_mm_loadu_si128(load), mask1));
|
||||
break;
|
||||
case 14: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
|
||||
case 10: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
|
||||
case 6: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
|
||||
case 2: _mm_storeu_si128(store, _mm_shuffle_epi8(_mm_loadu_si128(load), mask2));
|
||||
break;
|
||||
case 15: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
|
||||
case 11: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
|
||||
case 7: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
|
||||
case 3: _mm_storeu_si128(store, _mm_shuffle_epi8(_mm_loadu_si128(load), mask3));
|
||||
break;
|
||||
case 16: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
|
||||
case 12: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
|
||||
case 8: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
|
||||
case 4: _mm_storeu_si128(store, _mm_shuffle_epi8(_mm_loadu_si128(load), mask4));
|
||||
break;
|
||||
}
|
||||
g_pVideoData += (sizeof(u32) * N);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
template<unsigned int N>
|
||||
void DataReadU32xN(u32 *bufx16)
|
||||
{
|
||||
|
@ -87,9 +136,6 @@ void DataReadU32xN(u32 *bufx16)
|
|||
g_pVideoData += (sizeof(u32) * N);
|
||||
}
|
||||
|
||||
typedef void (*DataReadU32xNfunc)(u32 *buf);
|
||||
extern DataReadU32xNfunc DataReadU32xFuncs[16];
|
||||
|
||||
__forceinline u32 DataReadU32Unswapped()
|
||||
{
|
||||
u32 tmp = *(u32*)g_pVideoData;
|
||||
|
|
|
@ -30,6 +30,7 @@
|
|||
#include "Profiler.h"
|
||||
#include "OpcodeDecoding.h"
|
||||
#include "CommandProcessor.h"
|
||||
#include "CPUDetect.h"
|
||||
|
||||
#include "VertexLoaderManager.h"
|
||||
|
||||
|
@ -47,6 +48,27 @@
|
|||
#include "VideoConfig.h"
|
||||
|
||||
u8* g_pVideoData = 0;
|
||||
#if _M_SSE >= 0x301
|
||||
DataReadU32xNfunc DataReadU32xFuncs_SSSE3[16] = {
|
||||
DataReadU32xN_SSSE3<1>,
|
||||
DataReadU32xN_SSSE3<2>,
|
||||
DataReadU32xN_SSSE3<3>,
|
||||
DataReadU32xN_SSSE3<4>,
|
||||
DataReadU32xN_SSSE3<5>,
|
||||
DataReadU32xN_SSSE3<6>,
|
||||
DataReadU32xN_SSSE3<7>,
|
||||
DataReadU32xN_SSSE3<8>,
|
||||
DataReadU32xN_SSSE3<9>,
|
||||
DataReadU32xN_SSSE3<10>,
|
||||
DataReadU32xN_SSSE3<11>,
|
||||
DataReadU32xN_SSSE3<12>,
|
||||
DataReadU32xN_SSSE3<13>,
|
||||
DataReadU32xN_SSSE3<14>,
|
||||
DataReadU32xN_SSSE3<15>,
|
||||
DataReadU32xN_SSSE3<16>
|
||||
};
|
||||
#endif
|
||||
|
||||
DataReadU32xNfunc DataReadU32xFuncs[16] = {
|
||||
DataReadU32xN<1>,
|
||||
DataReadU32xN<2>,
|
||||
|
@ -250,7 +272,6 @@ static void Decode()
|
|||
u32 Cmd2 = DataReadU32();
|
||||
int transfer_size = ((Cmd2 >> 16) & 15) + 1;
|
||||
u32 xf_address = Cmd2 & 0xFFFF;
|
||||
// TODO - speed this up. pshufb?
|
||||
u32 data_buffer[16];
|
||||
DataReadU32xFuncs[transfer_size-1](data_buffer);
|
||||
|
||||
|
@ -401,6 +422,13 @@ void OpcodeDecoder_Init()
|
|||
{
|
||||
g_pVideoData = FAKE_GetFifoStartPtr();
|
||||
|
||||
#if _M_SSE >= 0x301
|
||||
if (cpu_info.bSSSE3)
|
||||
{
|
||||
*DataReadU32xFuncs = *DataReadU32xFuncs_SSSE3;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (g_Config.bEnableOpenCL)
|
||||
{
|
||||
OpenCL::Initialize();
|
||||
|
|
|
@ -22,10 +22,6 @@
|
|||
#include "VertexManagerBase.h"
|
||||
#include "CPUDetect.h"
|
||||
|
||||
#if _M_SSE >= 0x301 && !(defined __GNUC__ && !defined __SSSE3__)
|
||||
#include <tmmintrin.h>
|
||||
#endif
|
||||
|
||||
extern float posScale;
|
||||
extern TVtxAttr *pVtxAttr;
|
||||
|
||||
|
|
Loading…
Reference in New Issue