Use SSSE3 shuffle for DataReader's DataReadU32xN in VideoCommon. The function is used for reading up to 16 u32's at a time (512-bits) and then converting endianness.

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6802 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
xsacha 2011-01-10 13:14:56 +00:00
parent f49efa8868
commit f9e4e73e42
4 changed files with 78 additions and 10 deletions

View File

@ -305,7 +305,6 @@ u8 AnalyzeAndRunDisplayList(u32 address, int size, CachedDisplayList *dl)
u32 Cmd2 = DataReadU32();
int transfer_size = ((Cmd2 >> 16) & 15) + 1;
u32 xf_address = Cmd2 & 0xFFFF;
// TODO - speed this up. pshufb?
u32 data_buffer[16];
DataReadU32xFuncs[transfer_size-1](data_buffer);
LoadXFReg(transfer_size, xf_address, data_buffer);
@ -453,7 +452,6 @@ bool CompileAndRunDisplayList(u32 address, int size, CachedDisplayList *dl)
u32 Cmd2 = DataReadU32();
int transfer_size = ((Cmd2 >> 16) & 15) + 1;
u32 xf_address = Cmd2 & 0xFFFF;
// TODO - speed this up. pshufb?
ReferencedDataRegion* NewRegion = new ReferencedDataRegion;
NewRegion->MustClean = true;
NewRegion->size = transfer_size * 4;

View File

@ -15,11 +15,17 @@
// Official SVN repository and contact information can be found at
// http://code.google.com/p/dolphin-emu/
#ifndef _DATAREADER_H
#define _DATAREADER_H
extern u8* g_pVideoData;
#if _M_SSE >= 0x301 && !(defined __GNUC__ && !defined __SSSE3__)
#include <tmmintrin.h>
#endif
__forceinline void DataSkip(u32 skip)
{
g_pVideoData += skip;
@ -64,6 +70,49 @@ __forceinline u32 DataReadU32()
return tmp;
}
typedef void (*DataReadU32xNfunc)(u32 *buf);
extern DataReadU32xNfunc DataReadU32xFuncs[16];
#if _M_SSE >= 0x301
const __m128i mask1 = _mm_set_epi8(15,14,13,12,11,10,9,8,7,6,5,4,0,1,2,3);
const __m128i mask2 = _mm_set_epi8(15,14,13,12,11,10,9,8,4,5,6,7,0,1,2,3);
const __m128i mask3 = _mm_set_epi8(15,14,13,12,8,9,10,11,4,5,6,7,0,1,2,3);
const __m128i mask4 = _mm_set_epi8(12,13,14,15,8,9,10,11,4,5,6,7,0,1,2,3);
template<unsigned int N>
void DataReadU32xN_SSSE3(u32 *bufx16)
{
__m128i* store = (__m128i *)bufx16;
__m128i* load = (__m128i *)g_pVideoData;
switch(N)
{
case 13: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
case 9: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
case 5: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
case 1: _mm_storeu_si128(store, _mm_shuffle_epi8(_mm_loadu_si128(load), mask1));
break;
case 14: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
case 10: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
case 6: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
case 2: _mm_storeu_si128(store, _mm_shuffle_epi8(_mm_loadu_si128(load), mask2));
break;
case 15: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
case 11: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
case 7: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
case 3: _mm_storeu_si128(store, _mm_shuffle_epi8(_mm_loadu_si128(load), mask3));
break;
case 16: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
case 12: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
case 8: _mm_storeu_si128(store++, _mm_shuffle_epi8(_mm_loadu_si128(load++), mask4));
case 4: _mm_storeu_si128(store, _mm_shuffle_epi8(_mm_loadu_si128(load), mask4));
break;
}
g_pVideoData += (sizeof(u32) * N);
}
#endif
template<unsigned int N>
void DataReadU32xN(u32 *bufx16)
{
@ -87,9 +136,6 @@ void DataReadU32xN(u32 *bufx16)
g_pVideoData += (sizeof(u32) * N);
}
typedef void (*DataReadU32xNfunc)(u32 *buf);
extern DataReadU32xNfunc DataReadU32xFuncs[16];
__forceinline u32 DataReadU32Unswapped()
{
u32 tmp = *(u32*)g_pVideoData;

View File

@ -30,6 +30,7 @@
#include "Profiler.h"
#include "OpcodeDecoding.h"
#include "CommandProcessor.h"
#include "CPUDetect.h"
#include "VertexLoaderManager.h"
@ -47,6 +48,27 @@
#include "VideoConfig.h"
u8* g_pVideoData = 0;
#if _M_SSE >= 0x301
DataReadU32xNfunc DataReadU32xFuncs_SSSE3[16] = {
DataReadU32xN_SSSE3<1>,
DataReadU32xN_SSSE3<2>,
DataReadU32xN_SSSE3<3>,
DataReadU32xN_SSSE3<4>,
DataReadU32xN_SSSE3<5>,
DataReadU32xN_SSSE3<6>,
DataReadU32xN_SSSE3<7>,
DataReadU32xN_SSSE3<8>,
DataReadU32xN_SSSE3<9>,
DataReadU32xN_SSSE3<10>,
DataReadU32xN_SSSE3<11>,
DataReadU32xN_SSSE3<12>,
DataReadU32xN_SSSE3<13>,
DataReadU32xN_SSSE3<14>,
DataReadU32xN_SSSE3<15>,
DataReadU32xN_SSSE3<16>
};
#endif
DataReadU32xNfunc DataReadU32xFuncs[16] = {
DataReadU32xN<1>,
DataReadU32xN<2>,
@ -250,7 +272,6 @@ static void Decode()
u32 Cmd2 = DataReadU32();
int transfer_size = ((Cmd2 >> 16) & 15) + 1;
u32 xf_address = Cmd2 & 0xFFFF;
// TODO - speed this up. pshufb?
u32 data_buffer[16];
DataReadU32xFuncs[transfer_size-1](data_buffer);
@ -401,6 +422,13 @@ void OpcodeDecoder_Init()
{
g_pVideoData = FAKE_GetFifoStartPtr();
#if _M_SSE >= 0x301
if (cpu_info.bSSSE3)
{
*DataReadU32xFuncs = *DataReadU32xFuncs_SSSE3;
}
#endif
if (g_Config.bEnableOpenCL)
{
OpenCL::Initialize();

View File

@ -22,10 +22,6 @@
#include "VertexManagerBase.h"
#include "CPUDetect.h"
#if _M_SSE >= 0x301 && !(defined __GNUC__ && !defined __SSSE3__)
#include <tmmintrin.h>
#endif
extern float posScale;
extern TVtxAttr *pVtxAttr;