TextureDecoder.cpp: new SSE2 optimized GX_TF_I8 decoder. Probably not ultimately optimal SSE2 code, but provably better (on my machine) than the memset version. Tested with __rdtsc counts in an independent project. I get about 6-7 FPS more on average during the intro movie playback in Mario Kart Wii. Hope this compiles for GCC okay.

TextureDecoder.cpp: merged two functionally identical decode5A3RGBA and decode5A3rgba methods.
OpcodeDecoding.cpp and DLCache.cpp: optimization for GX_LOAD_XF_REG. The PSUHFB solution sounds better for SSSE3, but this is a small win for the default case.

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6692 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
james.jdunne 2010-12-30 19:17:08 +00:00
parent 6cf9b3688d
commit b038df64bf
5 changed files with 164 additions and 66 deletions

View File

@ -307,8 +307,7 @@ u8 AnalyzeAndRunDisplayList(u32 address, int size, CachedDisplayList *dl)
u32 xf_address = Cmd2 & 0xFFFF;
// TODO - speed this up. pshufb?
u32 data_buffer[16];
for (int i = 0; i < transfer_size; i++)
data_buffer[i] = DataReadU32();
DataReadU32xFuncs[transfer_size-1](data_buffer);
LoadXFReg(transfer_size, xf_address, data_buffer);
INCSTAT(stats.thisFrame.numXFLoads);
num_xf_reg++;
@ -462,8 +461,7 @@ bool CompileAndRunDisplayList(u32 address, int size, CachedDisplayList *dl)
NewRegion->hash = 0;
dl->InsertRegion(NewRegion);
u32 *data_buffer = (u32*)NewRegion->start_address;
for (int i = 0; i < transfer_size; i++)
data_buffer[i] = DataReadU32();
DataReadU32xFuncs[transfer_size-1](data_buffer);
LoadXFReg(transfer_size, xf_address, data_buffer);
INCSTAT(stats.thisFrame.numXFLoads);
// Compile

View File

@ -64,6 +64,32 @@ __forceinline u32 DataReadU32()
return tmp;
}
template<unsigned int N>
void DataReadU32xN(u32 *bufx16)
{
memcpy(bufx16, g_pVideoData, sizeof(u32) * N);
if (N >= 1) bufx16[0] = Common::swap32(bufx16[0]);
if (N >= 2) bufx16[1] = Common::swap32(bufx16[1]);
if (N >= 3) bufx16[2] = Common::swap32(bufx16[2]);
if (N >= 4) bufx16[3] = Common::swap32(bufx16[3]);
if (N >= 5) bufx16[4] = Common::swap32(bufx16[4]);
if (N >= 6) bufx16[5] = Common::swap32(bufx16[5]);
if (N >= 7) bufx16[6] = Common::swap32(bufx16[6]);
if (N >= 8) bufx16[7] = Common::swap32(bufx16[7]);
if (N >= 9) bufx16[8] = Common::swap32(bufx16[8]);
if (N >= 10) bufx16[9] = Common::swap32(bufx16[9]);
if (N >= 11) bufx16[10] = Common::swap32(bufx16[10]);
if (N >= 12) bufx16[11] = Common::swap32(bufx16[11]);
if (N >= 13) bufx16[12] = Common::swap32(bufx16[12]);
if (N >= 14) bufx16[13] = Common::swap32(bufx16[13]);
if (N >= 15) bufx16[14] = Common::swap32(bufx16[14]);
if (N >= 16) bufx16[15] = Common::swap32(bufx16[15]);
g_pVideoData += (sizeof(u32) * N);
}
typedef void (*DataReadU32xNfunc)(u32 *buf);
extern DataReadU32xNfunc DataReadU32xFuncs[16];
__forceinline u32 DataReadU32Unswapped()
{
u32 tmp = *(u32*)g_pVideoData;

View File

@ -49,6 +49,24 @@
#endif
u8* g_pVideoData = 0;
DataReadU32xNfunc DataReadU32xFuncs[16] = {
DataReadU32xN<1>,
DataReadU32xN<2>,
DataReadU32xN<3>,
DataReadU32xN<4>,
DataReadU32xN<5>,
DataReadU32xN<6>,
DataReadU32xN<7>,
DataReadU32xN<8>,
DataReadU32xN<9>,
DataReadU32xN<10>,
DataReadU32xN<11>,
DataReadU32xN<12>,
DataReadU32xN<13>,
DataReadU32xN<14>,
DataReadU32xN<15>,
DataReadU32xN<16>
};
extern u8* FAKE_GetFifoStartPtr();
extern u8* FAKE_GetFifoEndPtr();
@ -233,12 +251,13 @@ static void Decode()
{
u32 Cmd2 = DataReadU32();
int transfer_size = ((Cmd2 >> 16) & 15) + 1;
u32 xf_address = Cmd2 & 0xFFFF;
u32 xf_address = Cmd2 & 0xFFFF;
// TODO - speed this up. pshufb?
u32 data_buffer[16];
for (int i = 0; i < transfer_size; i++)
data_buffer[i] = DataReadU32();
u32 data_buffer[16];
DataReadU32xFuncs[transfer_size-1](data_buffer);
LoadXFReg(transfer_size, xf_address, data_buffer);
INCSTAT(stats.thisFrame.numXFLoads);
}
break;
@ -317,7 +336,7 @@ static void DecodeSemiNop()
u8 sub_cmd = DataReadU8();
u32 value = DataReadU32();
LoadCPReg(sub_cmd, value);
INCSTAT(stats.thisFrame.numCPLoads);
INCSTAT(stats.thisFrame.numCPLoads);
}
break;
@ -328,10 +347,9 @@ static void DecodeSemiNop()
u32 address = Cmd2 & 0xFFFF;
// TODO - speed this up. pshufb?
u32 data_buffer[16];
for (int i = 0; i < transfer_size; i++)
data_buffer[i] = DataReadU32();
DataReadU32xFuncs[transfer_size-1](data_buffer);
LoadXFReg(transfer_size, address, data_buffer);
INCSTAT(stats.thisFrame.numXFLoads);
INCSTAT(stats.thisFrame.numXFLoads);
}
break;

View File

@ -32,6 +32,7 @@
#if _M_SSE >= 0x401
#include <smmintrin.h>
#include <emmintrin.h>
#elif _M_SSE >= 0x301 && !(defined __GNUC__ && !defined __SSSE3__)
#include <tmmintrin.h>
#endif
@ -209,43 +210,6 @@ inline u32 decode5A3(u16 val)
return (a << 24) | (r << 16) | (g << 8) | b;
}
inline u32 decode5A3rgba(u16 val)
{
int r,g,b,a;
if ((val & 0x8000))
{
a = 0xFF;
r = Convert5To8((val >> 10) & 0x1F);
g = Convert5To8((val >> 5) & 0x1F);
b = Convert5To8(val & 0x1F);
}
else
{
a = Convert3To8((val >> 12) & 0x7);
r = Convert4To8((val >> 8) & 0xF);
g = Convert4To8((val >> 4) & 0xF);
b = Convert4To8(val & 0xF);
}
return (a << 24) | (b << 16) | (g << 8) | r;
}
inline u32 decode565RGBA(u16 val)
{
int r,g,b,a;
r=Convert5To8((val>>11) & 0x1f);
g=Convert6To8((val>>5 ) & 0x3f);
b=Convert5To8((val ) & 0x1f);
a=0xFF;
return r | (g<<8) | (b << 16) | (a << 24);
}
inline u32 decodeIA8Swapped(u16 val)
{
int a = val & 0xFF;
int i = val >> 8;
return i | (i<<8) | (i<<16) | (a<<24);
}
inline u32 decode5A3RGBA(u16 val)
{
int r,g,b,a;
@ -266,6 +230,23 @@ inline u32 decode5A3RGBA(u16 val)
return r | (g<<8) | (b << 16) | (a << 24);
}
inline u32 decode565RGBA(u16 val)
{
int r,g,b,a;
r=Convert5To8((val>>11) & 0x1f);
g=Convert6To8((val>>5 ) & 0x3f);
b=Convert5To8((val ) & 0x1f);
a=0xFF;
return r | (g<<8) | (b << 16) | (a << 24);
}
inline u32 decodeIA8Swapped(u16 val)
{
int a = val & 0xFF;
int i = val >> 8;
return i | (i<<8) | (i<<16) | (a<<24);
}
struct DXTBlock
@ -293,8 +274,8 @@ inline void decodebytesC4_5A3_To_rgba32(u32 *dst, const u8 *src, int tlutaddr)
for (int x = 0; x < 4; x++)
{
u8 val = src[x];
*dst++ = decode5A3rgba(Common::swap16(tlut[val >> 4]));
*dst++ = decode5A3rgba(Common::swap16(tlut[val & 0xF]));
*dst++ = decode5A3RGBA(Common::swap16(tlut[val >> 4]));
*dst++ = decode5A3RGBA(Common::swap16(tlut[val & 0xF]));
}
}
@ -348,7 +329,7 @@ inline void decodebytesC8_5A3_To_RGBA32(u32 *dst, const u8 *src, int tlutaddr)
for (int x = 0; x < 8; x++)
{
u8 val = src[x];
*dst++ = decode5A3rgba(Common::swap16(tlut[val]));
*dst++ = decode5A3RGBA(Common::swap16(tlut[val]));
}
}
@ -422,7 +403,7 @@ inline void decodebytesC14X2_5A3_To_RGBA(u32 *dst, const u16 *src, int tlutaddr)
for (int x = 0; x < 4; x++)
{
u16 val = Common::swap16(src[x]);
*dst++ = decode5A3rgba(Common::swap16(tlut[(val & 0x3FFF)]));
*dst++ = decode5A3RGBA(Common::swap16(tlut[(val & 0x3FFF)]));
}
}
@ -481,23 +462,43 @@ inline void decodebytesIA4RGBA(u32 *dst, const u8 *src)
inline void decodebytesRGB5A3(u32 *dst, const u16 *src)
{
#if 0
for (int x = 0; x < 4; x++)
dst[x] = decode5A3(Common::swap16(src[x]));
#else
dst[0] = decode5A3(Common::swap16(src[0]));
dst[1] = decode5A3(Common::swap16(src[1]));
dst[2] = decode5A3(Common::swap16(src[2]));
dst[3] = decode5A3(Common::swap16(src[3]));
#endif
}
inline void decodebytesRGB5A3rgba(u32 *dst, const u16 *src)
{
#if 0
for (int x = 0; x < 4; x++)
dst[x] = decode5A3rgba(Common::swap16(src[x]));
dst[x] = decode5A3RGBA(Common::swap16(src[x]));
#else
dst[0] = decode5A3RGBA(Common::swap16(src[0]));
dst[1] = decode5A3RGBA(Common::swap16(src[1]));
dst[2] = decode5A3RGBA(Common::swap16(src[2]));
dst[3] = decode5A3RGBA(Common::swap16(src[3]));
#endif
}
// This one is used by many video formats. It'd therefore be good if it was fast.
// Needs more speed.
inline void decodebytesARGB8_4(u32 *dst, const u16 *src, const u16 *src2)
{
for (int x = 0; x < 4; x++) {
#if 0
for (int x = 0; x < 4; x++)
dst[x] = Common::swap32((src2[x] << 16) | src[x]);
}
#else
dst[0] = Common::swap32((src2[0] << 16) | src[0]);
dst[1] = Common::swap32((src2[1] << 16) | src[1]);
dst[2] = Common::swap32((src2[2] << 16) | src[2]);
dst[3] = Common::swap32((src2[3] << 16) | src[3]);
#endif
// This can probably be done in a few SSE pack/unpack instructions + pshufb
// some unpack instruction x2:
@ -508,11 +509,18 @@ inline void decodebytesARGB8_4(u32 *dst, const u16 *src, const u16 *src2)
// and we are done.
}
inline void decodebytesARGB8_4ToRgba(u32 *dst, const u16 *src, const u16 *src2)
inline void decodebytesARGB8_4ToRgba(u32 *dst, const u16 *src, const u16 * src2)
{
for (int x = 0; x < 4; x++) {
#if 0
for (int x = 0; x < 4; x++) {
dst[x] = ((src[x] & 0xFF) << 24) | ((src[x] & 0xFF00)>>8) | (src2[x] << 8);
}
}
#else
dst[0] = ((src[0] & 0xFF) << 24) | ((src[0] & 0xFF00)>>8) | (src2[0] << 8);
dst[1] = ((src[1] & 0xFF) << 24) | ((src[1] & 0xFF00)>>8) | (src2[1] << 8);
dst[2] = ((src[2] & 0xFF) << 24) | ((src[2] & 0xFF00)>>8) | (src2[2] << 8);
dst[3] = ((src[3] & 0xFF) << 24) | ((src[3] & 0xFF00)>>8) | (src2[3] << 8);
#endif
}
inline u32 makecol(int r, int g, int b, int a)
@ -919,7 +927,7 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
PC_TexFormat TexDecoder_Decode_RGBA(u32 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt)
PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int height, int texformat, int tlutaddr, int tlutfmt)
{
switch (texformat)
{
@ -966,9 +974,52 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 *dst, const u8 *src, int width, int heig
{
for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 8)
for (int iy = 0; iy < 4; iy++, src += 8)
for (int ix = 0; ix < 8; ix++)
memset(dst + (y + iy)*width+x+ ix, (src + ix)[0], 4);
#if _M_SSE >= 0x401
for (int iy = 0; iy < 4; ++iy, src += 8)
{
__m128i *quaddst = (__m128i *)(dst + (y + iy)*width + x);
const __m128i m0 = _mm_or_si128(
_mm_or_si128(
_mm_and_si128(_mm_set1_epi8(src[0]), _mm_set_epi32(0, 0, 0, (int)0xffffffffU)),
_mm_and_si128(_mm_set1_epi8(src[1]), _mm_set_epi32(0, 0, (int)0xffffffffU, 0))
),
_mm_or_si128(
_mm_and_si128(_mm_set1_epi8(src[2]), _mm_set_epi32(0, (int)0xffffffffU, 0, 0)),
_mm_and_si128(_mm_set1_epi8(src[3]), _mm_set_epi32((int)0xffffffffU, 0, 0, 0))
)
);
_mm_store_si128(quaddst, m0);
const __m128i m1 = _mm_or_si128(
_mm_or_si128(
_mm_and_si128(_mm_set1_epi8(src[4]), _mm_set_epi32(0, 0, 0, (int)0xffffffffU)),
_mm_and_si128(_mm_set1_epi8(src[5]), _mm_set_epi32(0, 0, (int)0xffffffffU, 0))
),
_mm_or_si128(
_mm_and_si128(_mm_set1_epi8(src[6]), _mm_set_epi32(0, (int)0xffffffffU, 0, 0)),
_mm_and_si128(_mm_set1_epi8(src[7]), _mm_set_epi32((int)0xffffffffU, 0, 0, 0))
)
);
_mm_store_si128(quaddst+1, m1);
}
#else
for (int iy = 0; iy < 4; ++iy, src += 8)
{
u32 * newdst = dst + (y + iy)*width+x;
const u8 * newsrc = src;
u8 srcval;
srcval = (newsrc++)[0]; (newdst++)[0] = srcval | (srcval << 8) | (srcval << 16) | (srcval << 24);
srcval = (newsrc++)[0]; (newdst++)[0] = srcval | (srcval << 8) | (srcval << 16) | (srcval << 24);
srcval = (newsrc++)[0]; (newdst++)[0] = srcval | (srcval << 8) | (srcval << 16) | (srcval << 24);
srcval = (newsrc++)[0]; (newdst++)[0] = srcval | (srcval << 8) | (srcval << 16) | (srcval << 24);
srcval = (newsrc++)[0]; (newdst++)[0] = srcval | (srcval << 8) | (srcval << 16) | (srcval << 24);
srcval = (newsrc++)[0]; (newdst++)[0] = srcval | (srcval << 8) | (srcval << 16) | (srcval << 24);
srcval = (newsrc++)[0]; (newdst++)[0] = srcval | (srcval << 8) | (srcval << 16) | (srcval << 24);
srcval = newsrc[0]; newdst[0] = srcval | (srcval << 8) | (srcval << 16) | (srcval << 24);
}
#endif
}
break;
case GX_TF_C8:
@ -1014,8 +1065,10 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 *dst, const u8 *src, int width, int heig
{
u32 *ptr = dst + (y + iy) * width + x;
u16 *s = (u16 *)src;
for(int j = 0; j < 4; j++)
*ptr++ = decodeIA8Swapped(*s++);
ptr[0] = decodeIA8Swapped(s[0]);
ptr[1] = decodeIA8Swapped(s[1]);
ptr[2] = decodeIA8Swapped(s[2]);
ptr[3] = decodeIA8Swapped(s[3]);
}
}
@ -1058,7 +1111,7 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 *dst, const u8 *src, int width, int heig
}
break;
case GX_TF_RGB5A3:
{
{ // JSD: speed critical for Mario Kart Wii intro movie (at least)
for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 4)
for (int iy = 0; iy < 4; iy++, src += 8)

View File

@ -387,6 +387,9 @@
/>
<Tool
Name="VCCLCompilerTool"
Optimization="2"
InlineFunctionExpansion="1"
EnableIntrinsicFunctions="true"
AdditionalIncludeDirectories="../../PluginSpecs;../../../Externals/CLRun/include;../../../Externals/SOIL;../Common/Src;../Core/Src"
PreprocessorDefinitions="WIN32;NDEBUG;_LIB;_SECURE_SCL=0;__WXMSW__;wxUSE_BASE=0;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE"
RuntimeLibrary="0"