SSSE3 implementation of IA8 texture decode. Roughly 50% faster than SSE2 version on my computer (SSSE3: 77%, SSE2: 57% vs reference C on Core2 Duo). About half as many cycles.

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6770 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
xsacha 2011-01-07 14:55:05 +00:00
parent a6acc99a89
commit dcbfd4ea4c
1 changed files with 59 additions and 40 deletions

View File

@ -1230,6 +1230,24 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
break; break;
case GX_TF_IA8: case GX_TF_IA8:
{ {
#if _M_SSE >= 0x301
// SSSE3 implementation is approximately 50% faster than SSE2 version.
if (cpu_info.bSSSE3)
{
for (int y = 0; y < height; y += 4)
for (int x = 0; x < width; x += 4)
for (int iy = 0; iy < 4; iy++, src += 8)
{
const __m128i mask = _mm_set_epi8(6, 7, 7, 7, 4, 5, 5, 5, 2, 3, 3, 3, 0, 1, 1, 1);
// Load 4x 16-bit IA8 samples from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
const __m128i r0 = _mm_loadl_epi64((const __m128i *)src);
// Shuffle to (ghhh efff cddd abbb)
const __m128i r1 = _mm_shuffle_epi8(r0, mask);
_mm_storeu_si128( (__m128i*)(dst + (y + iy) * width + x), r1 );
}
} else
#endif
{
// JSD optimized with SSE2 intrinsics. // JSD optimized with SSE2 intrinsics.
// Produces an ~80% speed improvement over reference C implementation. // Produces an ~80% speed improvement over reference C implementation.
const __m128i kMask_xf0 = _mm_set_epi32(0x00000000L, 0x00000000L, 0xff00ff00L, 0xff00ff00L); const __m128i kMask_xf0 = _mm_set_epi32(0x00000000L, 0x00000000L, 0xff00ff00L, 0xff00ff00L);
@ -1279,6 +1297,7 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
// write out the 128-bit result: // write out the 128-bit result:
_mm_storeu_si128( (__m128i*)(dst + (y + iy) * width + x), r1 ); _mm_storeu_si128( (__m128i*)(dst + (y + iy) * width + x), r1 );
} }
}
#if 0 #if 0
// Reference C implementation: // Reference C implementation:
for (int y = 0; y < height; y += 4) for (int y = 0; y < height; y += 4)