Merge pull request #4642 from stenzek/split-x64-texture-decoders

TextureDecoder: Seperate each format into its own function
This commit is contained in:
Markus Wick 2017-01-10 12:06:47 +01:00 committed by GitHub
commit 52ec186f0a
1 changed files with 1213 additions and 1069 deletions

View File

@ -10,6 +10,7 @@
#include "Common/CommonFuncs.h"
#include "Common/CommonTypes.h"
#include "Common/Intrinsics.h"
#include "Common/MsgHandler.h"
#include "VideoCommon/LookUpTables.h"
#include "VideoCommon/TextureDecoder.h"
@ -221,39 +222,35 @@ static void DecodeDXTBlock(u32* dst, const DXTBlock* src, int pitch)
// JSD 01/06/11:
// TODO: we really should ensure BOTH the source and destination addresses are aligned to 16-byte
// boundaries to
// squeeze out a little more performance. _mm_loadu_si128/_mm_storeu_si128 is slower than
// _mm_load_si128/_mm_store_si128
// because they work on unaligned addresses. The processor is free to make the assumption that
// addresses are multiples
// of 16 in the aligned case.
// boundaries to squeeze out a little more performance. _mm_loadu_si128/_mm_storeu_si128 is slower
// than _mm_load_si128/_mm_store_si128 because they work on unaligned addresses. The processor is
// free to make the assumption that addresses are multiples of 16 in the aligned case.
// TODO: complete SSE2 optimization of less often used texture formats.
// TODO: refactor algorithms using _mm_loadl_epi64 unaligned loads to prefer 128-bit aligned loads.
void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int texformat,
const u8* tlut, TlutFormat tlutfmt)
static void TexDecoder_DecodeImpl_C4(u32* dst, const u8* src, int width, int height, int texformat,
const u8* tlut, TlutFormat tlutfmt, int Wsteps4, int Wsteps8)
{
const int Wsteps4 = (width + 3) / 4;
const int Wsteps8 = (width + 7) / 8;
switch (texformat)
switch (tlutfmt)
{
case GX_TF_C4:
if (tlutfmt == GX_TL_RGB5A3)
case GX_TL_RGB5A3:
{
for (int y = 0; y < height; y += 8)
for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
for (int iy = 0, xStep = 8 * yStep; iy < 8; iy++, xStep++)
DecodeBytes_C4_RGB5A3(dst + (y + iy) * width + x, src + 4 * xStep, tlut);
}
else if (tlutfmt == GX_TL_IA8)
break;
case GX_TL_IA8:
{
for (int y = 0; y < height; y += 8)
for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
for (int iy = 0, xStep = 8 * yStep; iy < 8; iy++, xStep++)
DecodeBytes_C4_IA8(dst + (y + iy) * width + x, src + 4 * xStep, tlut);
}
else if (tlutfmt == GX_TL_RGB565)
break;
case GX_TL_RGB565:
{
for (int y = 0; y < height; y += 8)
for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
@ -261,21 +258,30 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
DecodeBytes_C4_RGB565(dst + (y + iy) * width + x, src + 4 * xStep, tlut);
}
break;
case GX_TF_I4:
{
default:
break;
}
}
static void TexDecoder_DecodeImpl_I4_SSSE3(u32* dst, const u8* src, int width, int height,
int texformat, const u8* tlut, TlutFormat tlutfmt,
int Wsteps4, int Wsteps8)
{
#if _M_SSE >= 0x301
const __m128i kMask_x0f = _mm_set1_epi32(0x0f0f0f0fL);
const __m128i kMask_xf0 = _mm_set1_epi32(0xf0f0f0f0L);
#if _M_SSE >= 0x301
// xsacha optimized with SSSE3 intrinsics
// Produces a ~40% speed improvement over SSE2 implementation
if (cpu_info.bSSSE3)
{
const __m128i mask9180 = _mm_set_epi8(9, 9, 9, 9, 1, 1, 1, 1, 8, 8, 8, 8, 0, 0, 0, 0);
const __m128i maskB3A2 = _mm_set_epi8(11, 11, 11, 11, 3, 3, 3, 3, 10, 10, 10, 10, 2, 2, 2, 2);
const __m128i maskD5C4 = _mm_set_epi8(13, 13, 13, 13, 5, 5, 5, 5, 12, 12, 12, 12, 4, 4, 4, 4);
const __m128i maskF7E6 = _mm_set_epi8(15, 15, 15, 15, 7, 7, 7, 7, 14, 14, 14, 14, 6, 6, 6, 6);
for (int y = 0; y < height; y += 8)
{
for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
{
for (int iy = 0, xStep = 4 * yStep; iy < 8; iy += 2, xStep++)
{
const __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + 8 * xStep));
@ -304,13 +310,22 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
_mm_storeu_si128((__m128i*)(dst + (y + iy + 1) * width + x + 4), o4);
}
}
else
}
#endif
}
static void TexDecoder_DecodeImpl_I4(u32* dst, const u8* src, int width, int height, int texformat,
const u8* tlut, TlutFormat tlutfmt, int Wsteps4, int Wsteps8)
{
const __m128i kMask_x0f = _mm_set1_epi32(0x0f0f0f0fL);
const __m128i kMask_xf0 = _mm_set1_epi32(0xf0f0f0f0L);
// JSD optimized with SSE2 intrinsics.
// Produces a ~76% speed improvement over reference C implementation.
{
for (int y = 0; y < height; y += 8)
{
for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
{
for (int iy = 0, xStep = 4 * yStep; iy < 8; iy += 2, xStep++)
{
const __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + 8 * xStep));
@ -387,24 +402,26 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
}
}
}
break;
case GX_TF_I8: // speed critical
{
}
static void TexDecoder_DecodeImpl_I8_SSSE3(u32* dst, const u8* src, int width, int height,
int texformat, const u8* tlut, TlutFormat tlutfmt,
int Wsteps4, int Wsteps8)
{
#if _M_SSE >= 0x301
// xsacha optimized with SSSE3 intrinsics
// Produces a ~10% speed improvement over SSE2 implementation
if (cpu_info.bSSSE3)
{
for (int y = 0; y < height; y += 4)
{
for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
{
for (int iy = 0, xStep = 4 * yStep; iy < 4; ++iy, xStep++)
{
const __m128i mask3210 = _mm_set_epi8(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
const __m128i mask7654 = _mm_set_epi8(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4);
__m128i *quaddst, r, rgba0, rgba1;
// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe
// dcba)
// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
r = _mm_loadl_epi64((const __m128i*)(src + 8 * xStep));
// Shuffle select bytes to expand from (0000 0000 hgfe dcba) to:
rgba0 = _mm_shuffle_epi8(r, mask3210); // (dddd cccc bbbb aaaa)
@ -415,27 +432,28 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
_mm_storeu_si128(quaddst + 1, rgba1);
}
}
else
}
#endif
}
static void TexDecoder_DecodeImpl_I8(u32* dst, const u8* src, int width, int height, int texformat,
const u8* tlut, TlutFormat tlutfmt, int Wsteps4, int Wsteps8)
{
// JSD optimized with SSE2 intrinsics.
// Produces an ~86% speed improvement over reference C implementation.
{
for (int y = 0; y < height; y += 4)
{
for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
{
// Each loop iteration processes 4 rows from 4 64-bit reads.
const u8* src2 = src + 32 * yStep;
// TODO: is it more efficient to group the loads together sequentially and also the stores
// at the end?
// _mm_stream instead of _mm_store on my AMD Phenom II x410 made performance significantly
// WORSE, so I
// went with _mm_stores. Perhaps there is some edge case here creating the terrible
// performance or we're
// not aligned to 16-byte boundaries. I don't know.
// at the end? _mm_stream instead of _mm_store on my AMD Phenom II x410 made performance
// significantly WORSE, so I went with _mm_stores. Perhaps there is some edge case here
// creating the terrible performance or we're not aligned to 16-byte boundaries. I don't know.
__m128i* quaddst;
// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe
// dcba)
// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
const __m128i r0 = _mm_loadl_epi64((const __m128i*)src2);
// Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc
// bbaa)
@ -454,8 +472,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
// Store (hhhh gggg ffff eeee) out:
_mm_storeu_si128(quaddst + 1, rgba1);
// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe
// dcba)
// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
src2 += 8;
const __m128i r2 = _mm_loadl_epi64((const __m128i*)src2);
// Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc
@ -475,8 +492,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
// Store (hhhh gggg ffff eeee) out:
_mm_storeu_si128(quaddst + 1, rgba3);
// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe
// dcba)
// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
src2 += 8;
const __m128i r4 = _mm_loadl_epi64((const __m128i*)src2);
// Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc
@ -496,8 +512,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
// Store (hhhh gggg ffff eeee) out:
_mm_storeu_si128(quaddst + 1, rgba5);
// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe
// dcba)
// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
src2 += 8;
const __m128i r6 = _mm_loadl_epi64((const __m128i*)src2);
// Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc
@ -518,24 +533,32 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
_mm_storeu_si128(quaddst + 1, rgba7);
}
}
}
break;
case GX_TF_C8:
if (tlutfmt == GX_TL_RGB5A3)
}
static void TexDecoder_DecodeImpl_C8(u32* dst, const u8* src, int width, int height, int texformat,
const u8* tlut, TlutFormat tlutfmt, int Wsteps4, int Wsteps8)
{
switch (tlutfmt)
{
case GX_TL_RGB5A3:
{
for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
DecodeBytes_C8_RGB5A3((u32*)dst + (y + iy) * width + x, src + 8 * xStep, tlut);
}
else if (tlutfmt == GX_TL_IA8)
break;
case GX_TL_IA8:
{
for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
DecodeBytes_C8_IA8(dst + (y + iy) * width + x, src + 8 * xStep, tlut);
}
else if (tlutfmt == GX_TL_RGB565)
break;
case GX_TL_RGB565:
{
for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
@ -543,23 +566,38 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
DecodeBytes_C8_RGB565(dst + (y + iy) * width + x, src + 8 * xStep, tlut);
}
break;
case GX_TF_IA4:
{
default:
break;
}
}
static void TexDecoder_DecodeImpl_IA4(u32* dst, const u8* src, int width, int height, int texformat,
const u8* tlut, TlutFormat tlutfmt, int Wsteps4, int Wsteps8)
{
for (int y = 0; y < height; y += 4)
{
for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
{
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
{
DecodeBytes_IA4(dst + (y + iy) * width + x, src + 8 * xStep);
}
break;
case GX_TF_IA8:
{
}
}
}
static void TexDecoder_DecodeImpl_IA8_SSSE3(u32* dst, const u8* src, int width, int height,
int texformat, const u8* tlut, TlutFormat tlutfmt,
int Wsteps4, int Wsteps8)
{
#if _M_SSE >= 0x301
// xsacha optimized with SSSE3 intrinsics.
// Produces an ~50% speed improvement over SSE2 implementation.
if (cpu_info.bSSSE3)
{
for (int y = 0; y < height; y += 4)
{
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
{
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
{
const __m128i mask = _mm_set_epi8(6, 7, 7, 7, 4, 5, 5, 5, 2, 3, 3, 3, 0, 1, 1, 1);
@ -571,17 +609,23 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
_mm_storeu_si128((__m128i*)(dst + (y + iy) * width + x), r1);
}
}
else
}
#endif
}
static void TexDecoder_DecodeImpl_IA8(u32* dst, const u8* src, int width, int height, int texformat,
const u8* tlut, TlutFormat tlutfmt, int Wsteps4, int Wsteps8)
{
// JSD optimized with SSE2 intrinsics.
// Produces an ~80% speed improvement over reference C implementation.
{
const __m128i kMask_xf0 = _mm_set_epi32(0x00000000L, 0x00000000L, 0xff00ff00L, 0xff00ff00L);
const __m128i kMask_x0f = _mm_set_epi32(0x00000000L, 0x00000000L, 0x00ff00ffL, 0x00ff00ffL);
const __m128i kMask_xf000 = _mm_set_epi32(0xff000000L, 0xff000000L, 0xff000000L, 0xff000000L);
const __m128i kMask_x0fff = _mm_set_epi32(0x00ffffffL, 0x00ffffffL, 0x00ffffffL, 0x00ffffffL);
for (int y = 0; y < height; y += 4)
{
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
{
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
{
// Expands a 16-bit "IA" to a 32-bit "AIII". Each char is an 8-bit value.
@ -591,8 +635,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
const __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + 8 * xStep));
// Logical shift all 16-bit words right by 8 bits (0000 0000 hgfe dcba) to (0000 0000
// 0h0f 0d0b)
// This gets us only the I components.
// 0h0f 0d0b). This gets us only the I components.
const __m128i i0 = _mm_srli_epi16(r0, 8);
// Now join up the I components from their original positions but mask out the A
@ -608,8 +651,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
const __m128i i3 = _mm_and_si128(i2, kMask_x0fff);
// Now that we have the I components in 32-bit word form, time work out the A components
// into
// their final positions.
// into their final positions.
// (0000 0000 hgfe dcba) & kMask_x00FF -> (0000 0000 0g0e 0c0a)
const __m128i a0 = _mm_and_si128(r0, kMask_x0f);
@ -629,23 +671,33 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
}
}
}
break;
case GX_TF_C14X2:
if (tlutfmt == GX_TL_RGB5A3)
}
static void TexDecoder_DecodeImpl_C14X2(u32* dst, const u8* src, int width, int height,
int texformat, const u8* tlut, TlutFormat tlutfmt,
int Wsteps4, int Wsteps8)
{
switch (tlutfmt)
{
case GX_TL_RGB5A3:
{
for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
DecodeBytes_C14X2_RGB5A3(dst + (y + iy) * width + x, (u16*)(src + 8 * xStep), tlut);
}
else if (tlutfmt == GX_TL_IA8)
break;
case GX_TL_IA8:
{
for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
DecodeBytes_C14X2_IA8(dst + (y + iy) * width + x, (u16*)(src + 8 * xStep), tlut);
}
else if (tlutfmt == GX_TL_RGB565)
break;
case GX_TL_RGB565:
{
for (int y = 0; y < height; y += 4)
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
@ -653,8 +705,16 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
DecodeBytes_C14X2_RGB565(dst + (y + iy) * width + x, (u16*)(src + 8 * xStep), tlut);
}
break;
case GX_TF_RGB565:
{
default:
break;
}
}
static void TexDecoder_DecodeImpl_RGB565(u32* dst, const u8* src, int width, int height,
int texformat, const u8* tlut, TlutFormat tlutfmt,
int Wsteps4, int Wsteps8)
{
// JSD optimized with SSE2 intrinsics.
// Produces an ~78% speed improvement over reference C implementation.
const __m128i kMaskR0 = _mm_set1_epi32(0x000000F8);
@ -663,7 +723,9 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
const __m128i kMaskB0 = _mm_set1_epi32(0x00F80000);
const __m128i kAlpha = _mm_set1_epi32(0xFF000000);
for (int y = 0; y < height; y += 4)
{
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
{
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
{
__m128i* dxtsrc = (__m128i*)(src + 8 * xStep);
@ -672,9 +734,8 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
const __m128i rgb565x4 = _mm_loadl_epi64(dxtsrc);
// The big-endian 16-bit colors `ba` and `dc` look like 0b_gggBBBbb_RRRrrGGg in a little
// endian xmm register
// Unpack `hgfe dcba` to `hhgg ffee ddcc bbaa`, where each 32-bit word is now
// 0b_gggBBBbb_RRRrrGGg_gggBBBbb_RRRrrGGg
// endian xmm register Unpack `hgfe dcba` to `hhgg ffee ddcc bbaa`, where each 32-bit word
// is now 0b_gggBBBbb_RRRrrGGg_gggBBBbb_RRRrrGGg
const __m128i c0 = _mm_unpacklo_epi16(rgb565x4, rgb565x4);
// swizzle 0b_gggBBBbb_RRRrrGGg_gggBBBbb_RRRrrGGg
@ -718,9 +779,14 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
_mm_storeu_si128(ptr, abgr888x4);
}
}
break;
case GX_TF_RGB5A3:
{
}
}
static void TexDecoder_DecodeImpl_RGB5A3_SSSE3(u32* dst, const u8* src, int width, int height,
int texformat, const u8* tlut, TlutFormat tlutfmt,
int Wsteps4, int Wsteps8)
{
#if _M_SSE >= 0x301
const __m128i kMask_x1f = _mm_set1_epi32(0x0000001fL);
const __m128i kMask_x0f = _mm_set1_epi32(0x0000000fL);
const __m128i kMask_x07 = _mm_set1_epi32(0x00000007L);
@ -728,22 +794,20 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
// for the RGB555 case when (s[x] & 0x8000) is true for all pixels.
const __m128i aVxff00 = _mm_set1_epi32(0xFF000000L);
#if _M_SSE >= 0x301
// xsacha optimized with SSSE3 intrinsics (2 in 4 cases)
// Produces a ~10% speed improvement over SSE2 implementation
if (cpu_info.bSSSE3)
{
for (int y = 0; y < height; y += 4)
{
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
{
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
{
u32* newdst = dst + (y + iy) * width + x;
const __m128i mask = _mm_set_epi8(-128, -128, 6, 7, -128, -128, 4, 5, -128, -128, 2, 3,
-128, -128, 0, 1);
const __m128i mask =
_mm_set_epi8(-128, -128, 6, 7, -128, -128, 4, 5, -128, -128, 2, 3, -128, -128, 0, 1);
const __m128i valV =
_mm_shuffle_epi8(_mm_loadl_epi64((const __m128i*)(src + 8 * xStep)), mask);
int cmp =
_mm_movemask_epi8(valV); // MSB: 0x2 = val0; 0x20=val1; 0x200 = val2; 0x2000=val3
int cmp = _mm_movemask_epi8(valV); // MSB: 0x2 = val0; 0x20=val1; 0x200 = val2; 0x2000=val3
if ((cmp & 0x2222) ==
0x2222) // SSSE3 case #1: all 4 pixels are in RGB555 and alpha = 0xFF.
{
@ -781,8 +845,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
// b0 = (((val0 ) & 0xf) << 4) | ((val0 ) & 0xf);
const __m128i tmpbV = _mm_and_si128(valV, kMask_x0f);
const __m128i bV = _mm_or_si128(_mm_slli_epi16(tmpbV, 4), tmpbV);
// a0 = (((val0>>12) & 0x7) << 5) | (((val0>>12) & 0x7) << 2) | (((val0>>12) & 0x7) >>
// 1);
// a0 = (((val0>>12) & 0x7) << 5) | (((val0>>12) & 0x7) << 2) | (((val0>>12) & 0x7) >> 1);
const __m128i tmpaV = _mm_and_si128(_mm_srli_epi16(valV, 12), kMask_x07);
const __m128i aV =
_mm_or_si128(_mm_slli_epi16(tmpaV, 5),
@ -823,13 +886,27 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
}
}
}
else
}
#endif
}
static void TexDecoder_DecodeImpl_RGB5A3(u32* dst, const u8* src, int width, int height,
int texformat, const u8* tlut, TlutFormat tlutfmt,
int Wsteps4, int Wsteps8)
{
const __m128i kMask_x1f = _mm_set1_epi32(0x0000001fL);
const __m128i kMask_x0f = _mm_set1_epi32(0x0000000fL);
const __m128i kMask_x07 = _mm_set1_epi32(0x00000007L);
// This is the hard-coded 0xFF alpha constant that is ORed in place after the RGB are calculated
// for the RGB555 case when (s[x] & 0x8000) is true for all pixels.
const __m128i aVxff00 = _mm_set1_epi32(0xFF000000L);
// JSD optimized with SSE2 intrinsics (2 in 4 cases)
// Produces a ~25% speed improvement over reference C implementation.
{
for (int y = 0; y < height; y += 4)
{
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
{
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
{
u32* newdst = dst + (y + iy) * width + x;
@ -869,8 +946,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
// write the final result:
_mm_storeu_si128((__m128i*)newdst, final);
}
else if (((val0 & 0x8000) | (val1 & 0x8000) | (val2 & 0x8000) | (val3 & 0x8000)) ==
0x0000)
else if (((val0 & 0x8000) | (val1 & 0x8000) | (val2 & 0x8000) | (val3 & 0x8000)) == 0x0000)
{
// SSE2 case #2: all 4 pixels are in RGBA4443.
@ -888,8 +964,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
const __m128i tmpbV = _mm_and_si128(valV, kMask_x0f);
const __m128i bV = _mm_or_si128(_mm_slli_epi16(tmpbV, 4), tmpbV);
// a0 = (((val0>>12) & 0x7) << 5) | (((val0>>12) & 0x7) << 2) | (((val0>>12) & 0x7) >>
// 1);
// a0 = (((val0>>12) & 0x7) << 5) | (((val0>>12) & 0x7) << 2) | (((val0>>12) & 0x7) >> 1);
const __m128i tmpaV = _mm_and_si128(_mm_srli_epi16(valV, 12), kMask_x07);
const __m128i aV =
_mm_or_si128(_mm_slli_epi16(tmpaV, 5),
@ -933,20 +1008,21 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
}
}
}
break;
case GX_TF_RGBA8: // speed critical
{
}
static void TexDecoder_DecodeImpl_RGBA8_SSSE3(u32* dst, const u8* src, int width, int height,
int texformat, const u8* tlut, TlutFormat tlutfmt,
int Wsteps4, int Wsteps8)
{
#if _M_SSE >= 0x301
// xsacha optimized with SSSE3 instrinsics
// Produces a ~30% speed improvement over SSE2 implementation
if (cpu_info.bSSSE3)
{
for (int y = 0; y < height; y += 4)
{
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
{
const u8* src2 = src + 64 * yStep;
const __m128i mask0312 =
_mm_set_epi8(12, 15, 13, 14, 8, 11, 9, 10, 4, 7, 5, 6, 0, 3, 1, 2);
const __m128i mask0312 = _mm_set_epi8(12, 15, 13, 14, 8, 11, 9, 10, 4, 7, 5, 6, 0, 3, 1, 2);
const __m128i ar0 = _mm_loadu_si128((__m128i*)src2);
const __m128i ar1 = _mm_loadu_si128((__m128i*)src2 + 1);
const __m128i gb0 = _mm_loadu_si128((__m128i*)src2 + 2);
@ -967,34 +1043,36 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
_mm_storeu_si128(dst128, rgba11);
}
}
else
#endif
}
static void TexDecoder_DecodeImpl_RGBA8(u32* dst, const u8* src, int width, int height,
int texformat, const u8* tlut, TlutFormat tlutfmt,
int Wsteps4, int Wsteps8)
{
// JSD optimized with SSE2 intrinsics
// Produces a ~68% speed improvement over reference C implementation.
{
for (int y = 0; y < height; y += 4)
{
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
{
// Input is divided up into 16-bit words. The texels are split up into AR and GB
// components where all
// AR components come grouped up first in 32 bytes followed by the GB components in 32
// bytes. We are
// processing 16 texels per each loop iteration, numbered from 0-f.
// components where all AR components come grouped up first in 32 bytes followed by the GB
// components in 32 bytes. We are processing 16 texels per each loop iteration, numbered from
// 0-f.
//
// Convention is:
// one byte is [component-name texel-number]
// __m128i is (4-bytes 4-bytes 4-bytes 4-bytes)
//
// Input is ([A 7][R 7][A 6][R 6] [A 5][R 5][A 4][R 4] [A 3][R 3][A 2][R 2] [A 1][R 1][A
// 0][R 0])
// ([A f][R f][A e][R e] [A d][R d][A c][R c] [A b][R b][A a][R a] [A 9][R 9][A
// 8][R 8])
// ([G 7][B 7][G 6][B 6] [G 5][B 5][G 4][B 4] [G 3][B 3][G 2][B 2] [G 1][B 1][G
// 0][B 0])
// ([G f][B f][G e][B e] [G d][B d][G c][B c] [G b][B b][G a][B a] [G 9][B 9][G
// 8][B 8])
// Input is:
// ([A 7][R 7][A 6][R 6] [A 5][R 5][A 4][R 4] [A 3][R 3][A 2][R 2] [A 1][R 1][A 0][R 0])
// ([A f][R f][A e][R e] [A d][R d][A c][R c] [A b][R b][A a][R a] [A 9][R 9][A 8][R 8])
// ([G 7][B 7][G 6][B 6] [G 5][B 5][G 4][B 4] [G 3][B 3][G 2][B 2] [G 1][B 1][G 0][B 0])
// ([G f][B f][G e][B e] [G d][B d][G c][B c] [G b][B b][G a][B a] [G 9][B 9][G 8][B 8])
//
// Output is (RGBA3 RGBA2 RGBA1 RGBA0)
// Output is:
// (RGBA3 RGBA2 RGBA1 RGBA0)
// (RGBA7 RGBA6 RGBA5 RGBA4)
// (RGBAb RGBAa RGBA9 RGBA8)
// (RGBAf RGBAe RGBAd RGBAc)
@ -1012,28 +1090,21 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
// b][G a][B a] [G 9][B 9][G 8][B 8])
const __m128i gb1 = _mm_loadu_si128((__m128i*)src2 + 3);
__m128i rgba00, rgba01, rgba10, rgba11;
const __m128i kMask_x000f =
_mm_set_epi32(0x000000FFL, 0x000000FFL, 0x000000FFL, 0x000000FFL);
const __m128i kMask_xf000 =
_mm_set_epi32(0xFF000000L, 0xFF000000L, 0xFF000000L, 0xFF000000L);
const __m128i kMask_x0ff0 =
_mm_set_epi32(0x00FFFF00L, 0x00FFFF00L, 0x00FFFF00L, 0x00FFFF00L);
const __m128i kMask_x000f = _mm_set_epi32(0x000000FFL, 0x000000FFL, 0x000000FFL, 0x000000FFL);
const __m128i kMask_xf000 = _mm_set_epi32(0xFF000000L, 0xFF000000L, 0xFF000000L, 0xFF000000L);
const __m128i kMask_x0ff0 = _mm_set_epi32(0x00FFFF00L, 0x00FFFF00L, 0x00FFFF00L, 0x00FFFF00L);
// Expand the AR components to fill out 32-bit words:
// ([A 7][R 7][A 6][R 6] [A 5][R 5][A 4][R 4] [A 3][R 3][A 2][R 2] [A 1][R 1][A 0][R 0])
// -> ([A 3][A 3][R 3][R 3] [A 2][A 2][R 2][R 2] [A 1][A 1][R 1][R 1] [A 0][A 0][R 0][R
// 0])
// -> ([A 3][A 3][R 3][R 3] [A 2][A 2][R 2][R 2] [A 1][A 1][R 1][R 1] [A 0][A 0][R 0][R 0])
const __m128i aarr00 = _mm_unpacklo_epi8(ar0, ar0);
// ([A 7][R 7][A 6][R 6] [A 5][R 5][A 4][R 4] [A 3][R 3][A 2][R 2] [A 1][R 1][A 0][R 0])
// -> ([A 7][A 7][R 7][R 7] [A 6][A 6][R 6][R 6] [A 5][A 5][R 5][R 5] [A 4][A 4][R 4][R
// 4])
// -> ([A 7][A 7][R 7][R 7] [A 6][A 6][R 6][R 6] [A 5][A 5][R 5][R 5] [A 4][A 4][R 4][R 4])
const __m128i aarr01 = _mm_unpackhi_epi8(ar0, ar0);
// ([A f][R f][A e][R e] [A d][R d][A c][R c] [A b][R b][A a][R a] [A 9][R 9][A 8][R 8])
// -> ([A b][A b][R b][R b] [A a][A a][R a][R a] [A 9][A 9][R 9][R 9] [A 8][A 8][R 8][R
// 8])
// -> ([A b][A b][R b][R b] [A a][A a][R a][R a] [A 9][A 9][R 9][R 9] [A 8][A 8][R 8][R 8])
const __m128i aarr10 = _mm_unpacklo_epi8(ar1, ar1);
// ([A f][R f][A e][R e] [A d][R d][A c][R c] [A b][R b][A a][R a] [A 9][R 9][A 8][R 8])
// -> ([A f][A f][R f][R f] [A e][A e][R e][R e] [A d][A d][R d][R d] [A c][A c][R c][R
// c])
// -> ([A f][A f][R f][R f] [A e][A e][R e][R e] [A d][A d][R d][R d] [A c][A c][R c][R c])
const __m128i aarr11 = _mm_unpackhi_epi8(ar1, ar1);
// Move A right 16 bits and mask off everything but the lowest 8 bits to get A in its
@ -1059,20 +1130,16 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
// Expand the GB components to fill out 32-bit words:
// ([G 7][B 7][G 6][B 6] [G 5][B 5][G 4][B 4] [G 3][B 3][G 2][B 2] [G 1][B 1][G 0][B 0])
// -> ([G 3][G 3][B 3][B 3] [G 2][G 2][B 2][B 2] [G 1][G 1][B 1][B 1] [G 0][G 0][B 0][B
// 0])
// -> ([G 3][G 3][B 3][B 3] [G 2][G 2][B 2][B 2] [G 1][G 1][B 1][B 1] [G 0][G 0][B 0][B 0])
const __m128i ggbb00 = _mm_unpacklo_epi8(gb0, gb0);
// ([G 7][B 7][G 6][B 6] [G 5][B 5][G 4][B 4] [G 3][B 3][G 2][B 2] [G 1][B 1][G 0][B 0])
// -> ([G 7][G 7][B 7][B 7] [G 6][G 6][B 6][B 6] [G 5][G 5][B 5][B 5] [G 4][G 4][B 4][B
// 4])
// -> ([G 7][G 7][B 7][B 7] [G 6][G 6][B 6][B 6] [G 5][G 5][B 5][B 5] [G 4][G 4][B 4][B 4])
const __m128i ggbb01 = _mm_unpackhi_epi8(gb0, gb0);
// ([G f][B f][G e][B e] [G d][B d][G c][B c] [G b][B b][G a][B a] [G 9][B 9][G 8][B 8])
// -> ([G b][G b][B b][B b] [G a][G a][B a][B a] [G 9][G 9][B 9][B 9] [G 8][G 8][B 8][B
// 8])
// -> ([G b][G b][B b][B b] [G a][G a][B a][B a] [G 9][G 9][B 9][B 9] [G 8][G 8][B 8][B 8])
const __m128i ggbb10 = _mm_unpacklo_epi8(gb1, gb1);
// ([G f][B f][G e][B e] [G d][B d][G c][B c] [G b][B b][G a][B a] [G 9][B 9][G 8][B 8])
// -> ([G f][G f][B f][B f] [G e][G e][B e][B e] [G d][G d][B d][B d] [G c][G c][B c][B
// c])
// -> ([G f][G f][B f][B f] [G e][G e][B e][B e] [G d][G d][B d][B d] [G c][G c][B c][B c])
const __m128i ggbb11 = _mm_unpackhi_epi8(gb1, gb1);
// G and B are already in perfect spots in the center, just remove the extra copies in the
@ -1098,28 +1165,25 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
_mm_storeu_si128(dst128, rgba11);
}
}
}
break;
case GX_TF_CMPR: // speed critical
}
static void TexDecoder_DecodeImpl_CMPR(u32* dst, const u8* src, int width, int height,
int texformat, const u8* tlut, TlutFormat tlutfmt,
int Wsteps4, int Wsteps8)
{
// The metroid games use this format almost exclusively.
{
// JSD optimized with SSE2 intrinsics.
// Produces a ~50% improvement for x86 and a ~40% improvement for x64 in speed over reference
// C implementation.
// The x64 compiled reference C code is faster than the x86 compiled reference C code, but the
// SSE2 is
// faster than both.
// C implementation. The x64 compiled reference C code is faster than the x86 compiled reference
// C code, but the SSE2 is faster than both.
for (int y = 0; y < height; y += 8)
{
for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
{
// We handle two DXT blocks simultaneously to take full advantage of SSE2's 128-bit
// registers.
// We handle two DXT blocks simultaneously to take full advantage of SSE2's 128-bit registers.
// This is ideal because a single DXT block contains 2 RGBA colors when decoded from their
// 16-bit.
// Two DXT blocks therefore contain 4 RGBA colors to be processed. The processing is
// parallelizable
// at this level, so we do.
// 16-bit. Two DXT blocks therefore contain 4 RGBA colors to be processed. The processing is
// parallelizable at this level, so we do.
for (int z = 0, xStep = 2 * yStep; z < 2; ++z, xStep++)
{
// JSD NOTE: You may see many strange patterns of behavior in the below code, but they
@ -1127,17 +1191,13 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
// constants is faster than loading their values from memory. Unfortunately, there is no
// way to inline 128-bit constants from opcodes so they must be loaded from memory. This
// seems a little ridiculous to me in that you can't even generate a constant value of 1
// without
// having to load it from memory. So, I stored the minimal constant I could, 128-bits
// worth
// of 1s :). Then I use sequences of shifts to squash it to the appropriate size and bit
// positions that I need.
// without having to load it from memory. So, I stored the minimal constant I could,
// 128-bits worth of 1s :). Then I use sequences of shifts to squash it to the appropriate
// size and bitpositions that I need.
const __m128i allFFs128 = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128());
// Load 128 bits, i.e. two DXTBlocks (64-bits each)
const __m128i dxt =
_mm_loadu_si128((__m128i*)(src + sizeof(struct DXTBlock) * 2 * xStep));
const __m128i dxt = _mm_loadu_si128((__m128i*)(src + sizeof(struct DXTBlock) * 2 * xStep));
// Copy the 2-bit indices from each DXT block:
alignas(16) u32 dxttmp[4];
@ -1149,8 +1209,8 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
__m128i argb888x4;
__m128i c1 = _mm_unpackhi_epi16(dxt, dxt);
c1 = _mm_slli_si128(c1, 8);
const __m128i c0 = _mm_or_si128(
c1, _mm_srli_si128(_mm_slli_si128(_mm_unpacklo_epi16(dxt, dxt), 8), 8));
const __m128i c0 =
_mm_or_si128(c1, _mm_srli_si128(_mm_slli_si128(_mm_unpacklo_epi16(dxt, dxt), 8), 8));
// Compare rgb0 to rgb1:
// Each 32-bit word will contain either 0xFFFFFFFF or 0x00000000 for true/false.
@ -1163,16 +1223,14 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
// green:
// NOTE: We start with the larger number of bits (6) firts for G and shift the mask down
// 1 bit to get a 5-bit mask
// later for R and B components.
// 1 bit to get a 5-bit mask later for R and B components.
// low6mask == _mm_set_epi32(0x0000FC00, 0x0000FC00, 0x0000FC00, 0x0000FC00)
const __m128i low6mask = _mm_slli_epi32(_mm_srli_epi32(allFFs128, 24 + 2), 8 + 2);
const __m128i gtmp = _mm_srli_epi32(c0, 3);
const __m128i g0 = _mm_and_si128(gtmp, low6mask);
// low3mask == _mm_set_epi32(0x00000300, 0x00000300, 0x00000300, 0x00000300)
const __m128i g1 =
_mm_and_si128(_mm_srli_epi32(gtmp, 6),
_mm_set_epi32(0x00000300, 0x00000300, 0x00000300, 0x00000300));
const __m128i g1 = _mm_and_si128(
_mm_srli_epi32(gtmp, 6), _mm_set_epi32(0x00000300, 0x00000300, 0x00000300, 0x00000300));
argb888x4 = _mm_or_si128(g0, g1);
// red:
// low5mask == _mm_set_epi32(0x000000F8, 0x000000F8, 0x000000F8, 0x000000F8)
@ -1263,8 +1321,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
// _mm_srli_epi32( allFFs128, 8 ) == _mm_set_epi32(0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF,
// 0x00FFFFFF)
// Make this color fully transparent:
rgb3 = _mm_or_si128(rgb3,
_mm_and_si128(_mm_and_si128(rgb2, _mm_srli_epi32(allFFs128, 8)),
rgb3 = _mm_or_si128(rgb3, _mm_and_si128(_mm_and_si128(rgb2, _mm_srli_epi32(allFFs128, 8)),
_mm_slli_si128(allFFs128, 8)));
}
@ -1287,11 +1344,10 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
u32 tmp0[4][4], tmp1[4][4];
DecodeDXTBlock(&(tmp0[0][0]),
reinterpret_cast<const DXTBlock*>(src + sizeof(DXTBlock) * 2 * xStep),
reinterpret_cast<const DXTBlock*>(src + sizeof(DXTBlock) * 2 * xStep), 4);
DecodeDXTBlock(&(tmp1[0][0]),
reinterpret_cast<const DXTBlock*>((src + sizeof(DXTBlock) * 2 * xStep) + 8),
4);
DecodeDXTBlock(
&(tmp1[0][0]),
reinterpret_cast<const DXTBlock*>((src + sizeof(DXTBlock) * 2 * xStep) + 8), 4);
#endif
u32* dst32 = (dst + (y + z * 4) * width + x);
@ -1357,7 +1413,95 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
}
}
}
}
void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int texformat,
const u8* tlut, TlutFormat tlutfmt)
{
int Wsteps4 = (width + 3) / 4;
int Wsteps8 = (width + 7) / 8;
// If the binary was not compiled with SSSE3 support, the functions turn into no-ops.
// Therefore, we shouldn't call them based on what the CPU reports at runtime alone.
#if _M_SSE >= 0x301
bool has_SSSE3 = cpu_info.bSSSE3;
#else
bool has_SSSE3 = false;
#endif
switch (texformat)
{
case GX_TF_C4:
TexDecoder_DecodeImpl_C4(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4, Wsteps8);
break;
case GX_TF_I4:
if (has_SSSE3)
TexDecoder_DecodeImpl_I4_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
Wsteps8);
else
TexDecoder_DecodeImpl_I4(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4, Wsteps8);
break;
case GX_TF_I8:
if (has_SSSE3)
TexDecoder_DecodeImpl_I8_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
Wsteps8);
else
TexDecoder_DecodeImpl_I8(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4, Wsteps8);
break;
case GX_TF_C8:
TexDecoder_DecodeImpl_C8(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4, Wsteps8);
break;
case GX_TF_IA4:
TexDecoder_DecodeImpl_IA4(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4, Wsteps8);
break;
case GX_TF_IA8:
if (has_SSSE3)
TexDecoder_DecodeImpl_IA8_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
Wsteps8);
else
TexDecoder_DecodeImpl_IA8(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
Wsteps8);
break;
case GX_TF_C14X2:
TexDecoder_DecodeImpl_C14X2(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
Wsteps8);
break;
case GX_TF_RGB565:
TexDecoder_DecodeImpl_RGB565(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
Wsteps8);
break;
case GX_TF_RGB5A3:
if (has_SSSE3)
TexDecoder_DecodeImpl_RGB5A3_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
Wsteps8);
else
TexDecoder_DecodeImpl_RGB5A3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
Wsteps8);
break;
case GX_TF_RGBA8:
if (has_SSSE3)
TexDecoder_DecodeImpl_RGBA8_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
Wsteps8);
else
TexDecoder_DecodeImpl_RGBA8(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
Wsteps8);
break;
case GX_TF_CMPR:
TexDecoder_DecodeImpl_CMPR(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4, Wsteps8);
break;
default:
PanicAlert("Unhandled texture format %d", texformat);
break;
}
}
}