Merge pull request #4642 from stenzek/split-x64-texture-decoders
TextureDecoder: Seperate each format into its own function
This commit is contained in:
commit
52ec186f0a
|
@ -10,6 +10,7 @@
|
||||||
#include "Common/CommonFuncs.h"
|
#include "Common/CommonFuncs.h"
|
||||||
#include "Common/CommonTypes.h"
|
#include "Common/CommonTypes.h"
|
||||||
#include "Common/Intrinsics.h"
|
#include "Common/Intrinsics.h"
|
||||||
|
#include "Common/MsgHandler.h"
|
||||||
|
|
||||||
#include "VideoCommon/LookUpTables.h"
|
#include "VideoCommon/LookUpTables.h"
|
||||||
#include "VideoCommon/TextureDecoder.h"
|
#include "VideoCommon/TextureDecoder.h"
|
||||||
|
@ -221,39 +222,35 @@ static void DecodeDXTBlock(u32* dst, const DXTBlock* src, int pitch)
|
||||||
|
|
||||||
// JSD 01/06/11:
|
// JSD 01/06/11:
|
||||||
// TODO: we really should ensure BOTH the source and destination addresses are aligned to 16-byte
|
// TODO: we really should ensure BOTH the source and destination addresses are aligned to 16-byte
|
||||||
// boundaries to
|
// boundaries to squeeze out a little more performance. _mm_loadu_si128/_mm_storeu_si128 is slower
|
||||||
// squeeze out a little more performance. _mm_loadu_si128/_mm_storeu_si128 is slower than
|
// than _mm_load_si128/_mm_store_si128 because they work on unaligned addresses. The processor is
|
||||||
// _mm_load_si128/_mm_store_si128
|
// free to make the assumption that addresses are multiples of 16 in the aligned case.
|
||||||
// because they work on unaligned addresses. The processor is free to make the assumption that
|
|
||||||
// addresses are multiples
|
|
||||||
// of 16 in the aligned case.
|
|
||||||
// TODO: complete SSE2 optimization of less often used texture formats.
|
// TODO: complete SSE2 optimization of less often used texture formats.
|
||||||
// TODO: refactor algorithms using _mm_loadl_epi64 unaligned loads to prefer 128-bit aligned loads.
|
// TODO: refactor algorithms using _mm_loadl_epi64 unaligned loads to prefer 128-bit aligned loads.
|
||||||
|
static void TexDecoder_DecodeImpl_C4(u32* dst, const u8* src, int width, int height, int texformat,
|
||||||
void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int texformat,
|
const u8* tlut, TlutFormat tlutfmt, int Wsteps4, int Wsteps8)
|
||||||
const u8* tlut, TlutFormat tlutfmt)
|
|
||||||
{
|
{
|
||||||
const int Wsteps4 = (width + 3) / 4;
|
switch (tlutfmt)
|
||||||
const int Wsteps8 = (width + 7) / 8;
|
|
||||||
|
|
||||||
switch (texformat)
|
|
||||||
{
|
{
|
||||||
case GX_TF_C4:
|
case GX_TL_RGB5A3:
|
||||||
if (tlutfmt == GX_TL_RGB5A3)
|
|
||||||
{
|
{
|
||||||
for (int y = 0; y < height; y += 8)
|
for (int y = 0; y < height; y += 8)
|
||||||
for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
|
for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
|
||||||
for (int iy = 0, xStep = 8 * yStep; iy < 8; iy++, xStep++)
|
for (int iy = 0, xStep = 8 * yStep; iy < 8; iy++, xStep++)
|
||||||
DecodeBytes_C4_RGB5A3(dst + (y + iy) * width + x, src + 4 * xStep, tlut);
|
DecodeBytes_C4_RGB5A3(dst + (y + iy) * width + x, src + 4 * xStep, tlut);
|
||||||
}
|
}
|
||||||
else if (tlutfmt == GX_TL_IA8)
|
break;
|
||||||
|
|
||||||
|
case GX_TL_IA8:
|
||||||
{
|
{
|
||||||
for (int y = 0; y < height; y += 8)
|
for (int y = 0; y < height; y += 8)
|
||||||
for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
|
for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
|
||||||
for (int iy = 0, xStep = 8 * yStep; iy < 8; iy++, xStep++)
|
for (int iy = 0, xStep = 8 * yStep; iy < 8; iy++, xStep++)
|
||||||
DecodeBytes_C4_IA8(dst + (y + iy) * width + x, src + 4 * xStep, tlut);
|
DecodeBytes_C4_IA8(dst + (y + iy) * width + x, src + 4 * xStep, tlut);
|
||||||
}
|
}
|
||||||
else if (tlutfmt == GX_TL_RGB565)
|
break;
|
||||||
|
|
||||||
|
case GX_TL_RGB565:
|
||||||
{
|
{
|
||||||
for (int y = 0; y < height; y += 8)
|
for (int y = 0; y < height; y += 8)
|
||||||
for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
|
for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
|
||||||
|
@ -261,21 +258,30 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
||||||
DecodeBytes_C4_RGB565(dst + (y + iy) * width + x, src + 4 * xStep, tlut);
|
DecodeBytes_C4_RGB565(dst + (y + iy) * width + x, src + 4 * xStep, tlut);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case GX_TF_I4:
|
|
||||||
{
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void TexDecoder_DecodeImpl_I4_SSSE3(u32* dst, const u8* src, int width, int height,
|
||||||
|
int texformat, const u8* tlut, TlutFormat tlutfmt,
|
||||||
|
int Wsteps4, int Wsteps8)
|
||||||
|
{
|
||||||
|
#if _M_SSE >= 0x301
|
||||||
const __m128i kMask_x0f = _mm_set1_epi32(0x0f0f0f0fL);
|
const __m128i kMask_x0f = _mm_set1_epi32(0x0f0f0f0fL);
|
||||||
const __m128i kMask_xf0 = _mm_set1_epi32(0xf0f0f0f0L);
|
const __m128i kMask_xf0 = _mm_set1_epi32(0xf0f0f0f0L);
|
||||||
#if _M_SSE >= 0x301
|
|
||||||
// xsacha optimized with SSSE3 intrinsics
|
// xsacha optimized with SSSE3 intrinsics
|
||||||
// Produces a ~40% speed improvement over SSE2 implementation
|
// Produces a ~40% speed improvement over SSE2 implementation
|
||||||
if (cpu_info.bSSSE3)
|
|
||||||
{
|
|
||||||
const __m128i mask9180 = _mm_set_epi8(9, 9, 9, 9, 1, 1, 1, 1, 8, 8, 8, 8, 0, 0, 0, 0);
|
const __m128i mask9180 = _mm_set_epi8(9, 9, 9, 9, 1, 1, 1, 1, 8, 8, 8, 8, 0, 0, 0, 0);
|
||||||
const __m128i maskB3A2 = _mm_set_epi8(11, 11, 11, 11, 3, 3, 3, 3, 10, 10, 10, 10, 2, 2, 2, 2);
|
const __m128i maskB3A2 = _mm_set_epi8(11, 11, 11, 11, 3, 3, 3, 3, 10, 10, 10, 10, 2, 2, 2, 2);
|
||||||
const __m128i maskD5C4 = _mm_set_epi8(13, 13, 13, 13, 5, 5, 5, 5, 12, 12, 12, 12, 4, 4, 4, 4);
|
const __m128i maskD5C4 = _mm_set_epi8(13, 13, 13, 13, 5, 5, 5, 5, 12, 12, 12, 12, 4, 4, 4, 4);
|
||||||
const __m128i maskF7E6 = _mm_set_epi8(15, 15, 15, 15, 7, 7, 7, 7, 14, 14, 14, 14, 6, 6, 6, 6);
|
const __m128i maskF7E6 = _mm_set_epi8(15, 15, 15, 15, 7, 7, 7, 7, 14, 14, 14, 14, 6, 6, 6, 6);
|
||||||
for (int y = 0; y < height; y += 8)
|
for (int y = 0; y < height; y += 8)
|
||||||
|
{
|
||||||
for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
|
for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
|
||||||
|
{
|
||||||
for (int iy = 0, xStep = 4 * yStep; iy < 8; iy += 2, xStep++)
|
for (int iy = 0, xStep = 4 * yStep; iy < 8; iy += 2, xStep++)
|
||||||
{
|
{
|
||||||
const __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + 8 * xStep));
|
const __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + 8 * xStep));
|
||||||
|
@ -304,13 +310,22 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
||||||
_mm_storeu_si128((__m128i*)(dst + (y + iy + 1) * width + x + 4), o4);
|
_mm_storeu_si128((__m128i*)(dst + (y + iy + 1) * width + x + 4), o4);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
}
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static void TexDecoder_DecodeImpl_I4(u32* dst, const u8* src, int width, int height, int texformat,
|
||||||
|
const u8* tlut, TlutFormat tlutfmt, int Wsteps4, int Wsteps8)
|
||||||
|
{
|
||||||
|
const __m128i kMask_x0f = _mm_set1_epi32(0x0f0f0f0fL);
|
||||||
|
const __m128i kMask_xf0 = _mm_set1_epi32(0xf0f0f0f0L);
|
||||||
|
|
||||||
// JSD optimized with SSE2 intrinsics.
|
// JSD optimized with SSE2 intrinsics.
|
||||||
// Produces a ~76% speed improvement over reference C implementation.
|
// Produces a ~76% speed improvement over reference C implementation.
|
||||||
{
|
|
||||||
for (int y = 0; y < height; y += 8)
|
for (int y = 0; y < height; y += 8)
|
||||||
|
{
|
||||||
for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
|
for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
|
||||||
|
{
|
||||||
for (int iy = 0, xStep = 4 * yStep; iy < 8; iy += 2, xStep++)
|
for (int iy = 0, xStep = 4 * yStep; iy < 8; iy += 2, xStep++)
|
||||||
{
|
{
|
||||||
const __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + 8 * xStep));
|
const __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + 8 * xStep));
|
||||||
|
@ -387,24 +402,26 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
}
|
||||||
case GX_TF_I8: // speed critical
|
|
||||||
{
|
static void TexDecoder_DecodeImpl_I8_SSSE3(u32* dst, const u8* src, int width, int height,
|
||||||
|
int texformat, const u8* tlut, TlutFormat tlutfmt,
|
||||||
|
int Wsteps4, int Wsteps8)
|
||||||
|
{
|
||||||
#if _M_SSE >= 0x301
|
#if _M_SSE >= 0x301
|
||||||
// xsacha optimized with SSSE3 intrinsics
|
// xsacha optimized with SSSE3 intrinsics
|
||||||
// Produces a ~10% speed improvement over SSE2 implementation
|
// Produces a ~10% speed improvement over SSE2 implementation
|
||||||
if (cpu_info.bSSSE3)
|
|
||||||
{
|
|
||||||
for (int y = 0; y < height; y += 4)
|
for (int y = 0; y < height; y += 4)
|
||||||
|
{
|
||||||
for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
|
for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
|
||||||
|
{
|
||||||
for (int iy = 0, xStep = 4 * yStep; iy < 4; ++iy, xStep++)
|
for (int iy = 0, xStep = 4 * yStep; iy < 4; ++iy, xStep++)
|
||||||
{
|
{
|
||||||
const __m128i mask3210 = _mm_set_epi8(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
|
const __m128i mask3210 = _mm_set_epi8(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
|
||||||
|
|
||||||
const __m128i mask7654 = _mm_set_epi8(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4);
|
const __m128i mask7654 = _mm_set_epi8(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4);
|
||||||
__m128i *quaddst, r, rgba0, rgba1;
|
__m128i *quaddst, r, rgba0, rgba1;
|
||||||
// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe
|
// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
|
||||||
// dcba)
|
|
||||||
r = _mm_loadl_epi64((const __m128i*)(src + 8 * xStep));
|
r = _mm_loadl_epi64((const __m128i*)(src + 8 * xStep));
|
||||||
// Shuffle select bytes to expand from (0000 0000 hgfe dcba) to:
|
// Shuffle select bytes to expand from (0000 0000 hgfe dcba) to:
|
||||||
rgba0 = _mm_shuffle_epi8(r, mask3210); // (dddd cccc bbbb aaaa)
|
rgba0 = _mm_shuffle_epi8(r, mask3210); // (dddd cccc bbbb aaaa)
|
||||||
|
@ -415,27 +432,28 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
||||||
_mm_storeu_si128(quaddst + 1, rgba1);
|
_mm_storeu_si128(quaddst + 1, rgba1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
}
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static void TexDecoder_DecodeImpl_I8(u32* dst, const u8* src, int width, int height, int texformat,
|
||||||
|
const u8* tlut, TlutFormat tlutfmt, int Wsteps4, int Wsteps8)
|
||||||
|
{
|
||||||
// JSD optimized with SSE2 intrinsics.
|
// JSD optimized with SSE2 intrinsics.
|
||||||
// Produces an ~86% speed improvement over reference C implementation.
|
// Produces an ~86% speed improvement over reference C implementation.
|
||||||
{
|
|
||||||
for (int y = 0; y < height; y += 4)
|
for (int y = 0; y < height; y += 4)
|
||||||
|
{
|
||||||
for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
|
for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
|
||||||
{
|
{
|
||||||
// Each loop iteration processes 4 rows from 4 64-bit reads.
|
// Each loop iteration processes 4 rows from 4 64-bit reads.
|
||||||
const u8* src2 = src + 32 * yStep;
|
const u8* src2 = src + 32 * yStep;
|
||||||
// TODO: is it more efficient to group the loads together sequentially and also the stores
|
// TODO: is it more efficient to group the loads together sequentially and also the stores
|
||||||
// at the end?
|
// at the end? _mm_stream instead of _mm_store on my AMD Phenom II x410 made performance
|
||||||
// _mm_stream instead of _mm_store on my AMD Phenom II x410 made performance significantly
|
// significantly WORSE, so I went with _mm_stores. Perhaps there is some edge case here
|
||||||
// WORSE, so I
|
// creating the terrible performance or we're not aligned to 16-byte boundaries. I don't know.
|
||||||
// went with _mm_stores. Perhaps there is some edge case here creating the terrible
|
|
||||||
// performance or we're
|
|
||||||
// not aligned to 16-byte boundaries. I don't know.
|
|
||||||
__m128i* quaddst;
|
__m128i* quaddst;
|
||||||
|
|
||||||
// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe
|
// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
|
||||||
// dcba)
|
|
||||||
const __m128i r0 = _mm_loadl_epi64((const __m128i*)src2);
|
const __m128i r0 = _mm_loadl_epi64((const __m128i*)src2);
|
||||||
// Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc
|
// Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc
|
||||||
// bbaa)
|
// bbaa)
|
||||||
|
@ -454,8 +472,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
||||||
// Store (hhhh gggg ffff eeee) out:
|
// Store (hhhh gggg ffff eeee) out:
|
||||||
_mm_storeu_si128(quaddst + 1, rgba1);
|
_mm_storeu_si128(quaddst + 1, rgba1);
|
||||||
|
|
||||||
// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe
|
// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
|
||||||
// dcba)
|
|
||||||
src2 += 8;
|
src2 += 8;
|
||||||
const __m128i r2 = _mm_loadl_epi64((const __m128i*)src2);
|
const __m128i r2 = _mm_loadl_epi64((const __m128i*)src2);
|
||||||
// Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc
|
// Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc
|
||||||
|
@ -475,8 +492,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
||||||
// Store (hhhh gggg ffff eeee) out:
|
// Store (hhhh gggg ffff eeee) out:
|
||||||
_mm_storeu_si128(quaddst + 1, rgba3);
|
_mm_storeu_si128(quaddst + 1, rgba3);
|
||||||
|
|
||||||
// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe
|
// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
|
||||||
// dcba)
|
|
||||||
src2 += 8;
|
src2 += 8;
|
||||||
const __m128i r4 = _mm_loadl_epi64((const __m128i*)src2);
|
const __m128i r4 = _mm_loadl_epi64((const __m128i*)src2);
|
||||||
// Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc
|
// Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc
|
||||||
|
@ -496,8 +512,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
||||||
// Store (hhhh gggg ffff eeee) out:
|
// Store (hhhh gggg ffff eeee) out:
|
||||||
_mm_storeu_si128(quaddst + 1, rgba5);
|
_mm_storeu_si128(quaddst + 1, rgba5);
|
||||||
|
|
||||||
// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe
|
// Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
|
||||||
// dcba)
|
|
||||||
src2 += 8;
|
src2 += 8;
|
||||||
const __m128i r6 = _mm_loadl_epi64((const __m128i*)src2);
|
const __m128i r6 = _mm_loadl_epi64((const __m128i*)src2);
|
||||||
// Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc
|
// Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc
|
||||||
|
@ -518,24 +533,32 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
||||||
_mm_storeu_si128(quaddst + 1, rgba7);
|
_mm_storeu_si128(quaddst + 1, rgba7);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
|
||||||
case GX_TF_C8:
|
static void TexDecoder_DecodeImpl_C8(u32* dst, const u8* src, int width, int height, int texformat,
|
||||||
if (tlutfmt == GX_TL_RGB5A3)
|
const u8* tlut, TlutFormat tlutfmt, int Wsteps4, int Wsteps8)
|
||||||
|
{
|
||||||
|
switch (tlutfmt)
|
||||||
|
{
|
||||||
|
case GX_TL_RGB5A3:
|
||||||
{
|
{
|
||||||
for (int y = 0; y < height; y += 4)
|
for (int y = 0; y < height; y += 4)
|
||||||
for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
|
for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
|
||||||
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
|
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
|
||||||
DecodeBytes_C8_RGB5A3((u32*)dst + (y + iy) * width + x, src + 8 * xStep, tlut);
|
DecodeBytes_C8_RGB5A3((u32*)dst + (y + iy) * width + x, src + 8 * xStep, tlut);
|
||||||
}
|
}
|
||||||
else if (tlutfmt == GX_TL_IA8)
|
break;
|
||||||
|
|
||||||
|
case GX_TL_IA8:
|
||||||
{
|
{
|
||||||
for (int y = 0; y < height; y += 4)
|
for (int y = 0; y < height; y += 4)
|
||||||
for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
|
for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
|
||||||
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
|
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
|
||||||
DecodeBytes_C8_IA8(dst + (y + iy) * width + x, src + 8 * xStep, tlut);
|
DecodeBytes_C8_IA8(dst + (y + iy) * width + x, src + 8 * xStep, tlut);
|
||||||
}
|
}
|
||||||
else if (tlutfmt == GX_TL_RGB565)
|
break;
|
||||||
|
|
||||||
|
case GX_TL_RGB565:
|
||||||
{
|
{
|
||||||
for (int y = 0; y < height; y += 4)
|
for (int y = 0; y < height; y += 4)
|
||||||
for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
|
for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
|
||||||
|
@ -543,23 +566,38 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
||||||
DecodeBytes_C8_RGB565(dst + (y + iy) * width + x, src + 8 * xStep, tlut);
|
DecodeBytes_C8_RGB565(dst + (y + iy) * width + x, src + 8 * xStep, tlut);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case GX_TF_IA4:
|
|
||||||
{
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void TexDecoder_DecodeImpl_IA4(u32* dst, const u8* src, int width, int height, int texformat,
|
||||||
|
const u8* tlut, TlutFormat tlutfmt, int Wsteps4, int Wsteps8)
|
||||||
|
{
|
||||||
for (int y = 0; y < height; y += 4)
|
for (int y = 0; y < height; y += 4)
|
||||||
|
{
|
||||||
for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
|
for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
|
||||||
|
{
|
||||||
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
|
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
|
||||||
|
{
|
||||||
DecodeBytes_IA4(dst + (y + iy) * width + x, src + 8 * xStep);
|
DecodeBytes_IA4(dst + (y + iy) * width + x, src + 8 * xStep);
|
||||||
}
|
}
|
||||||
break;
|
}
|
||||||
case GX_TF_IA8:
|
}
|
||||||
{
|
}
|
||||||
|
|
||||||
|
static void TexDecoder_DecodeImpl_IA8_SSSE3(u32* dst, const u8* src, int width, int height,
|
||||||
|
int texformat, const u8* tlut, TlutFormat tlutfmt,
|
||||||
|
int Wsteps4, int Wsteps8)
|
||||||
|
{
|
||||||
#if _M_SSE >= 0x301
|
#if _M_SSE >= 0x301
|
||||||
// xsacha optimized with SSSE3 intrinsics.
|
// xsacha optimized with SSSE3 intrinsics.
|
||||||
// Produces an ~50% speed improvement over SSE2 implementation.
|
// Produces an ~50% speed improvement over SSE2 implementation.
|
||||||
if (cpu_info.bSSSE3)
|
|
||||||
{
|
|
||||||
for (int y = 0; y < height; y += 4)
|
for (int y = 0; y < height; y += 4)
|
||||||
|
{
|
||||||
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
|
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
|
||||||
|
{
|
||||||
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
|
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
|
||||||
{
|
{
|
||||||
const __m128i mask = _mm_set_epi8(6, 7, 7, 7, 4, 5, 5, 5, 2, 3, 3, 3, 0, 1, 1, 1);
|
const __m128i mask = _mm_set_epi8(6, 7, 7, 7, 4, 5, 5, 5, 2, 3, 3, 3, 0, 1, 1, 1);
|
||||||
|
@ -571,17 +609,23 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
||||||
_mm_storeu_si128((__m128i*)(dst + (y + iy) * width + x), r1);
|
_mm_storeu_si128((__m128i*)(dst + (y + iy) * width + x), r1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
}
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static void TexDecoder_DecodeImpl_IA8(u32* dst, const u8* src, int width, int height, int texformat,
|
||||||
|
const u8* tlut, TlutFormat tlutfmt, int Wsteps4, int Wsteps8)
|
||||||
|
{
|
||||||
// JSD optimized with SSE2 intrinsics.
|
// JSD optimized with SSE2 intrinsics.
|
||||||
// Produces an ~80% speed improvement over reference C implementation.
|
// Produces an ~80% speed improvement over reference C implementation.
|
||||||
{
|
|
||||||
const __m128i kMask_xf0 = _mm_set_epi32(0x00000000L, 0x00000000L, 0xff00ff00L, 0xff00ff00L);
|
const __m128i kMask_xf0 = _mm_set_epi32(0x00000000L, 0x00000000L, 0xff00ff00L, 0xff00ff00L);
|
||||||
const __m128i kMask_x0f = _mm_set_epi32(0x00000000L, 0x00000000L, 0x00ff00ffL, 0x00ff00ffL);
|
const __m128i kMask_x0f = _mm_set_epi32(0x00000000L, 0x00000000L, 0x00ff00ffL, 0x00ff00ffL);
|
||||||
const __m128i kMask_xf000 = _mm_set_epi32(0xff000000L, 0xff000000L, 0xff000000L, 0xff000000L);
|
const __m128i kMask_xf000 = _mm_set_epi32(0xff000000L, 0xff000000L, 0xff000000L, 0xff000000L);
|
||||||
const __m128i kMask_x0fff = _mm_set_epi32(0x00ffffffL, 0x00ffffffL, 0x00ffffffL, 0x00ffffffL);
|
const __m128i kMask_x0fff = _mm_set_epi32(0x00ffffffL, 0x00ffffffL, 0x00ffffffL, 0x00ffffffL);
|
||||||
for (int y = 0; y < height; y += 4)
|
for (int y = 0; y < height; y += 4)
|
||||||
|
{
|
||||||
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
|
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
|
||||||
|
{
|
||||||
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
|
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
|
||||||
{
|
{
|
||||||
// Expands a 16-bit "IA" to a 32-bit "AIII". Each char is an 8-bit value.
|
// Expands a 16-bit "IA" to a 32-bit "AIII". Each char is an 8-bit value.
|
||||||
|
@ -591,8 +635,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
||||||
const __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + 8 * xStep));
|
const __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + 8 * xStep));
|
||||||
|
|
||||||
// Logical shift all 16-bit words right by 8 bits (0000 0000 hgfe dcba) to (0000 0000
|
// Logical shift all 16-bit words right by 8 bits (0000 0000 hgfe dcba) to (0000 0000
|
||||||
// 0h0f 0d0b)
|
// 0h0f 0d0b). This gets us only the I components.
|
||||||
// This gets us only the I components.
|
|
||||||
const __m128i i0 = _mm_srli_epi16(r0, 8);
|
const __m128i i0 = _mm_srli_epi16(r0, 8);
|
||||||
|
|
||||||
// Now join up the I components from their original positions but mask out the A
|
// Now join up the I components from their original positions but mask out the A
|
||||||
|
@ -608,8 +651,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
||||||
const __m128i i3 = _mm_and_si128(i2, kMask_x0fff);
|
const __m128i i3 = _mm_and_si128(i2, kMask_x0fff);
|
||||||
|
|
||||||
// Now that we have the I components in 32-bit word form, time work out the A components
|
// Now that we have the I components in 32-bit word form, time work out the A components
|
||||||
// into
|
// into their final positions.
|
||||||
// their final positions.
|
|
||||||
|
|
||||||
// (0000 0000 hgfe dcba) & kMask_x00FF -> (0000 0000 0g0e 0c0a)
|
// (0000 0000 hgfe dcba) & kMask_x00FF -> (0000 0000 0g0e 0c0a)
|
||||||
const __m128i a0 = _mm_and_si128(r0, kMask_x0f);
|
const __m128i a0 = _mm_and_si128(r0, kMask_x0f);
|
||||||
|
@ -629,23 +671,33 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
}
|
||||||
case GX_TF_C14X2:
|
|
||||||
if (tlutfmt == GX_TL_RGB5A3)
|
static void TexDecoder_DecodeImpl_C14X2(u32* dst, const u8* src, int width, int height,
|
||||||
|
int texformat, const u8* tlut, TlutFormat tlutfmt,
|
||||||
|
int Wsteps4, int Wsteps8)
|
||||||
|
{
|
||||||
|
switch (tlutfmt)
|
||||||
|
{
|
||||||
|
case GX_TL_RGB5A3:
|
||||||
{
|
{
|
||||||
for (int y = 0; y < height; y += 4)
|
for (int y = 0; y < height; y += 4)
|
||||||
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
|
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
|
||||||
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
|
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
|
||||||
DecodeBytes_C14X2_RGB5A3(dst + (y + iy) * width + x, (u16*)(src + 8 * xStep), tlut);
|
DecodeBytes_C14X2_RGB5A3(dst + (y + iy) * width + x, (u16*)(src + 8 * xStep), tlut);
|
||||||
}
|
}
|
||||||
else if (tlutfmt == GX_TL_IA8)
|
break;
|
||||||
|
|
||||||
|
case GX_TL_IA8:
|
||||||
{
|
{
|
||||||
for (int y = 0; y < height; y += 4)
|
for (int y = 0; y < height; y += 4)
|
||||||
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
|
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
|
||||||
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
|
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
|
||||||
DecodeBytes_C14X2_IA8(dst + (y + iy) * width + x, (u16*)(src + 8 * xStep), tlut);
|
DecodeBytes_C14X2_IA8(dst + (y + iy) * width + x, (u16*)(src + 8 * xStep), tlut);
|
||||||
}
|
}
|
||||||
else if (tlutfmt == GX_TL_RGB565)
|
break;
|
||||||
|
|
||||||
|
case GX_TL_RGB565:
|
||||||
{
|
{
|
||||||
for (int y = 0; y < height; y += 4)
|
for (int y = 0; y < height; y += 4)
|
||||||
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
|
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
|
||||||
|
@ -653,8 +705,16 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
||||||
DecodeBytes_C14X2_RGB565(dst + (y + iy) * width + x, (u16*)(src + 8 * xStep), tlut);
|
DecodeBytes_C14X2_RGB565(dst + (y + iy) * width + x, (u16*)(src + 8 * xStep), tlut);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case GX_TF_RGB565:
|
|
||||||
{
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void TexDecoder_DecodeImpl_RGB565(u32* dst, const u8* src, int width, int height,
|
||||||
|
int texformat, const u8* tlut, TlutFormat tlutfmt,
|
||||||
|
int Wsteps4, int Wsteps8)
|
||||||
|
{
|
||||||
// JSD optimized with SSE2 intrinsics.
|
// JSD optimized with SSE2 intrinsics.
|
||||||
// Produces an ~78% speed improvement over reference C implementation.
|
// Produces an ~78% speed improvement over reference C implementation.
|
||||||
const __m128i kMaskR0 = _mm_set1_epi32(0x000000F8);
|
const __m128i kMaskR0 = _mm_set1_epi32(0x000000F8);
|
||||||
|
@ -663,7 +723,9 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
||||||
const __m128i kMaskB0 = _mm_set1_epi32(0x00F80000);
|
const __m128i kMaskB0 = _mm_set1_epi32(0x00F80000);
|
||||||
const __m128i kAlpha = _mm_set1_epi32(0xFF000000);
|
const __m128i kAlpha = _mm_set1_epi32(0xFF000000);
|
||||||
for (int y = 0; y < height; y += 4)
|
for (int y = 0; y < height; y += 4)
|
||||||
|
{
|
||||||
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
|
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
|
||||||
|
{
|
||||||
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
|
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
|
||||||
{
|
{
|
||||||
__m128i* dxtsrc = (__m128i*)(src + 8 * xStep);
|
__m128i* dxtsrc = (__m128i*)(src + 8 * xStep);
|
||||||
|
@ -672,9 +734,8 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
||||||
const __m128i rgb565x4 = _mm_loadl_epi64(dxtsrc);
|
const __m128i rgb565x4 = _mm_loadl_epi64(dxtsrc);
|
||||||
|
|
||||||
// The big-endian 16-bit colors `ba` and `dc` look like 0b_gggBBBbb_RRRrrGGg in a little
|
// The big-endian 16-bit colors `ba` and `dc` look like 0b_gggBBBbb_RRRrrGGg in a little
|
||||||
// endian xmm register
|
// endian xmm register Unpack `hgfe dcba` to `hhgg ffee ddcc bbaa`, where each 32-bit word
|
||||||
// Unpack `hgfe dcba` to `hhgg ffee ddcc bbaa`, where each 32-bit word is now
|
// is now 0b_gggBBBbb_RRRrrGGg_gggBBBbb_RRRrrGGg
|
||||||
// 0b_gggBBBbb_RRRrrGGg_gggBBBbb_RRRrrGGg
|
|
||||||
const __m128i c0 = _mm_unpacklo_epi16(rgb565x4, rgb565x4);
|
const __m128i c0 = _mm_unpacklo_epi16(rgb565x4, rgb565x4);
|
||||||
|
|
||||||
// swizzle 0b_gggBBBbb_RRRrrGGg_gggBBBbb_RRRrrGGg
|
// swizzle 0b_gggBBBbb_RRRrrGGg_gggBBBbb_RRRrrGGg
|
||||||
|
@ -718,9 +779,14 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
||||||
_mm_storeu_si128(ptr, abgr888x4);
|
_mm_storeu_si128(ptr, abgr888x4);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
}
|
||||||
case GX_TF_RGB5A3:
|
}
|
||||||
{
|
|
||||||
|
static void TexDecoder_DecodeImpl_RGB5A3_SSSE3(u32* dst, const u8* src, int width, int height,
|
||||||
|
int texformat, const u8* tlut, TlutFormat tlutfmt,
|
||||||
|
int Wsteps4, int Wsteps8)
|
||||||
|
{
|
||||||
|
#if _M_SSE >= 0x301
|
||||||
const __m128i kMask_x1f = _mm_set1_epi32(0x0000001fL);
|
const __m128i kMask_x1f = _mm_set1_epi32(0x0000001fL);
|
||||||
const __m128i kMask_x0f = _mm_set1_epi32(0x0000000fL);
|
const __m128i kMask_x0f = _mm_set1_epi32(0x0000000fL);
|
||||||
const __m128i kMask_x07 = _mm_set1_epi32(0x00000007L);
|
const __m128i kMask_x07 = _mm_set1_epi32(0x00000007L);
|
||||||
|
@ -728,22 +794,20 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
||||||
// for the RGB555 case when (s[x] & 0x8000) is true for all pixels.
|
// for the RGB555 case when (s[x] & 0x8000) is true for all pixels.
|
||||||
const __m128i aVxff00 = _mm_set1_epi32(0xFF000000L);
|
const __m128i aVxff00 = _mm_set1_epi32(0xFF000000L);
|
||||||
|
|
||||||
#if _M_SSE >= 0x301
|
|
||||||
// xsacha optimized with SSSE3 intrinsics (2 in 4 cases)
|
// xsacha optimized with SSSE3 intrinsics (2 in 4 cases)
|
||||||
// Produces a ~10% speed improvement over SSE2 implementation
|
// Produces a ~10% speed improvement over SSE2 implementation
|
||||||
if (cpu_info.bSSSE3)
|
|
||||||
{
|
|
||||||
for (int y = 0; y < height; y += 4)
|
for (int y = 0; y < height; y += 4)
|
||||||
|
{
|
||||||
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
|
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
|
||||||
|
{
|
||||||
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
|
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
|
||||||
{
|
{
|
||||||
u32* newdst = dst + (y + iy) * width + x;
|
u32* newdst = dst + (y + iy) * width + x;
|
||||||
const __m128i mask = _mm_set_epi8(-128, -128, 6, 7, -128, -128, 4, 5, -128, -128, 2, 3,
|
const __m128i mask =
|
||||||
-128, -128, 0, 1);
|
_mm_set_epi8(-128, -128, 6, 7, -128, -128, 4, 5, -128, -128, 2, 3, -128, -128, 0, 1);
|
||||||
const __m128i valV =
|
const __m128i valV =
|
||||||
_mm_shuffle_epi8(_mm_loadl_epi64((const __m128i*)(src + 8 * xStep)), mask);
|
_mm_shuffle_epi8(_mm_loadl_epi64((const __m128i*)(src + 8 * xStep)), mask);
|
||||||
int cmp =
|
int cmp = _mm_movemask_epi8(valV); // MSB: 0x2 = val0; 0x20=val1; 0x200 = val2; 0x2000=val3
|
||||||
_mm_movemask_epi8(valV); // MSB: 0x2 = val0; 0x20=val1; 0x200 = val2; 0x2000=val3
|
|
||||||
if ((cmp & 0x2222) ==
|
if ((cmp & 0x2222) ==
|
||||||
0x2222) // SSSE3 case #1: all 4 pixels are in RGB555 and alpha = 0xFF.
|
0x2222) // SSSE3 case #1: all 4 pixels are in RGB555 and alpha = 0xFF.
|
||||||
{
|
{
|
||||||
|
@ -781,8 +845,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
||||||
// b0 = (((val0 ) & 0xf) << 4) | ((val0 ) & 0xf);
|
// b0 = (((val0 ) & 0xf) << 4) | ((val0 ) & 0xf);
|
||||||
const __m128i tmpbV = _mm_and_si128(valV, kMask_x0f);
|
const __m128i tmpbV = _mm_and_si128(valV, kMask_x0f);
|
||||||
const __m128i bV = _mm_or_si128(_mm_slli_epi16(tmpbV, 4), tmpbV);
|
const __m128i bV = _mm_or_si128(_mm_slli_epi16(tmpbV, 4), tmpbV);
|
||||||
// a0 = (((val0>>12) & 0x7) << 5) | (((val0>>12) & 0x7) << 2) | (((val0>>12) & 0x7) >>
|
// a0 = (((val0>>12) & 0x7) << 5) | (((val0>>12) & 0x7) << 2) | (((val0>>12) & 0x7) >> 1);
|
||||||
// 1);
|
|
||||||
const __m128i tmpaV = _mm_and_si128(_mm_srli_epi16(valV, 12), kMask_x07);
|
const __m128i tmpaV = _mm_and_si128(_mm_srli_epi16(valV, 12), kMask_x07);
|
||||||
const __m128i aV =
|
const __m128i aV =
|
||||||
_mm_or_si128(_mm_slli_epi16(tmpaV, 5),
|
_mm_or_si128(_mm_slli_epi16(tmpaV, 5),
|
||||||
|
@ -823,13 +886,27 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
}
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static void TexDecoder_DecodeImpl_RGB5A3(u32* dst, const u8* src, int width, int height,
|
||||||
|
int texformat, const u8* tlut, TlutFormat tlutfmt,
|
||||||
|
int Wsteps4, int Wsteps8)
|
||||||
|
{
|
||||||
|
const __m128i kMask_x1f = _mm_set1_epi32(0x0000001fL);
|
||||||
|
const __m128i kMask_x0f = _mm_set1_epi32(0x0000000fL);
|
||||||
|
const __m128i kMask_x07 = _mm_set1_epi32(0x00000007L);
|
||||||
|
// This is the hard-coded 0xFF alpha constant that is ORed in place after the RGB are calculated
|
||||||
|
// for the RGB555 case when (s[x] & 0x8000) is true for all pixels.
|
||||||
|
const __m128i aVxff00 = _mm_set1_epi32(0xFF000000L);
|
||||||
|
|
||||||
// JSD optimized with SSE2 intrinsics (2 in 4 cases)
|
// JSD optimized with SSE2 intrinsics (2 in 4 cases)
|
||||||
// Produces a ~25% speed improvement over reference C implementation.
|
// Produces a ~25% speed improvement over reference C implementation.
|
||||||
{
|
|
||||||
for (int y = 0; y < height; y += 4)
|
for (int y = 0; y < height; y += 4)
|
||||||
|
{
|
||||||
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
|
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
|
||||||
|
{
|
||||||
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
|
for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
|
||||||
{
|
{
|
||||||
u32* newdst = dst + (y + iy) * width + x;
|
u32* newdst = dst + (y + iy) * width + x;
|
||||||
|
@ -869,8 +946,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
||||||
// write the final result:
|
// write the final result:
|
||||||
_mm_storeu_si128((__m128i*)newdst, final);
|
_mm_storeu_si128((__m128i*)newdst, final);
|
||||||
}
|
}
|
||||||
else if (((val0 & 0x8000) | (val1 & 0x8000) | (val2 & 0x8000) | (val3 & 0x8000)) ==
|
else if (((val0 & 0x8000) | (val1 & 0x8000) | (val2 & 0x8000) | (val3 & 0x8000)) == 0x0000)
|
||||||
0x0000)
|
|
||||||
{
|
{
|
||||||
// SSE2 case #2: all 4 pixels are in RGBA4443.
|
// SSE2 case #2: all 4 pixels are in RGBA4443.
|
||||||
|
|
||||||
|
@ -888,8 +964,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
||||||
const __m128i tmpbV = _mm_and_si128(valV, kMask_x0f);
|
const __m128i tmpbV = _mm_and_si128(valV, kMask_x0f);
|
||||||
const __m128i bV = _mm_or_si128(_mm_slli_epi16(tmpbV, 4), tmpbV);
|
const __m128i bV = _mm_or_si128(_mm_slli_epi16(tmpbV, 4), tmpbV);
|
||||||
|
|
||||||
// a0 = (((val0>>12) & 0x7) << 5) | (((val0>>12) & 0x7) << 2) | (((val0>>12) & 0x7) >>
|
// a0 = (((val0>>12) & 0x7) << 5) | (((val0>>12) & 0x7) << 2) | (((val0>>12) & 0x7) >> 1);
|
||||||
// 1);
|
|
||||||
const __m128i tmpaV = _mm_and_si128(_mm_srli_epi16(valV, 12), kMask_x07);
|
const __m128i tmpaV = _mm_and_si128(_mm_srli_epi16(valV, 12), kMask_x07);
|
||||||
const __m128i aV =
|
const __m128i aV =
|
||||||
_mm_or_si128(_mm_slli_epi16(tmpaV, 5),
|
_mm_or_si128(_mm_slli_epi16(tmpaV, 5),
|
||||||
|
@ -933,20 +1008,21 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
}
|
||||||
case GX_TF_RGBA8: // speed critical
|
|
||||||
{
|
static void TexDecoder_DecodeImpl_RGBA8_SSSE3(u32* dst, const u8* src, int width, int height,
|
||||||
|
int texformat, const u8* tlut, TlutFormat tlutfmt,
|
||||||
|
int Wsteps4, int Wsteps8)
|
||||||
|
{
|
||||||
#if _M_SSE >= 0x301
|
#if _M_SSE >= 0x301
|
||||||
// xsacha optimized with SSSE3 instrinsics
|
// xsacha optimized with SSSE3 instrinsics
|
||||||
// Produces a ~30% speed improvement over SSE2 implementation
|
// Produces a ~30% speed improvement over SSE2 implementation
|
||||||
if (cpu_info.bSSSE3)
|
|
||||||
{
|
|
||||||
for (int y = 0; y < height; y += 4)
|
for (int y = 0; y < height; y += 4)
|
||||||
|
{
|
||||||
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
|
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
|
||||||
{
|
{
|
||||||
const u8* src2 = src + 64 * yStep;
|
const u8* src2 = src + 64 * yStep;
|
||||||
const __m128i mask0312 =
|
const __m128i mask0312 = _mm_set_epi8(12, 15, 13, 14, 8, 11, 9, 10, 4, 7, 5, 6, 0, 3, 1, 2);
|
||||||
_mm_set_epi8(12, 15, 13, 14, 8, 11, 9, 10, 4, 7, 5, 6, 0, 3, 1, 2);
|
|
||||||
const __m128i ar0 = _mm_loadu_si128((__m128i*)src2);
|
const __m128i ar0 = _mm_loadu_si128((__m128i*)src2);
|
||||||
const __m128i ar1 = _mm_loadu_si128((__m128i*)src2 + 1);
|
const __m128i ar1 = _mm_loadu_si128((__m128i*)src2 + 1);
|
||||||
const __m128i gb0 = _mm_loadu_si128((__m128i*)src2 + 2);
|
const __m128i gb0 = _mm_loadu_si128((__m128i*)src2 + 2);
|
||||||
|
@ -967,34 +1043,36 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
||||||
_mm_storeu_si128(dst128, rgba11);
|
_mm_storeu_si128(dst128, rgba11);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static void TexDecoder_DecodeImpl_RGBA8(u32* dst, const u8* src, int width, int height,
|
||||||
|
int texformat, const u8* tlut, TlutFormat tlutfmt,
|
||||||
|
int Wsteps4, int Wsteps8)
|
||||||
|
{
|
||||||
// JSD optimized with SSE2 intrinsics
|
// JSD optimized with SSE2 intrinsics
|
||||||
// Produces a ~68% speed improvement over reference C implementation.
|
// Produces a ~68% speed improvement over reference C implementation.
|
||||||
{
|
|
||||||
for (int y = 0; y < height; y += 4)
|
for (int y = 0; y < height; y += 4)
|
||||||
|
{
|
||||||
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
|
for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
|
||||||
{
|
{
|
||||||
// Input is divided up into 16-bit words. The texels are split up into AR and GB
|
// Input is divided up into 16-bit words. The texels are split up into AR and GB
|
||||||
// components where all
|
// components where all AR components come grouped up first in 32 bytes followed by the GB
|
||||||
// AR components come grouped up first in 32 bytes followed by the GB components in 32
|
// components in 32 bytes. We are processing 16 texels per each loop iteration, numbered from
|
||||||
// bytes. We are
|
// 0-f.
|
||||||
// processing 16 texels per each loop iteration, numbered from 0-f.
|
|
||||||
//
|
//
|
||||||
// Convention is:
|
// Convention is:
|
||||||
// one byte is [component-name texel-number]
|
// one byte is [component-name texel-number]
|
||||||
// __m128i is (4-bytes 4-bytes 4-bytes 4-bytes)
|
// __m128i is (4-bytes 4-bytes 4-bytes 4-bytes)
|
||||||
//
|
//
|
||||||
// Input is ([A 7][R 7][A 6][R 6] [A 5][R 5][A 4][R 4] [A 3][R 3][A 2][R 2] [A 1][R 1][A
|
// Input is:
|
||||||
// 0][R 0])
|
// ([A 7][R 7][A 6][R 6] [A 5][R 5][A 4][R 4] [A 3][R 3][A 2][R 2] [A 1][R 1][A 0][R 0])
|
||||||
// ([A f][R f][A e][R e] [A d][R d][A c][R c] [A b][R b][A a][R a] [A 9][R 9][A
|
// ([A f][R f][A e][R e] [A d][R d][A c][R c] [A b][R b][A a][R a] [A 9][R 9][A 8][R 8])
|
||||||
// 8][R 8])
|
// ([G 7][B 7][G 6][B 6] [G 5][B 5][G 4][B 4] [G 3][B 3][G 2][B 2] [G 1][B 1][G 0][B 0])
|
||||||
// ([G 7][B 7][G 6][B 6] [G 5][B 5][G 4][B 4] [G 3][B 3][G 2][B 2] [G 1][B 1][G
|
// ([G f][B f][G e][B e] [G d][B d][G c][B c] [G b][B b][G a][B a] [G 9][B 9][G 8][B 8])
|
||||||
// 0][B 0])
|
|
||||||
// ([G f][B f][G e][B e] [G d][B d][G c][B c] [G b][B b][G a][B a] [G 9][B 9][G
|
|
||||||
// 8][B 8])
|
|
||||||
//
|
//
|
||||||
// Output is (RGBA3 RGBA2 RGBA1 RGBA0)
|
// Output is:
|
||||||
|
// (RGBA3 RGBA2 RGBA1 RGBA0)
|
||||||
// (RGBA7 RGBA6 RGBA5 RGBA4)
|
// (RGBA7 RGBA6 RGBA5 RGBA4)
|
||||||
// (RGBAb RGBAa RGBA9 RGBA8)
|
// (RGBAb RGBAa RGBA9 RGBA8)
|
||||||
// (RGBAf RGBAe RGBAd RGBAc)
|
// (RGBAf RGBAe RGBAd RGBAc)
|
||||||
|
@ -1012,28 +1090,21 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
||||||
// b][G a][B a] [G 9][B 9][G 8][B 8])
|
// b][G a][B a] [G 9][B 9][G 8][B 8])
|
||||||
const __m128i gb1 = _mm_loadu_si128((__m128i*)src2 + 3);
|
const __m128i gb1 = _mm_loadu_si128((__m128i*)src2 + 3);
|
||||||
__m128i rgba00, rgba01, rgba10, rgba11;
|
__m128i rgba00, rgba01, rgba10, rgba11;
|
||||||
const __m128i kMask_x000f =
|
const __m128i kMask_x000f = _mm_set_epi32(0x000000FFL, 0x000000FFL, 0x000000FFL, 0x000000FFL);
|
||||||
_mm_set_epi32(0x000000FFL, 0x000000FFL, 0x000000FFL, 0x000000FFL);
|
const __m128i kMask_xf000 = _mm_set_epi32(0xFF000000L, 0xFF000000L, 0xFF000000L, 0xFF000000L);
|
||||||
const __m128i kMask_xf000 =
|
const __m128i kMask_x0ff0 = _mm_set_epi32(0x00FFFF00L, 0x00FFFF00L, 0x00FFFF00L, 0x00FFFF00L);
|
||||||
_mm_set_epi32(0xFF000000L, 0xFF000000L, 0xFF000000L, 0xFF000000L);
|
|
||||||
const __m128i kMask_x0ff0 =
|
|
||||||
_mm_set_epi32(0x00FFFF00L, 0x00FFFF00L, 0x00FFFF00L, 0x00FFFF00L);
|
|
||||||
// Expand the AR components to fill out 32-bit words:
|
// Expand the AR components to fill out 32-bit words:
|
||||||
// ([A 7][R 7][A 6][R 6] [A 5][R 5][A 4][R 4] [A 3][R 3][A 2][R 2] [A 1][R 1][A 0][R 0])
|
// ([A 7][R 7][A 6][R 6] [A 5][R 5][A 4][R 4] [A 3][R 3][A 2][R 2] [A 1][R 1][A 0][R 0])
|
||||||
// -> ([A 3][A 3][R 3][R 3] [A 2][A 2][R 2][R 2] [A 1][A 1][R 1][R 1] [A 0][A 0][R 0][R
|
// -> ([A 3][A 3][R 3][R 3] [A 2][A 2][R 2][R 2] [A 1][A 1][R 1][R 1] [A 0][A 0][R 0][R 0])
|
||||||
// 0])
|
|
||||||
const __m128i aarr00 = _mm_unpacklo_epi8(ar0, ar0);
|
const __m128i aarr00 = _mm_unpacklo_epi8(ar0, ar0);
|
||||||
// ([A 7][R 7][A 6][R 6] [A 5][R 5][A 4][R 4] [A 3][R 3][A 2][R 2] [A 1][R 1][A 0][R 0])
|
// ([A 7][R 7][A 6][R 6] [A 5][R 5][A 4][R 4] [A 3][R 3][A 2][R 2] [A 1][R 1][A 0][R 0])
|
||||||
// -> ([A 7][A 7][R 7][R 7] [A 6][A 6][R 6][R 6] [A 5][A 5][R 5][R 5] [A 4][A 4][R 4][R
|
// -> ([A 7][A 7][R 7][R 7] [A 6][A 6][R 6][R 6] [A 5][A 5][R 5][R 5] [A 4][A 4][R 4][R 4])
|
||||||
// 4])
|
|
||||||
const __m128i aarr01 = _mm_unpackhi_epi8(ar0, ar0);
|
const __m128i aarr01 = _mm_unpackhi_epi8(ar0, ar0);
|
||||||
// ([A f][R f][A e][R e] [A d][R d][A c][R c] [A b][R b][A a][R a] [A 9][R 9][A 8][R 8])
|
// ([A f][R f][A e][R e] [A d][R d][A c][R c] [A b][R b][A a][R a] [A 9][R 9][A 8][R 8])
|
||||||
// -> ([A b][A b][R b][R b] [A a][A a][R a][R a] [A 9][A 9][R 9][R 9] [A 8][A 8][R 8][R
|
// -> ([A b][A b][R b][R b] [A a][A a][R a][R a] [A 9][A 9][R 9][R 9] [A 8][A 8][R 8][R 8])
|
||||||
// 8])
|
|
||||||
const __m128i aarr10 = _mm_unpacklo_epi8(ar1, ar1);
|
const __m128i aarr10 = _mm_unpacklo_epi8(ar1, ar1);
|
||||||
// ([A f][R f][A e][R e] [A d][R d][A c][R c] [A b][R b][A a][R a] [A 9][R 9][A 8][R 8])
|
// ([A f][R f][A e][R e] [A d][R d][A c][R c] [A b][R b][A a][R a] [A 9][R 9][A 8][R 8])
|
||||||
// -> ([A f][A f][R f][R f] [A e][A e][R e][R e] [A d][A d][R d][R d] [A c][A c][R c][R
|
// -> ([A f][A f][R f][R f] [A e][A e][R e][R e] [A d][A d][R d][R d] [A c][A c][R c][R c])
|
||||||
// c])
|
|
||||||
const __m128i aarr11 = _mm_unpackhi_epi8(ar1, ar1);
|
const __m128i aarr11 = _mm_unpackhi_epi8(ar1, ar1);
|
||||||
|
|
||||||
// Move A right 16 bits and mask off everything but the lowest 8 bits to get A in its
|
// Move A right 16 bits and mask off everything but the lowest 8 bits to get A in its
|
||||||
|
@ -1059,20 +1130,16 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
||||||
|
|
||||||
// Expand the GB components to fill out 32-bit words:
|
// Expand the GB components to fill out 32-bit words:
|
||||||
// ([G 7][B 7][G 6][B 6] [G 5][B 5][G 4][B 4] [G 3][B 3][G 2][B 2] [G 1][B 1][G 0][B 0])
|
// ([G 7][B 7][G 6][B 6] [G 5][B 5][G 4][B 4] [G 3][B 3][G 2][B 2] [G 1][B 1][G 0][B 0])
|
||||||
// -> ([G 3][G 3][B 3][B 3] [G 2][G 2][B 2][B 2] [G 1][G 1][B 1][B 1] [G 0][G 0][B 0][B
|
// -> ([G 3][G 3][B 3][B 3] [G 2][G 2][B 2][B 2] [G 1][G 1][B 1][B 1] [G 0][G 0][B 0][B 0])
|
||||||
// 0])
|
|
||||||
const __m128i ggbb00 = _mm_unpacklo_epi8(gb0, gb0);
|
const __m128i ggbb00 = _mm_unpacklo_epi8(gb0, gb0);
|
||||||
// ([G 7][B 7][G 6][B 6] [G 5][B 5][G 4][B 4] [G 3][B 3][G 2][B 2] [G 1][B 1][G 0][B 0])
|
// ([G 7][B 7][G 6][B 6] [G 5][B 5][G 4][B 4] [G 3][B 3][G 2][B 2] [G 1][B 1][G 0][B 0])
|
||||||
// -> ([G 7][G 7][B 7][B 7] [G 6][G 6][B 6][B 6] [G 5][G 5][B 5][B 5] [G 4][G 4][B 4][B
|
// -> ([G 7][G 7][B 7][B 7] [G 6][G 6][B 6][B 6] [G 5][G 5][B 5][B 5] [G 4][G 4][B 4][B 4])
|
||||||
// 4])
|
|
||||||
const __m128i ggbb01 = _mm_unpackhi_epi8(gb0, gb0);
|
const __m128i ggbb01 = _mm_unpackhi_epi8(gb0, gb0);
|
||||||
// ([G f][B f][G e][B e] [G d][B d][G c][B c] [G b][B b][G a][B a] [G 9][B 9][G 8][B 8])
|
// ([G f][B f][G e][B e] [G d][B d][G c][B c] [G b][B b][G a][B a] [G 9][B 9][G 8][B 8])
|
||||||
// -> ([G b][G b][B b][B b] [G a][G a][B a][B a] [G 9][G 9][B 9][B 9] [G 8][G 8][B 8][B
|
// -> ([G b][G b][B b][B b] [G a][G a][B a][B a] [G 9][G 9][B 9][B 9] [G 8][G 8][B 8][B 8])
|
||||||
// 8])
|
|
||||||
const __m128i ggbb10 = _mm_unpacklo_epi8(gb1, gb1);
|
const __m128i ggbb10 = _mm_unpacklo_epi8(gb1, gb1);
|
||||||
// ([G f][B f][G e][B e] [G d][B d][G c][B c] [G b][B b][G a][B a] [G 9][B 9][G 8][B 8])
|
// ([G f][B f][G e][B e] [G d][B d][G c][B c] [G b][B b][G a][B a] [G 9][B 9][G 8][B 8])
|
||||||
// -> ([G f][G f][B f][B f] [G e][G e][B e][B e] [G d][G d][B d][B d] [G c][G c][B c][B
|
// -> ([G f][G f][B f][B f] [G e][G e][B e][B e] [G d][G d][B d][B d] [G c][G c][B c][B c])
|
||||||
// c])
|
|
||||||
const __m128i ggbb11 = _mm_unpackhi_epi8(gb1, gb1);
|
const __m128i ggbb11 = _mm_unpackhi_epi8(gb1, gb1);
|
||||||
|
|
||||||
// G and B are already in perfect spots in the center, just remove the extra copies in the
|
// G and B are already in perfect spots in the center, just remove the extra copies in the
|
||||||
|
@ -1098,28 +1165,25 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
||||||
_mm_storeu_si128(dst128, rgba11);
|
_mm_storeu_si128(dst128, rgba11);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
|
||||||
case GX_TF_CMPR: // speed critical
|
static void TexDecoder_DecodeImpl_CMPR(u32* dst, const u8* src, int width, int height,
|
||||||
|
int texformat, const u8* tlut, TlutFormat tlutfmt,
|
||||||
|
int Wsteps4, int Wsteps8)
|
||||||
|
{
|
||||||
// The metroid games use this format almost exclusively.
|
// The metroid games use this format almost exclusively.
|
||||||
{
|
|
||||||
// JSD optimized with SSE2 intrinsics.
|
// JSD optimized with SSE2 intrinsics.
|
||||||
// Produces a ~50% improvement for x86 and a ~40% improvement for x64 in speed over reference
|
// Produces a ~50% improvement for x86 and a ~40% improvement for x64 in speed over reference
|
||||||
// C implementation.
|
// C implementation. The x64 compiled reference C code is faster than the x86 compiled reference
|
||||||
// The x64 compiled reference C code is faster than the x86 compiled reference C code, but the
|
// C code, but the SSE2 is faster than both.
|
||||||
// SSE2 is
|
|
||||||
// faster than both.
|
|
||||||
for (int y = 0; y < height; y += 8)
|
for (int y = 0; y < height; y += 8)
|
||||||
{
|
{
|
||||||
for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
|
for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
|
||||||
{
|
{
|
||||||
// We handle two DXT blocks simultaneously to take full advantage of SSE2's 128-bit
|
// We handle two DXT blocks simultaneously to take full advantage of SSE2's 128-bit registers.
|
||||||
// registers.
|
|
||||||
// This is ideal because a single DXT block contains 2 RGBA colors when decoded from their
|
// This is ideal because a single DXT block contains 2 RGBA colors when decoded from their
|
||||||
// 16-bit.
|
// 16-bit. Two DXT blocks therefore contain 4 RGBA colors to be processed. The processing is
|
||||||
// Two DXT blocks therefore contain 4 RGBA colors to be processed. The processing is
|
// parallelizable at this level, so we do.
|
||||||
// parallelizable
|
|
||||||
// at this level, so we do.
|
|
||||||
for (int z = 0, xStep = 2 * yStep; z < 2; ++z, xStep++)
|
for (int z = 0, xStep = 2 * yStep; z < 2; ++z, xStep++)
|
||||||
{
|
{
|
||||||
// JSD NOTE: You may see many strange patterns of behavior in the below code, but they
|
// JSD NOTE: You may see many strange patterns of behavior in the below code, but they
|
||||||
|
@ -1127,17 +1191,13 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
||||||
// constants is faster than loading their values from memory. Unfortunately, there is no
|
// constants is faster than loading their values from memory. Unfortunately, there is no
|
||||||
// way to inline 128-bit constants from opcodes so they must be loaded from memory. This
|
// way to inline 128-bit constants from opcodes so they must be loaded from memory. This
|
||||||
// seems a little ridiculous to me in that you can't even generate a constant value of 1
|
// seems a little ridiculous to me in that you can't even generate a constant value of 1
|
||||||
// without
|
// without having to load it from memory. So, I stored the minimal constant I could,
|
||||||
// having to load it from memory. So, I stored the minimal constant I could, 128-bits
|
// 128-bits worth of 1s :). Then I use sequences of shifts to squash it to the appropriate
|
||||||
// worth
|
// size and bitpositions that I need.
|
||||||
// of 1s :). Then I use sequences of shifts to squash it to the appropriate size and bit
|
|
||||||
// positions that I need.
|
|
||||||
|
|
||||||
const __m128i allFFs128 = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128());
|
const __m128i allFFs128 = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128());
|
||||||
|
|
||||||
// Load 128 bits, i.e. two DXTBlocks (64-bits each)
|
// Load 128 bits, i.e. two DXTBlocks (64-bits each)
|
||||||
const __m128i dxt =
|
const __m128i dxt = _mm_loadu_si128((__m128i*)(src + sizeof(struct DXTBlock) * 2 * xStep));
|
||||||
_mm_loadu_si128((__m128i*)(src + sizeof(struct DXTBlock) * 2 * xStep));
|
|
||||||
|
|
||||||
// Copy the 2-bit indices from each DXT block:
|
// Copy the 2-bit indices from each DXT block:
|
||||||
alignas(16) u32 dxttmp[4];
|
alignas(16) u32 dxttmp[4];
|
||||||
|
@ -1149,8 +1209,8 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
||||||
__m128i argb888x4;
|
__m128i argb888x4;
|
||||||
__m128i c1 = _mm_unpackhi_epi16(dxt, dxt);
|
__m128i c1 = _mm_unpackhi_epi16(dxt, dxt);
|
||||||
c1 = _mm_slli_si128(c1, 8);
|
c1 = _mm_slli_si128(c1, 8);
|
||||||
const __m128i c0 = _mm_or_si128(
|
const __m128i c0 =
|
||||||
c1, _mm_srli_si128(_mm_slli_si128(_mm_unpacklo_epi16(dxt, dxt), 8), 8));
|
_mm_or_si128(c1, _mm_srli_si128(_mm_slli_si128(_mm_unpacklo_epi16(dxt, dxt), 8), 8));
|
||||||
|
|
||||||
// Compare rgb0 to rgb1:
|
// Compare rgb0 to rgb1:
|
||||||
// Each 32-bit word will contain either 0xFFFFFFFF or 0x00000000 for true/false.
|
// Each 32-bit word will contain either 0xFFFFFFFF or 0x00000000 for true/false.
|
||||||
|
@ -1163,16 +1223,14 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
||||||
|
|
||||||
// green:
|
// green:
|
||||||
// NOTE: We start with the larger number of bits (6) firts for G and shift the mask down
|
// NOTE: We start with the larger number of bits (6) firts for G and shift the mask down
|
||||||
// 1 bit to get a 5-bit mask
|
// 1 bit to get a 5-bit mask later for R and B components.
|
||||||
// later for R and B components.
|
|
||||||
// low6mask == _mm_set_epi32(0x0000FC00, 0x0000FC00, 0x0000FC00, 0x0000FC00)
|
// low6mask == _mm_set_epi32(0x0000FC00, 0x0000FC00, 0x0000FC00, 0x0000FC00)
|
||||||
const __m128i low6mask = _mm_slli_epi32(_mm_srli_epi32(allFFs128, 24 + 2), 8 + 2);
|
const __m128i low6mask = _mm_slli_epi32(_mm_srli_epi32(allFFs128, 24 + 2), 8 + 2);
|
||||||
const __m128i gtmp = _mm_srli_epi32(c0, 3);
|
const __m128i gtmp = _mm_srli_epi32(c0, 3);
|
||||||
const __m128i g0 = _mm_and_si128(gtmp, low6mask);
|
const __m128i g0 = _mm_and_si128(gtmp, low6mask);
|
||||||
// low3mask == _mm_set_epi32(0x00000300, 0x00000300, 0x00000300, 0x00000300)
|
// low3mask == _mm_set_epi32(0x00000300, 0x00000300, 0x00000300, 0x00000300)
|
||||||
const __m128i g1 =
|
const __m128i g1 = _mm_and_si128(
|
||||||
_mm_and_si128(_mm_srli_epi32(gtmp, 6),
|
_mm_srli_epi32(gtmp, 6), _mm_set_epi32(0x00000300, 0x00000300, 0x00000300, 0x00000300));
|
||||||
_mm_set_epi32(0x00000300, 0x00000300, 0x00000300, 0x00000300));
|
|
||||||
argb888x4 = _mm_or_si128(g0, g1);
|
argb888x4 = _mm_or_si128(g0, g1);
|
||||||
// red:
|
// red:
|
||||||
// low5mask == _mm_set_epi32(0x000000F8, 0x000000F8, 0x000000F8, 0x000000F8)
|
// low5mask == _mm_set_epi32(0x000000F8, 0x000000F8, 0x000000F8, 0x000000F8)
|
||||||
|
@ -1263,8 +1321,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
||||||
// _mm_srli_epi32( allFFs128, 8 ) == _mm_set_epi32(0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF,
|
// _mm_srli_epi32( allFFs128, 8 ) == _mm_set_epi32(0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF,
|
||||||
// 0x00FFFFFF)
|
// 0x00FFFFFF)
|
||||||
// Make this color fully transparent:
|
// Make this color fully transparent:
|
||||||
rgb3 = _mm_or_si128(rgb3,
|
rgb3 = _mm_or_si128(rgb3, _mm_and_si128(_mm_and_si128(rgb2, _mm_srli_epi32(allFFs128, 8)),
|
||||||
_mm_and_si128(_mm_and_si128(rgb2, _mm_srli_epi32(allFFs128, 8)),
|
|
||||||
_mm_slli_si128(allFFs128, 8)));
|
_mm_slli_si128(allFFs128, 8)));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1287,11 +1344,10 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
||||||
u32 tmp0[4][4], tmp1[4][4];
|
u32 tmp0[4][4], tmp1[4][4];
|
||||||
|
|
||||||
DecodeDXTBlock(&(tmp0[0][0]),
|
DecodeDXTBlock(&(tmp0[0][0]),
|
||||||
reinterpret_cast<const DXTBlock*>(src + sizeof(DXTBlock) * 2 * xStep),
|
reinterpret_cast<const DXTBlock*>(src + sizeof(DXTBlock) * 2 * xStep), 4);
|
||||||
|
DecodeDXTBlock(&(tmp1[0][0]),
|
||||||
|
reinterpret_cast<const DXTBlock*>((src + sizeof(DXTBlock) * 2 * xStep) + 8),
|
||||||
4);
|
4);
|
||||||
DecodeDXTBlock(
|
|
||||||
&(tmp1[0][0]),
|
|
||||||
reinterpret_cast<const DXTBlock*>((src + sizeof(DXTBlock) * 2 * xStep) + 8), 4);
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
u32* dst32 = (dst + (y + z * 4) * width + x);
|
u32* dst32 = (dst + (y + z * 4) * width + x);
|
||||||
|
@ -1357,7 +1413,95 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int texformat,
|
||||||
|
const u8* tlut, TlutFormat tlutfmt)
|
||||||
|
{
|
||||||
|
int Wsteps4 = (width + 3) / 4;
|
||||||
|
int Wsteps8 = (width + 7) / 8;
|
||||||
|
|
||||||
|
// If the binary was not compiled with SSSE3 support, the functions turn into no-ops.
|
||||||
|
// Therefore, we shouldn't call them based on what the CPU reports at runtime alone.
|
||||||
|
#if _M_SSE >= 0x301
|
||||||
|
bool has_SSSE3 = cpu_info.bSSSE3;
|
||||||
|
#else
|
||||||
|
bool has_SSSE3 = false;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
switch (texformat)
|
||||||
|
{
|
||||||
|
case GX_TF_C4:
|
||||||
|
TexDecoder_DecodeImpl_C4(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4, Wsteps8);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case GX_TF_I4:
|
||||||
|
if (has_SSSE3)
|
||||||
|
TexDecoder_DecodeImpl_I4_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
|
||||||
|
Wsteps8);
|
||||||
|
else
|
||||||
|
TexDecoder_DecodeImpl_I4(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4, Wsteps8);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case GX_TF_I8:
|
||||||
|
if (has_SSSE3)
|
||||||
|
TexDecoder_DecodeImpl_I8_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
|
||||||
|
Wsteps8);
|
||||||
|
else
|
||||||
|
TexDecoder_DecodeImpl_I8(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4, Wsteps8);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case GX_TF_C8:
|
||||||
|
TexDecoder_DecodeImpl_C8(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4, Wsteps8);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case GX_TF_IA4:
|
||||||
|
TexDecoder_DecodeImpl_IA4(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4, Wsteps8);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case GX_TF_IA8:
|
||||||
|
if (has_SSSE3)
|
||||||
|
TexDecoder_DecodeImpl_IA8_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
|
||||||
|
Wsteps8);
|
||||||
|
else
|
||||||
|
TexDecoder_DecodeImpl_IA8(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
|
||||||
|
Wsteps8);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case GX_TF_C14X2:
|
||||||
|
TexDecoder_DecodeImpl_C14X2(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
|
||||||
|
Wsteps8);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case GX_TF_RGB565:
|
||||||
|
TexDecoder_DecodeImpl_RGB565(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
|
||||||
|
Wsteps8);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case GX_TF_RGB5A3:
|
||||||
|
if (has_SSSE3)
|
||||||
|
TexDecoder_DecodeImpl_RGB5A3_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
|
||||||
|
Wsteps8);
|
||||||
|
else
|
||||||
|
TexDecoder_DecodeImpl_RGB5A3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
|
||||||
|
Wsteps8);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case GX_TF_RGBA8:
|
||||||
|
if (has_SSSE3)
|
||||||
|
TexDecoder_DecodeImpl_RGBA8_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
|
||||||
|
Wsteps8);
|
||||||
|
else
|
||||||
|
TexDecoder_DecodeImpl_RGBA8(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
|
||||||
|
Wsteps8);
|
||||||
|
break;
|
||||||
|
|
||||||
|
case GX_TF_CMPR:
|
||||||
|
TexDecoder_DecodeImpl_CMPR(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4, Wsteps8);
|
||||||
|
break;
|
||||||
|
|
||||||
|
default:
|
||||||
|
PanicAlert("Unhandled texture format %d", texformat);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue