Enabled SSE2 optimization of GX_TF_CMPR decoder only for x86 builds. It can't compete with the x64 optimized reference C code.
git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6755 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
parent
809670611b
commit
5ca3adde3c
|
@ -1707,20 +1707,18 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
|
||||||
case GX_TF_CMPR: // speed critical
|
case GX_TF_CMPR: // speed critical
|
||||||
// The metroid games use this format almost exclusively.
|
// The metroid games use this format almost exclusively.
|
||||||
{
|
{
|
||||||
|
#if !defined(_M_X64)
|
||||||
// JSD optimized with SSE2 intrinsics.
|
// JSD optimized with SSE2 intrinsics.
|
||||||
// Produces a ~40% improvement in speed over reference C implementation on an x86 Intel Core 2 Duo
|
// Produces a 30% improvement for x86 code only.
|
||||||
// but also a -10% to 0% "improvement" on an x64 AMD Phenom II. Further optimization is required to
|
|
||||||
// ensure that all architectures and CPUs gain benefits.
|
|
||||||
for (int y = 0; y < height; y += 8)
|
for (int y = 0; y < height; y += 8)
|
||||||
{
|
{
|
||||||
for (int x = 0; x < width; x += 8)
|
for (int x = 0; x < width; x += 8)
|
||||||
{
|
{
|
||||||
u32 *dst32 = dst + y * width + x;
|
|
||||||
// We handle two DXT blocks simultaneously to take full advantage of SSE2's 128-bit registers.
|
// We handle two DXT blocks simultaneously to take full advantage of SSE2's 128-bit registers.
|
||||||
// This is ideal because a single DXT block contains 2 RGBA colors when decoded from their 16-bit.
|
// This is ideal because a single DXT block contains 2 RGBA colors when decoded from their 16-bit.
|
||||||
// Two DXT blocks therefore contain 4 RGBA colors to be processed. The processing is parallelizable
|
// Two DXT blocks therefore contain 4 RGBA colors to be processed. The processing is parallelizable
|
||||||
// at this level, so we do.
|
// at this level, so we do.
|
||||||
for (int z = 0; z < 2; ++z, src += sizeof(struct DXTBlock) * 2, dst32 += (width * 4))
|
for (int z = 0; z < 2; ++z, src += sizeof(struct DXTBlock) * 2)
|
||||||
{
|
{
|
||||||
// JSD NOTE: You may see many strange patterns of behavior in the below code, but they
|
// JSD NOTE: You may see many strange patterns of behavior in the below code, but they
|
||||||
// are for performance reasons. Sometimes, calculating what should be obvious hard-coded
|
// are for performance reasons. Sometimes, calculating what should be obvious hard-coded
|
||||||
|
@ -1820,12 +1818,6 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
|
||||||
_mm_slli_si128(_mm_srli_si128(rgb31, 4), 8 + 4)
|
_mm_slli_si128(_mm_srli_si128(rgb31, 4), 8 + 4)
|
||||||
);
|
);
|
||||||
|
|
||||||
// Copy the color arrays from the XMM registers to local variables in RAM:
|
|
||||||
GC_ALIGNED16(u32 colors0A[4]);
|
|
||||||
GC_ALIGNED16(u32 colors0B[4]);
|
|
||||||
_mm_store_si128((__m128i *)colors0A, rgb01230);
|
|
||||||
_mm_store_si128((__m128i *)colors0B, rgb01450);
|
|
||||||
|
|
||||||
// Create an array for color lookups for DXT1 so we can use the 2-bit indices:
|
// Create an array for color lookups for DXT1 so we can use the 2-bit indices:
|
||||||
const __m128i rgb01231 = _mm_or_si128(
|
const __m128i rgb01231 = _mm_or_si128(
|
||||||
_mm_or_si128(
|
_mm_or_si128(
|
||||||
|
@ -1842,12 +1834,6 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
|
||||||
_mm_slli_si128(_mm_srli_si128(rgb31, 8 + 4), 8 + 4)
|
_mm_slli_si128(_mm_srli_si128(rgb31, 8 + 4), 8 + 4)
|
||||||
);
|
);
|
||||||
|
|
||||||
// Copy the color arrays from the XMM registers to aligned local variables in RAM:
|
|
||||||
GC_ALIGNED16(u32 colors1A[4]);
|
|
||||||
GC_ALIGNED16(u32 colors1B[4]);
|
|
||||||
_mm_store_si128((__m128i *)colors1A, rgb01231);
|
|
||||||
_mm_store_si128((__m128i *)colors1B, rgb01451);
|
|
||||||
|
|
||||||
// The #ifdef CHECKs here and below are to compare correctness of output against the reference code.
|
// The #ifdef CHECKs here and below are to compare correctness of output against the reference code.
|
||||||
// Don't use them in a normal build.
|
// Don't use them in a normal build.
|
||||||
#ifdef CHECK
|
#ifdef CHECK
|
||||||
|
@ -1867,11 +1853,48 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
|
||||||
// the assembly generated was swapping a lot of temporaries to RAM.
|
// the assembly generated was swapping a lot of temporaries to RAM.
|
||||||
|
|
||||||
// Compare rgb0 to rgb1:
|
// Compare rgb0 to rgb1:
|
||||||
// Each 32-bit word will contain either 0xFFFFFFFF or 0x00000000 for true/false.
|
|
||||||
//const __m128i cmprgb0rgb1 = _mm_cmpgt_epi32(rgb0, rgb1);
|
// rgb1rgb1 rgb0rgb0 rgb1rgb1 rgb0rgb0
|
||||||
const __m128i c0cmp = _mm_srli_epi32(_mm_slli_epi32(_mm_srli_epi64(c0, 8), 16), 16);
|
// c0 example: {0x 5ef05ef0 c003c003 3ff83ff8 e003e003} in BIG ENDIAN
|
||||||
|
// c0cmp contains rgb0 values in 64-bit words in LITTLE ENDIAN
|
||||||
|
// c0cmp: {0x 00000000 000003c0 00000000 000003e0}
|
||||||
|
// c0shr contains rgb1 values in 64-bit words in LITTLE ENDIAN
|
||||||
|
// c0cmp: {0x 00000000 0000f05e 00000000 0000f83f}
|
||||||
|
|
||||||
|
__m128i c0cmp = _mm_srli_epi32(_mm_slli_epi32(_mm_srli_epi64(c0, 8), 16), 16);
|
||||||
const __m128i c0shr = _mm_srli_epi64(c0cmp, 32);
|
const __m128i c0shr = _mm_srli_epi64(c0cmp, 32);
|
||||||
const __m128i cmprgb0rgb1 = _mm_cmpgt_epi32(c0cmp, c0shr);
|
c0cmp = _mm_srli_epi64(_mm_slli_epi64(c0cmp, 48), 48);
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
// SSE4 only.
|
||||||
|
const __m128i cmprgb0rgb1 = _mm_cmpgt_epi64(c0cmp, c0shr);
|
||||||
|
#else
|
||||||
|
// We need 64-bit words full of 0xFF bytes for comparison masks. cmgt_epi64 would do that but it's only SSE4
|
||||||
|
// so we fake it on SSE2 by shuffling the 32-bit comparison masks to copy the 0xFF bytes to the high 32-bits of
|
||||||
|
// each 64-bit word.
|
||||||
|
__m128i cmprgb0rgb1 = _mm_cmpgt_epi32(c0cmp, c0shr);
|
||||||
|
cmprgb0rgb1 = _mm_shuffle_epi32(cmprgb0rgb1, _MM_SHUFFLE(2, 2, 0, 0));
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Now use the comparison mask from rgb0 > rgb1 to conditionally move colors from
|
||||||
|
// either rgb0123X or rgb0145X:
|
||||||
|
const __m128i cmp0 = _mm_unpacklo_epi64(cmprgb0rgb1, cmprgb0rgb1);
|
||||||
|
const __m128i cmp1 = _mm_unpackhi_epi64(cmprgb0rgb1, cmprgb0rgb1);
|
||||||
|
const __m128i mmcolors0 = _mm_or_si128(
|
||||||
|
_mm_and_si128(cmp0, rgb01230),
|
||||||
|
_mm_andnot_si128(cmp0, rgb01450)
|
||||||
|
);
|
||||||
|
const __m128i mmcolors1 = _mm_or_si128(
|
||||||
|
_mm_and_si128(cmp1, rgb01231),
|
||||||
|
_mm_andnot_si128(cmp1, rgb01451)
|
||||||
|
);
|
||||||
|
|
||||||
|
// Copy the color arrays from the XMM registers to local variables in RAM:
|
||||||
|
GC_ALIGNED16(u32 colors0[4]);
|
||||||
|
GC_ALIGNED16(u32 colors1[4]);
|
||||||
|
|
||||||
|
_mm_store_si128((__m128i *)colors0, mmcolors0);
|
||||||
|
_mm_store_si128((__m128i *)colors1, mmcolors1);
|
||||||
|
|
||||||
// Copy the 2-bit indices from each DXT block:
|
// Copy the 2-bit indices from each DXT block:
|
||||||
GC_ALIGNED16(u32 dxttmp[4]);
|
GC_ALIGNED16(u32 dxttmp[4]);
|
||||||
|
@ -1880,244 +1903,97 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
|
||||||
u32 dxt0sel = dxttmp[1];
|
u32 dxt0sel = dxttmp[1];
|
||||||
u32 dxt1sel = dxttmp[3];
|
u32 dxt1sel = dxttmp[3];
|
||||||
|
|
||||||
// Copy the comparison results:
|
|
||||||
GC_ALIGNED16(u16 cmptmp[8]);
|
|
||||||
_mm_store_si128((__m128i *)cmptmp, cmprgb0rgb1);
|
|
||||||
|
|
||||||
// Per each row written we alternate storing RGBA values from DXT0 and DXT1.
|
// Per each row written we alternate storing RGBA values from DXT0 and DXT1.
|
||||||
|
|
||||||
__m128i *dst128 = (__m128i *)( dst + (y + z*4) * width + x );
|
__m128i *dst128 = (__m128i *)( dst + (y + z*4) * width + x );
|
||||||
|
|
||||||
// Row 0:
|
// Row 0:
|
||||||
__m128i col0, col1;
|
__m128i col0, col1;
|
||||||
if (cmptmp[0])
|
|
||||||
{
|
|
||||||
// rgb0a > rgb1a
|
|
||||||
col0 = _mm_set_epi32(
|
col0 = _mm_set_epi32(
|
||||||
colors0A[(dxt0sel >> ((0*8)+0)) & 3],
|
colors0[(dxt0sel >> ((0*8)+0)) & 3],
|
||||||
colors0A[(dxt0sel >> ((0*8)+2)) & 3],
|
colors0[(dxt0sel >> ((0*8)+2)) & 3],
|
||||||
colors0A[(dxt0sel >> ((0*8)+4)) & 3],
|
colors0[(dxt0sel >> ((0*8)+4)) & 3],
|
||||||
colors0A[(dxt0sel >> ((0*8)+6)) & 3]
|
colors0[(dxt0sel >> ((0*8)+6)) & 3]
|
||||||
);
|
);
|
||||||
_mm_store_si128(dst128 + ((width / 4) * 0), col0);
|
_mm_store_si128(dst128 + ((width / 4) * 0), col0);
|
||||||
#ifdef CHECK
|
#ifdef CHECK
|
||||||
assert( memcmp(&(tmp0[0]), dst128 + ((width / 4) * 0), 16) == 0 );
|
assert( memcmp(&(tmp0[0]), dst128 + ((width / 4) * 0), 16) == 0 );
|
||||||
#endif
|
#endif
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// rgb0a <= rgb1a
|
|
||||||
col0 = _mm_set_epi32(
|
|
||||||
colors0B[(dxt0sel >> ((0*8)+0)) & 3],
|
|
||||||
colors0B[(dxt0sel >> ((0*8)+2)) & 3],
|
|
||||||
colors0B[(dxt0sel >> ((0*8)+4)) & 3],
|
|
||||||
colors0B[(dxt0sel >> ((0*8)+6)) & 3]
|
|
||||||
);
|
|
||||||
_mm_store_si128(dst128 + ((width / 4) * 0), col0);
|
|
||||||
#ifdef CHECK
|
|
||||||
assert( memcmp(&(tmp0[0]), dst128 + ((width / 4) * 0), 16) == 0 );
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
if (cmptmp[4])
|
|
||||||
{
|
|
||||||
// (rgb0b > rgb1b)
|
|
||||||
col1 = _mm_set_epi32(
|
col1 = _mm_set_epi32(
|
||||||
colors1A[(dxt1sel >> ((0*8)+0)) & 3],
|
colors1[(dxt1sel >> ((0*8)+0)) & 3],
|
||||||
colors1A[(dxt1sel >> ((0*8)+2)) & 3],
|
colors1[(dxt1sel >> ((0*8)+2)) & 3],
|
||||||
colors1A[(dxt1sel >> ((0*8)+4)) & 3],
|
colors1[(dxt1sel >> ((0*8)+4)) & 3],
|
||||||
colors1A[(dxt1sel >> ((0*8)+6)) & 3]
|
colors1[(dxt1sel >> ((0*8)+6)) & 3]
|
||||||
);
|
);
|
||||||
_mm_store_si128(dst128 + ((width / 4) * 0) + 1, col1);
|
_mm_store_si128(dst128 + ((width / 4) * 0) + 1, col1);
|
||||||
#ifdef CHECK
|
#ifdef CHECK
|
||||||
assert( memcmp(&(tmp1[0]), dst128 + ((width / 4) * 0) + 1, 16) == 0 );
|
assert( memcmp(&(tmp1[0]), dst128 + ((width / 4) * 0) + 1, 16) == 0 );
|
||||||
#endif
|
#endif
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// (rgb0b <= rgb1b)
|
|
||||||
col1 = _mm_set_epi32(
|
|
||||||
colors1B[(dxt1sel >> ((0*8)+0)) & 3],
|
|
||||||
colors1B[(dxt1sel >> ((0*8)+2)) & 3],
|
|
||||||
colors1B[(dxt1sel >> ((0*8)+4)) & 3],
|
|
||||||
colors1B[(dxt1sel >> ((0*8)+6)) & 3]
|
|
||||||
);
|
|
||||||
_mm_store_si128(dst128 + ((width / 4) * 0) + 1, col1);
|
|
||||||
#ifdef CHECK
|
|
||||||
assert( memcmp(&(tmp1[0]), dst128 + ((width / 4) * 0) + 1, 16) == 0 );
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
// Row 1:
|
// Row 1:
|
||||||
if (cmptmp[0])
|
|
||||||
{
|
|
||||||
// rgb0a > rgb1a
|
|
||||||
col0 = _mm_set_epi32(
|
col0 = _mm_set_epi32(
|
||||||
colors0A[(dxt0sel >> ((1*8)+0)) & 3],
|
colors0[(dxt0sel >> ((1*8)+0)) & 3],
|
||||||
colors0A[(dxt0sel >> ((1*8)+2)) & 3],
|
colors0[(dxt0sel >> ((1*8)+2)) & 3],
|
||||||
colors0A[(dxt0sel >> ((1*8)+4)) & 3],
|
colors0[(dxt0sel >> ((1*8)+4)) & 3],
|
||||||
colors0A[(dxt0sel >> ((1*8)+6)) & 3]
|
colors0[(dxt0sel >> ((1*8)+6)) & 3]
|
||||||
);
|
);
|
||||||
_mm_store_si128(dst128 + ((width / 4) * 1), col0);
|
_mm_store_si128(dst128 + ((width / 4) * 1), col0);
|
||||||
#ifdef CHECK
|
#ifdef CHECK
|
||||||
assert( memcmp(&(tmp0[1]), dst128 + ((width / 4) * 1), 16) == 0 );
|
assert( memcmp(&(tmp0[1]), dst128 + ((width / 4) * 1), 16) == 0 );
|
||||||
#endif
|
#endif
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// rgb0a <= rgb1a
|
|
||||||
col0 = _mm_set_epi32(
|
|
||||||
colors0B[(dxt0sel >> ((1*8)+0)) & 3],
|
|
||||||
colors0B[(dxt0sel >> ((1*8)+2)) & 3],
|
|
||||||
colors0B[(dxt0sel >> ((1*8)+4)) & 3],
|
|
||||||
colors0B[(dxt0sel >> ((1*8)+6)) & 3]
|
|
||||||
);
|
|
||||||
_mm_store_si128(dst128 + ((width / 4) * 1), col0);
|
|
||||||
#ifdef CHECK
|
|
||||||
assert( memcmp(&(tmp0[1]), dst128 + ((width / 4) * 1), 16) == 0 );
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
if (cmptmp[4])
|
|
||||||
{
|
|
||||||
// (rgb0b > rgb1b)
|
|
||||||
col1 = _mm_set_epi32(
|
col1 = _mm_set_epi32(
|
||||||
colors1A[(dxt1sel >> ((1*8)+0)) & 3],
|
colors1[(dxt1sel >> ((1*8)+0)) & 3],
|
||||||
colors1A[(dxt1sel >> ((1*8)+2)) & 3],
|
colors1[(dxt1sel >> ((1*8)+2)) & 3],
|
||||||
colors1A[(dxt1sel >> ((1*8)+4)) & 3],
|
colors1[(dxt1sel >> ((1*8)+4)) & 3],
|
||||||
colors1A[(dxt1sel >> ((1*8)+6)) & 3]
|
colors1[(dxt1sel >> ((1*8)+6)) & 3]
|
||||||
);
|
);
|
||||||
_mm_store_si128(dst128 + ((width / 4) * 1) + 1, col1);
|
_mm_store_si128(dst128 + ((width / 4) * 1) + 1, col1);
|
||||||
#ifdef CHECK
|
#ifdef CHECK
|
||||||
assert( memcmp(&(tmp1[1]), dst128 + ((width / 4) * 1) + 1, 16) == 0 );
|
assert( memcmp(&(tmp1[1]), dst128 + ((width / 4) * 1) + 1, 16) == 0 );
|
||||||
#endif
|
#endif
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// (rgb0b <= rgb1b)
|
|
||||||
col1 = _mm_set_epi32(
|
|
||||||
colors1B[(dxt1sel >> ((1*8)+0)) & 3],
|
|
||||||
colors1B[(dxt1sel >> ((1*8)+2)) & 3],
|
|
||||||
colors1B[(dxt1sel >> ((1*8)+4)) & 3],
|
|
||||||
colors1B[(dxt1sel >> ((1*8)+6)) & 3]
|
|
||||||
);
|
|
||||||
_mm_store_si128(dst128 + ((width / 4) * 1) + 1, col1);
|
|
||||||
#ifdef CHECK
|
|
||||||
assert( memcmp(&(tmp1[1]), dst128 + ((width / 4) * 1) + 1, 16) == 0 );
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
// Row 2:
|
// Row 2:
|
||||||
if (cmptmp[0])
|
|
||||||
{
|
|
||||||
// rgb0a > rgb1a
|
|
||||||
col0 = _mm_set_epi32(
|
col0 = _mm_set_epi32(
|
||||||
colors0A[(dxt0sel >> ((2*8)+0)) & 3],
|
colors0[(dxt0sel >> ((2*8)+0)) & 3],
|
||||||
colors0A[(dxt0sel >> ((2*8)+2)) & 3],
|
colors0[(dxt0sel >> ((2*8)+2)) & 3],
|
||||||
colors0A[(dxt0sel >> ((2*8)+4)) & 3],
|
colors0[(dxt0sel >> ((2*8)+4)) & 3],
|
||||||
colors0A[(dxt0sel >> ((2*8)+6)) & 3]
|
colors0[(dxt0sel >> ((2*8)+6)) & 3]
|
||||||
);
|
);
|
||||||
_mm_store_si128(dst128 + ((width / 4) * 2), col0);
|
_mm_store_si128(dst128 + ((width / 4) * 2), col0);
|
||||||
#ifdef CHECK
|
#ifdef CHECK
|
||||||
assert( memcmp(&(tmp0[2]), dst128 + ((width / 4) * 2), 16) == 0 );
|
assert( memcmp(&(tmp0[2]), dst128 + ((width / 4) * 2), 16) == 0 );
|
||||||
#endif
|
#endif
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// rgb0a <= rgb1a
|
|
||||||
col0 = _mm_set_epi32(
|
|
||||||
colors0B[(dxt0sel >> ((2*8)+0)) & 3],
|
|
||||||
colors0B[(dxt0sel >> ((2*8)+2)) & 3],
|
|
||||||
colors0B[(dxt0sel >> ((2*8)+4)) & 3],
|
|
||||||
colors0B[(dxt0sel >> ((2*8)+6)) & 3]
|
|
||||||
);
|
|
||||||
_mm_store_si128(dst128 + ((width / 4) * 2), col0);
|
|
||||||
#ifdef CHECK
|
|
||||||
assert( memcmp(&(tmp0[2]), dst128 + ((width / 4) * 2), 16) == 0 );
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
if (cmptmp[4])
|
|
||||||
{
|
|
||||||
// (rgb0b > rgb1b)
|
|
||||||
col1 = _mm_set_epi32(
|
col1 = _mm_set_epi32(
|
||||||
colors1A[(dxt1sel >> ((2*8)+0)) & 3],
|
colors1[(dxt1sel >> ((2*8)+0)) & 3],
|
||||||
colors1A[(dxt1sel >> ((2*8)+2)) & 3],
|
colors1[(dxt1sel >> ((2*8)+2)) & 3],
|
||||||
colors1A[(dxt1sel >> ((2*8)+4)) & 3],
|
colors1[(dxt1sel >> ((2*8)+4)) & 3],
|
||||||
colors1A[(dxt1sel >> ((2*8)+6)) & 3]
|
colors1[(dxt1sel >> ((2*8)+6)) & 3]
|
||||||
);
|
);
|
||||||
_mm_store_si128(dst128 + ((width / 4) * 2) + 1, col1);
|
_mm_store_si128(dst128 + ((width / 4) * 2) + 1, col1);
|
||||||
#ifdef CHECK
|
#ifdef CHECK
|
||||||
assert( memcmp(&(tmp1[2]), dst128 + ((width / 4) * 2) + 1, 16) == 0 );
|
assert( memcmp(&(tmp1[2]), dst128 + ((width / 4) * 2) + 1, 16) == 0 );
|
||||||
#endif
|
#endif
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// (rgb0b <= rgb1b)
|
|
||||||
col1 = _mm_set_epi32(
|
|
||||||
colors1B[(dxt1sel >> ((2*8)+0)) & 3],
|
|
||||||
colors1B[(dxt1sel >> ((2*8)+2)) & 3],
|
|
||||||
colors1B[(dxt1sel >> ((2*8)+4)) & 3],
|
|
||||||
colors1B[(dxt1sel >> ((2*8)+6)) & 3]
|
|
||||||
);
|
|
||||||
_mm_store_si128(dst128 + ((width / 4) * 2) + 1, col1);
|
|
||||||
#ifdef CHECK
|
|
||||||
assert( memcmp(&(tmp1[2]), dst128 + ((width / 4) * 2) + 1, 16) == 0 );
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
// Row 3:
|
// Row 3:
|
||||||
if (cmptmp[0])
|
|
||||||
{
|
|
||||||
// rgb0a > rgb1a
|
|
||||||
col0 = _mm_set_epi32(
|
col0 = _mm_set_epi32(
|
||||||
colors0A[(dxt0sel >> ((3*8)+0)) & 3],
|
colors0[(dxt0sel >> ((3*8)+0)) & 3],
|
||||||
colors0A[(dxt0sel >> ((3*8)+2)) & 3],
|
colors0[(dxt0sel >> ((3*8)+2)) & 3],
|
||||||
colors0A[(dxt0sel >> ((3*8)+4)) & 3],
|
colors0[(dxt0sel >> ((3*8)+4)) & 3],
|
||||||
colors0A[(dxt0sel >> ((3*8)+6)) & 3]
|
colors0[(dxt0sel >> ((3*8)+6)) & 3]
|
||||||
);
|
);
|
||||||
_mm_store_si128(dst128 + ((width / 4) * 3), col0);
|
_mm_store_si128(dst128 + ((width / 4) * 3), col0);
|
||||||
#ifdef CHECK
|
#ifdef CHECK
|
||||||
assert( memcmp(&(tmp0[3]), dst128 + ((width / 4) * 3), 16) == 0 );
|
assert( memcmp(&(tmp0[3]), dst128 + ((width / 4) * 3), 16) == 0 );
|
||||||
#endif
|
#endif
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// rgb0a <= rgb1a
|
|
||||||
col0 = _mm_set_epi32(
|
|
||||||
colors0B[(dxt0sel >> ((3*8)+0)) & 3],
|
|
||||||
colors0B[(dxt0sel >> ((3*8)+2)) & 3],
|
|
||||||
colors0B[(dxt0sel >> ((3*8)+4)) & 3],
|
|
||||||
colors0B[(dxt0sel >> ((3*8)+6)) & 3]
|
|
||||||
);
|
|
||||||
_mm_store_si128(dst128 + ((width / 4) * 3), col0);
|
|
||||||
#ifdef CHECK
|
|
||||||
assert( memcmp(&(tmp0[3]), dst128 + ((width / 4) * 3), 16) == 0 );
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
if (cmptmp[4])
|
|
||||||
{
|
|
||||||
// (rgb0b > rgb1b)
|
|
||||||
col1 = _mm_set_epi32(
|
col1 = _mm_set_epi32(
|
||||||
colors1A[(dxt1sel >> ((3*8)+0)) & 3],
|
colors1[(dxt1sel >> ((3*8)+0)) & 3],
|
||||||
colors1A[(dxt1sel >> ((3*8)+2)) & 3],
|
colors1[(dxt1sel >> ((3*8)+2)) & 3],
|
||||||
colors1A[(dxt1sel >> ((3*8)+4)) & 3],
|
colors1[(dxt1sel >> ((3*8)+4)) & 3],
|
||||||
colors1A[(dxt1sel >> ((3*8)+6)) & 3]
|
colors1[(dxt1sel >> ((3*8)+6)) & 3]
|
||||||
);
|
|
||||||
_mm_store_si128(dst128 + ((width / 4) * 3) + 1, col1);
|
|
||||||
#ifdef CHECK
|
|
||||||
assert( memcmp(&(tmp1[3]), dst128 + ((width / 4) * 3) + 1, 16) == 0 );
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
// (rgb0b <= rgb1b)
|
|
||||||
col1 = _mm_set_epi32(
|
|
||||||
colors1B[(dxt1sel >> ((3*8)+0)) & 3],
|
|
||||||
colors1B[(dxt1sel >> ((3*8)+2)) & 3],
|
|
||||||
colors1B[(dxt1sel >> ((3*8)+4)) & 3],
|
|
||||||
colors1B[(dxt1sel >> ((3*8)+6)) & 3]
|
|
||||||
);
|
);
|
||||||
_mm_store_si128(dst128 + ((width / 4) * 3) + 1, col1);
|
_mm_store_si128(dst128 + ((width / 4) * 3) + 1, col1);
|
||||||
#ifdef CHECK
|
#ifdef CHECK
|
||||||
|
@ -2126,8 +2002,7 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
#else
|
||||||
#if 0
|
|
||||||
for (int y = 0; y < height; y += 8)
|
for (int y = 0; y < height; y += 8)
|
||||||
{
|
{
|
||||||
for (int x = 0; x < width; x += 8)
|
for (int x = 0; x < width; x += 8)
|
||||||
|
|
Loading…
Reference in New Issue