Faster SSE2 optimized GX_TF_CMPR texture decoder which gets ~40% speed improvement on x64 and ~50% improvement on x86 as compared to reference C code.
The code now uses direct pointer access from C code to write the colors to the destination texture instead of trying to force them back up into an __m128i and a single write call. This is what produces the major speed-up. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6761 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
parent
ca2628f896
commit
2841d67ce3
|
@ -919,8 +919,13 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
|
|||
|
||||
|
||||
|
||||
|
||||
|
||||
// JSD 01/06/11:
|
||||
// TODO: we really should ensure BOTH the source and destination addresses are aligned to 16-byte boundaries to
|
||||
// squeeze out a little more performance. _mm_loadu_si128/_mm_storeu_si128 is slower than _mm_load_si128/_mm_store_si128
|
||||
// because they work on unaligned addresses. The processor is free to make the assumption that addresses are multiples
|
||||
// of 16 in the aligned case.
|
||||
// TODO: complete SSE2 optimization of less often used texture formats.
|
||||
// TODO: refactor algorithms using _mm_loadl_epi64 unaligned loads to prefer 128-bit aligned loads.
|
||||
|
||||
PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int height, int texformat, int tlutaddr, int tlutfmt)
|
||||
{
|
||||
|
@ -1705,9 +1710,10 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
|
|||
case GX_TF_CMPR: // speed critical
|
||||
// The metroid games use this format almost exclusively.
|
||||
{
|
||||
#if !defined(_M_X64)
|
||||
// JSD optimized with SSE2 intrinsics.
|
||||
// Produces a 30% improvement for x86 code only.
|
||||
// Produces a ~50% improvement for x86 and a ~40% improvement for x64 in speed over reference C implementation.
|
||||
// The x64 compiled reference C code is faster than the x86 compiled reference C code, but the SSE2 is
|
||||
// faster than both.
|
||||
for (int y = 0; y < height; y += 8)
|
||||
{
|
||||
for (int x = 0; x < width; x += 8)
|
||||
|
@ -1727,20 +1733,42 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
|
|||
// of 1s :). Then I use sequences of shifts to squash it to the appropriate size and bit
|
||||
// positions that I need.
|
||||
|
||||
const __m128i allFFs128 = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128());
|
||||
|
||||
// Load 128 bits, i.e. two DXTBlocks (64-bits each)
|
||||
const __m128i dxt = _mm_loadu_si128((__m128i *)(src + sizeof(struct DXTBlock) * 0));
|
||||
const __m128i dxt = _mm_loadu_si128((__m128i *)src);
|
||||
|
||||
// Copy the 2-bit indices from each DXT block:
|
||||
__declspec(align(16)) u32 dxttmp[4];
|
||||
_mm_store_si128((__m128i *)dxttmp, dxt);
|
||||
|
||||
u32 dxt0sel = dxttmp[1];
|
||||
u32 dxt1sel = dxttmp[3];
|
||||
|
||||
__m128i argb888x4;
|
||||
const __m128i allFF = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL);
|
||||
const __m128i lowMask = _mm_srli_si128( allFFs128, 8 );
|
||||
__m128i c1 = _mm_unpackhi_epi16(dxt, dxt);
|
||||
c1 = _mm_slli_si128(c1, 8);
|
||||
const __m128i c0 = _mm_or_si128(c1, _mm_srli_si128(_mm_slli_si128(_mm_unpacklo_epi16(dxt, dxt), 8), 8));
|
||||
|
||||
// Compare rgb0 to rgb1:
|
||||
// Each 32-bit word will contain either 0xFFFFFFFF or 0x00000000 for true/false.
|
||||
const __m128i c0cmp = _mm_srli_epi32(_mm_slli_epi32(_mm_srli_epi64(c0, 8), 16), 16);
|
||||
const __m128i c0shr = _mm_srli_epi64(c0cmp, 32);
|
||||
const __m128i cmprgb0rgb1 = _mm_cmpgt_epi32(c0cmp, c0shr);
|
||||
|
||||
int cmp0 = _mm_extract_epi16(cmprgb0rgb1, 0);
|
||||
int cmp1 = _mm_extract_epi16(cmprgb0rgb1, 4);
|
||||
|
||||
// green:
|
||||
// NOTE: We start with the larger number of bits (6) firts for G and shift the mask down 1 bit to get a 5-bit mask
|
||||
// later for R and B components.
|
||||
// low6mask == _mm_set_epi32(0x0000FC00, 0x0000FC00, 0x0000FC00, 0x0000FC00)
|
||||
const __m128i low6mask = _mm_slli_epi32( _mm_srli_epi32(allFF, 24 + 2), 8 + 2);
|
||||
const __m128i low6mask = _mm_slli_epi32( _mm_srli_epi32(allFFs128, 24 + 2), 8 + 2);
|
||||
const __m128i gtmp = _mm_srli_epi32(c0, 3);
|
||||
const __m128i g0 = _mm_and_si128(gtmp, low6mask);
|
||||
// low3mask == _mm_set_epi32(0x00000300, 0x00000300, 0x00000300, 0x00000300)
|
||||
const __m128i low3mask = _mm_slli_epi32(_mm_srli_epi32(allFFs128, 32 - 3), 8);
|
||||
const __m128i g1 = _mm_and_si128(_mm_srli_epi32(gtmp, 6), _mm_set_epi32(0x00000300, 0x00000300, 0x00000300, 0x00000300));
|
||||
argb888x4 = _mm_or_si128(g0, g1);
|
||||
// red:
|
||||
|
@ -1754,81 +1782,106 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
|
|||
const __m128i b0 = _mm_and_si128(_mm_srli_epi32(c0, 5), _mm_slli_epi32(low5mask, 16));
|
||||
const __m128i b1 = _mm_srli_epi16(b0, 5);
|
||||
// OR in the fixed alpha component
|
||||
// _mm_slli_epi32( allFF, 24 ) == _mm_set_epi32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000)
|
||||
argb888x4 = _mm_or_si128(_mm_or_si128(argb888x4, _mm_slli_epi32( allFF, 24 ) ), _mm_or_si128(b0, b1));
|
||||
// calculate RGB2 and RGB3:
|
||||
const __m128i rgb0 = _mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
const __m128i rgb1 = _mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(3, 3, 1, 1));
|
||||
const __m128i rrggbb0 = _mm_and_si128(_mm_unpacklo_epi8(rgb0, rgb0), _mm_srli_epi16( allFF, 8 ));
|
||||
const __m128i rrggbb1 = _mm_and_si128(_mm_unpacklo_epi8(rgb1, rgb1), _mm_srli_epi16( allFF, 8 ));
|
||||
const __m128i rrggbb01 = _mm_and_si128(_mm_unpackhi_epi8(rgb0, rgb0), _mm_srli_epi16( allFF, 8 ));
|
||||
const __m128i rrggbb11 = _mm_and_si128(_mm_unpackhi_epi8(rgb1, rgb1), _mm_srli_epi16( allFF, 8 ));
|
||||
const __m128i rrggbbsub = _mm_subs_epi16(rrggbb1, rrggbb0);
|
||||
const __m128i rrggbbsub1 = _mm_subs_epi16(rrggbb11, rrggbb01);
|
||||
#if 0
|
||||
// RGB2b = (RGB0 + RGB1 + 1) / 2
|
||||
const __m128i one16 = _mm_srli_epi16( allFF, 15 );
|
||||
const __m128i rrggbb21 = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(rrggbb0, rrggbb1), one16), 1);
|
||||
const __m128i rrggbb211 = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(rrggbb01, rrggbb11), one16), 1);
|
||||
#else
|
||||
// RGB2b = avg(RGB0, RGB1)
|
||||
const __m128i rrggbb21 = _mm_avg_epu16(rrggbb0, rrggbb1);
|
||||
const __m128i rrggbb211 = _mm_avg_epu16(rrggbb01, rrggbb11);
|
||||
#endif
|
||||
const __m128i rgb210 = _mm_srli_si128(_mm_packus_epi16(rrggbb21, rrggbb21), 8);
|
||||
const __m128i rgb211 = _mm_slli_si128(_mm_packus_epi16(rrggbb211, rrggbb211), 8);
|
||||
const __m128i rgb21 = _mm_or_si128(rgb210, rgb211);
|
||||
// RGB2a = ((RGB1 - RGB0) >> 1) - ((RGB1 - RGB0) >> 3) using arithmetic shifts to extend sign (not logical shifts)
|
||||
const __m128i rrggbbsubshr1 = _mm_srai_epi16(rrggbbsub, 1);
|
||||
const __m128i rrggbbsubshr3 = _mm_srai_epi16(rrggbbsub, 3);
|
||||
const __m128i rrggbbsubshr11 = _mm_srai_epi16(rrggbbsub1, 1);
|
||||
const __m128i rrggbbsubshr31 = _mm_srai_epi16(rrggbbsub1, 3);
|
||||
const __m128i shr1subshr3 = _mm_sub_epi16(rrggbbsubshr1, rrggbbsubshr3);
|
||||
const __m128i shr1subshr31 = _mm_sub_epi16(rrggbbsubshr11, rrggbbsubshr31);
|
||||
// low8mask16 == _mm_set_epi16(0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff)
|
||||
const __m128i low8mask16 = _mm_srli_epi16( allFF, 8 );
|
||||
const __m128i rrggbbdelta = _mm_and_si128(shr1subshr3, low8mask16);
|
||||
const __m128i rrggbbdelta1 = _mm_and_si128(shr1subshr31, low8mask16);
|
||||
const __m128i rgbdelta0 = _mm_packus_epi16(rrggbbdelta, rrggbbdelta);
|
||||
__m128i rgbdelta1 = _mm_packus_epi16(rrggbbdelta1, rrggbbdelta1);
|
||||
rgbdelta1 = _mm_slli_si128(rgbdelta1, 8);
|
||||
const __m128i rgbdelta = _mm_or_si128(_mm_srli_si128(_mm_slli_si128(rgbdelta0, 8), 8), rgbdelta1);
|
||||
const __m128i rgb20 = _mm_add_epi8(_mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(2, 2, 0, 0)), rgbdelta);
|
||||
const __m128i rgb30 = _mm_sub_epi8(_mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(3, 3, 1, 1)), rgbdelta);
|
||||
// _mm_srli_epi32( allFF, 8 ) == _mm_set_epi32(0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF)
|
||||
// Make this color fully transparent:
|
||||
const __m128i rgb31 = _mm_and_si128(_mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(3, 3, 1, 1)), _mm_srli_epi32( allFF, 8 ) );
|
||||
// _mm_slli_epi32( allFFs128, 24 ) == _mm_set_epi32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000)
|
||||
argb888x4 = _mm_or_si128(_mm_or_si128(argb888x4, _mm_slli_epi32( allFFs128, 24 ) ), _mm_or_si128(b0, b1));
|
||||
|
||||
__m128i rgb2, rgb3;
|
||||
|
||||
// if (rgb0 > rgb1):
|
||||
if (cmp0 != 0)
|
||||
{
|
||||
// calculate RGB2 and RGB3:
|
||||
const __m128i rgb0 = _mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
const __m128i rgb1 = _mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(3, 3, 1, 1));
|
||||
const __m128i rrggbb0 = _mm_and_si128(_mm_unpacklo_epi8(rgb0, rgb0), _mm_srli_epi16( allFFs128, 8 ));
|
||||
const __m128i rrggbb1 = _mm_and_si128(_mm_unpacklo_epi8(rgb1, rgb1), _mm_srli_epi16( allFFs128, 8 ));
|
||||
const __m128i rrggbbsub = _mm_subs_epi16(rrggbb1, rrggbb0);
|
||||
|
||||
// RGB2a = ((RGB1 - RGB0) >> 1) - ((RGB1 - RGB0) >> 3) using arithmetic shifts to extend sign (not logical shifts)
|
||||
const __m128i rrggbbsubshr1 = _mm_srai_epi16(rrggbbsub, 1);
|
||||
const __m128i rrggbbsubshr3 = _mm_srai_epi16(rrggbbsub, 3);
|
||||
const __m128i shr1subshr3 = _mm_sub_epi16(rrggbbsubshr1, rrggbbsubshr3);
|
||||
// low8mask16 == _mm_set_epi16(0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff)
|
||||
const __m128i low8mask16 = _mm_srli_epi16( allFFs128, 8 );
|
||||
const __m128i rrggbbdelta = _mm_and_si128(shr1subshr3, low8mask16);
|
||||
const __m128i rgbdeltadup = _mm_packus_epi16(rrggbbdelta, rrggbbdelta);
|
||||
const __m128i rgbdelta = _mm_srli_si128(_mm_slli_si128(rgbdeltadup, 8), 8);
|
||||
|
||||
rgb2 = _mm_and_si128(_mm_add_epi8(_mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(2, 2, 0, 0)), rgbdelta), _mm_srli_si128(allFFs128, 8));
|
||||
rgb3 = _mm_and_si128(_mm_sub_epi8(_mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(3, 3, 1, 1)), rgbdelta), _mm_srli_si128(allFFs128, 8));
|
||||
}
|
||||
else
|
||||
{
|
||||
// calculate RGB2 and RGB3:
|
||||
const __m128i rgb0 = _mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
const __m128i rgb1 = _mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(3, 3, 1, 1));
|
||||
const __m128i rrggbb0 = _mm_and_si128(_mm_unpacklo_epi8(rgb0, rgb0), _mm_srli_epi16( allFFs128, 8 ));
|
||||
const __m128i rrggbb1 = _mm_and_si128(_mm_unpacklo_epi8(rgb1, rgb1), _mm_srli_epi16( allFFs128, 8 ));
|
||||
const __m128i rrggbbsub = _mm_subs_epi16(rrggbb1, rrggbb0);
|
||||
// RGB2b = avg(RGB0, RGB1)
|
||||
const __m128i rrggbb21 = _mm_avg_epu16(rrggbb0, rrggbb1);
|
||||
const __m128i rgb210 = _mm_srli_si128(_mm_packus_epi16(rrggbb21, rrggbb21), 8);
|
||||
rgb2 = rgb210;
|
||||
rgb3 = _mm_and_si128(_mm_srli_si128(_mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(1, 1, 1, 1)), 8), _mm_srli_epi32( allFFs128, 8 ));
|
||||
}
|
||||
|
||||
// if (rgb0 > rgb1):
|
||||
if (cmp1 != 0)
|
||||
{
|
||||
// calculate RGB2 and RGB3:
|
||||
const __m128i rgb0 = _mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
const __m128i rgb1 = _mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(3, 3, 1, 1));
|
||||
const __m128i rrggbb01 = _mm_and_si128(_mm_unpackhi_epi8(rgb0, rgb0), _mm_srli_epi16( allFFs128, 8 ));
|
||||
const __m128i rrggbb11 = _mm_and_si128(_mm_unpackhi_epi8(rgb1, rgb1), _mm_srli_epi16( allFFs128, 8 ));
|
||||
const __m128i rrggbbsub1 = _mm_subs_epi16(rrggbb11, rrggbb01);
|
||||
|
||||
// RGB2a = ((RGB1 - RGB0) >> 1) - ((RGB1 - RGB0) >> 3) using arithmetic shifts to extend sign (not logical shifts)
|
||||
const __m128i rrggbbsubshr11 = _mm_srai_epi16(rrggbbsub1, 1);
|
||||
const __m128i rrggbbsubshr31 = _mm_srai_epi16(rrggbbsub1, 3);
|
||||
const __m128i shr1subshr31 = _mm_sub_epi16(rrggbbsubshr11, rrggbbsubshr31);
|
||||
// low8mask16 == _mm_set_epi16(0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff)
|
||||
const __m128i low8mask16 = _mm_srli_epi16( allFFs128, 8 );
|
||||
const __m128i rrggbbdelta1 = _mm_and_si128(shr1subshr31, low8mask16);
|
||||
__m128i rgbdelta1 = _mm_packus_epi16(rrggbbdelta1, rrggbbdelta1);
|
||||
rgbdelta1 = _mm_slli_si128(rgbdelta1, 8);
|
||||
|
||||
rgb2 = _mm_or_si128(rgb2, _mm_and_si128(_mm_add_epi8(_mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(2, 2, 0, 0)), rgbdelta1), _mm_slli_si128(allFFs128, 8)));
|
||||
rgb3 = _mm_or_si128(rgb3, _mm_and_si128(_mm_sub_epi8(_mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(3, 3, 1, 1)), rgbdelta1), _mm_slli_si128(allFFs128, 8)));
|
||||
}
|
||||
else
|
||||
{
|
||||
// calculate RGB2 and RGB3:
|
||||
const __m128i rgb0 = _mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
const __m128i rgb1 = _mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(3, 3, 1, 1));
|
||||
const __m128i rrggbb01 = _mm_and_si128(_mm_unpackhi_epi8(rgb0, rgb0), _mm_srli_epi16( allFFs128, 8 ));
|
||||
const __m128i rrggbb11 = _mm_and_si128(_mm_unpackhi_epi8(rgb1, rgb1), _mm_srli_epi16( allFFs128, 8 ));
|
||||
const __m128i rrggbbsub1 = _mm_subs_epi16(rrggbb11, rrggbb01);
|
||||
// RGB2b = avg(RGB0, RGB1)
|
||||
const __m128i rrggbb211 = _mm_avg_epu16(rrggbb01, rrggbb11);
|
||||
const __m128i rgb211 = _mm_slli_si128(_mm_packus_epi16(rrggbb211, rrggbb211), 8);
|
||||
rgb2 = _mm_or_si128(rgb2, rgb211);
|
||||
|
||||
// _mm_srli_epi32( allFFs128, 8 ) == _mm_set_epi32(0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF)
|
||||
// Make this color fully transparent:
|
||||
rgb3 = _mm_or_si128(rgb3, _mm_and_si128(_mm_and_si128(_mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(3, 3, 1, 1)), _mm_srli_epi32( allFFs128, 8 ) ), _mm_slli_si128(allFFs128, 8)));
|
||||
}
|
||||
|
||||
// Create an array for color lookups for DXT0 so we can use the 2-bit indices:
|
||||
const __m128i rgb01230 = _mm_or_si128(
|
||||
const __m128i mmcolors0 = _mm_or_si128(
|
||||
_mm_or_si128(
|
||||
_mm_srli_si128(_mm_slli_si128(argb888x4, 8), 8),
|
||||
_mm_slli_si128(_mm_srli_si128(_mm_slli_si128(rgb20, 8), 8 + 4), 8)
|
||||
_mm_slli_si128(_mm_srli_si128(_mm_slli_si128(rgb2, 8), 8 + 4), 8)
|
||||
),
|
||||
_mm_slli_si128(_mm_srli_si128(rgb30, 4), 8 + 4)
|
||||
);
|
||||
const __m128i rgb01450 = _mm_or_si128(
|
||||
_mm_or_si128(
|
||||
_mm_srli_si128(_mm_slli_si128(argb888x4, 8), 8),
|
||||
_mm_slli_si128(_mm_srli_si128(_mm_slli_si128(rgb21, 8), 8 + 4), 8)
|
||||
),
|
||||
_mm_slli_si128(_mm_srli_si128(rgb31, 4), 8 + 4)
|
||||
_mm_slli_si128(_mm_srli_si128(rgb3, 4), 8 + 4)
|
||||
);
|
||||
|
||||
// Create an array for color lookups for DXT1 so we can use the 2-bit indices:
|
||||
const __m128i rgb01231 = _mm_or_si128(
|
||||
const __m128i mmcolors1 = _mm_or_si128(
|
||||
_mm_or_si128(
|
||||
_mm_srli_si128(argb888x4, 8),
|
||||
_mm_slli_si128(_mm_srli_si128(rgb20, 8 + 4), 8)
|
||||
_mm_slli_si128(_mm_srli_si128(rgb2, 8 + 4), 8)
|
||||
),
|
||||
_mm_slli_si128(_mm_srli_si128(rgb30, 8 + 4), 8 + 4)
|
||||
);
|
||||
const __m128i rgb01451 = _mm_or_si128(
|
||||
_mm_or_si128(
|
||||
_mm_srli_si128(argb888x4, 8),
|
||||
_mm_slli_si128(_mm_srli_si128(rgb21, 8 + 4), 8)
|
||||
),
|
||||
_mm_slli_si128(_mm_srli_si128(rgb31, 8 + 4), 8 + 4)
|
||||
_mm_slli_si128(_mm_srli_si128(rgb3, 8 + 4), 8 + 4)
|
||||
);
|
||||
|
||||
// The #ifdef CHECKs here and below are to compare correctness of output against the reference code.
|
||||
|
@ -1841,165 +1894,70 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
|
|||
decodeDXTBlockRGBA(&(tmp1[0][0]), (const DXTBlock *)(src + 8), 4);
|
||||
#endif
|
||||
|
||||
// JSD NOTE: I attempted with an earlier solution to use only SSE2 intrinsics and no branching
|
||||
// or array indexing. That solution was consistently worse than the reference C code so I ditched
|
||||
// it in favor of having SSE2 calculate the vector-intensive stuff, then have normal C code take
|
||||
// care of color selection and SSE2 for writes back to memory. This separation of concerns seems
|
||||
// to work out well. I don't know exactly why the SSE2-only code did not perform very well. I
|
||||
// suspect it was too many XMM temporaries and not enough registers to balance them all and so
|
||||
// the assembly generated was swapping a lot of temporaries to RAM.
|
||||
|
||||
// Compare rgb0 to rgb1:
|
||||
|
||||
// rgb1rgb1 rgb0rgb0 rgb1rgb1 rgb0rgb0
|
||||
// c0 example: {0x 5ef05ef0 c003c003 3ff83ff8 e003e003} in BIG ENDIAN
|
||||
// c0cmp contains rgb0 values in 64-bit words in LITTLE ENDIAN
|
||||
// c0cmp: {0x 00000000 000003c0 00000000 000003e0}
|
||||
// c0shr contains rgb1 values in 64-bit words in LITTLE ENDIAN
|
||||
// c0cmp: {0x 00000000 0000f05e 00000000 0000f83f}
|
||||
|
||||
__m128i c0cmp = _mm_srli_epi32(_mm_slli_epi32(_mm_srli_epi64(c0, 8), 16), 16);
|
||||
const __m128i c0shr = _mm_srli_epi64(c0cmp, 32);
|
||||
c0cmp = _mm_srli_epi64(_mm_slli_epi64(c0cmp, 48), 48);
|
||||
|
||||
#if 0
|
||||
// SSE4 only.
|
||||
const __m128i cmprgb0rgb1 = _mm_cmpgt_epi64(c0cmp, c0shr);
|
||||
#else
|
||||
// We need 64-bit words full of 0xFF bytes for comparison masks. cmgt_epi64 would do that but it's only SSE4
|
||||
// so we fake it on SSE2 by shuffling the 32-bit comparison masks to copy the 0xFF bytes to the high 32-bits of
|
||||
// each 64-bit word.
|
||||
__m128i cmprgb0rgb1 = _mm_cmpgt_epi32(c0cmp, c0shr);
|
||||
cmprgb0rgb1 = _mm_shuffle_epi32(cmprgb0rgb1, _MM_SHUFFLE(2, 2, 0, 0));
|
||||
#endif
|
||||
|
||||
// Now use the comparison mask from rgb0 > rgb1 to conditionally move colors from
|
||||
// either rgb0123X or rgb0145X:
|
||||
const __m128i cmp0 = _mm_unpacklo_epi64(cmprgb0rgb1, cmprgb0rgb1);
|
||||
const __m128i cmp1 = _mm_unpackhi_epi64(cmprgb0rgb1, cmprgb0rgb1);
|
||||
const __m128i mmcolors0 = _mm_or_si128(
|
||||
_mm_and_si128(cmp0, rgb01230),
|
||||
_mm_andnot_si128(cmp0, rgb01450)
|
||||
);
|
||||
const __m128i mmcolors1 = _mm_or_si128(
|
||||
_mm_and_si128(cmp1, rgb01231),
|
||||
_mm_andnot_si128(cmp1, rgb01451)
|
||||
);
|
||||
|
||||
// Copy the color arrays from the XMM registers to local variables in RAM:
|
||||
GC_ALIGNED16(u32 colors0[4]);
|
||||
GC_ALIGNED16(u32 colors1[4]);
|
||||
u32 *dst32 = ( dst + (y + z*4) * width + x );
|
||||
|
||||
// Copy the colors here:
|
||||
GC_ALIGNED16( u32 colors0[4] );
|
||||
GC_ALIGNED16( u32 colors1[4] );
|
||||
_mm_store_si128((__m128i *)colors0, mmcolors0);
|
||||
_mm_store_si128((__m128i *)colors1, mmcolors1);
|
||||
|
||||
// Copy the 2-bit indices from each DXT block:
|
||||
GC_ALIGNED16(u32 dxttmp[4]);
|
||||
_mm_store_si128((__m128i *)dxttmp, dxt);
|
||||
|
||||
u32 dxt0sel = dxttmp[1];
|
||||
u32 dxt1sel = dxttmp[3];
|
||||
|
||||
// Per each row written we alternate storing RGBA values from DXT0 and DXT1.
|
||||
|
||||
__m128i *dst128 = (__m128i *)( dst + (y + z*4) * width + x );
|
||||
|
||||
// Row 0:
|
||||
__m128i col0, col1;
|
||||
col0 = _mm_set_epi32(
|
||||
colors0[(dxt0sel >> ((0*8)+0)) & 3],
|
||||
colors0[(dxt0sel >> ((0*8)+2)) & 3],
|
||||
colors0[(dxt0sel >> ((0*8)+4)) & 3],
|
||||
colors0[(dxt0sel >> ((0*8)+6)) & 3]
|
||||
);
|
||||
_mm_store_si128(dst128 + ((width / 4) * 0), col0);
|
||||
dst32[(width * 0) + 0] = colors0[(dxt0sel >> ((0*8)+6)) & 3];
|
||||
dst32[(width * 0) + 1] = colors0[(dxt0sel >> ((0*8)+4)) & 3];
|
||||
dst32[(width * 0) + 2] = colors0[(dxt0sel >> ((0*8)+2)) & 3];
|
||||
dst32[(width * 0) + 3] = colors0[(dxt0sel >> ((0*8)+0)) & 3];
|
||||
dst32[(width * 0) + 4] = colors1[(dxt1sel >> ((0*8)+6)) & 3];
|
||||
dst32[(width * 0) + 5] = colors1[(dxt1sel >> ((0*8)+4)) & 3];
|
||||
dst32[(width * 0) + 6] = colors1[(dxt1sel >> ((0*8)+2)) & 3];
|
||||
dst32[(width * 0) + 7] = colors1[(dxt1sel >> ((0*8)+0)) & 3];
|
||||
#ifdef CHECK
|
||||
assert( memcmp(&(tmp0[0]), dst128 + ((width / 4) * 0), 16) == 0 );
|
||||
assert( memcmp(&(tmp0[0]), &dst32[(width * 0)], 16) == 0 );
|
||||
assert( memcmp(&(tmp1[0]), &dst32[(width * 0) + 4], 16) == 0 );
|
||||
#endif
|
||||
|
||||
col1 = _mm_set_epi32(
|
||||
colors1[(dxt1sel >> ((0*8)+0)) & 3],
|
||||
colors1[(dxt1sel >> ((0*8)+2)) & 3],
|
||||
colors1[(dxt1sel >> ((0*8)+4)) & 3],
|
||||
colors1[(dxt1sel >> ((0*8)+6)) & 3]
|
||||
);
|
||||
_mm_store_si128(dst128 + ((width / 4) * 0) + 1, col1);
|
||||
#ifdef CHECK
|
||||
assert( memcmp(&(tmp1[0]), dst128 + ((width / 4) * 0) + 1, 16) == 0 );
|
||||
#endif
|
||||
|
||||
// Row 1:
|
||||
col0 = _mm_set_epi32(
|
||||
colors0[(dxt0sel >> ((1*8)+0)) & 3],
|
||||
colors0[(dxt0sel >> ((1*8)+2)) & 3],
|
||||
colors0[(dxt0sel >> ((1*8)+4)) & 3],
|
||||
colors0[(dxt0sel >> ((1*8)+6)) & 3]
|
||||
);
|
||||
_mm_store_si128(dst128 + ((width / 4) * 1), col0);
|
||||
dst32[(width * 1) + 0] = colors0[(dxt0sel >> ((1*8)+6)) & 3];
|
||||
dst32[(width * 1) + 1] = colors0[(dxt0sel >> ((1*8)+4)) & 3];
|
||||
dst32[(width * 1) + 2] = colors0[(dxt0sel >> ((1*8)+2)) & 3];
|
||||
dst32[(width * 1) + 3] = colors0[(dxt0sel >> ((1*8)+0)) & 3];
|
||||
dst32[(width * 1) + 4] = colors1[(dxt1sel >> ((1*8)+6)) & 3];
|
||||
dst32[(width * 1) + 5] = colors1[(dxt1sel >> ((1*8)+4)) & 3];
|
||||
dst32[(width * 1) + 6] = colors1[(dxt1sel >> ((1*8)+2)) & 3];
|
||||
dst32[(width * 1) + 7] = colors1[(dxt1sel >> ((1*8)+0)) & 3];
|
||||
#ifdef CHECK
|
||||
assert( memcmp(&(tmp0[1]), dst128 + ((width / 4) * 1), 16) == 0 );
|
||||
assert( memcmp(&(tmp0[1]), &dst32[(width * 1)], 16) == 0 );
|
||||
assert( memcmp(&(tmp1[1]), &dst32[(width * 1) + 4], 16) == 0 );
|
||||
#endif
|
||||
|
||||
col1 = _mm_set_epi32(
|
||||
colors1[(dxt1sel >> ((1*8)+0)) & 3],
|
||||
colors1[(dxt1sel >> ((1*8)+2)) & 3],
|
||||
colors1[(dxt1sel >> ((1*8)+4)) & 3],
|
||||
colors1[(dxt1sel >> ((1*8)+6)) & 3]
|
||||
);
|
||||
_mm_store_si128(dst128 + ((width / 4) * 1) + 1, col1);
|
||||
#ifdef CHECK
|
||||
assert( memcmp(&(tmp1[1]), dst128 + ((width / 4) * 1) + 1, 16) == 0 );
|
||||
#endif
|
||||
|
||||
// Row 2:
|
||||
col0 = _mm_set_epi32(
|
||||
colors0[(dxt0sel >> ((2*8)+0)) & 3],
|
||||
colors0[(dxt0sel >> ((2*8)+2)) & 3],
|
||||
colors0[(dxt0sel >> ((2*8)+4)) & 3],
|
||||
colors0[(dxt0sel >> ((2*8)+6)) & 3]
|
||||
);
|
||||
_mm_store_si128(dst128 + ((width / 4) * 2), col0);
|
||||
dst32[(width * 2) + 0] = colors0[(dxt0sel >> ((2*8)+6)) & 3];
|
||||
dst32[(width * 2) + 1] = colors0[(dxt0sel >> ((2*8)+4)) & 3];
|
||||
dst32[(width * 2) + 2] = colors0[(dxt0sel >> ((2*8)+2)) & 3];
|
||||
dst32[(width * 2) + 3] = colors0[(dxt0sel >> ((2*8)+0)) & 3];
|
||||
dst32[(width * 2) + 4] = colors1[(dxt1sel >> ((2*8)+6)) & 3];
|
||||
dst32[(width * 2) + 5] = colors1[(dxt1sel >> ((2*8)+4)) & 3];
|
||||
dst32[(width * 2) + 6] = colors1[(dxt1sel >> ((2*8)+2)) & 3];
|
||||
dst32[(width * 2) + 7] = colors1[(dxt1sel >> ((2*8)+0)) & 3];
|
||||
#ifdef CHECK
|
||||
assert( memcmp(&(tmp0[2]), dst128 + ((width / 4) * 2), 16) == 0 );
|
||||
assert( memcmp(&(tmp0[2]), &dst32[(width * 2)], 16) == 0 );
|
||||
assert( memcmp(&(tmp1[2]), &dst32[(width * 2) + 4], 16) == 0 );
|
||||
#endif
|
||||
|
||||
col1 = _mm_set_epi32(
|
||||
colors1[(dxt1sel >> ((2*8)+0)) & 3],
|
||||
colors1[(dxt1sel >> ((2*8)+2)) & 3],
|
||||
colors1[(dxt1sel >> ((2*8)+4)) & 3],
|
||||
colors1[(dxt1sel >> ((2*8)+6)) & 3]
|
||||
);
|
||||
_mm_store_si128(dst128 + ((width / 4) * 2) + 1, col1);
|
||||
#ifdef CHECK
|
||||
assert( memcmp(&(tmp1[2]), dst128 + ((width / 4) * 2) + 1, 16) == 0 );
|
||||
#endif
|
||||
|
||||
// Row 3:
|
||||
col0 = _mm_set_epi32(
|
||||
colors0[(dxt0sel >> ((3*8)+0)) & 3],
|
||||
colors0[(dxt0sel >> ((3*8)+2)) & 3],
|
||||
colors0[(dxt0sel >> ((3*8)+4)) & 3],
|
||||
colors0[(dxt0sel >> ((3*8)+6)) & 3]
|
||||
);
|
||||
_mm_store_si128(dst128 + ((width / 4) * 3), col0);
|
||||
dst32[(width * 3) + 0] = colors0[(dxt0sel >> ((3*8)+6)) & 3];
|
||||
dst32[(width * 3) + 1] = colors0[(dxt0sel >> ((3*8)+4)) & 3];
|
||||
dst32[(width * 3) + 2] = colors0[(dxt0sel >> ((3*8)+2)) & 3];
|
||||
dst32[(width * 3) + 3] = colors0[(dxt0sel >> ((3*8)+0)) & 3];
|
||||
dst32[(width * 3) + 4] = colors1[(dxt1sel >> ((3*8)+6)) & 3];
|
||||
dst32[(width * 3) + 5] = colors1[(dxt1sel >> ((3*8)+4)) & 3];
|
||||
dst32[(width * 3) + 6] = colors1[(dxt1sel >> ((3*8)+2)) & 3];
|
||||
dst32[(width * 3) + 7] = colors1[(dxt1sel >> ((3*8)+0)) & 3];
|
||||
#ifdef CHECK
|
||||
assert( memcmp(&(tmp0[3]), dst128 + ((width / 4) * 3), 16) == 0 );
|
||||
#endif
|
||||
|
||||
col1 = _mm_set_epi32(
|
||||
colors1[(dxt1sel >> ((3*8)+0)) & 3],
|
||||
colors1[(dxt1sel >> ((3*8)+2)) & 3],
|
||||
colors1[(dxt1sel >> ((3*8)+4)) & 3],
|
||||
colors1[(dxt1sel >> ((3*8)+6)) & 3]
|
||||
);
|
||||
_mm_store_si128(dst128 + ((width / 4) * 3) + 1, col1);
|
||||
#ifdef CHECK
|
||||
assert( memcmp(&(tmp1[3]), dst128 + ((width / 4) * 3) + 1, 16) == 0 );
|
||||
assert( memcmp(&(tmp0[3]), &dst32[(width * 3)], 16) == 0 );
|
||||
assert( memcmp(&(tmp1[3]), &dst32[(width * 3) + 4], 16) == 0 );
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
#if 0
|
||||
for (int y = 0; y < height; y += 8)
|
||||
{
|
||||
for (int x = 0; x < width; x += 8)
|
||||
|
|
Loading…
Reference in New Issue