From 2841d67ce314ae6ddc1f92828c928470d146c871 Mon Sep 17 00:00:00 2001 From: "james.jdunne" Date: Thu, 6 Jan 2011 16:41:20 +0000 Subject: [PATCH] Faster SSE2 optimized GX_TF_CMPR texture decoder which gets ~40% speed improvement on x64 and ~50% improvement on x86 as compared to reference C code. The code now uses direct pointer access from C code to write the colors to the destination texture instead of trying to force them back up into an __m128i and a single write call. This is what produces the major speed-up. git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6761 8ced0084-cf51-0410-be5f-012b33b47a6e --- .../Core/VideoCommon/Src/TextureDecoder.cpp | 380 ++++++++---------- 1 file changed, 169 insertions(+), 211 deletions(-) diff --git a/Source/Core/VideoCommon/Src/TextureDecoder.cpp b/Source/Core/VideoCommon/Src/TextureDecoder.cpp index 150f2b7ae2..f3f647ef3d 100644 --- a/Source/Core/VideoCommon/Src/TextureDecoder.cpp +++ b/Source/Core/VideoCommon/Src/TextureDecoder.cpp @@ -919,8 +919,13 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh - - +// JSD 01/06/11: +// TODO: we really should ensure BOTH the source and destination addresses are aligned to 16-byte boundaries to +// squeeze out a little more performance. _mm_loadu_si128/_mm_storeu_si128 is slower than _mm_load_si128/_mm_store_si128 +// because they work on unaligned addresses. The processor is free to make the assumption that addresses are multiples +// of 16 in the aligned case. +// TODO: complete SSE2 optimization of less often used texture formats. +// TODO: refactor algorithms using _mm_loadl_epi64 unaligned loads to prefer 128-bit aligned loads. PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int height, int texformat, int tlutaddr, int tlutfmt) { @@ -1705,9 +1710,10 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he case GX_TF_CMPR: // speed critical // The metroid games use this format almost exclusively. { -#if !defined(_M_X64) // JSD optimized with SSE2 intrinsics. - // Produces a 30% improvement for x86 code only. + // Produces a ~50% improvement for x86 and a ~40% improvement for x64 in speed over reference C implementation. + // The x64 compiled reference C code is faster than the x86 compiled reference C code, but the SSE2 is + // faster than both. for (int y = 0; y < height; y += 8) { for (int x = 0; x < width; x += 8) @@ -1727,20 +1733,42 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he // of 1s :). Then I use sequences of shifts to squash it to the appropriate size and bit // positions that I need. + const __m128i allFFs128 = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()); + // Load 128 bits, i.e. two DXTBlocks (64-bits each) - const __m128i dxt = _mm_loadu_si128((__m128i *)(src + sizeof(struct DXTBlock) * 0)); + const __m128i dxt = _mm_loadu_si128((__m128i *)src); + + // Copy the 2-bit indices from each DXT block: + __declspec(align(16)) u32 dxttmp[4]; + _mm_store_si128((__m128i *)dxttmp, dxt); + + u32 dxt0sel = dxttmp[1]; + u32 dxt1sel = dxttmp[3]; + __m128i argb888x4; - const __m128i allFF = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL); + const __m128i lowMask = _mm_srli_si128( allFFs128, 8 ); __m128i c1 = _mm_unpackhi_epi16(dxt, dxt); c1 = _mm_slli_si128(c1, 8); const __m128i c0 = _mm_or_si128(c1, _mm_srli_si128(_mm_slli_si128(_mm_unpacklo_epi16(dxt, dxt), 8), 8)); + + // Compare rgb0 to rgb1: + // Each 32-bit word will contain either 0xFFFFFFFF or 0x00000000 for true/false. + const __m128i c0cmp = _mm_srli_epi32(_mm_slli_epi32(_mm_srli_epi64(c0, 8), 16), 16); + const __m128i c0shr = _mm_srli_epi64(c0cmp, 32); + const __m128i cmprgb0rgb1 = _mm_cmpgt_epi32(c0cmp, c0shr); + + int cmp0 = _mm_extract_epi16(cmprgb0rgb1, 0); + int cmp1 = _mm_extract_epi16(cmprgb0rgb1, 4); + // green: // NOTE: We start with the larger number of bits (6) firts for G and shift the mask down 1 bit to get a 5-bit mask // later for R and B components. // low6mask == _mm_set_epi32(0x0000FC00, 0x0000FC00, 0x0000FC00, 0x0000FC00) - const __m128i low6mask = _mm_slli_epi32( _mm_srli_epi32(allFF, 24 + 2), 8 + 2); + const __m128i low6mask = _mm_slli_epi32( _mm_srli_epi32(allFFs128, 24 + 2), 8 + 2); const __m128i gtmp = _mm_srli_epi32(c0, 3); const __m128i g0 = _mm_and_si128(gtmp, low6mask); + // low3mask == _mm_set_epi32(0x00000300, 0x00000300, 0x00000300, 0x00000300) + const __m128i low3mask = _mm_slli_epi32(_mm_srli_epi32(allFFs128, 32 - 3), 8); const __m128i g1 = _mm_and_si128(_mm_srli_epi32(gtmp, 6), _mm_set_epi32(0x00000300, 0x00000300, 0x00000300, 0x00000300)); argb888x4 = _mm_or_si128(g0, g1); // red: @@ -1754,81 +1782,106 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he const __m128i b0 = _mm_and_si128(_mm_srli_epi32(c0, 5), _mm_slli_epi32(low5mask, 16)); const __m128i b1 = _mm_srli_epi16(b0, 5); // OR in the fixed alpha component - // _mm_slli_epi32( allFF, 24 ) == _mm_set_epi32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000) - argb888x4 = _mm_or_si128(_mm_or_si128(argb888x4, _mm_slli_epi32( allFF, 24 ) ), _mm_or_si128(b0, b1)); - // calculate RGB2 and RGB3: - const __m128i rgb0 = _mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(2, 2, 0, 0)); - const __m128i rgb1 = _mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(3, 3, 1, 1)); - const __m128i rrggbb0 = _mm_and_si128(_mm_unpacklo_epi8(rgb0, rgb0), _mm_srli_epi16( allFF, 8 )); - const __m128i rrggbb1 = _mm_and_si128(_mm_unpacklo_epi8(rgb1, rgb1), _mm_srli_epi16( allFF, 8 )); - const __m128i rrggbb01 = _mm_and_si128(_mm_unpackhi_epi8(rgb0, rgb0), _mm_srli_epi16( allFF, 8 )); - const __m128i rrggbb11 = _mm_and_si128(_mm_unpackhi_epi8(rgb1, rgb1), _mm_srli_epi16( allFF, 8 )); - const __m128i rrggbbsub = _mm_subs_epi16(rrggbb1, rrggbb0); - const __m128i rrggbbsub1 = _mm_subs_epi16(rrggbb11, rrggbb01); -#if 0 - // RGB2b = (RGB0 + RGB1 + 1) / 2 - const __m128i one16 = _mm_srli_epi16( allFF, 15 ); - const __m128i rrggbb21 = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(rrggbb0, rrggbb1), one16), 1); - const __m128i rrggbb211 = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(rrggbb01, rrggbb11), one16), 1); -#else - // RGB2b = avg(RGB0, RGB1) - const __m128i rrggbb21 = _mm_avg_epu16(rrggbb0, rrggbb1); - const __m128i rrggbb211 = _mm_avg_epu16(rrggbb01, rrggbb11); -#endif - const __m128i rgb210 = _mm_srli_si128(_mm_packus_epi16(rrggbb21, rrggbb21), 8); - const __m128i rgb211 = _mm_slli_si128(_mm_packus_epi16(rrggbb211, rrggbb211), 8); - const __m128i rgb21 = _mm_or_si128(rgb210, rgb211); - // RGB2a = ((RGB1 - RGB0) >> 1) - ((RGB1 - RGB0) >> 3) using arithmetic shifts to extend sign (not logical shifts) - const __m128i rrggbbsubshr1 = _mm_srai_epi16(rrggbbsub, 1); - const __m128i rrggbbsubshr3 = _mm_srai_epi16(rrggbbsub, 3); - const __m128i rrggbbsubshr11 = _mm_srai_epi16(rrggbbsub1, 1); - const __m128i rrggbbsubshr31 = _mm_srai_epi16(rrggbbsub1, 3); - const __m128i shr1subshr3 = _mm_sub_epi16(rrggbbsubshr1, rrggbbsubshr3); - const __m128i shr1subshr31 = _mm_sub_epi16(rrggbbsubshr11, rrggbbsubshr31); - // low8mask16 == _mm_set_epi16(0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff) - const __m128i low8mask16 = _mm_srli_epi16( allFF, 8 ); - const __m128i rrggbbdelta = _mm_and_si128(shr1subshr3, low8mask16); - const __m128i rrggbbdelta1 = _mm_and_si128(shr1subshr31, low8mask16); - const __m128i rgbdelta0 = _mm_packus_epi16(rrggbbdelta, rrggbbdelta); - __m128i rgbdelta1 = _mm_packus_epi16(rrggbbdelta1, rrggbbdelta1); - rgbdelta1 = _mm_slli_si128(rgbdelta1, 8); - const __m128i rgbdelta = _mm_or_si128(_mm_srli_si128(_mm_slli_si128(rgbdelta0, 8), 8), rgbdelta1); - const __m128i rgb20 = _mm_add_epi8(_mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(2, 2, 0, 0)), rgbdelta); - const __m128i rgb30 = _mm_sub_epi8(_mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(3, 3, 1, 1)), rgbdelta); - // _mm_srli_epi32( allFF, 8 ) == _mm_set_epi32(0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF) - // Make this color fully transparent: - const __m128i rgb31 = _mm_and_si128(_mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(3, 3, 1, 1)), _mm_srli_epi32( allFF, 8 ) ); + // _mm_slli_epi32( allFFs128, 24 ) == _mm_set_epi32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000) + argb888x4 = _mm_or_si128(_mm_or_si128(argb888x4, _mm_slli_epi32( allFFs128, 24 ) ), _mm_or_si128(b0, b1)); + + __m128i rgb2, rgb3; + + // if (rgb0 > rgb1): + if (cmp0 != 0) + { + // calculate RGB2 and RGB3: + const __m128i rgb0 = _mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(2, 2, 0, 0)); + const __m128i rgb1 = _mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(3, 3, 1, 1)); + const __m128i rrggbb0 = _mm_and_si128(_mm_unpacklo_epi8(rgb0, rgb0), _mm_srli_epi16( allFFs128, 8 )); + const __m128i rrggbb1 = _mm_and_si128(_mm_unpacklo_epi8(rgb1, rgb1), _mm_srli_epi16( allFFs128, 8 )); + const __m128i rrggbbsub = _mm_subs_epi16(rrggbb1, rrggbb0); + + // RGB2a = ((RGB1 - RGB0) >> 1) - ((RGB1 - RGB0) >> 3) using arithmetic shifts to extend sign (not logical shifts) + const __m128i rrggbbsubshr1 = _mm_srai_epi16(rrggbbsub, 1); + const __m128i rrggbbsubshr3 = _mm_srai_epi16(rrggbbsub, 3); + const __m128i shr1subshr3 = _mm_sub_epi16(rrggbbsubshr1, rrggbbsubshr3); + // low8mask16 == _mm_set_epi16(0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff) + const __m128i low8mask16 = _mm_srli_epi16( allFFs128, 8 ); + const __m128i rrggbbdelta = _mm_and_si128(shr1subshr3, low8mask16); + const __m128i rgbdeltadup = _mm_packus_epi16(rrggbbdelta, rrggbbdelta); + const __m128i rgbdelta = _mm_srli_si128(_mm_slli_si128(rgbdeltadup, 8), 8); + + rgb2 = _mm_and_si128(_mm_add_epi8(_mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(2, 2, 0, 0)), rgbdelta), _mm_srli_si128(allFFs128, 8)); + rgb3 = _mm_and_si128(_mm_sub_epi8(_mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(3, 3, 1, 1)), rgbdelta), _mm_srli_si128(allFFs128, 8)); + } + else + { + // calculate RGB2 and RGB3: + const __m128i rgb0 = _mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(2, 2, 0, 0)); + const __m128i rgb1 = _mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(3, 3, 1, 1)); + const __m128i rrggbb0 = _mm_and_si128(_mm_unpacklo_epi8(rgb0, rgb0), _mm_srli_epi16( allFFs128, 8 )); + const __m128i rrggbb1 = _mm_and_si128(_mm_unpacklo_epi8(rgb1, rgb1), _mm_srli_epi16( allFFs128, 8 )); + const __m128i rrggbbsub = _mm_subs_epi16(rrggbb1, rrggbb0); + // RGB2b = avg(RGB0, RGB1) + const __m128i rrggbb21 = _mm_avg_epu16(rrggbb0, rrggbb1); + const __m128i rgb210 = _mm_srli_si128(_mm_packus_epi16(rrggbb21, rrggbb21), 8); + rgb2 = rgb210; + rgb3 = _mm_and_si128(_mm_srli_si128(_mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(1, 1, 1, 1)), 8), _mm_srli_epi32( allFFs128, 8 )); + } + + // if (rgb0 > rgb1): + if (cmp1 != 0) + { + // calculate RGB2 and RGB3: + const __m128i rgb0 = _mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(2, 2, 0, 0)); + const __m128i rgb1 = _mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(3, 3, 1, 1)); + const __m128i rrggbb01 = _mm_and_si128(_mm_unpackhi_epi8(rgb0, rgb0), _mm_srli_epi16( allFFs128, 8 )); + const __m128i rrggbb11 = _mm_and_si128(_mm_unpackhi_epi8(rgb1, rgb1), _mm_srli_epi16( allFFs128, 8 )); + const __m128i rrggbbsub1 = _mm_subs_epi16(rrggbb11, rrggbb01); + + // RGB2a = ((RGB1 - RGB0) >> 1) - ((RGB1 - RGB0) >> 3) using arithmetic shifts to extend sign (not logical shifts) + const __m128i rrggbbsubshr11 = _mm_srai_epi16(rrggbbsub1, 1); + const __m128i rrggbbsubshr31 = _mm_srai_epi16(rrggbbsub1, 3); + const __m128i shr1subshr31 = _mm_sub_epi16(rrggbbsubshr11, rrggbbsubshr31); + // low8mask16 == _mm_set_epi16(0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff) + const __m128i low8mask16 = _mm_srli_epi16( allFFs128, 8 ); + const __m128i rrggbbdelta1 = _mm_and_si128(shr1subshr31, low8mask16); + __m128i rgbdelta1 = _mm_packus_epi16(rrggbbdelta1, rrggbbdelta1); + rgbdelta1 = _mm_slli_si128(rgbdelta1, 8); + + rgb2 = _mm_or_si128(rgb2, _mm_and_si128(_mm_add_epi8(_mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(2, 2, 0, 0)), rgbdelta1), _mm_slli_si128(allFFs128, 8))); + rgb3 = _mm_or_si128(rgb3, _mm_and_si128(_mm_sub_epi8(_mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(3, 3, 1, 1)), rgbdelta1), _mm_slli_si128(allFFs128, 8))); + } + else + { + // calculate RGB2 and RGB3: + const __m128i rgb0 = _mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(2, 2, 0, 0)); + const __m128i rgb1 = _mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(3, 3, 1, 1)); + const __m128i rrggbb01 = _mm_and_si128(_mm_unpackhi_epi8(rgb0, rgb0), _mm_srli_epi16( allFFs128, 8 )); + const __m128i rrggbb11 = _mm_and_si128(_mm_unpackhi_epi8(rgb1, rgb1), _mm_srli_epi16( allFFs128, 8 )); + const __m128i rrggbbsub1 = _mm_subs_epi16(rrggbb11, rrggbb01); + // RGB2b = avg(RGB0, RGB1) + const __m128i rrggbb211 = _mm_avg_epu16(rrggbb01, rrggbb11); + const __m128i rgb211 = _mm_slli_si128(_mm_packus_epi16(rrggbb211, rrggbb211), 8); + rgb2 = _mm_or_si128(rgb2, rgb211); + + // _mm_srli_epi32( allFFs128, 8 ) == _mm_set_epi32(0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF) + // Make this color fully transparent: + rgb3 = _mm_or_si128(rgb3, _mm_and_si128(_mm_and_si128(_mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(3, 3, 1, 1)), _mm_srli_epi32( allFFs128, 8 ) ), _mm_slli_si128(allFFs128, 8))); + } // Create an array for color lookups for DXT0 so we can use the 2-bit indices: - const __m128i rgb01230 = _mm_or_si128( + const __m128i mmcolors0 = _mm_or_si128( _mm_or_si128( _mm_srli_si128(_mm_slli_si128(argb888x4, 8), 8), - _mm_slli_si128(_mm_srli_si128(_mm_slli_si128(rgb20, 8), 8 + 4), 8) + _mm_slli_si128(_mm_srli_si128(_mm_slli_si128(rgb2, 8), 8 + 4), 8) ), - _mm_slli_si128(_mm_srli_si128(rgb30, 4), 8 + 4) - ); - const __m128i rgb01450 = _mm_or_si128( - _mm_or_si128( - _mm_srli_si128(_mm_slli_si128(argb888x4, 8), 8), - _mm_slli_si128(_mm_srli_si128(_mm_slli_si128(rgb21, 8), 8 + 4), 8) - ), - _mm_slli_si128(_mm_srli_si128(rgb31, 4), 8 + 4) + _mm_slli_si128(_mm_srli_si128(rgb3, 4), 8 + 4) ); // Create an array for color lookups for DXT1 so we can use the 2-bit indices: - const __m128i rgb01231 = _mm_or_si128( + const __m128i mmcolors1 = _mm_or_si128( _mm_or_si128( _mm_srli_si128(argb888x4, 8), - _mm_slli_si128(_mm_srli_si128(rgb20, 8 + 4), 8) + _mm_slli_si128(_mm_srli_si128(rgb2, 8 + 4), 8) ), - _mm_slli_si128(_mm_srli_si128(rgb30, 8 + 4), 8 + 4) - ); - const __m128i rgb01451 = _mm_or_si128( - _mm_or_si128( - _mm_srli_si128(argb888x4, 8), - _mm_slli_si128(_mm_srli_si128(rgb21, 8 + 4), 8) - ), - _mm_slli_si128(_mm_srli_si128(rgb31, 8 + 4), 8 + 4) + _mm_slli_si128(_mm_srli_si128(rgb3, 8 + 4), 8 + 4) ); // The #ifdef CHECKs here and below are to compare correctness of output against the reference code. @@ -1841,165 +1894,70 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he decodeDXTBlockRGBA(&(tmp1[0][0]), (const DXTBlock *)(src + 8), 4); #endif - // JSD NOTE: I attempted with an earlier solution to use only SSE2 intrinsics and no branching - // or array indexing. That solution was consistently worse than the reference C code so I ditched - // it in favor of having SSE2 calculate the vector-intensive stuff, then have normal C code take - // care of color selection and SSE2 for writes back to memory. This separation of concerns seems - // to work out well. I don't know exactly why the SSE2-only code did not perform very well. I - // suspect it was too many XMM temporaries and not enough registers to balance them all and so - // the assembly generated was swapping a lot of temporaries to RAM. - - // Compare rgb0 to rgb1: - - // rgb1rgb1 rgb0rgb0 rgb1rgb1 rgb0rgb0 - // c0 example: {0x 5ef05ef0 c003c003 3ff83ff8 e003e003} in BIG ENDIAN - // c0cmp contains rgb0 values in 64-bit words in LITTLE ENDIAN - // c0cmp: {0x 00000000 000003c0 00000000 000003e0} - // c0shr contains rgb1 values in 64-bit words in LITTLE ENDIAN - // c0cmp: {0x 00000000 0000f05e 00000000 0000f83f} - - __m128i c0cmp = _mm_srli_epi32(_mm_slli_epi32(_mm_srli_epi64(c0, 8), 16), 16); - const __m128i c0shr = _mm_srli_epi64(c0cmp, 32); - c0cmp = _mm_srli_epi64(_mm_slli_epi64(c0cmp, 48), 48); - -#if 0 - // SSE4 only. - const __m128i cmprgb0rgb1 = _mm_cmpgt_epi64(c0cmp, c0shr); -#else - // We need 64-bit words full of 0xFF bytes for comparison masks. cmgt_epi64 would do that but it's only SSE4 - // so we fake it on SSE2 by shuffling the 32-bit comparison masks to copy the 0xFF bytes to the high 32-bits of - // each 64-bit word. - __m128i cmprgb0rgb1 = _mm_cmpgt_epi32(c0cmp, c0shr); - cmprgb0rgb1 = _mm_shuffle_epi32(cmprgb0rgb1, _MM_SHUFFLE(2, 2, 0, 0)); -#endif - - // Now use the comparison mask from rgb0 > rgb1 to conditionally move colors from - // either rgb0123X or rgb0145X: - const __m128i cmp0 = _mm_unpacklo_epi64(cmprgb0rgb1, cmprgb0rgb1); - const __m128i cmp1 = _mm_unpackhi_epi64(cmprgb0rgb1, cmprgb0rgb1); - const __m128i mmcolors0 = _mm_or_si128( - _mm_and_si128(cmp0, rgb01230), - _mm_andnot_si128(cmp0, rgb01450) - ); - const __m128i mmcolors1 = _mm_or_si128( - _mm_and_si128(cmp1, rgb01231), - _mm_andnot_si128(cmp1, rgb01451) - ); - - // Copy the color arrays from the XMM registers to local variables in RAM: - GC_ALIGNED16(u32 colors0[4]); - GC_ALIGNED16(u32 colors1[4]); + u32 *dst32 = ( dst + (y + z*4) * width + x ); + // Copy the colors here: + GC_ALIGNED16( u32 colors0[4] ); + GC_ALIGNED16( u32 colors1[4] ); _mm_store_si128((__m128i *)colors0, mmcolors0); _mm_store_si128((__m128i *)colors1, mmcolors1); - // Copy the 2-bit indices from each DXT block: - GC_ALIGNED16(u32 dxttmp[4]); - _mm_store_si128((__m128i *)dxttmp, dxt); - - u32 dxt0sel = dxttmp[1]; - u32 dxt1sel = dxttmp[3]; - - // Per each row written we alternate storing RGBA values from DXT0 and DXT1. - - __m128i *dst128 = (__m128i *)( dst + (y + z*4) * width + x ); - // Row 0: - __m128i col0, col1; - col0 = _mm_set_epi32( - colors0[(dxt0sel >> ((0*8)+0)) & 3], - colors0[(dxt0sel >> ((0*8)+2)) & 3], - colors0[(dxt0sel >> ((0*8)+4)) & 3], - colors0[(dxt0sel >> ((0*8)+6)) & 3] - ); - _mm_store_si128(dst128 + ((width / 4) * 0), col0); + dst32[(width * 0) + 0] = colors0[(dxt0sel >> ((0*8)+6)) & 3]; + dst32[(width * 0) + 1] = colors0[(dxt0sel >> ((0*8)+4)) & 3]; + dst32[(width * 0) + 2] = colors0[(dxt0sel >> ((0*8)+2)) & 3]; + dst32[(width * 0) + 3] = colors0[(dxt0sel >> ((0*8)+0)) & 3]; + dst32[(width * 0) + 4] = colors1[(dxt1sel >> ((0*8)+6)) & 3]; + dst32[(width * 0) + 5] = colors1[(dxt1sel >> ((0*8)+4)) & 3]; + dst32[(width * 0) + 6] = colors1[(dxt1sel >> ((0*8)+2)) & 3]; + dst32[(width * 0) + 7] = colors1[(dxt1sel >> ((0*8)+0)) & 3]; #ifdef CHECK - assert( memcmp(&(tmp0[0]), dst128 + ((width / 4) * 0), 16) == 0 ); + assert( memcmp(&(tmp0[0]), &dst32[(width * 0)], 16) == 0 ); + assert( memcmp(&(tmp1[0]), &dst32[(width * 0) + 4], 16) == 0 ); #endif - - col1 = _mm_set_epi32( - colors1[(dxt1sel >> ((0*8)+0)) & 3], - colors1[(dxt1sel >> ((0*8)+2)) & 3], - colors1[(dxt1sel >> ((0*8)+4)) & 3], - colors1[(dxt1sel >> ((0*8)+6)) & 3] - ); - _mm_store_si128(dst128 + ((width / 4) * 0) + 1, col1); -#ifdef CHECK - assert( memcmp(&(tmp1[0]), dst128 + ((width / 4) * 0) + 1, 16) == 0 ); -#endif - // Row 1: - col0 = _mm_set_epi32( - colors0[(dxt0sel >> ((1*8)+0)) & 3], - colors0[(dxt0sel >> ((1*8)+2)) & 3], - colors0[(dxt0sel >> ((1*8)+4)) & 3], - colors0[(dxt0sel >> ((1*8)+6)) & 3] - ); - _mm_store_si128(dst128 + ((width / 4) * 1), col0); + dst32[(width * 1) + 0] = colors0[(dxt0sel >> ((1*8)+6)) & 3]; + dst32[(width * 1) + 1] = colors0[(dxt0sel >> ((1*8)+4)) & 3]; + dst32[(width * 1) + 2] = colors0[(dxt0sel >> ((1*8)+2)) & 3]; + dst32[(width * 1) + 3] = colors0[(dxt0sel >> ((1*8)+0)) & 3]; + dst32[(width * 1) + 4] = colors1[(dxt1sel >> ((1*8)+6)) & 3]; + dst32[(width * 1) + 5] = colors1[(dxt1sel >> ((1*8)+4)) & 3]; + dst32[(width * 1) + 6] = colors1[(dxt1sel >> ((1*8)+2)) & 3]; + dst32[(width * 1) + 7] = colors1[(dxt1sel >> ((1*8)+0)) & 3]; #ifdef CHECK - assert( memcmp(&(tmp0[1]), dst128 + ((width / 4) * 1), 16) == 0 ); + assert( memcmp(&(tmp0[1]), &dst32[(width * 1)], 16) == 0 ); + assert( memcmp(&(tmp1[1]), &dst32[(width * 1) + 4], 16) == 0 ); #endif - - col1 = _mm_set_epi32( - colors1[(dxt1sel >> ((1*8)+0)) & 3], - colors1[(dxt1sel >> ((1*8)+2)) & 3], - colors1[(dxt1sel >> ((1*8)+4)) & 3], - colors1[(dxt1sel >> ((1*8)+6)) & 3] - ); - _mm_store_si128(dst128 + ((width / 4) * 1) + 1, col1); -#ifdef CHECK - assert( memcmp(&(tmp1[1]), dst128 + ((width / 4) * 1) + 1, 16) == 0 ); -#endif - // Row 2: - col0 = _mm_set_epi32( - colors0[(dxt0sel >> ((2*8)+0)) & 3], - colors0[(dxt0sel >> ((2*8)+2)) & 3], - colors0[(dxt0sel >> ((2*8)+4)) & 3], - colors0[(dxt0sel >> ((2*8)+6)) & 3] - ); - _mm_store_si128(dst128 + ((width / 4) * 2), col0); + dst32[(width * 2) + 0] = colors0[(dxt0sel >> ((2*8)+6)) & 3]; + dst32[(width * 2) + 1] = colors0[(dxt0sel >> ((2*8)+4)) & 3]; + dst32[(width * 2) + 2] = colors0[(dxt0sel >> ((2*8)+2)) & 3]; + dst32[(width * 2) + 3] = colors0[(dxt0sel >> ((2*8)+0)) & 3]; + dst32[(width * 2) + 4] = colors1[(dxt1sel >> ((2*8)+6)) & 3]; + dst32[(width * 2) + 5] = colors1[(dxt1sel >> ((2*8)+4)) & 3]; + dst32[(width * 2) + 6] = colors1[(dxt1sel >> ((2*8)+2)) & 3]; + dst32[(width * 2) + 7] = colors1[(dxt1sel >> ((2*8)+0)) & 3]; #ifdef CHECK - assert( memcmp(&(tmp0[2]), dst128 + ((width / 4) * 2), 16) == 0 ); + assert( memcmp(&(tmp0[2]), &dst32[(width * 2)], 16) == 0 ); + assert( memcmp(&(tmp1[2]), &dst32[(width * 2) + 4], 16) == 0 ); #endif - - col1 = _mm_set_epi32( - colors1[(dxt1sel >> ((2*8)+0)) & 3], - colors1[(dxt1sel >> ((2*8)+2)) & 3], - colors1[(dxt1sel >> ((2*8)+4)) & 3], - colors1[(dxt1sel >> ((2*8)+6)) & 3] - ); - _mm_store_si128(dst128 + ((width / 4) * 2) + 1, col1); -#ifdef CHECK - assert( memcmp(&(tmp1[2]), dst128 + ((width / 4) * 2) + 1, 16) == 0 ); -#endif - // Row 3: - col0 = _mm_set_epi32( - colors0[(dxt0sel >> ((3*8)+0)) & 3], - colors0[(dxt0sel >> ((3*8)+2)) & 3], - colors0[(dxt0sel >> ((3*8)+4)) & 3], - colors0[(dxt0sel >> ((3*8)+6)) & 3] - ); - _mm_store_si128(dst128 + ((width / 4) * 3), col0); + dst32[(width * 3) + 0] = colors0[(dxt0sel >> ((3*8)+6)) & 3]; + dst32[(width * 3) + 1] = colors0[(dxt0sel >> ((3*8)+4)) & 3]; + dst32[(width * 3) + 2] = colors0[(dxt0sel >> ((3*8)+2)) & 3]; + dst32[(width * 3) + 3] = colors0[(dxt0sel >> ((3*8)+0)) & 3]; + dst32[(width * 3) + 4] = colors1[(dxt1sel >> ((3*8)+6)) & 3]; + dst32[(width * 3) + 5] = colors1[(dxt1sel >> ((3*8)+4)) & 3]; + dst32[(width * 3) + 6] = colors1[(dxt1sel >> ((3*8)+2)) & 3]; + dst32[(width * 3) + 7] = colors1[(dxt1sel >> ((3*8)+0)) & 3]; #ifdef CHECK - assert( memcmp(&(tmp0[3]), dst128 + ((width / 4) * 3), 16) == 0 ); -#endif - - col1 = _mm_set_epi32( - colors1[(dxt1sel >> ((3*8)+0)) & 3], - colors1[(dxt1sel >> ((3*8)+2)) & 3], - colors1[(dxt1sel >> ((3*8)+4)) & 3], - colors1[(dxt1sel >> ((3*8)+6)) & 3] - ); - _mm_store_si128(dst128 + ((width / 4) * 3) + 1, col1); -#ifdef CHECK - assert( memcmp(&(tmp1[3]), dst128 + ((width / 4) * 3) + 1, 16) == 0 ); + assert( memcmp(&(tmp0[3]), &dst32[(width * 3)], 16) == 0 ); + assert( memcmp(&(tmp1[3]), &dst32[(width * 3) + 4], 16) == 0 ); #endif } } } -#else +#if 0 for (int y = 0; y < height; y += 8) { for (int x = 0; x < width; x += 8)