~40% speed improvement for decoding GX_TF_CMPR (S3TC) textures using SSE2 intrinsics.

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6740 8ced0084-cf51-0410-be5f-012b33b47a6e
2011-01-04 19:54:30 +00:00 · 2011-01-04 19:54:30 +00:00 · 99b4f4703d
parent 77346cc254
commit 99b4f4703d
1 changed files with 394 additions and 0 deletions
--- a/Source/Core/VideoCommon/Src/TextureDecoder.cpp
+++ b/Source/Core/VideoCommon/Src/TextureDecoder.cpp
@ -1707,6 +1707,399 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 	case GX_TF_CMPR:  // speed critical
 		// The metroid games use this format almost exclusively.
 		{
+			// JSD optimized with SSE2 intrinsics.
+			// Produces a ~40% improvement in speed over reference C implementation.
+			for (int y = 0; y < height; y += 8)
+			{
+				for (int x = 0; x < width; x += 8)
+				{
+					// We handle two DXT blocks simultaneously to take full advantage of SSE2's 128-bit registers.
+					// This is ideal because a single DXT block contains 2 RGBA colors when decoded from their 16-bit.
+					// Two DXT blocks therefore contain 4 RGBA colors to be processed. The processing is parallelizable
+					// at this level, so we do.
+					for (int z = 0; z < 2; ++z, src += sizeof(struct DXTBlock) * 2)
+					{
+						// JSD NOTE: You may see many strange patterns of behavior in the below code, but they
+						// are for performance reasons. Sometimes, calculating what should be obvious hard-coded
+						// constants is faster than loading their values from memory. Unfortunately, there is no
+						// way to inline 128-bit constants from opcodes so they must be loaded from memory. This
+						// seems a little ridiculous to me in that you can't even generate a constant value of 1 without
+						// having to load it from memory. So, I stored the minimal constant I could, 128-bits worth
+						// of 1s :). Then I use sequences of shifts to squash it to the appropriate size and bit
+						// positions that I need.
+
+						// Load 128 bits, i.e. two DXTBlocks (64-bits each)
+						const __m128i dxt = _mm_loadu_si128((__m128i *)(src + sizeof(struct DXTBlock) * 0));
+						__m128i *dst128 = (__m128i *)( dst + (y + z*4) * width + x );
+						__m128i argb888x4;
+						const __m128i allFF = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL);
+						const __m128i lowMask = _mm_srli_si128( allFF, 8 );
+						__m128i c1 = _mm_unpackhi_epi16(dxt, dxt);
+						c1 = _mm_slli_si128(c1, 8);
+						const __m128i c0 = _mm_or_si128(c1, _mm_srli_si128(_mm_slli_si128(_mm_unpacklo_epi16(dxt, dxt), 8), 8));
+						// green:
+						// NOTE: We start with the larger number of bits (6) firts for G and shift the mask down 1 bit to get a 5-bit mask
+						// later for R and B components.
+						// low6mask == _mm_set_epi32(0x0000FC00, 0x0000FC00, 0x0000FC00, 0x0000FC00)
+						const __m128i low6mask = _mm_slli_epi32( _mm_srli_epi32(allFF, 24 + 2), 8 + 2);
+						const __m128i gtmp = _mm_srli_epi32(c0, 3);
+						const __m128i g0 = _mm_and_si128(gtmp, low6mask);
+						const __m128i g1 = _mm_and_si128(_mm_srli_epi32(gtmp, 6), _mm_set_epi32(0x00000300, 0x00000300, 0x00000300, 0x00000300));
+						argb888x4 = _mm_or_si128(g0, g1);
+						// red:
+						// low5mask == _mm_set_epi32(0x000000F8, 0x000000F8, 0x000000F8, 0x000000F8)
+						const __m128i low5mask = _mm_slli_epi32( _mm_srli_epi32(low6mask, 8 + 3), 3);
+						const __m128i r0 = _mm_and_si128(c0, low5mask);
+						const __m128i r1 = _mm_srli_epi32(r0, 5);
+						argb888x4 = _mm_or_si128(argb888x4, _mm_or_si128(r0, r1));
+						// blue:
+						// _mm_slli_epi32(low5mask, 16) == _mm_set_epi32(0x00F80000, 0x00F80000, 0x00F80000, 0x00F80000)
+						const __m128i b0 = _mm_and_si128(_mm_srli_epi32(c0, 5), _mm_slli_epi32(low5mask, 16));
+						const __m128i b1 = _mm_srli_epi16(b0, 5);
+						// OR in the fixed alpha component
+						// _mm_slli_epi32( allFF, 24 ) == _mm_set_epi32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000)
+						argb888x4 = _mm_or_si128(_mm_or_si128(argb888x4, _mm_slli_epi32( allFF, 24 ) ), _mm_or_si128(b0, b1));
+						// calculate RGB2 and RGB3:
+						const __m128i rgb0 = _mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(2, 2, 0, 0));
+						const __m128i rgb1 = _mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(3, 3, 1, 1));
+						const __m128i rrggbb0 = _mm_and_si128(_mm_unpacklo_epi8(rgb0, rgb0), _mm_srli_epi16( allFF, 8 ));
+						const __m128i rrggbb1 = _mm_and_si128(_mm_unpacklo_epi8(rgb1, rgb1), _mm_srli_epi16( allFF, 8 ));
+						const __m128i rrggbb01 = _mm_and_si128(_mm_unpackhi_epi8(rgb0, rgb0), _mm_srli_epi16( allFF, 8 ));
+						const __m128i rrggbb11 = _mm_and_si128(_mm_unpackhi_epi8(rgb1, rgb1), _mm_srli_epi16( allFF, 8 ));
+						const __m128i rrggbbsub = _mm_subs_epi16(rrggbb1, rrggbb0);
+						const __m128i rrggbbsub1 = _mm_subs_epi16(rrggbb11, rrggbb01);
+#if 0
+						// RGB2b = (RGB0 + RGB1 + 1) / 2
+						const __m128i one16 = _mm_srli_epi16( allFF, 15 );
+						const __m128i rrggbb21  = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(rrggbb0, rrggbb1), one16), 1);
+						const __m128i rrggbb211 = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(rrggbb01, rrggbb11), one16), 1);
+#else
+						// RGB2b = avg(RGB0, RGB1)
+						const __m128i rrggbb21  = _mm_avg_epu16(rrggbb0, rrggbb1);
+						const __m128i rrggbb211 = _mm_avg_epu16(rrggbb01, rrggbb11);
+#endif
+						const __m128i rgb210 = _mm_srli_si128(_mm_packus_epi16(rrggbb21, rrggbb21), 8);
+						const __m128i rgb211 = _mm_slli_si128(_mm_packus_epi16(rrggbb211, rrggbb211), 8);
+						const __m128i rgb21 = _mm_or_si128(rgb210, rgb211);
+						// RGB2a = ((RGB1 - RGB0) >> 1) - ((RGB1 - RGB0) >> 3)  using arithmetic shifts to extend sign (not logical shifts)
+						const __m128i rrggbbsubshr1 = _mm_srai_epi16(rrggbbsub, 1);
+						const __m128i rrggbbsubshr3 = _mm_srai_epi16(rrggbbsub, 3);
+						const __m128i rrggbbsubshr11 = _mm_srai_epi16(rrggbbsub1, 1);
+						const __m128i rrggbbsubshr31 = _mm_srai_epi16(rrggbbsub1, 3);
+						const __m128i shr1subshr3 = _mm_sub_epi16(rrggbbsubshr1, rrggbbsubshr3);
+						const __m128i shr1subshr31 = _mm_sub_epi16(rrggbbsubshr11, rrggbbsubshr31);
+						// low8mask16 == _mm_set_epi16(0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff)
+						const __m128i low8mask16 = _mm_srli_epi16( allFF, 8 );
+						const __m128i rrggbbdelta = _mm_and_si128(shr1subshr3, low8mask16);
+						const __m128i rrggbbdelta1 = _mm_and_si128(shr1subshr31, low8mask16);
+						const __m128i rgbdelta0 = _mm_packus_epi16(rrggbbdelta, rrggbbdelta);
+						__m128i rgbdelta1 = _mm_packus_epi16(rrggbbdelta1, rrggbbdelta1);
+						rgbdelta1 = _mm_slli_si128(rgbdelta1, 8);
+						const __m128i rgbdelta = _mm_or_si128(_mm_srli_si128(_mm_slli_si128(rgbdelta0, 8), 8), rgbdelta1);
+						const __m128i rgb20 = _mm_add_epi8(_mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(2, 2, 0, 0)), rgbdelta);
+						const __m128i rgb30 = _mm_sub_epi8(_mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(3, 3, 1, 1)), rgbdelta);
+						const __m128i rgb31 = _mm_and_si128(_mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(3, 3, 1, 1)), _mm_srli_epi32( allFF, 8 ) /*_mm_set_epi32(0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF)*/ );
+
+						// Create an array for color lookups for DXT0 so we can use the 2-bit indices:
+						const __m128i rgb01230 = _mm_or_si128(
+							_mm_or_si128(
+								_mm_srli_si128(_mm_slli_si128(argb888x4, 8), 8),
+								_mm_slli_si128(_mm_srli_si128(_mm_slli_si128(rgb20, 8), 8 + 4), 8)
+							),
+							_mm_slli_si128(_mm_srli_si128(rgb30, 4), 8 + 4)
+						);
+						const __m128i rgb01450 = _mm_or_si128(
+							_mm_or_si128(
+								_mm_srli_si128(_mm_slli_si128(argb888x4, 8), 8),
+								_mm_slli_si128(_mm_srli_si128(_mm_slli_si128(rgb21, 8), 8 + 4), 8)
+							),
+							_mm_slli_si128(_mm_srli_si128(rgb31, 4), 8 + 4)
+						);
+
+						// Create an array for color lookups for DXT1 so we can use the 2-bit indices:
+						const __m128i rgb01231 = _mm_or_si128(
+							_mm_or_si128(
+								_mm_srli_si128(argb888x4, 8),
+								_mm_slli_si128(_mm_srli_si128(rgb20, 8 + 4), 8)
+							),
+							_mm_slli_si128(_mm_srli_si128(rgb30, 8 + 4), 8 + 4)
+						);
+						const __m128i rgb01451 = _mm_or_si128(
+							_mm_or_si128(
+								_mm_srli_si128(argb888x4, 8),
+								_mm_slli_si128(_mm_srli_si128(rgb21, 8 + 4), 8)
+							),
+							_mm_slli_si128(_mm_srli_si128(rgb31, 8 + 4), 8 + 4)
+						);
+
+						// The #ifdef CHECKs here and below are to compare correctness of output against the reference code.
+						// Don't use them in a normal build.
+#ifdef CHECK
+						// REFERENCE:
+						u32 tmp0[4][4], tmp1[4][4];
+
+						decodeDXTBlockRGBA(&(tmp0[0][0]), (const DXTBlock *)src, 4);
+						decodeDXTBlockRGBA(&(tmp1[0][0]), (const DXTBlock *)(src + 8), 4);
+#endif
+
+						// JSD NOTE: I attempted with an earlier solution to use only SSE2 intrinsics and no branching
+						// or array indexing. That solution was consistently worse than the reference C code so I ditched
+						// it in favor of having SSE2 calculate the vector-intensive stuff, then have normal C code take
+						// care of color selection and SSE2 for writes back to memory. This separation of concerns seems
+						// to work out well. I don't know exactly why the SSE2-only code did not perform very well. I
+						// suspect it was too many XMM temporaries and not enough registers to balance them all and so
+						// the assembly generated was swapping a lot of temporaries to RAM.
+
+						// Compare rgb0 to rgb1:
+						// Each 32-bit word will contain either 0xFFFFFFFF or 0x00000000 for true/false.
+						const __m128i cmprgb0rgb1 = _mm_cmpgt_epi32(rgb0, rgb1);
+
+						// The 2-bit indices read from each DXT block:
+						u32 dxt0sel = dxt.m128i_u32[1];
+						u32 dxt1sel = dxt.m128i_u32[3];
+
+						// Per each row written we alternate storing RGBA values from DXT0 and DXT1.
+
+						// Row 0:
+						__m128i col0, col1;
+						if (cmprgb0rgb1.m128i_u32[0])
+						{
+							// rgb0a > rgb1a
+							col0 = _mm_set_epi32(
+								rgb01230.m128i_u32[(dxt0sel >> ((0*8)+0)) & 3],
+								rgb01230.m128i_u32[(dxt0sel >> ((0*8)+2)) & 3],
+								rgb01230.m128i_u32[(dxt0sel >> ((0*8)+4)) & 3],
+								rgb01230.m128i_u32[(dxt0sel >> ((0*8)+6)) & 3]
+							);
+							_mm_store_si128(dst128 + ((width / 4) * 0), col0);
+#ifdef CHECK
+							assert( memcmp(&(tmp0[0]), dst128 + ((width / 4) * 0), 16) == 0 );
+#endif
+						}
+						else
+						{
+							// rgb0a <= rgb1a
+							col0 = _mm_set_epi32(
+								rgb01450.m128i_u32[(dxt0sel >> ((0*8)+0)) & 3],
+								rgb01450.m128i_u32[(dxt0sel >> ((0*8)+2)) & 3],
+								rgb01450.m128i_u32[(dxt0sel >> ((0*8)+4)) & 3],
+								rgb01450.m128i_u32[(dxt0sel >> ((0*8)+6)) & 3]
+							);
+							_mm_store_si128(dst128 + ((width / 4) * 0), col0);
+#ifdef CHECK
+							assert( memcmp(&(tmp0[0]), dst128 + ((width / 4) * 0), 16) == 0 );
+#endif
+						}
+
+						if (cmprgb0rgb1.m128i_u32[2])
+						{
+							// (rgb0b > rgb1b)
+							col1 = _mm_set_epi32(
+								rgb01231.m128i_u32[(dxt1sel >> ((0*8)+0)) & 3],
+								rgb01231.m128i_u32[(dxt1sel >> ((0*8)+2)) & 3],
+								rgb01231.m128i_u32[(dxt1sel >> ((0*8)+4)) & 3],
+								rgb01231.m128i_u32[(dxt1sel >> ((0*8)+6)) & 3]
+							);
+							_mm_store_si128(dst128 + ((width / 4) * 0) + 1, col1);
+#ifdef CHECK
+							assert( memcmp(&(tmp1[0]), dst128 + ((width / 4) * 0) + 1, 16) == 0 );
+#endif
+						}
+						else
+						{
+							// (rgb0b <= rgb1b)
+							col1 = _mm_set_epi32(
+								rgb01451.m128i_u32[(dxt1sel >> ((0*8)+0)) & 3],
+								rgb01451.m128i_u32[(dxt1sel >> ((0*8)+2)) & 3],
+								rgb01451.m128i_u32[(dxt1sel >> ((0*8)+4)) & 3],
+								rgb01451.m128i_u32[(dxt1sel >> ((0*8)+6)) & 3]
+							);
+							_mm_store_si128(dst128 + ((width / 4) * 0) + 1, col1);
+#ifdef CHECK
+							assert( memcmp(&(tmp1[0]), dst128 + ((width / 4) * 0) + 1, 16) == 0 );
+#endif
+						}
+
+						// Row 1:
+						if (cmprgb0rgb1.m128i_u32[0])
+						{
+							// rgb0a > rgb1a
+							col0 = _mm_set_epi32(
+								rgb01230.m128i_u32[(dxt0sel >> ((1*8)+0)) & 3],
+								rgb01230.m128i_u32[(dxt0sel >> ((1*8)+2)) & 3],
+								rgb01230.m128i_u32[(dxt0sel >> ((1*8)+4)) & 3],
+								rgb01230.m128i_u32[(dxt0sel >> ((1*8)+6)) & 3]
+							);
+							_mm_store_si128(dst128 + ((width / 4) * 1), col0);
+#ifdef CHECK
+							assert( memcmp(&(tmp0[1]), dst128 + ((width / 4) * 1), 16) == 0 );
+#endif
+						}
+						else
+						{
+							// rgb0a <= rgb1a
+							col0 = _mm_set_epi32(
+								rgb01450.m128i_u32[(dxt0sel >> ((1*8)+0)) & 3],
+								rgb01450.m128i_u32[(dxt0sel >> ((1*8)+2)) & 3],
+								rgb01450.m128i_u32[(dxt0sel >> ((1*8)+4)) & 3],
+								rgb01450.m128i_u32[(dxt0sel >> ((1*8)+6)) & 3]
+							);
+							_mm_store_si128(dst128 + ((width / 4) * 1), col0);
+#ifdef CHECK
+							assert( memcmp(&(tmp0[1]), dst128 + ((width / 4) * 1), 16) == 0 );
+#endif
+						}
+
+						if (cmprgb0rgb1.m128i_u32[2])
+						{
+							// (rgb0b > rgb1b)
+							col1 = _mm_set_epi32(
+								rgb01231.m128i_u32[(dxt1sel >> ((1*8)+0)) & 3],
+								rgb01231.m128i_u32[(dxt1sel >> ((1*8)+2)) & 3],
+								rgb01231.m128i_u32[(dxt1sel >> ((1*8)+4)) & 3],
+								rgb01231.m128i_u32[(dxt1sel >> ((1*8)+6)) & 3]
+							);
+							_mm_store_si128(dst128 + ((width / 4) * 1) + 1, col1);
+#ifdef CHECK
+							assert( memcmp(&(tmp1[1]), dst128 + ((width / 4) * 1) + 1, 16) == 0 );
+#endif
+						}
+						else
+						{
+							// (rgb0b <= rgb1b)
+							col1 = _mm_set_epi32(
+								rgb01451.m128i_u32[(dxt1sel >> ((1*8)+0)) & 3],
+								rgb01451.m128i_u32[(dxt1sel >> ((1*8)+2)) & 3],
+								rgb01451.m128i_u32[(dxt1sel >> ((1*8)+4)) & 3],
+								rgb01451.m128i_u32[(dxt1sel >> ((1*8)+6)) & 3]
+							);
+							_mm_store_si128(dst128 + ((width / 4) * 1) + 1, col1);
+#ifdef CHECK
+							assert( memcmp(&(tmp1[1]), dst128 + ((width / 4) * 1) + 1, 16) == 0 );
+#endif
+						}
+
+						// Row 2:
+						if (cmprgb0rgb1.m128i_u32[0])
+						{
+							// rgb0a > rgb1a
+							col0 = _mm_set_epi32(
+								rgb01230.m128i_u32[(dxt0sel >> ((2*8)+0)) & 3],
+								rgb01230.m128i_u32[(dxt0sel >> ((2*8)+2)) & 3],
+								rgb01230.m128i_u32[(dxt0sel >> ((2*8)+4)) & 3],
+								rgb01230.m128i_u32[(dxt0sel >> ((2*8)+6)) & 3]
+							);
+							_mm_store_si128(dst128 + ((width / 4) * 2), col0);
+#ifdef CHECK
+							assert( memcmp(&(tmp0[2]), dst128 + ((width / 4) * 2), 16) == 0 );
+#endif
+						}
+						else
+						{
+							// rgb0a <= rgb1a
+							col0 = _mm_set_epi32(
+								rgb01450.m128i_u32[(dxt0sel >> ((2*8)+0)) & 3],
+								rgb01450.m128i_u32[(dxt0sel >> ((2*8)+2)) & 3],
+								rgb01450.m128i_u32[(dxt0sel >> ((2*8)+4)) & 3],
+								rgb01450.m128i_u32[(dxt0sel >> ((2*8)+6)) & 3]
+							);
+							_mm_store_si128(dst128 + ((width / 4) * 2), col0);
+#ifdef CHECK
+							assert( memcmp(&(tmp0[2]), dst128 + ((width / 4) * 2), 16) == 0 );
+#endif
+						}
+
+						if (cmprgb0rgb1.m128i_u32[2])
+						{
+							// (rgb0b > rgb1b)
+							col1 = _mm_set_epi32(
+								rgb01231.m128i_u32[(dxt1sel >> ((2*8)+0)) & 3],
+								rgb01231.m128i_u32[(dxt1sel >> ((2*8)+2)) & 3],
+								rgb01231.m128i_u32[(dxt1sel >> ((2*8)+4)) & 3],
+								rgb01231.m128i_u32[(dxt1sel >> ((2*8)+6)) & 3]
+							);
+							_mm_store_si128(dst128 + ((width / 4) * 2) + 1, col1);
+#ifdef CHECK
+							assert( memcmp(&(tmp1[2]), dst128 + ((width / 4) * 2) + 1, 16) == 0 );
+#endif
+						}
+						else
+						{
+							// (rgb0b <= rgb1b)
+							col1 = _mm_set_epi32(
+								rgb01451.m128i_u32[(dxt1sel >> ((2*8)+0)) & 3],
+								rgb01451.m128i_u32[(dxt1sel >> ((2*8)+2)) & 3],
+								rgb01451.m128i_u32[(dxt1sel >> ((2*8)+4)) & 3],
+								rgb01451.m128i_u32[(dxt1sel >> ((2*8)+6)) & 3]
+							);
+							_mm_store_si128(dst128 + ((width / 4) * 2) + 1, col1);
+#ifdef CHECK
+							assert( memcmp(&(tmp1[2]), dst128 + ((width / 4) * 2) + 1, 16) == 0 );
+#endif
+						}
+
+						// Row 3:
+						if (cmprgb0rgb1.m128i_u32[0])
+						{
+							// rgb0a > rgb1a
+							col0 = _mm_set_epi32(
+								rgb01230.m128i_u32[(dxt0sel >> ((3*8)+0)) & 3],
+								rgb01230.m128i_u32[(dxt0sel >> ((3*8)+2)) & 3],
+								rgb01230.m128i_u32[(dxt0sel >> ((3*8)+4)) & 3],
+								rgb01230.m128i_u32[(dxt0sel >> ((3*8)+6)) & 3]
+							);
+							_mm_store_si128(dst128 + ((width / 4) * 3), col0);
+#ifdef CHECK
+							assert( memcmp(&(tmp0[3]), dst128 + ((width / 4) * 3), 16) == 0 );
+#endif
+						}
+						else
+						{
+							// rgb0a <= rgb1a
+							col0 = _mm_set_epi32(
+								rgb01450.m128i_u32[(dxt0sel >> ((3*8)+0)) & 3],
+								rgb01450.m128i_u32[(dxt0sel >> ((3*8)+2)) & 3],
+								rgb01450.m128i_u32[(dxt0sel >> ((3*8)+4)) & 3],
+								rgb01450.m128i_u32[(dxt0sel >> ((3*8)+6)) & 3]
+							);
+							_mm_store_si128(dst128 + ((width / 4) * 3), col0);
+#ifdef CHECK
+							assert( memcmp(&(tmp0[3]), dst128 + ((width / 4) * 3), 16) == 0 );
+#endif
+						}
+
+						if (cmprgb0rgb1.m128i_u32[2])
+						{
+							// (rgb0b > rgb1b)
+							col1 = _mm_set_epi32(
+								rgb01231.m128i_u32[(dxt1sel >> ((3*8)+0)) & 3],
+								rgb01231.m128i_u32[(dxt1sel >> ((3*8)+2)) & 3],
+								rgb01231.m128i_u32[(dxt1sel >> ((3*8)+4)) & 3],
+								rgb01231.m128i_u32[(dxt1sel >> ((3*8)+6)) & 3]
+							);
+							_mm_store_si128(dst128 + ((width / 4) * 3) + 1, col1);
+#ifdef CHECK
+							assert( memcmp(&(tmp1[3]), dst128 + ((width / 4) * 3) + 1, 16) == 0 );
+#endif
+						}
+						else
+						{
+							// (rgb0b <= rgb1b)
+							col1 = _mm_set_epi32(
+								rgb01451.m128i_u32[(dxt1sel >> ((3*8)+0)) & 3],
+								rgb01451.m128i_u32[(dxt1sel >> ((3*8)+2)) & 3],
+								rgb01451.m128i_u32[(dxt1sel >> ((3*8)+4)) & 3],
+								rgb01451.m128i_u32[(dxt1sel >> ((3*8)+6)) & 3]
+							);
+							_mm_store_si128(dst128 + ((width / 4) * 3) + 1, col1);
+#ifdef CHECK
+							assert( memcmp(&(tmp1[3]), dst128 + ((width / 4) * 3) + 1, 16) == 0 );
+#endif
+						}
+					}
+				}
+			}
+#if 0
 			for (int y = 0; y < height; y += 8)
 			{
 				for (int x = 0; x < width; x += 8)
@ -1721,6 +2114,7 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 										src += sizeof(DXTBlock);
 				}
 			}
+#endif
 			break;
 		}
 	}