From 2841d67ce314ae6ddc1f92828c928470d146c871 Mon Sep 17 00:00:00 2001
From: "james.jdunne" <james.jdunne@gmail.com>
Date: Thu, 6 Jan 2011 16:41:20 +0000
Subject: [PATCH] Faster SSE2 optimized GX_TF_CMPR texture decoder which gets
 ~40% speed improvement on x64 and ~50% improvement on x86 as compared to
 reference C code. The code now uses direct pointer access from C code to
 write the colors to the destination texture instead of trying to force them
 back up into an __m128i and a single write call. This is what produces the
 major speed-up.

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@6761 8ced0084-cf51-0410-be5f-012b33b47a6e
---
 .../Core/VideoCommon/Src/TextureDecoder.cpp   | 380 ++++++++----------
 1 file changed, 169 insertions(+), 211 deletions(-)

diff --git a/Source/Core/VideoCommon/Src/TextureDecoder.cpp b/Source/Core/VideoCommon/Src/TextureDecoder.cpp
index 150f2b7ae2..f3f647ef3d 100644
--- a/Source/Core/VideoCommon/Src/TextureDecoder.cpp
+++ b/Source/Core/VideoCommon/Src/TextureDecoder.cpp
@@ -919,8 +919,13 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
 
 
 
-
-
+// JSD 01/06/11:
+// TODO: we really should ensure BOTH the source and destination addresses are aligned to 16-byte boundaries to
+// squeeze out a little more performance. _mm_loadu_si128/_mm_storeu_si128 is slower than _mm_load_si128/_mm_store_si128
+// because they work on unaligned addresses. The processor is free to make the assumption that addresses are multiples
+// of 16 in the aligned case.
+// TODO: complete SSE2 optimization of less often used texture formats.
+// TODO: refactor algorithms using _mm_loadl_epi64 unaligned loads to prefer 128-bit aligned loads.
 
 PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int height, int texformat, int tlutaddr, int tlutfmt)
 {
@@ -1705,9 +1710,10 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 	case GX_TF_CMPR:  // speed critical
 		// The metroid games use this format almost exclusively.
 		{
-#if !defined(_M_X64)
 			// JSD optimized with SSE2 intrinsics.
-			// Produces a 30% improvement for x86 code only.
+			// Produces a ~50% improvement for x86 and a ~40% improvement for x64 in speed over reference C implementation.
+			// The x64 compiled reference C code is faster than the x86 compiled reference C code, but the SSE2 is
+			// faster than both.
 			for (int y = 0; y < height; y += 8)
 			{
 				for (int x = 0; x < width; x += 8)
@@ -1727,20 +1733,42 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 						// of 1s :). Then I use sequences of shifts to squash it to the appropriate size and bit
 						// positions that I need.
 
+						const __m128i allFFs128 = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128());
+
 						// Load 128 bits, i.e. two DXTBlocks (64-bits each)
-						const __m128i dxt = _mm_loadu_si128((__m128i *)(src + sizeof(struct DXTBlock) * 0));
+						const __m128i dxt = _mm_loadu_si128((__m128i *)src);
+
+						// Copy the 2-bit indices from each DXT block:
+						__declspec(align(16)) u32 dxttmp[4];
+						_mm_store_si128((__m128i *)dxttmp, dxt);
+
+						u32 dxt0sel = dxttmp[1];
+						u32 dxt1sel = dxttmp[3];
+
 						__m128i argb888x4;
-						const __m128i allFF = _mm_set_epi32(0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL, 0xFFFFFFFFL);
+						const __m128i lowMask = _mm_srli_si128( allFFs128, 8 );
 						__m128i c1 = _mm_unpackhi_epi16(dxt, dxt);
 						c1 = _mm_slli_si128(c1, 8);
 						const __m128i c0 = _mm_or_si128(c1, _mm_srli_si128(_mm_slli_si128(_mm_unpacklo_epi16(dxt, dxt), 8), 8));
+
+						// Compare rgb0 to rgb1:
+						// Each 32-bit word will contain either 0xFFFFFFFF or 0x00000000 for true/false.
+						const __m128i c0cmp = _mm_srli_epi32(_mm_slli_epi32(_mm_srli_epi64(c0, 8), 16), 16);
+						const __m128i c0shr = _mm_srli_epi64(c0cmp, 32);
+						const __m128i cmprgb0rgb1 = _mm_cmpgt_epi32(c0cmp, c0shr);
+
+						int cmp0 = _mm_extract_epi16(cmprgb0rgb1, 0);
+						int cmp1 = _mm_extract_epi16(cmprgb0rgb1, 4);
+
 						// green:
 						// NOTE: We start with the larger number of bits (6) firts for G and shift the mask down 1 bit to get a 5-bit mask
 						// later for R and B components.
 						// low6mask == _mm_set_epi32(0x0000FC00, 0x0000FC00, 0x0000FC00, 0x0000FC00)
-						const __m128i low6mask = _mm_slli_epi32( _mm_srli_epi32(allFF, 24 + 2), 8 + 2);
+						const __m128i low6mask = _mm_slli_epi32( _mm_srli_epi32(allFFs128, 24 + 2), 8 + 2);
 						const __m128i gtmp = _mm_srli_epi32(c0, 3);
 						const __m128i g0 = _mm_and_si128(gtmp, low6mask);
+						// low3mask == _mm_set_epi32(0x00000300, 0x00000300, 0x00000300, 0x00000300)
+						const __m128i low3mask = _mm_slli_epi32(_mm_srli_epi32(allFFs128, 32 - 3), 8);
 						const __m128i g1 = _mm_and_si128(_mm_srli_epi32(gtmp, 6), _mm_set_epi32(0x00000300, 0x00000300, 0x00000300, 0x00000300));
 						argb888x4 = _mm_or_si128(g0, g1);
 						// red:
@@ -1754,81 +1782,106 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 						const __m128i b0 = _mm_and_si128(_mm_srli_epi32(c0, 5), _mm_slli_epi32(low5mask, 16));
 						const __m128i b1 = _mm_srli_epi16(b0, 5);
 						// OR in the fixed alpha component
-						// _mm_slli_epi32( allFF, 24 ) == _mm_set_epi32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000)
-						argb888x4 = _mm_or_si128(_mm_or_si128(argb888x4, _mm_slli_epi32( allFF, 24 ) ), _mm_or_si128(b0, b1));
-						// calculate RGB2 and RGB3:
-						const __m128i rgb0 = _mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(2, 2, 0, 0));
-						const __m128i rgb1 = _mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(3, 3, 1, 1));
-						const __m128i rrggbb0 = _mm_and_si128(_mm_unpacklo_epi8(rgb0, rgb0), _mm_srli_epi16( allFF, 8 ));
-						const __m128i rrggbb1 = _mm_and_si128(_mm_unpacklo_epi8(rgb1, rgb1), _mm_srli_epi16( allFF, 8 ));
-						const __m128i rrggbb01 = _mm_and_si128(_mm_unpackhi_epi8(rgb0, rgb0), _mm_srli_epi16( allFF, 8 ));
-						const __m128i rrggbb11 = _mm_and_si128(_mm_unpackhi_epi8(rgb1, rgb1), _mm_srli_epi16( allFF, 8 ));
-						const __m128i rrggbbsub = _mm_subs_epi16(rrggbb1, rrggbb0);
-						const __m128i rrggbbsub1 = _mm_subs_epi16(rrggbb11, rrggbb01);
-#if 0
-						// RGB2b = (RGB0 + RGB1 + 1) / 2
-						const __m128i one16 = _mm_srli_epi16( allFF, 15 );
-						const __m128i rrggbb21  = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(rrggbb0, rrggbb1), one16), 1);
-						const __m128i rrggbb211 = _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(rrggbb01, rrggbb11), one16), 1);
-#else
-						// RGB2b = avg(RGB0, RGB1)
-						const __m128i rrggbb21  = _mm_avg_epu16(rrggbb0, rrggbb1);
-						const __m128i rrggbb211 = _mm_avg_epu16(rrggbb01, rrggbb11);
-#endif
-						const __m128i rgb210 = _mm_srli_si128(_mm_packus_epi16(rrggbb21, rrggbb21), 8);
-						const __m128i rgb211 = _mm_slli_si128(_mm_packus_epi16(rrggbb211, rrggbb211), 8);
-						const __m128i rgb21 = _mm_or_si128(rgb210, rgb211);
-						// RGB2a = ((RGB1 - RGB0) >> 1) - ((RGB1 - RGB0) >> 3)  using arithmetic shifts to extend sign (not logical shifts)
-						const __m128i rrggbbsubshr1 = _mm_srai_epi16(rrggbbsub, 1);
-						const __m128i rrggbbsubshr3 = _mm_srai_epi16(rrggbbsub, 3);
-						const __m128i rrggbbsubshr11 = _mm_srai_epi16(rrggbbsub1, 1);
-						const __m128i rrggbbsubshr31 = _mm_srai_epi16(rrggbbsub1, 3);
-						const __m128i shr1subshr3 = _mm_sub_epi16(rrggbbsubshr1, rrggbbsubshr3);
-						const __m128i shr1subshr31 = _mm_sub_epi16(rrggbbsubshr11, rrggbbsubshr31);
-						// low8mask16 == _mm_set_epi16(0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff)
-						const __m128i low8mask16 = _mm_srli_epi16( allFF, 8 );
-						const __m128i rrggbbdelta = _mm_and_si128(shr1subshr3, low8mask16);
-						const __m128i rrggbbdelta1 = _mm_and_si128(shr1subshr31, low8mask16);
-						const __m128i rgbdelta0 = _mm_packus_epi16(rrggbbdelta, rrggbbdelta);
-						__m128i rgbdelta1 = _mm_packus_epi16(rrggbbdelta1, rrggbbdelta1);
-						rgbdelta1 = _mm_slli_si128(rgbdelta1, 8);
-						const __m128i rgbdelta = _mm_or_si128(_mm_srli_si128(_mm_slli_si128(rgbdelta0, 8), 8), rgbdelta1);
-						const __m128i rgb20 = _mm_add_epi8(_mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(2, 2, 0, 0)), rgbdelta);
-						const __m128i rgb30 = _mm_sub_epi8(_mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(3, 3, 1, 1)), rgbdelta);
-						// _mm_srli_epi32( allFF, 8 ) == _mm_set_epi32(0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF)
-						// Make this color fully transparent:
-						const __m128i rgb31 = _mm_and_si128(_mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(3, 3, 1, 1)), _mm_srli_epi32( allFF, 8 ) );
+						// _mm_slli_epi32( allFFs128, 24 ) == _mm_set_epi32(0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000)
+						argb888x4 = _mm_or_si128(_mm_or_si128(argb888x4, _mm_slli_epi32( allFFs128, 24 ) ), _mm_or_si128(b0, b1));
+
+						__m128i rgb2, rgb3;
+
+						// if (rgb0 > rgb1):
+						if (cmp0 != 0)
+						{
+							// calculate RGB2 and RGB3:
+							const __m128i rgb0 = _mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(2, 2, 0, 0));
+							const __m128i rgb1 = _mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(3, 3, 1, 1));
+							const __m128i rrggbb0 = _mm_and_si128(_mm_unpacklo_epi8(rgb0, rgb0), _mm_srli_epi16( allFFs128, 8 ));
+							const __m128i rrggbb1 = _mm_and_si128(_mm_unpacklo_epi8(rgb1, rgb1), _mm_srli_epi16( allFFs128, 8 ));
+							const __m128i rrggbbsub = _mm_subs_epi16(rrggbb1, rrggbb0);
+
+							// RGB2a = ((RGB1 - RGB0) >> 1) - ((RGB1 - RGB0) >> 3)  using arithmetic shifts to extend sign (not logical shifts)
+							const __m128i rrggbbsubshr1 = _mm_srai_epi16(rrggbbsub, 1);
+							const __m128i rrggbbsubshr3 = _mm_srai_epi16(rrggbbsub, 3);
+							const __m128i shr1subshr3 = _mm_sub_epi16(rrggbbsubshr1, rrggbbsubshr3);
+							// low8mask16 == _mm_set_epi16(0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff)
+							const __m128i low8mask16 = _mm_srli_epi16( allFFs128, 8 );
+							const __m128i rrggbbdelta = _mm_and_si128(shr1subshr3, low8mask16);
+							const __m128i rgbdeltadup = _mm_packus_epi16(rrggbbdelta, rrggbbdelta);
+							const __m128i rgbdelta = _mm_srli_si128(_mm_slli_si128(rgbdeltadup, 8), 8);
+
+							rgb2 = _mm_and_si128(_mm_add_epi8(_mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(2, 2, 0, 0)), rgbdelta), _mm_srli_si128(allFFs128, 8));
+							rgb3 = _mm_and_si128(_mm_sub_epi8(_mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(3, 3, 1, 1)), rgbdelta), _mm_srli_si128(allFFs128, 8));
+						}
+						else
+						{
+							// calculate RGB2 and RGB3:
+							const __m128i rgb0 = _mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(2, 2, 0, 0));
+							const __m128i rgb1 = _mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(3, 3, 1, 1));
+							const __m128i rrggbb0 = _mm_and_si128(_mm_unpacklo_epi8(rgb0, rgb0), _mm_srli_epi16( allFFs128, 8 ));
+							const __m128i rrggbb1 = _mm_and_si128(_mm_unpacklo_epi8(rgb1, rgb1), _mm_srli_epi16( allFFs128, 8 ));
+							const __m128i rrggbbsub = _mm_subs_epi16(rrggbb1, rrggbb0);
+							// RGB2b = avg(RGB0, RGB1)
+							const __m128i rrggbb21  = _mm_avg_epu16(rrggbb0, rrggbb1);
+							const __m128i rgb210 = _mm_srli_si128(_mm_packus_epi16(rrggbb21, rrggbb21), 8);
+							rgb2 = rgb210;
+							rgb3 = _mm_and_si128(_mm_srli_si128(_mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(1, 1, 1, 1)), 8), _mm_srli_epi32( allFFs128, 8 ));
+						}
+
+						// if (rgb0 > rgb1):
+						if (cmp1 != 0)
+						{
+							// calculate RGB2 and RGB3:
+							const __m128i rgb0 = _mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(2, 2, 0, 0));
+							const __m128i rgb1 = _mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(3, 3, 1, 1));
+							const __m128i rrggbb01 = _mm_and_si128(_mm_unpackhi_epi8(rgb0, rgb0), _mm_srli_epi16( allFFs128, 8 ));
+							const __m128i rrggbb11 = _mm_and_si128(_mm_unpackhi_epi8(rgb1, rgb1), _mm_srli_epi16( allFFs128, 8 ));
+							const __m128i rrggbbsub1 = _mm_subs_epi16(rrggbb11, rrggbb01);
+
+							// RGB2a = ((RGB1 - RGB0) >> 1) - ((RGB1 - RGB0) >> 3)  using arithmetic shifts to extend sign (not logical shifts)
+							const __m128i rrggbbsubshr11 = _mm_srai_epi16(rrggbbsub1, 1);
+							const __m128i rrggbbsubshr31 = _mm_srai_epi16(rrggbbsub1, 3);
+							const __m128i shr1subshr31 = _mm_sub_epi16(rrggbbsubshr11, rrggbbsubshr31);
+							// low8mask16 == _mm_set_epi16(0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff)
+							const __m128i low8mask16 = _mm_srli_epi16( allFFs128, 8 );
+							const __m128i rrggbbdelta1 = _mm_and_si128(shr1subshr31, low8mask16);
+							__m128i rgbdelta1 = _mm_packus_epi16(rrggbbdelta1, rrggbbdelta1);
+							rgbdelta1 = _mm_slli_si128(rgbdelta1, 8);
+
+							rgb2 = _mm_or_si128(rgb2, _mm_and_si128(_mm_add_epi8(_mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(2, 2, 0, 0)), rgbdelta1), _mm_slli_si128(allFFs128, 8)));
+							rgb3 = _mm_or_si128(rgb3, _mm_and_si128(_mm_sub_epi8(_mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(3, 3, 1, 1)), rgbdelta1), _mm_slli_si128(allFFs128, 8)));
+						}
+						else
+						{
+							// calculate RGB2 and RGB3:
+							const __m128i rgb0 = _mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(2, 2, 0, 0));
+							const __m128i rgb1 = _mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(3, 3, 1, 1));
+							const __m128i rrggbb01 = _mm_and_si128(_mm_unpackhi_epi8(rgb0, rgb0), _mm_srli_epi16( allFFs128, 8 ));
+							const __m128i rrggbb11 = _mm_and_si128(_mm_unpackhi_epi8(rgb1, rgb1), _mm_srli_epi16( allFFs128, 8 ));
+							const __m128i rrggbbsub1 = _mm_subs_epi16(rrggbb11, rrggbb01);
+							// RGB2b = avg(RGB0, RGB1)
+							const __m128i rrggbb211 = _mm_avg_epu16(rrggbb01, rrggbb11);
+							const __m128i rgb211 = _mm_slli_si128(_mm_packus_epi16(rrggbb211, rrggbb211), 8);
+							rgb2 = _mm_or_si128(rgb2, rgb211);
+
+							// _mm_srli_epi32( allFFs128, 8 ) == _mm_set_epi32(0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF)
+							// Make this color fully transparent:
+							rgb3 = _mm_or_si128(rgb3, _mm_and_si128(_mm_and_si128(_mm_shuffle_epi32(argb888x4, _MM_SHUFFLE(3, 3, 1, 1)), _mm_srli_epi32( allFFs128, 8 ) ), _mm_slli_si128(allFFs128, 8)));
+						}
 
 						// Create an array for color lookups for DXT0 so we can use the 2-bit indices:
-						const __m128i rgb01230 = _mm_or_si128(
+						const __m128i mmcolors0 = _mm_or_si128(
 							_mm_or_si128(
 								_mm_srli_si128(_mm_slli_si128(argb888x4, 8), 8),
-								_mm_slli_si128(_mm_srli_si128(_mm_slli_si128(rgb20, 8), 8 + 4), 8)
+								_mm_slli_si128(_mm_srli_si128(_mm_slli_si128(rgb2, 8), 8 + 4), 8)
 							),
-							_mm_slli_si128(_mm_srli_si128(rgb30, 4), 8 + 4)
-						);
-						const __m128i rgb01450 = _mm_or_si128(
-							_mm_or_si128(
-								_mm_srli_si128(_mm_slli_si128(argb888x4, 8), 8),
-								_mm_slli_si128(_mm_srli_si128(_mm_slli_si128(rgb21, 8), 8 + 4), 8)
-							),
-							_mm_slli_si128(_mm_srli_si128(rgb31, 4), 8 + 4)
+							_mm_slli_si128(_mm_srli_si128(rgb3, 4), 8 + 4)
 						);
 
 						// Create an array for color lookups for DXT1 so we can use the 2-bit indices:
-						const __m128i rgb01231 = _mm_or_si128(
+						const __m128i mmcolors1 = _mm_or_si128(
 							_mm_or_si128(
 								_mm_srli_si128(argb888x4, 8),
-								_mm_slli_si128(_mm_srli_si128(rgb20, 8 + 4), 8)
+								_mm_slli_si128(_mm_srli_si128(rgb2, 8 + 4), 8)
 							),
-							_mm_slli_si128(_mm_srli_si128(rgb30, 8 + 4), 8 + 4)
-						);
-						const __m128i rgb01451 = _mm_or_si128(
-							_mm_or_si128(
-								_mm_srli_si128(argb888x4, 8),
-								_mm_slli_si128(_mm_srli_si128(rgb21, 8 + 4), 8)
-							),
-							_mm_slli_si128(_mm_srli_si128(rgb31, 8 + 4), 8 + 4)
+							_mm_slli_si128(_mm_srli_si128(rgb3, 8 + 4), 8 + 4)
 						);
 
 						// The #ifdef CHECKs here and below are to compare correctness of output against the reference code.
@@ -1841,165 +1894,70 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 						decodeDXTBlockRGBA(&(tmp1[0][0]), (const DXTBlock *)(src + 8), 4);
 #endif
 
-						// JSD NOTE: I attempted with an earlier solution to use only SSE2 intrinsics and no branching
-						// or array indexing. That solution was consistently worse than the reference C code so I ditched
-						// it in favor of having SSE2 calculate the vector-intensive stuff, then have normal C code take
-						// care of color selection and SSE2 for writes back to memory. This separation of concerns seems
-						// to work out well. I don't know exactly why the SSE2-only code did not perform very well. I
-						// suspect it was too many XMM temporaries and not enough registers to balance them all and so
-						// the assembly generated was swapping a lot of temporaries to RAM.
-
-						// Compare rgb0 to rgb1:
-
-						//                 rgb1rgb1 rgb0rgb0 rgb1rgb1 rgb0rgb0
-						// c0 example: {0x 5ef05ef0 c003c003 3ff83ff8 e003e003} in BIG ENDIAN
-						// c0cmp contains rgb0 values in 64-bit words in LITTLE ENDIAN
-						// c0cmp:      {0x 00000000 000003c0 00000000 000003e0}
-						// c0shr contains rgb1 values in 64-bit words in LITTLE ENDIAN
-						// c0cmp:      {0x 00000000 0000f05e 00000000 0000f83f}
-
-						__m128i c0cmp = _mm_srli_epi32(_mm_slli_epi32(_mm_srli_epi64(c0, 8), 16), 16);
-						const __m128i c0shr = _mm_srli_epi64(c0cmp, 32);
-						c0cmp = _mm_srli_epi64(_mm_slli_epi64(c0cmp, 48), 48);
-
-#if 0
-						// SSE4 only.
-						const __m128i cmprgb0rgb1 = _mm_cmpgt_epi64(c0cmp, c0shr);
-#else
-						// We need 64-bit words full of 0xFF bytes for comparison masks. cmgt_epi64 would do that but it's only SSE4
-						// so we fake it on SSE2 by shuffling the 32-bit comparison masks to copy the 0xFF bytes to the high 32-bits of
-						// each 64-bit word.
-						__m128i cmprgb0rgb1 = _mm_cmpgt_epi32(c0cmp, c0shr);
-						cmprgb0rgb1 = _mm_shuffle_epi32(cmprgb0rgb1, _MM_SHUFFLE(2, 2, 0, 0));
-#endif
-
-						// Now use the comparison mask from rgb0 > rgb1 to conditionally move colors from
-						// either rgb0123X or rgb0145X:
-						const __m128i cmp0 = _mm_unpacklo_epi64(cmprgb0rgb1, cmprgb0rgb1);
-						const __m128i cmp1 = _mm_unpackhi_epi64(cmprgb0rgb1, cmprgb0rgb1);
-						const __m128i mmcolors0 = _mm_or_si128(
-							_mm_and_si128(cmp0, rgb01230),
-							_mm_andnot_si128(cmp0, rgb01450)
-						);
-						const __m128i mmcolors1 = _mm_or_si128(
-							_mm_and_si128(cmp1, rgb01231),
-							_mm_andnot_si128(cmp1, rgb01451)
-						);
-
-						// Copy the color arrays from the XMM registers to local variables in RAM:
-						GC_ALIGNED16(u32 colors0[4]);
-						GC_ALIGNED16(u32 colors1[4]);
+						u32 *dst32 = ( dst + (y + z*4) * width + x );
 
+						// Copy the colors here:
+						GC_ALIGNED16( u32 colors0[4] );
+						GC_ALIGNED16( u32 colors1[4] );
 						_mm_store_si128((__m128i *)colors0, mmcolors0);
 						_mm_store_si128((__m128i *)colors1, mmcolors1);
 
-						// Copy the 2-bit indices from each DXT block:
-						GC_ALIGNED16(u32 dxttmp[4]);
-						_mm_store_si128((__m128i *)dxttmp, dxt);
-
-						u32 dxt0sel = dxttmp[1];
-						u32 dxt1sel = dxttmp[3];
-
-						// Per each row written we alternate storing RGBA values from DXT0 and DXT1.
-
-						__m128i *dst128 = (__m128i *)( dst + (y + z*4) * width + x );
-
 						// Row 0:
-						__m128i col0, col1;
-						col0 = _mm_set_epi32(
-							colors0[(dxt0sel >> ((0*8)+0)) & 3],
-							colors0[(dxt0sel >> ((0*8)+2)) & 3],
-							colors0[(dxt0sel >> ((0*8)+4)) & 3],
-							colors0[(dxt0sel >> ((0*8)+6)) & 3]
-						);
-						_mm_store_si128(dst128 + ((width / 4) * 0), col0);
+						dst32[(width * 0) + 0] = colors0[(dxt0sel >> ((0*8)+6)) & 3];
+						dst32[(width * 0) + 1] = colors0[(dxt0sel >> ((0*8)+4)) & 3];
+						dst32[(width * 0) + 2] = colors0[(dxt0sel >> ((0*8)+2)) & 3];
+						dst32[(width * 0) + 3] = colors0[(dxt0sel >> ((0*8)+0)) & 3];
+						dst32[(width * 0) + 4] = colors1[(dxt1sel >> ((0*8)+6)) & 3];
+						dst32[(width * 0) + 5] = colors1[(dxt1sel >> ((0*8)+4)) & 3];
+						dst32[(width * 0) + 6] = colors1[(dxt1sel >> ((0*8)+2)) & 3];
+						dst32[(width * 0) + 7] = colors1[(dxt1sel >> ((0*8)+0)) & 3];
 #ifdef CHECK
-						assert( memcmp(&(tmp0[0]), dst128 + ((width / 4) * 0), 16) == 0 );
+						assert( memcmp(&(tmp0[0]), &dst32[(width * 0)], 16) == 0 );
+						assert( memcmp(&(tmp1[0]), &dst32[(width * 0) + 4], 16) == 0 );
 #endif
-
-						col1 = _mm_set_epi32(
-							colors1[(dxt1sel >> ((0*8)+0)) & 3],
-							colors1[(dxt1sel >> ((0*8)+2)) & 3],
-							colors1[(dxt1sel >> ((0*8)+4)) & 3],
-							colors1[(dxt1sel >> ((0*8)+6)) & 3]
-						);
-						_mm_store_si128(dst128 + ((width / 4) * 0) + 1, col1);
-#ifdef CHECK
-						assert( memcmp(&(tmp1[0]), dst128 + ((width / 4) * 0) + 1, 16) == 0 );
-#endif
-
 						// Row 1:
-						col0 = _mm_set_epi32(
-							colors0[(dxt0sel >> ((1*8)+0)) & 3],
-							colors0[(dxt0sel >> ((1*8)+2)) & 3],
-							colors0[(dxt0sel >> ((1*8)+4)) & 3],
-							colors0[(dxt0sel >> ((1*8)+6)) & 3]
-						);
-						_mm_store_si128(dst128 + ((width / 4) * 1), col0);
+						dst32[(width * 1) + 0] = colors0[(dxt0sel >> ((1*8)+6)) & 3];
+						dst32[(width * 1) + 1] = colors0[(dxt0sel >> ((1*8)+4)) & 3];
+						dst32[(width * 1) + 2] = colors0[(dxt0sel >> ((1*8)+2)) & 3];
+						dst32[(width * 1) + 3] = colors0[(dxt0sel >> ((1*8)+0)) & 3];
+						dst32[(width * 1) + 4] = colors1[(dxt1sel >> ((1*8)+6)) & 3];
+						dst32[(width * 1) + 5] = colors1[(dxt1sel >> ((1*8)+4)) & 3];
+						dst32[(width * 1) + 6] = colors1[(dxt1sel >> ((1*8)+2)) & 3];
+						dst32[(width * 1) + 7] = colors1[(dxt1sel >> ((1*8)+0)) & 3];
 #ifdef CHECK
-						assert( memcmp(&(tmp0[1]), dst128 + ((width / 4) * 1), 16) == 0 );
+						assert( memcmp(&(tmp0[1]), &dst32[(width * 1)], 16) == 0 );
+						assert( memcmp(&(tmp1[1]), &dst32[(width * 1) + 4], 16) == 0 );
 #endif
-
-						col1 = _mm_set_epi32(
-							colors1[(dxt1sel >> ((1*8)+0)) & 3],
-							colors1[(dxt1sel >> ((1*8)+2)) & 3],
-							colors1[(dxt1sel >> ((1*8)+4)) & 3],
-							colors1[(dxt1sel >> ((1*8)+6)) & 3]
-						);
-						_mm_store_si128(dst128 + ((width / 4) * 1) + 1, col1);
-#ifdef CHECK
-						assert( memcmp(&(tmp1[1]), dst128 + ((width / 4) * 1) + 1, 16) == 0 );
-#endif
-
 						// Row 2:
-						col0 = _mm_set_epi32(
-							colors0[(dxt0sel >> ((2*8)+0)) & 3],
-							colors0[(dxt0sel >> ((2*8)+2)) & 3],
-							colors0[(dxt0sel >> ((2*8)+4)) & 3],
-							colors0[(dxt0sel >> ((2*8)+6)) & 3]
-						);
-						_mm_store_si128(dst128 + ((width / 4) * 2), col0);
+						dst32[(width * 2) + 0] = colors0[(dxt0sel >> ((2*8)+6)) & 3];
+						dst32[(width * 2) + 1] = colors0[(dxt0sel >> ((2*8)+4)) & 3];
+						dst32[(width * 2) + 2] = colors0[(dxt0sel >> ((2*8)+2)) & 3];
+						dst32[(width * 2) + 3] = colors0[(dxt0sel >> ((2*8)+0)) & 3];
+						dst32[(width * 2) + 4] = colors1[(dxt1sel >> ((2*8)+6)) & 3];
+						dst32[(width * 2) + 5] = colors1[(dxt1sel >> ((2*8)+4)) & 3];
+						dst32[(width * 2) + 6] = colors1[(dxt1sel >> ((2*8)+2)) & 3];
+						dst32[(width * 2) + 7] = colors1[(dxt1sel >> ((2*8)+0)) & 3];
 #ifdef CHECK
-						assert( memcmp(&(tmp0[2]), dst128 + ((width / 4) * 2), 16) == 0 );
+						assert( memcmp(&(tmp0[2]), &dst32[(width * 2)], 16) == 0 );
+						assert( memcmp(&(tmp1[2]), &dst32[(width * 2) + 4], 16) == 0 );
 #endif
-
-						col1 = _mm_set_epi32(
-							colors1[(dxt1sel >> ((2*8)+0)) & 3],
-							colors1[(dxt1sel >> ((2*8)+2)) & 3],
-							colors1[(dxt1sel >> ((2*8)+4)) & 3],
-							colors1[(dxt1sel >> ((2*8)+6)) & 3]
-						);
-						_mm_store_si128(dst128 + ((width / 4) * 2) + 1, col1);
-#ifdef CHECK
-						assert( memcmp(&(tmp1[2]), dst128 + ((width / 4) * 2) + 1, 16) == 0 );
-#endif
-
 						// Row 3:
-						col0 = _mm_set_epi32(
-							colors0[(dxt0sel >> ((3*8)+0)) & 3],
-							colors0[(dxt0sel >> ((3*8)+2)) & 3],
-							colors0[(dxt0sel >> ((3*8)+4)) & 3],
-							colors0[(dxt0sel >> ((3*8)+6)) & 3]
-						);
-						_mm_store_si128(dst128 + ((width / 4) * 3), col0);
+						dst32[(width * 3) + 0] = colors0[(dxt0sel >> ((3*8)+6)) & 3];
+						dst32[(width * 3) + 1] = colors0[(dxt0sel >> ((3*8)+4)) & 3];
+						dst32[(width * 3) + 2] = colors0[(dxt0sel >> ((3*8)+2)) & 3];
+						dst32[(width * 3) + 3] = colors0[(dxt0sel >> ((3*8)+0)) & 3];
+						dst32[(width * 3) + 4] = colors1[(dxt1sel >> ((3*8)+6)) & 3];
+						dst32[(width * 3) + 5] = colors1[(dxt1sel >> ((3*8)+4)) & 3];
+						dst32[(width * 3) + 6] = colors1[(dxt1sel >> ((3*8)+2)) & 3];
+						dst32[(width * 3) + 7] = colors1[(dxt1sel >> ((3*8)+0)) & 3];
 #ifdef CHECK
-						assert( memcmp(&(tmp0[3]), dst128 + ((width / 4) * 3), 16) == 0 );
-#endif
-
-						col1 = _mm_set_epi32(
-							colors1[(dxt1sel >> ((3*8)+0)) & 3],
-							colors1[(dxt1sel >> ((3*8)+2)) & 3],
-							colors1[(dxt1sel >> ((3*8)+4)) & 3],
-							colors1[(dxt1sel >> ((3*8)+6)) & 3]
-						);
-						_mm_store_si128(dst128 + ((width / 4) * 3) + 1, col1);
-#ifdef CHECK
-						assert( memcmp(&(tmp1[3]), dst128 + ((width / 4) * 3) + 1, 16) == 0 );
+						assert( memcmp(&(tmp0[3]), &dst32[(width * 3)], 16) == 0 );
+						assert( memcmp(&(tmp1[3]), &dst32[(width * 3) + 4], 16) == 0 );
 #endif
 					}
 				}
 			}
-#else
+#if 0
 			for (int y = 0; y < height; y += 8)
 			{
 				for (int x = 0; x < width; x += 8)