Merge pull request #4642 from stenzek/split-x64-texture-decoders

TextureDecoder: Seperate each format into its own function
2017-01-10 12:06:47 +01:00 · 2017-01-10 12:06:47 +01:00 · 52ec186f0a
parent a807db1751 2f223e24dc
commit 52ec186f0a
1 changed files with 1213 additions and 1069 deletions
--- a/Source/Core/VideoCommon/TextureDecoder_x64.cpp
+++ b/Source/Core/VideoCommon/TextureDecoder_x64.cpp
@ -10,6 +10,7 @@
 #include "Common/CommonFuncs.h"
 #include "Common/CommonTypes.h"
 #include "Common/Intrinsics.h"
 #include "Common/MsgHandler.h"
 #include "VideoCommon/LookUpTables.h"
 #include "VideoCommon/TextureDecoder.h"
@ -221,39 +222,35 @@ static void DecodeDXTBlock(u32* dst, const DXTBlock* src, int pitch)
 // JSD 01/06/11:
 // TODO: we really should ensure BOTH the source and destination addresses are aligned to 16-byte
-// boundaries to
+// boundaries to squeeze out a little more performance. _mm_loadu_si128/_mm_storeu_si128 is slower
-// squeeze out a little more performance. _mm_loadu_si128/_mm_storeu_si128 is slower than
+// than _mm_load_si128/_mm_store_si128 because they work on unaligned addresses. The processor is
-// _mm_load_si128/_mm_store_si128
+// free to make the assumption that addresses are multiples of 16 in the aligned case.
 // because they work on unaligned addresses. The processor is free to make the assumption that
 // addresses are multiples
 // of 16 in the aligned case.
 // TODO: complete SSE2 optimization of less often used texture formats.
 // TODO: refactor algorithms using _mm_loadl_epi64 unaligned loads to prefer 128-bit aligned loads.
-
+static void TexDecoder_DecodeImpl_C4(u32* dst, const u8* src, int width, int height, int texformat,
-void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int texformat,
+                                     const u8* tlut, TlutFormat tlutfmt, int Wsteps4, int Wsteps8)
                            const u8* tlut, TlutFormat tlutfmt)
 {
-  const int Wsteps4 = (width + 3) / 4;
+  switch (tlutfmt)
  const int Wsteps8 = (width + 7) / 8;
  switch (texformat)
  {
-  case GX_TF_C4:
+  case GX_TL_RGB5A3:
    if (tlutfmt == GX_TL_RGB5A3)
  {
    for (int y = 0; y < height; y += 8)
      for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
        for (int iy = 0, xStep = 8 * yStep; iy < 8; iy++, xStep++)
          DecodeBytes_C4_RGB5A3(dst + (y + iy) * width + x, src + 4 * xStep, tlut);
  }
-    else if (tlutfmt == GX_TL_IA8)
+  break;
  case GX_TL_IA8:
  {
    for (int y = 0; y < height; y += 8)
      for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
        for (int iy = 0, xStep = 8 * yStep; iy < 8; iy++, xStep++)
          DecodeBytes_C4_IA8(dst + (y + iy) * width + x, src + 4 * xStep, tlut);
  }
-    else if (tlutfmt == GX_TL_RGB565)
+  break;
  case GX_TL_RGB565:
  {
    for (int y = 0; y < height; y += 8)
      for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
@ -261,21 +258,30 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
          DecodeBytes_C4_RGB565(dst + (y + iy) * width + x, src + 4 * xStep, tlut);
  }
  break;
-  case GX_TF_I4:
+
-  {
+  default:
    break;
  }
 }
 static void TexDecoder_DecodeImpl_I4_SSSE3(u32* dst, const u8* src, int width, int height,
                                           int texformat, const u8* tlut, TlutFormat tlutfmt,
                                           int Wsteps4, int Wsteps8)
 {
 #if _M_SSE >= 0x301
  const __m128i kMask_x0f = _mm_set1_epi32(0x0f0f0f0fL);
  const __m128i kMask_xf0 = _mm_set1_epi32(0xf0f0f0f0L);
-#if _M_SSE >= 0x301
+
  // xsacha optimized with SSSE3 intrinsics
  // Produces a ~40% speed improvement over SSE2 implementation
    if (cpu_info.bSSSE3)
    {
  const __m128i mask9180 = _mm_set_epi8(9, 9, 9, 9, 1, 1, 1, 1, 8, 8, 8, 8, 0, 0, 0, 0);
  const __m128i maskB3A2 = _mm_set_epi8(11, 11, 11, 11, 3, 3, 3, 3, 10, 10, 10, 10, 2, 2, 2, 2);
  const __m128i maskD5C4 = _mm_set_epi8(13, 13, 13, 13, 5, 5, 5, 5, 12, 12, 12, 12, 4, 4, 4, 4);
  const __m128i maskF7E6 = _mm_set_epi8(15, 15, 15, 15, 7, 7, 7, 7, 14, 14, 14, 14, 6, 6, 6, 6);
  for (int y = 0; y < height; y += 8)
  {
    for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
    {
      for (int iy = 0, xStep = 4 * yStep; iy < 8; iy += 2, xStep++)
      {
        const __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + 8 * xStep));
@ -304,13 +310,22 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
        _mm_storeu_si128((__m128i*)(dst + (y + iy + 1) * width + x + 4), o4);
      }
    }
-    else
+  }
 #endif
 }
 static void TexDecoder_DecodeImpl_I4(u32* dst, const u8* src, int width, int height, int texformat,
                                     const u8* tlut, TlutFormat tlutfmt, int Wsteps4, int Wsteps8)
 {
  const __m128i kMask_x0f = _mm_set1_epi32(0x0f0f0f0fL);
  const __m128i kMask_xf0 = _mm_set1_epi32(0xf0f0f0f0L);
  // JSD optimized with SSE2 intrinsics.
  // Produces a ~76% speed improvement over reference C implementation.
    {
  for (int y = 0; y < height; y += 8)
  {
    for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
    {
      for (int iy = 0, xStep = 4 * yStep; iy < 8; iy += 2, xStep++)
      {
        const __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + 8 * xStep));
@ -387,24 +402,26 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
      }
    }
  }
-  break;
+}
-  case GX_TF_I8:  // speed critical
+
-  {
+static void TexDecoder_DecodeImpl_I8_SSSE3(u32* dst, const u8* src, int width, int height,
                                           int texformat, const u8* tlut, TlutFormat tlutfmt,
                                           int Wsteps4, int Wsteps8)
 {
 #if _M_SSE >= 0x301
  // xsacha optimized with SSSE3 intrinsics
  // Produces a ~10% speed improvement over SSE2 implementation
    if (cpu_info.bSSSE3)
    {
  for (int y = 0; y < height; y += 4)
  {
    for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
    {
      for (int iy = 0, xStep = 4 * yStep; iy < 4; ++iy, xStep++)
      {
        const __m128i mask3210 = _mm_set_epi8(3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0);
        const __m128i mask7654 = _mm_set_epi8(7, 7, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4);
        __m128i *quaddst, r, rgba0, rgba1;
-            // Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe
+        // Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
            // dcba)
        r = _mm_loadl_epi64((const __m128i*)(src + 8 * xStep));
        // Shuffle select bytes to expand from (0000 0000 hgfe dcba) to:
        rgba0 = _mm_shuffle_epi8(r, mask3210);  // (dddd cccc bbbb aaaa)
@ -415,27 +432,28 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
        _mm_storeu_si128(quaddst + 1, rgba1);
      }
    }
-    else
+  }
 #endif
 }
 static void TexDecoder_DecodeImpl_I8(u32* dst, const u8* src, int width, int height, int texformat,
                                     const u8* tlut, TlutFormat tlutfmt, int Wsteps4, int Wsteps8)
 {
  // JSD optimized with SSE2 intrinsics.
  // Produces an ~86% speed improvement over reference C implementation.
    {
  for (int y = 0; y < height; y += 4)
  {
    for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
    {
      // Each loop iteration processes 4 rows from 4 64-bit reads.
      const u8* src2 = src + 32 * yStep;
      // TODO: is it more efficient to group the loads together sequentially and also the stores
-          // at the end?
+      // at the end? _mm_stream instead of _mm_store on my AMD Phenom II x410 made performance
-          // _mm_stream instead of _mm_store on my AMD Phenom II x410 made performance significantly
+      // significantly WORSE, so I went with _mm_stores. Perhaps there is some edge case here
-          // WORSE, so I
+      // creating the terrible performance or we're not aligned to 16-byte boundaries. I don't know.
          // went with _mm_stores. Perhaps there is some edge case here creating the terrible
          // performance or we're
          // not aligned to 16-byte boundaries. I don't know.
      __m128i* quaddst;
-          // Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe
+      // Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
          // dcba)
      const __m128i r0 = _mm_loadl_epi64((const __m128i*)src2);
      // Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc
      // bbaa)
@ -454,8 +472,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
      // Store (hhhh gggg ffff eeee) out:
      _mm_storeu_si128(quaddst + 1, rgba1);
-          // Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe
+      // Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
          // dcba)
      src2 += 8;
      const __m128i r2 = _mm_loadl_epi64((const __m128i*)src2);
      // Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc
@ -475,8 +492,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
      // Store (hhhh gggg ffff eeee) out:
      _mm_storeu_si128(quaddst + 1, rgba3);
-          // Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe
+      // Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
          // dcba)
      src2 += 8;
      const __m128i r4 = _mm_loadl_epi64((const __m128i*)src2);
      // Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc
@ -496,8 +512,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
      // Store (hhhh gggg ffff eeee) out:
      _mm_storeu_si128(quaddst + 1, rgba5);
-          // Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe
+      // Load 64 bits from `src` into an __m128i with upper 64 bits zeroed: (0000 0000 hgfe dcba)
          // dcba)
      src2 += 8;
      const __m128i r6 = _mm_loadl_epi64((const __m128i*)src2);
      // Shuffle low 64-bits with itself to expand from (0000 0000 hgfe dcba) to (hhgg ffee ddcc
@ -518,24 +533,32 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
      _mm_storeu_si128(quaddst + 1, rgba7);
    }
  }
-  }
+}
-  break;
+
-  case GX_TF_C8:
+static void TexDecoder_DecodeImpl_C8(u32* dst, const u8* src, int width, int height, int texformat,
-    if (tlutfmt == GX_TL_RGB5A3)
+                                     const u8* tlut, TlutFormat tlutfmt, int Wsteps4, int Wsteps8)
 {
  switch (tlutfmt)
  {
  case GX_TL_RGB5A3:
  {
    for (int y = 0; y < height; y += 4)
      for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
        for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
          DecodeBytes_C8_RGB5A3((u32*)dst + (y + iy) * width + x, src + 8 * xStep, tlut);
  }
-    else if (tlutfmt == GX_TL_IA8)
+  break;
  case GX_TL_IA8:
  {
    for (int y = 0; y < height; y += 4)
      for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
        for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
          DecodeBytes_C8_IA8(dst + (y + iy) * width + x, src + 8 * xStep, tlut);
  }
-    else if (tlutfmt == GX_TL_RGB565)
+  break;
  case GX_TL_RGB565:
  {
    for (int y = 0; y < height; y += 4)
      for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
@ -543,23 +566,38 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
          DecodeBytes_C8_RGB565(dst + (y + iy) * width + x, src + 8 * xStep, tlut);
  }
  break;
-  case GX_TF_IA4:
+
-  {
+  default:
    break;
  }
 }
 static void TexDecoder_DecodeImpl_IA4(u32* dst, const u8* src, int width, int height, int texformat,
                                      const u8* tlut, TlutFormat tlutfmt, int Wsteps4, int Wsteps8)
 {
  for (int y = 0; y < height; y += 4)
  {
    for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
    {
      for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
      {
        DecodeBytes_IA4(dst + (y + iy) * width + x, src + 8 * xStep);
      }
-  break;
+    }
-  case GX_TF_IA8:
+  }
-  {
+}
 static void TexDecoder_DecodeImpl_IA8_SSSE3(u32* dst, const u8* src, int width, int height,
                                            int texformat, const u8* tlut, TlutFormat tlutfmt,
                                            int Wsteps4, int Wsteps8)
 {
 #if _M_SSE >= 0x301
  // xsacha optimized with SSSE3 intrinsics.
  // Produces an ~50% speed improvement over SSE2 implementation.
    if (cpu_info.bSSSE3)
    {
  for (int y = 0; y < height; y += 4)
  {
    for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
    {
      for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
      {
        const __m128i mask = _mm_set_epi8(6, 7, 7, 7, 4, 5, 5, 5, 2, 3, 3, 3, 0, 1, 1, 1);
@ -571,17 +609,23 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
        _mm_storeu_si128((__m128i*)(dst + (y + iy) * width + x), r1);
      }
    }
-    else
+  }
 #endif
 }
 static void TexDecoder_DecodeImpl_IA8(u32* dst, const u8* src, int width, int height, int texformat,
                                      const u8* tlut, TlutFormat tlutfmt, int Wsteps4, int Wsteps8)
 {
  // JSD optimized with SSE2 intrinsics.
  // Produces an ~80% speed improvement over reference C implementation.
    {
  const __m128i kMask_xf0 = _mm_set_epi32(0x00000000L, 0x00000000L, 0xff00ff00L, 0xff00ff00L);
  const __m128i kMask_x0f = _mm_set_epi32(0x00000000L, 0x00000000L, 0x00ff00ffL, 0x00ff00ffL);
  const __m128i kMask_xf000 = _mm_set_epi32(0xff000000L, 0xff000000L, 0xff000000L, 0xff000000L);
  const __m128i kMask_x0fff = _mm_set_epi32(0x00ffffffL, 0x00ffffffL, 0x00ffffffL, 0x00ffffffL);
  for (int y = 0; y < height; y += 4)
  {
    for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
    {
      for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
      {
        // Expands a 16-bit "IA" to a 32-bit "AIII". Each char is an 8-bit value.
@ -591,8 +635,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
        const __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + 8 * xStep));
        // Logical shift all 16-bit words right by 8 bits (0000 0000 hgfe dcba) to (0000 0000
-            // 0h0f 0d0b)
+        // 0h0f 0d0b). This gets us only the I components.
            // This gets us only the I components.
        const __m128i i0 = _mm_srli_epi16(r0, 8);
        // Now join up the I components from their original positions but mask out the A
@ -608,8 +651,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
        const __m128i i3 = _mm_and_si128(i2, kMask_x0fff);
        // Now that we have the I components in 32-bit word form, time work out the A components
-            // into
+        // into their final positions.
            // their final positions.
        // (0000 0000 hgfe dcba) &      kMask_x00FF      -> (0000 0000 0g0e 0c0a)
        const __m128i a0 = _mm_and_si128(r0, kMask_x0f);
@ -629,23 +671,33 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
      }
    }
  }
-  break;
+}
-  case GX_TF_C14X2:
+
-    if (tlutfmt == GX_TL_RGB5A3)
+static void TexDecoder_DecodeImpl_C14X2(u32* dst, const u8* src, int width, int height,
                                        int texformat, const u8* tlut, TlutFormat tlutfmt,
                                        int Wsteps4, int Wsteps8)
 {
  switch (tlutfmt)
  {
  case GX_TL_RGB5A3:
  {
    for (int y = 0; y < height; y += 4)
      for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
        for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
          DecodeBytes_C14X2_RGB5A3(dst + (y + iy) * width + x, (u16*)(src + 8 * xStep), tlut);
  }
-    else if (tlutfmt == GX_TL_IA8)
+  break;
  case GX_TL_IA8:
  {
    for (int y = 0; y < height; y += 4)
      for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
        for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
          DecodeBytes_C14X2_IA8(dst + (y + iy) * width + x, (u16*)(src + 8 * xStep), tlut);
  }
-    else if (tlutfmt == GX_TL_RGB565)
+  break;
  case GX_TL_RGB565:
  {
    for (int y = 0; y < height; y += 4)
      for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
@ -653,8 +705,16 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
          DecodeBytes_C14X2_RGB565(dst + (y + iy) * width + x, (u16*)(src + 8 * xStep), tlut);
  }
  break;
-  case GX_TF_RGB565:
+
-  {
+  default:
    break;
  }
 }
 static void TexDecoder_DecodeImpl_RGB565(u32* dst, const u8* src, int width, int height,
                                         int texformat, const u8* tlut, TlutFormat tlutfmt,
                                         int Wsteps4, int Wsteps8)
 {
  // JSD optimized with SSE2 intrinsics.
  // Produces an ~78% speed improvement over reference C implementation.
  const __m128i kMaskR0 = _mm_set1_epi32(0x000000F8);
@ -663,7 +723,9 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
  const __m128i kMaskB0 = _mm_set1_epi32(0x00F80000);
  const __m128i kAlpha = _mm_set1_epi32(0xFF000000);
  for (int y = 0; y < height; y += 4)
  {
    for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
    {
      for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
      {
        __m128i* dxtsrc = (__m128i*)(src + 8 * xStep);
@ -672,9 +734,8 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
        const __m128i rgb565x4 = _mm_loadl_epi64(dxtsrc);
        // The big-endian 16-bit colors `ba` and `dc` look like 0b_gggBBBbb_RRRrrGGg in a little
-          // endian xmm register
+        // endian xmm register Unpack `hgfe dcba` to `hhgg ffee ddcc bbaa`, where each 32-bit word
-          // Unpack `hgfe dcba` to `hhgg ffee ddcc bbaa`, where each 32-bit word is now
+        // is now 0b_gggBBBbb_RRRrrGGg_gggBBBbb_RRRrrGGg
          // 0b_gggBBBbb_RRRrrGGg_gggBBBbb_RRRrrGGg
        const __m128i c0 = _mm_unpacklo_epi16(rgb565x4, rgb565x4);
        // swizzle 0b_gggBBBbb_RRRrrGGg_gggBBBbb_RRRrrGGg
@ -718,9 +779,14 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
        _mm_storeu_si128(ptr, abgr888x4);
      }
    }
-  break;
+  }
-  case GX_TF_RGB5A3:
+}
-  {
+
 static void TexDecoder_DecodeImpl_RGB5A3_SSSE3(u32* dst, const u8* src, int width, int height,
                                               int texformat, const u8* tlut, TlutFormat tlutfmt,
                                               int Wsteps4, int Wsteps8)
 {
 #if _M_SSE >= 0x301
  const __m128i kMask_x1f = _mm_set1_epi32(0x0000001fL);
  const __m128i kMask_x0f = _mm_set1_epi32(0x0000000fL);
  const __m128i kMask_x07 = _mm_set1_epi32(0x00000007L);
@ -728,22 +794,20 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
  // for the RGB555 case when (s[x] & 0x8000) is true for all pixels.
  const __m128i aVxff00 = _mm_set1_epi32(0xFF000000L);
 #if _M_SSE >= 0x301
  // xsacha optimized with SSSE3 intrinsics (2 in 4 cases)
  // Produces a ~10% speed improvement over SSE2 implementation
    if (cpu_info.bSSSE3)
    {
  for (int y = 0; y < height; y += 4)
  {
    for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
    {
      for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
      {
        u32* newdst = dst + (y + iy) * width + x;
-            const __m128i mask = _mm_set_epi8(-128, -128, 6, 7, -128, -128, 4, 5, -128, -128, 2, 3,
+        const __m128i mask =
-                                              -128, -128, 0, 1);
+            _mm_set_epi8(-128, -128, 6, 7, -128, -128, 4, 5, -128, -128, 2, 3, -128, -128, 0, 1);
        const __m128i valV =
            _mm_shuffle_epi8(_mm_loadl_epi64((const __m128i*)(src + 8 * xStep)), mask);
-            int cmp =
+        int cmp = _mm_movemask_epi8(valV);  // MSB: 0x2 = val0; 0x20=val1; 0x200 = val2; 0x2000=val3
                _mm_movemask_epi8(valV);  // MSB: 0x2 = val0; 0x20=val1; 0x200 = val2; 0x2000=val3
        if ((cmp & 0x2222) ==
            0x2222)  // SSSE3 case #1: all 4 pixels are in RGB555 and alpha = 0xFF.
        {
@ -781,8 +845,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
          // b0 = (((val0    ) & 0xf) << 4) | ((val0    ) & 0xf);
          const __m128i tmpbV = _mm_and_si128(valV, kMask_x0f);
          const __m128i bV = _mm_or_si128(_mm_slli_epi16(tmpbV, 4), tmpbV);
-              // a0 = (((val0>>12) & 0x7) << 5) | (((val0>>12) & 0x7) << 2) | (((val0>>12) & 0x7) >>
+          // a0 = (((val0>>12) & 0x7) << 5) | (((val0>>12) & 0x7) << 2) | (((val0>>12) & 0x7) >> 1);
              // 1);
          const __m128i tmpaV = _mm_and_si128(_mm_srli_epi16(valV, 12), kMask_x07);
          const __m128i aV =
              _mm_or_si128(_mm_slli_epi16(tmpaV, 5),
@ -823,13 +886,27 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
        }
      }
    }
-    else
+  }
 #endif
 }
 static void TexDecoder_DecodeImpl_RGB5A3(u32* dst, const u8* src, int width, int height,
                                         int texformat, const u8* tlut, TlutFormat tlutfmt,
                                         int Wsteps4, int Wsteps8)
 {
  const __m128i kMask_x1f = _mm_set1_epi32(0x0000001fL);
  const __m128i kMask_x0f = _mm_set1_epi32(0x0000000fL);
  const __m128i kMask_x07 = _mm_set1_epi32(0x00000007L);
  // This is the hard-coded 0xFF alpha constant that is ORed in place after the RGB are calculated
  // for the RGB555 case when (s[x] & 0x8000) is true for all pixels.
  const __m128i aVxff00 = _mm_set1_epi32(0xFF000000L);
  // JSD optimized with SSE2 intrinsics (2 in 4 cases)
  // Produces a ~25% speed improvement over reference C implementation.
    {
  for (int y = 0; y < height; y += 4)
  {
    for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
    {
      for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
      {
        u32* newdst = dst + (y + iy) * width + x;
@ -869,8 +946,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
          // write the final result:
          _mm_storeu_si128((__m128i*)newdst, final);
        }
-            else if (((val0 & 0x8000) | (val1 & 0x8000) | (val2 & 0x8000) | (val3 & 0x8000)) ==
+        else if (((val0 & 0x8000) | (val1 & 0x8000) | (val2 & 0x8000) | (val3 & 0x8000)) == 0x0000)
                     0x0000)
        {
          // SSE2 case #2: all 4 pixels are in RGBA4443.
@ -888,8 +964,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
          const __m128i tmpbV = _mm_and_si128(valV, kMask_x0f);
          const __m128i bV = _mm_or_si128(_mm_slli_epi16(tmpbV, 4), tmpbV);
-              // a0 = (((val0>>12) & 0x7) << 5) | (((val0>>12) & 0x7) << 2) | (((val0>>12) & 0x7) >>
+          // a0 = (((val0>>12) & 0x7) << 5) | (((val0>>12) & 0x7) << 2) | (((val0>>12) & 0x7) >> 1);
              // 1);
          const __m128i tmpaV = _mm_and_si128(_mm_srli_epi16(valV, 12), kMask_x07);
          const __m128i aV =
              _mm_or_si128(_mm_slli_epi16(tmpaV, 5),
@ -933,20 +1008,21 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
      }
    }
  }
-  break;
+}
-  case GX_TF_RGBA8:  // speed critical
+
-  {
+static void TexDecoder_DecodeImpl_RGBA8_SSSE3(u32* dst, const u8* src, int width, int height,
                                              int texformat, const u8* tlut, TlutFormat tlutfmt,
                                              int Wsteps4, int Wsteps8)
 {
 #if _M_SSE >= 0x301
  // xsacha optimized with SSSE3 instrinsics
  // Produces a ~30% speed improvement over SSE2 implementation
    if (cpu_info.bSSSE3)
    {
  for (int y = 0; y < height; y += 4)
  {
    for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
    {
      const u8* src2 = src + 64 * yStep;
-          const __m128i mask0312 =
+      const __m128i mask0312 = _mm_set_epi8(12, 15, 13, 14, 8, 11, 9, 10, 4, 7, 5, 6, 0, 3, 1, 2);
              _mm_set_epi8(12, 15, 13, 14, 8, 11, 9, 10, 4, 7, 5, 6, 0, 3, 1, 2);
      const __m128i ar0 = _mm_loadu_si128((__m128i*)src2);
      const __m128i ar1 = _mm_loadu_si128((__m128i*)src2 + 1);
      const __m128i gb0 = _mm_loadu_si128((__m128i*)src2 + 2);
@ -967,34 +1043,36 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
      _mm_storeu_si128(dst128, rgba11);
    }
  }
    else
 #endif
 }
 static void TexDecoder_DecodeImpl_RGBA8(u32* dst, const u8* src, int width, int height,
                                        int texformat, const u8* tlut, TlutFormat tlutfmt,
                                        int Wsteps4, int Wsteps8)
 {
  // JSD optimized with SSE2 intrinsics
  // Produces a ~68% speed improvement over reference C implementation.
    {
  for (int y = 0; y < height; y += 4)
  {
    for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
    {
      // Input is divided up into 16-bit words. The texels are split up into AR and GB
-          // components where all
+      // components where all AR components come grouped up first in 32 bytes followed by the GB
-          // AR components come grouped up first in 32 bytes followed by the GB components in 32
+      // components in 32 bytes. We are processing 16 texels per each loop iteration, numbered from
-          // bytes. We are
+      // 0-f.
          // processing 16 texels per each loop iteration, numbered from 0-f.
      //
      // Convention is:
      //   one byte is [component-name texel-number]
      //    __m128i is (4-bytes 4-bytes 4-bytes 4-bytes)
      //
-          // Input  is ([A 7][R 7][A 6][R 6] [A 5][R 5][A 4][R 4] [A 3][R 3][A 2][R 2] [A 1][R 1][A
+      // Input is:
-          // 0][R 0])
+      //   ([A 7][R 7][A 6][R 6] [A 5][R 5][A 4][R 4] [A 3][R 3][A 2][R 2] [A 1][R 1][A 0][R 0])
-          //           ([A f][R f][A e][R e] [A d][R d][A c][R c] [A b][R b][A a][R a] [A 9][R 9][A
+      //   ([A f][R f][A e][R e] [A d][R d][A c][R c] [A b][R b][A a][R a] [A 9][R 9][A 8][R 8])
-          //           8][R 8])
+      //   ([G 7][B 7][G 6][B 6] [G 5][B 5][G 4][B 4] [G 3][B 3][G 2][B 2] [G 1][B 1][G 0][B 0])
-          //           ([G 7][B 7][G 6][B 6] [G 5][B 5][G 4][B 4] [G 3][B 3][G 2][B 2] [G 1][B 1][G
+      //   ([G f][B f][G e][B e] [G d][B d][G c][B c] [G b][B b][G a][B a] [G 9][B 9][G 8][B 8])
          //           0][B 0])
          //           ([G f][B f][G e][B e] [G d][B d][G c][B c] [G b][B b][G a][B a] [G 9][B 9][G
          //           8][B 8])
      //
-          // Output is (RGBA3 RGBA2 RGBA1 RGBA0)
+      // Output is:
      //   (RGBA3 RGBA2 RGBA1 RGBA0)
      //   (RGBA7 RGBA6 RGBA5 RGBA4)
      //   (RGBAb RGBAa RGBA9 RGBA8)
      //   (RGBAf RGBAe RGBAd RGBAc)
@ -1012,28 +1090,21 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
      // b][G a][B a] [G 9][B 9][G 8][B 8])
      const __m128i gb1 = _mm_loadu_si128((__m128i*)src2 + 3);
      __m128i rgba00, rgba01, rgba10, rgba11;
-          const __m128i kMask_x000f =
+      const __m128i kMask_x000f = _mm_set_epi32(0x000000FFL, 0x000000FFL, 0x000000FFL, 0x000000FFL);
-              _mm_set_epi32(0x000000FFL, 0x000000FFL, 0x000000FFL, 0x000000FFL);
+      const __m128i kMask_xf000 = _mm_set_epi32(0xFF000000L, 0xFF000000L, 0xFF000000L, 0xFF000000L);
-          const __m128i kMask_xf000 =
+      const __m128i kMask_x0ff0 = _mm_set_epi32(0x00FFFF00L, 0x00FFFF00L, 0x00FFFF00L, 0x00FFFF00L);
              _mm_set_epi32(0xFF000000L, 0xFF000000L, 0xFF000000L, 0xFF000000L);
          const __m128i kMask_x0ff0 =
              _mm_set_epi32(0x00FFFF00L, 0x00FFFF00L, 0x00FFFF00L, 0x00FFFF00L);
      // Expand the AR components to fill out 32-bit words:
      // ([A 7][R 7][A 6][R 6] [A 5][R 5][A 4][R 4] [A 3][R 3][A 2][R 2] [A 1][R 1][A 0][R 0])
-          // -> ([A 3][A 3][R 3][R 3] [A 2][A 2][R 2][R 2] [A 1][A 1][R 1][R 1] [A 0][A 0][R 0][R
+      // -> ([A 3][A 3][R 3][R 3] [A 2][A 2][R 2][R 2] [A 1][A 1][R 1][R 1] [A 0][A 0][R 0][R 0])
          // 0])
      const __m128i aarr00 = _mm_unpacklo_epi8(ar0, ar0);
      // ([A 7][R 7][A 6][R 6] [A 5][R 5][A 4][R 4] [A 3][R 3][A 2][R 2] [A 1][R 1][A 0][R 0])
-          // -> ([A 7][A 7][R 7][R 7] [A 6][A 6][R 6][R 6] [A 5][A 5][R 5][R 5] [A 4][A 4][R 4][R
+      // -> ([A 7][A 7][R 7][R 7] [A 6][A 6][R 6][R 6] [A 5][A 5][R 5][R 5] [A 4][A 4][R 4][R 4])
          // 4])
      const __m128i aarr01 = _mm_unpackhi_epi8(ar0, ar0);
      // ([A f][R f][A e][R e] [A d][R d][A c][R c] [A b][R b][A a][R a] [A 9][R 9][A 8][R 8])
-          // -> ([A b][A b][R b][R b] [A a][A a][R a][R a] [A 9][A 9][R 9][R 9] [A 8][A 8][R 8][R
+      // -> ([A b][A b][R b][R b] [A a][A a][R a][R a] [A 9][A 9][R 9][R 9] [A 8][A 8][R 8][R 8])
          // 8])
      const __m128i aarr10 = _mm_unpacklo_epi8(ar1, ar1);
      // ([A f][R f][A e][R e] [A d][R d][A c][R c] [A b][R b][A a][R a] [A 9][R 9][A 8][R 8])
-          // -> ([A f][A f][R f][R f] [A e][A e][R e][R e] [A d][A d][R d][R d] [A c][A c][R c][R
+      // -> ([A f][A f][R f][R f] [A e][A e][R e][R e] [A d][A d][R d][R d] [A c][A c][R c][R c])
          // c])
      const __m128i aarr11 = _mm_unpackhi_epi8(ar1, ar1);
      // Move A right 16 bits and mask off everything but the lowest  8 bits to get A in its
@ -1059,20 +1130,16 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
      // Expand the GB components to fill out 32-bit words:
      // ([G 7][B 7][G 6][B 6] [G 5][B 5][G 4][B 4] [G 3][B 3][G 2][B 2] [G 1][B 1][G 0][B 0])
-          // -> ([G 3][G 3][B 3][B 3] [G 2][G 2][B 2][B 2] [G 1][G 1][B 1][B 1] [G 0][G 0][B 0][B
+      // -> ([G 3][G 3][B 3][B 3] [G 2][G 2][B 2][B 2] [G 1][G 1][B 1][B 1] [G 0][G 0][B 0][B 0])
          // 0])
      const __m128i ggbb00 = _mm_unpacklo_epi8(gb0, gb0);
      // ([G 7][B 7][G 6][B 6] [G 5][B 5][G 4][B 4] [G 3][B 3][G 2][B 2] [G 1][B 1][G 0][B 0])
-          // -> ([G 7][G 7][B 7][B 7] [G 6][G 6][B 6][B 6] [G 5][G 5][B 5][B 5] [G 4][G 4][B 4][B
+      // -> ([G 7][G 7][B 7][B 7] [G 6][G 6][B 6][B 6] [G 5][G 5][B 5][B 5] [G 4][G 4][B 4][B 4])
          // 4])
      const __m128i ggbb01 = _mm_unpackhi_epi8(gb0, gb0);
      // ([G f][B f][G e][B e] [G d][B d][G c][B c] [G b][B b][G a][B a] [G 9][B 9][G 8][B 8])
-          // -> ([G b][G b][B b][B b] [G a][G a][B a][B a] [G 9][G 9][B 9][B 9] [G 8][G 8][B 8][B
+      // -> ([G b][G b][B b][B b] [G a][G a][B a][B a] [G 9][G 9][B 9][B 9] [G 8][G 8][B 8][B 8])
          // 8])
      const __m128i ggbb10 = _mm_unpacklo_epi8(gb1, gb1);
      // ([G f][B f][G e][B e] [G d][B d][G c][B c] [G b][B b][G a][B a] [G 9][B 9][G 8][B 8])
-          // -> ([G f][G f][B f][B f] [G e][G e][B e][B e] [G d][G d][B d][B d] [G c][G c][B c][B
+      // -> ([G f][G f][B f][B f] [G e][G e][B e][B e] [G d][G d][B d][B d] [G c][G c][B c][B c])
          // c])
      const __m128i ggbb11 = _mm_unpackhi_epi8(gb1, gb1);
      // G and B are already in perfect spots in the center, just remove the extra copies in the
@ -1098,28 +1165,25 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
      _mm_storeu_si128(dst128, rgba11);
    }
  }
-  }
+}
-  break;
+
-  case GX_TF_CMPR:  // speed critical
+static void TexDecoder_DecodeImpl_CMPR(u32* dst, const u8* src, int width, int height,
                                       int texformat, const u8* tlut, TlutFormat tlutfmt,
                                       int Wsteps4, int Wsteps8)
 {
  // The metroid games use this format almost exclusively.
    {
  // JSD optimized with SSE2 intrinsics.
  // Produces a ~50% improvement for x86 and a ~40% improvement for x64 in speed over reference
-      // C implementation.
+  // C implementation. The x64 compiled reference C code is faster than the x86 compiled reference
-      // The x64 compiled reference C code is faster than the x86 compiled reference C code, but the
+  // C code, but the SSE2 is faster than both.
      // SSE2 is
      // faster than both.
  for (int y = 0; y < height; y += 8)
  {
    for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
    {
-          // We handle two DXT blocks simultaneously to take full advantage of SSE2's 128-bit
+      // We handle two DXT blocks simultaneously to take full advantage of SSE2's 128-bit registers.
          // registers.
      // This is ideal because a single DXT block contains 2 RGBA colors when decoded from their
-          // 16-bit.
+      // 16-bit. Two DXT blocks therefore contain 4 RGBA colors to be processed. The processing is
-          // Two DXT blocks therefore contain 4 RGBA colors to be processed. The processing is
+      // parallelizable at this level, so we do.
          // parallelizable
          // at this level, so we do.
      for (int z = 0, xStep = 2 * yStep; z < 2; ++z, xStep++)
      {
        // JSD NOTE: You may see many strange patterns of behavior in the below code, but they
@ -1127,17 +1191,13 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
        // constants is faster than loading their values from memory. Unfortunately, there is no
        // way to inline 128-bit constants from opcodes so they must be loaded from memory. This
        // seems a little ridiculous to me in that you can't even generate a constant value of 1
-            // without
+        // without having to load it from memory. So, I stored the minimal constant I could,
-            // having to load it from memory. So, I stored the minimal constant I could, 128-bits
+        // 128-bits worth of 1s :). Then I use sequences of shifts to squash it to the appropriate
-            // worth
+        // size and bitpositions that I need.
            // of 1s :). Then I use sequences of shifts to squash it to the appropriate size and bit
            // positions that I need.
        const __m128i allFFs128 = _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128());
        // Load 128 bits, i.e. two DXTBlocks (64-bits each)
-            const __m128i dxt =
+        const __m128i dxt = _mm_loadu_si128((__m128i*)(src + sizeof(struct DXTBlock) * 2 * xStep));
                _mm_loadu_si128((__m128i*)(src + sizeof(struct DXTBlock) * 2 * xStep));
        // Copy the 2-bit indices from each DXT block:
        alignas(16) u32 dxttmp[4];
@ -1149,8 +1209,8 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
        __m128i argb888x4;
        __m128i c1 = _mm_unpackhi_epi16(dxt, dxt);
        c1 = _mm_slli_si128(c1, 8);
-            const __m128i c0 = _mm_or_si128(
+        const __m128i c0 =
-                c1, _mm_srli_si128(_mm_slli_si128(_mm_unpacklo_epi16(dxt, dxt), 8), 8));
+            _mm_or_si128(c1, _mm_srli_si128(_mm_slli_si128(_mm_unpacklo_epi16(dxt, dxt), 8), 8));
        // Compare rgb0 to rgb1:
        // Each 32-bit word will contain either 0xFFFFFFFF or 0x00000000 for true/false.
@ -1163,16 +1223,14 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
        // green:
        // NOTE: We start with the larger number of bits (6) firts for G and shift the mask down
-            // 1 bit to get a 5-bit mask
+        // 1 bit to get a 5-bit mask later for R and B components.
            // later for R and B components.
        // low6mask == _mm_set_epi32(0x0000FC00, 0x0000FC00, 0x0000FC00, 0x0000FC00)
        const __m128i low6mask = _mm_slli_epi32(_mm_srli_epi32(allFFs128, 24 + 2), 8 + 2);
        const __m128i gtmp = _mm_srli_epi32(c0, 3);
        const __m128i g0 = _mm_and_si128(gtmp, low6mask);
        // low3mask == _mm_set_epi32(0x00000300, 0x00000300, 0x00000300, 0x00000300)
-            const __m128i g1 =
+        const __m128i g1 = _mm_and_si128(
-                _mm_and_si128(_mm_srli_epi32(gtmp, 6),
+            _mm_srli_epi32(gtmp, 6), _mm_set_epi32(0x00000300, 0x00000300, 0x00000300, 0x00000300));
                              _mm_set_epi32(0x00000300, 0x00000300, 0x00000300, 0x00000300));
        argb888x4 = _mm_or_si128(g0, g1);
        // red:
        // low5mask == _mm_set_epi32(0x000000F8, 0x000000F8, 0x000000F8, 0x000000F8)
@ -1263,8 +1321,7 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
          // _mm_srli_epi32( allFFs128, 8 ) == _mm_set_epi32(0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF,
          // 0x00FFFFFF)
          // Make this color fully transparent:
-              rgb3 = _mm_or_si128(rgb3,
+          rgb3 = _mm_or_si128(rgb3, _mm_and_si128(_mm_and_si128(rgb2, _mm_srli_epi32(allFFs128, 8)),
                                  _mm_and_si128(_mm_and_si128(rgb2, _mm_srli_epi32(allFFs128, 8)),
                                                  _mm_slli_si128(allFFs128, 8)));
        }
@ -1287,11 +1344,10 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
        u32 tmp0[4][4], tmp1[4][4];
        DecodeDXTBlock(&(tmp0[0][0]),
-                           reinterpret_cast<const DXTBlock*>(src + sizeof(DXTBlock) * 2 * xStep),
+                       reinterpret_cast<const DXTBlock*>(src + sizeof(DXTBlock) * 2 * xStep), 4);
        DecodeDXTBlock(&(tmp1[0][0]),
                       reinterpret_cast<const DXTBlock*>((src + sizeof(DXTBlock) * 2 * xStep) + 8),
                       4);
            DecodeDXTBlock(
                &(tmp1[0][0]),
                reinterpret_cast<const DXTBlock*>((src + sizeof(DXTBlock) * 2 * xStep) + 8), 4);
 #endif
        u32* dst32 = (dst + (y + z * 4) * width + x);
@ -1357,7 +1413,95 @@ void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int
      }
    }
  }
 }
 void _TexDecoder_DecodeImpl(u32* dst, const u8* src, int width, int height, int texformat,
                            const u8* tlut, TlutFormat tlutfmt)
 {
  int Wsteps4 = (width + 3) / 4;
  int Wsteps8 = (width + 7) / 8;
 // If the binary was not compiled with SSSE3 support, the functions turn into no-ops.
 // Therefore, we shouldn't call them based on what the CPU reports at runtime alone.
 #if _M_SSE >= 0x301
  bool has_SSSE3 = cpu_info.bSSSE3;
 #else
  bool has_SSSE3 = false;
 #endif
  switch (texformat)
  {
  case GX_TF_C4:
    TexDecoder_DecodeImpl_C4(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4, Wsteps8);
    break;
  case GX_TF_I4:
    if (has_SSSE3)
      TexDecoder_DecodeImpl_I4_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
                                     Wsteps8);
    else
      TexDecoder_DecodeImpl_I4(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4, Wsteps8);
    break;
  case GX_TF_I8:
    if (has_SSSE3)
      TexDecoder_DecodeImpl_I8_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
                                     Wsteps8);
    else
      TexDecoder_DecodeImpl_I8(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4, Wsteps8);
    break;
  case GX_TF_C8:
    TexDecoder_DecodeImpl_C8(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4, Wsteps8);
    break;
  case GX_TF_IA4:
    TexDecoder_DecodeImpl_IA4(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4, Wsteps8);
    break;
  case GX_TF_IA8:
    if (has_SSSE3)
      TexDecoder_DecodeImpl_IA8_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
                                      Wsteps8);
    else
      TexDecoder_DecodeImpl_IA8(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
                                Wsteps8);
    break;
  case GX_TF_C14X2:
    TexDecoder_DecodeImpl_C14X2(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
                                Wsteps8);
    break;
  case GX_TF_RGB565:
    TexDecoder_DecodeImpl_RGB565(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
                                 Wsteps8);
    break;
  case GX_TF_RGB5A3:
    if (has_SSSE3)
      TexDecoder_DecodeImpl_RGB5A3_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
                                         Wsteps8);
    else
      TexDecoder_DecodeImpl_RGB5A3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
                                   Wsteps8);
    break;
  case GX_TF_RGBA8:
    if (has_SSSE3)
      TexDecoder_DecodeImpl_RGBA8_SSSE3(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
                                        Wsteps8);
    else
      TexDecoder_DecodeImpl_RGBA8(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4,
                                  Wsteps8);
    break;
  case GX_TF_CMPR:
    TexDecoder_DecodeImpl_CMPR(dst, src, width, height, texformat, tlut, tlutfmt, Wsteps4, Wsteps8);
    break;
  default:
    PanicAlert("Unhandled texture format %d", texformat);
    break;
  }
  }
 }