VideoCommon: Remove support for decoding to ARGB textures

The D3D / OGL backends only ever used RGBA textures, and the Software backend uses its own custom code for sampling. The ARGB path seems to just be dead code. Since ARGB and RGBA formats are similar, I don't think this will make the code more difficult to read or unable to be used as reference. Somebody who wants to use this code to output ARGB can simply modify the MakeRGBA function to put the shift at the other end.
2014-08-10 13:39:20 -04:00 · 2014-08-10 13:39:20 -04:00 · a8e591dc73
parent 9438a30384
commit a8e591dc73
9 changed files with 8 additions and 902 deletions
--- a/Source/Core/VideoBackends/D3D/main.cpp
+++ b/Source/Core/VideoBackends/D3D/main.cpp
@ -72,7 +72,6 @@ void InitBackendInfo()
 	}
 	g_Config.backend_info.APIType = API_D3D;
 	g_Config.backend_info.bUseRGBATextures = true; // the GX formats barely match any D3D11 formats
 	g_Config.backend_info.bUseMinimalMipCount = true;
 	g_Config.backend_info.bSupportsExclusiveFullscreen = true;
 	g_Config.backend_info.bSupportsDualSourceBlend = true;
--- a/Source/Core/VideoBackends/OGL/main.cpp
+++ b/Source/Core/VideoBackends/OGL/main.cpp
@ -132,7 +132,6 @@ static void GetShaders(std::vector<std::string> &shaders)
 static void InitBackendInfo()
 {
 	g_Config.backend_info.APIType = API_OPENGL;
 	g_Config.backend_info.bUseRGBATextures = true;
 	g_Config.backend_info.bUseMinimalMipCount = false;
 	g_Config.backend_info.bSupportsExclusiveFullscreen = false;
 	//g_Config.backend_info.bSupportsDualSourceBlend = true; // is gpu dependent and must be set in renderer
--- a/Source/Core/VideoCommon/TextureCacheBase.cpp
+++ b/Source/Core/VideoCommon/TextureCacheBase.cpp
@ -490,8 +490,7 @@ TextureCache::TCacheEntryBase* TextureCache::Load(unsigned int const stage,
 	{
 		if (!(texformat == GX_TF_RGBA8 && from_tmem))
 		{
-			pcfmt = TexDecoder_Decode(temp, src_data, expandedWidth,
+			pcfmt = TexDecoder_Decode(temp, src_data, expandedWidth, expandedHeight, texformat, tlutaddr, tlutfmt);
 						expandedHeight, texformat, tlutaddr, tlutfmt, g_ActiveConfig.backend_info.bUseRGBATextures);
 		}
 		else
 		{
@ -567,7 +566,7 @@ TextureCache::TCacheEntryBase* TextureCache::Load(unsigned int const stage,
 				const u8*& mip_src_data = from_tmem
 					? ((level % 2) ? ptr_odd : ptr_even)
 					: src_data;
-				TexDecoder_Decode(temp, mip_src_data, expanded_mip_width, expanded_mip_height, texformat, tlutaddr, tlutfmt, g_ActiveConfig.backend_info.bUseRGBATextures);
+				TexDecoder_Decode(temp, mip_src_data, expanded_mip_width, expanded_mip_height, texformat, tlutaddr, tlutfmt);
 				mip_src_data += TexDecoder_GetTextureSizeInBytes(expanded_mip_width, expanded_mip_height, texformat);
 				entry->Load(mip_width, mip_height, expanded_mip_width, level);
--- a/Source/Core/VideoCommon/TextureDecoder.h
+++ b/Source/Core/VideoCommon/TextureDecoder.h
@ -71,7 +71,7 @@ enum PC_TexFormat
 	PC_TEX_FMT_DXT1,
 };
-PC_TexFormat TexDecoder_Decode(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt,bool rgbaOnly = false);
+PC_TexFormat TexDecoder_Decode(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt);
 void TexDecoder_DecodeTexel(u8 *dst, const u8 *src, int s, int t, int imageWidth, int texformat, int tlutaddr, int tlutfmt);
 void TexDecoder_DecodeTexelRGBA8FromTmem(u8 *dst, const u8 *src_ar, const u8* src_gb, int s, int t, int imageWidth);
 PC_TexFormat TexDecoder_DecodeRGBA8FromTmem(u8* dst, const u8 *src_ar, const u8 *src_gb, int width, int height);
@ -79,4 +79,4 @@ PC_TexFormat TexDecoder_DecodeRGBA8FromTmem(u8* dst, const u8 *src_ar, const u8
 void TexDecoder_SetTexFmtOverlayOptions(bool enable, bool center);
 /* Internal method, implemented by TextureDecoder_Generic and TextureDecoder_x64. */
-PC_TexFormat _TexDecoder_DecodeImpl(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt,bool rgbaOnly);
+PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int height, int texformat, int tlutaddr, int tlutfmt);
--- a/Source/Core/VideoCommon/TextureDecoder_Common.cpp
+++ b/Source/Core/VideoCommon/TextureDecoder_Common.cpp
@ -242,9 +242,9 @@ static void TexDecoder_DrawOverlay(u8 *dst, int width, int height, int texformat
 	}
 }
-PC_TexFormat TexDecoder_Decode(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt,bool rgbaOnly)
+PC_TexFormat TexDecoder_Decode(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt)
 {
-	PC_TexFormat pc_texformat = _TexDecoder_DecodeImpl(dst, src, width, height, texformat, tlutaddr, tlutfmt, rgbaOnly);
+	PC_TexFormat pc_texformat = _TexDecoder_DecodeImpl((u32*)dst, src, width, height, texformat, tlutaddr, tlutfmt);
 	if (TexFmt_Overlay_Enable && pc_texformat != PC_TEX_FMT_NONE)
 		TexDecoder_DrawOverlay(dst, width, height, texformat, pc_texformat);
--- a/Source/Core/VideoCommon/TextureDecoder_Generic.cpp
+++ b/Source/Core/VideoCommon/TextureDecoder_Generic.cpp
@ -17,26 +17,6 @@
 // Decodes all known GameCube/Wii texture formats.
 // by ector
 static inline u32 decode5A3(u16 val)
 {
 	int r,g,b,a;
 	if ((val & 0x8000))
 	{
 		a = 0xFF;
 		r = Convert5To8((val >> 10) & 0x1F);
 		g = Convert5To8((val >> 5) & 0x1F);
 		b = Convert5To8(val & 0x1F);
 	}
 	else
 	{
 		a = Convert3To8((val >> 12) & 0x7);
 		r = Convert4To8((val >> 8) & 0xF);
 		g = Convert4To8((val >> 4) & 0xF);
 		b = Convert4To8(val & 0xF);
 	}
 	return (a << 24) | (r << 16) | (g << 8) | b;
 }
 static inline u32 decode5A3RGBA(u16 val)
 {
 	int r,g,b,a;
@ -80,18 +60,6 @@ struct DXTBlock
 	u8 lines[4];
 };
 //inline void decodebytesC4(u32 *dst, const u8 *src, int numbytes, int tlutaddr, int tlutfmt)
 inline void decodebytesC4_5A3_To_BGRA32(u32 *dst, const u8 *src, int tlutaddr)
 {
 	u16 *tlut = (u16*)(texMem + tlutaddr);
 	for (int x = 0; x < 4; x++)
 	{
 		u8 val = src[x];
 		*dst++ = decode5A3(Common::swap16(tlut[val >> 4]));
 		*dst++ = decode5A3(Common::swap16(tlut[val & 0xF]));
 	}
 }
 inline void decodebytesC4_5A3_To_rgba32(u32 *dst, const u8 *src, int tlutaddr)
 {
 	u16 *tlut = (u16*)(texMem + tlutaddr);
@ -103,17 +71,6 @@ inline void decodebytesC4_5A3_To_rgba32(u32 *dst, const u8 *src, int tlutaddr)
 	}
 }
 inline void decodebytesC4_To_Raw16(u16* dst, const u8* src, int tlutaddr)
 {
 	u16* tlut = (u16*)(texMem+tlutaddr);
 	for (int x = 0; x < 4; x++)
 	{
 		u8 val = src[x];
 		*dst++ = Common::swap16(tlut[val >> 4]);
 		*dst++ = Common::swap16(tlut[val & 0xF]);
 	}
 }
 inline void decodebytesC4IA8_To_RGBA(u32* dst, const u8* src, int tlutaddr)
 {
 	u16* tlut = (u16*)(texMem+tlutaddr);
@ -136,17 +93,6 @@ inline void decodebytesC4RGB565_To_RGBA(u32* dst, const u8* src, int tlutaddr)
 	}
 }
 //inline void decodebytesC8(u32 *dst, const u8 *src, int numbytes, int tlutaddr, int tlutfmt)
 inline void decodebytesC8_5A3_To_BGRA32(u32 *dst, const u8 *src, int tlutaddr)
 {
 	u16 *tlut = (u16*)(texMem + tlutaddr);
 	for (int x = 0; x < 8; x++)
 	{
 		u8 val = src[x];
 		*dst++ = decode5A3(Common::swap16(tlut[val]));
 	}
 }
 inline void decodebytesC8_5A3_To_RGBA32(u32 *dst, const u8 *src, int tlutaddr)
 {
 	u16 *tlut = (u16*)(texMem + tlutaddr);
@ -157,16 +103,6 @@ inline void decodebytesC8_5A3_To_RGBA32(u32 *dst, const u8 *src, int tlutaddr)
 	}
 }
 inline void decodebytesC8_To_Raw16(u16* dst, const u8* src, int tlutaddr)
 {
 	u16* tlut = (u16*)(texMem + tlutaddr);
 	for (int x = 0; x < 8; x++)
 	{
 		u8 val = src[x];
 		*dst++ = Common::swap16(tlut[val]);
 	}
 }
 inline void decodebytesC8IA8_To_RGBA(u32* dst, const u8* src, int tlutaddr)
 {
 	u16* tlut = (u16*)(texMem + tlutaddr);
@ -185,16 +121,6 @@ inline void decodebytesC8RGB565_To_RGBA(u32* dst, const u8* src, int tlutaddr)
 	}
 }
 inline void decodebytesC14X2_5A3_To_BGRA32(u32 *dst, const u16 *src, int tlutaddr)
 {
 	u16 *tlut = (u16*)(texMem + tlutaddr);
 	for (int x = 0; x < 4; x++)
 	{
 		u16 val = Common::swap16(src[x]);
 		*dst++ = decode5A3(Common::swap16(tlut[(val & 0x3FFF)]));
 	}
 }
 inline void decodebytesC14X2_5A3_To_RGBA(u32 *dst, const u16 *src, int tlutaddr)
 {
 	u16 *tlut = (u16*)(texMem + tlutaddr);
@ -205,16 +131,6 @@ inline void decodebytesC14X2_5A3_To_RGBA(u32 *dst, const u16 *src, int tlutaddr)
 	}
 }
 inline void decodebytesC14X2_To_Raw16(u16* dst, const u16* src, int tlutaddr)
 {
 	u16* tlut = (u16*)(texMem + tlutaddr);
 	for (int x = 0; x < 4; x++)
 	{
 		u16 val = Common::swap16(src[x]);
 		*dst++ = Common::swap16(tlut[(val & 0x3FFF)]);
 	}
 }
 inline void decodebytesC14X2IA8_To_RGBA(u32* dst, const u16* src, int tlutaddr)
 {
 	u16* tlut = (u16*)(texMem + tlutaddr);
@ -235,18 +151,6 @@ inline void decodebytesC14X2rgb565_To_RGBA(u32* dst, const u16* src, int tlutadd
 	}
 }
 // Needs more speed.
 inline void decodebytesIA4(u16 *dst, const u8 *src)
 {
 	for (int x = 0; x < 8; x++)
 	{
 		const u8 val = src[x];
 		u8 a = Convert4To8(val >> 4);
 		u8 l = Convert4To8(val & 0xF);
 		dst[x] = (a << 8) | l;
 	}
 }
 inline void decodebytesIA4RGBA(u32 *dst, const u8 *src)
 {
 	for (int x = 0; x < 8; x++)
@ -258,19 +162,6 @@ inline void decodebytesIA4RGBA(u32 *dst, const u8 *src)
 	}
 }
 inline void decodebytesRGB5A3(u32 *dst, const u16 *src)
 {
 #if 0
 	for (int x = 0; x < 4; x++)
 		dst[x] = decode5A3(Common::swap16(src[x]));
 #else
 	dst[0] = decode5A3(Common::swap16(src[0]));
 	dst[1] = decode5A3(Common::swap16(src[1]));
 	dst[2] = decode5A3(Common::swap16(src[2]));
 	dst[3] = decode5A3(Common::swap16(src[3]));
 #endif
 }
 inline void decodebytesRGB5A3rgba(u32 *dst, const u16 *src)
 {
 #if 0
@ -284,29 +175,6 @@ inline void decodebytesRGB5A3rgba(u32 *dst, const u16 *src)
 #endif
 }
 // This one is used by many video formats. It'd therefore be good if it was fast.
 // Needs more speed.
 inline void decodebytesARGB8_4(u32 *dst, const u16 *src, const u16 *src2)
 {
 #if 0
 	for (int x = 0; x < 4; x++)
 		dst[x] = Common::swap32((src2[x] << 16) | src[x]);
 #else
 	dst[0] = Common::swap32((src2[0] << 16) | src[0]);
 	dst[1] = Common::swap32((src2[1] << 16) | src[1]);
 	dst[2] = Common::swap32((src2[2] << 16) | src[2]);
 	dst[3] = Common::swap32((src2[3] << 16) | src[3]);
 #endif
 	// This can probably be done in a few SSE pack/unpack instructions + pshufb
 	// some unpack instruction x2:
 	// ABABABABABABABAB 1212121212121212 ->
 	// AB12AB12AB12AB12 AB12AB12AB12AB12
 	// 2x pshufb->
 	// 21BA21BA21BA21BA 21BA21BA21BA21BA
 	// and we are done.
 }
 inline void decodebytesARGB8_4ToRgba(u32 *dst, const u16 *src, const u16 * src2)
 {
 #if 0
@ -322,59 +190,11 @@ inline void decodebytesARGB8_4ToRgba(u32 *dst, const u16 *src, const u16 * src2)
 #endif
 }
 inline u32 makecol(int r, int g, int b, int a)
 {
 	return (a << 24)|(r << 16)|(g << 8)|b;
 }
 inline u32 makeRGBA(int r, int g, int b, int a)
 {
 	return (a<<24)|(b<<16)|(g<<8)|r;
 }
 void decodeDXTBlock(u32 *dst, const DXTBlock *src, int pitch)
 {
 	// S3TC Decoder (Note: GCN decodes differently from PC so we can't use native support)
 	// Needs more speed.
 	u16 c1 = Common::swap16(src->color1);
 	u16 c2 = Common::swap16(src->color2);
 	int blue1 = Convert5To8(c1 & 0x1F);
 	int blue2 = Convert5To8(c2 & 0x1F);
 	int green1 = Convert6To8((c1 >> 5) & 0x3F);
 	int green2 = Convert6To8((c2 >> 5) & 0x3F);
 	int red1 = Convert5To8((c1 >> 11) & 0x1F);
 	int red2 = Convert5To8((c2 >> 11) & 0x1F);
 	int colors[4];
 	colors[0] = makecol(red1, green1, blue1, 255);
 	colors[1] = makecol(red2, green2, blue2, 255);
 	if (c1 > c2)
 	{
 		int blue3 = ((blue2 - blue1) >> 1) - ((blue2 - blue1) >> 3);
 		int green3 = ((green2 - green1) >> 1) - ((green2 - green1) >> 3);
 		int red3 = ((red2 - red1) >> 1) - ((red2 - red1) >> 3);
 		colors[2] = makecol(red1 + red3, green1 + green3, blue1 + blue3, 255);
 		colors[3] = makecol(red2 - red3, green2 - green3, blue2 - blue3, 255);
 	}
 	else
 	{
 		colors[2] = makecol((red1 + red2 + 1) / 2, // Average
 							(green1 + green2 + 1) / 2,
 							(blue1 + blue2 + 1) / 2, 255);
 		colors[3] = makecol(red2, green2, blue2, 0);  // Color2 but transparent
 	}
 	for (int y = 0; y < 4; y++)
 	{
 		int val = src->lines[y];
 		for (int x = 0; x < 4; x++)
 		{
 			dst[x] = colors[(val >> 6) & 3];
 			val <<= 2;
 		}
 		dst += pitch;
 	}
 }
 void decodeDXTBlockRGBA(u32 *dst, const DXTBlock *src, int pitch)
 {
 	// S3TC Decoder (Note: GCN decodes differently from PC so we can't use native support)
@ -418,210 +238,6 @@ void decodeDXTBlockRGBA(u32 *dst, const DXTBlock *src, int pitch)
 	}
 }
 #if 0   // TODO - currently does not handle transparency correctly and causes problems when texture dimensions are not multiples of 8
 static void copyDXTBlock(u8* dst, const u8* src)
 {
 	((u16*)dst)[0] = Common::swap16(((u16*)src)[0]);
 	((u16*)dst)[1] = Common::swap16(((u16*)src)[1]);
 	u32 pixels = ((u32*)src)[1];
 	// A bit of trickiness here: the row are in the same order
 	// between the two formats, but the ordering within the rows
 	// is reversed.
 	pixels = ((pixels >> 4) & 0x0F0F0F0F) | ((pixels << 4) & 0xF0F0F0F0);
 	pixels = ((pixels >> 2) & 0x33333333) | ((pixels << 2) & 0xCCCCCCCC);
 	((u32*)dst)[1] = pixels;
 }
 #endif
 //switch endianness, unswizzle
 //TODO: to save memory, don't blindly convert everything to argb8888
 //also ARGB order needs to be swapped later, to accommodate modern hardware better
 //need to add DXT support too
 PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt)
 {
 	const int Wsteps4 = (width + 3) / 4;
 	const int Wsteps8 = (width + 7) / 8;
 	switch (texformat)
 	{
 	case GX_TF_C4:
 		if (tlutfmt == 2)
 		{
 			// Special decoding is required for TLUT format 5A3
 			for (int y = 0; y < height; y += 8)
 				for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
 					for (int iy = 0, xStep = yStep * 8; iy < 8; iy++, xStep++)
 						decodebytesC4_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, src + 4 * xStep, tlutaddr);
 		}
 		else
 		{
 			for (int y = 0; y < height; y += 8)
 				for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
 					for (int iy = 0, xStep = yStep * 8; iy < 8; iy++, xStep++)
 						decodebytesC4_To_Raw16((u16*)dst + (y + iy) * width + x, src + 4 * xStep, tlutaddr);
 		}
 		return GetPCFormatFromTLUTFormat(tlutfmt);
 	case GX_TF_I4:
 		{
 			for (int y = 0; y < height; y += 8)
 				for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
 					for (int iy = 0, xStep = yStep * 8 ; iy < 8; iy++,xStep++)
 						for (int ix = 0; ix < 4; ix++)
 						{
 							int val = src[4 * xStep + ix];
 							dst[(y + iy) * width + x + ix * 2] = Convert4To8(val >> 4);
 							dst[(y + iy) * width + x + ix * 2 + 1] = Convert4To8(val & 0xF);
 						}
 		}
 	   return PC_TEX_FMT_I4_AS_I8;
 	case GX_TF_I8:  // speed critical
 		{
 			for (int y = 0; y < height; y += 4)
 				for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
 					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
 						((u64*)(dst + (y + iy) * width + x))[0] = ((u64*)(src + 8 * xStep))[0];
 		}
 		return PC_TEX_FMT_I8;
 	case GX_TF_C8:
 		if (tlutfmt == 2)
 		{
 			// Special decoding is required for TLUT format 5A3
 			for (int y = 0; y < height; y += 4)
 				for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
 					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
 						decodebytesC8_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, src + 8 * xStep, tlutaddr);
 		}
 		else
 		{
 			{
 				for (int y = 0; y < height; y += 4)
 					for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
 						for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
 							decodebytesC8_To_Raw16((u16*)dst + (y + iy) * width + x, src  + 8 * xStep, tlutaddr);
 			}
 		}
 		return GetPCFormatFromTLUTFormat(tlutfmt);
 	case GX_TF_IA4:
 		{
 			for (int y = 0; y < height; y += 4)
 				for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
 					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
 						decodebytesIA4((u16*)dst + (y + iy) * width + x, src + 8 * xStep);
 		}
 		return PC_TEX_FMT_IA4_AS_IA8;
 	case GX_TF_IA8:
 		{
 			for (int y = 0; y < height; y += 4)
 				for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
 					for (int iy = 0, xStep = yStep * 4; iy < 4; iy++, xStep++)
 					{
 						u16 *ptr = (u16 *)dst + (y + iy) * width + x;
 						u16 *s = (u16 *)(src + 8 * xStep);
 						for (int j = 0; j < 4; j++)
 							*ptr++ = Common::swap16(*s++);
 					}
 		}
 		return PC_TEX_FMT_IA8;
 	case GX_TF_C14X2:
 		if (tlutfmt == 2)
 		{
 			// Special decoding is required for TLUT format 5A3
 			for (int y = 0; y < height; y += 4)
 				for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
 					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
 						decodebytesC14X2_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, (u16*)(src + 8 * xStep), tlutaddr);
 		}
 		else
 		{
 			for (int y = 0; y < height; y += 4)
 				for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
 					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
 						decodebytesC14X2_To_Raw16((u16*)dst + (y + iy) * width + x,(u16*)(src + 8 * xStep), tlutaddr);
 		}
 		return GetPCFormatFromTLUTFormat(tlutfmt);
 	case GX_TF_RGB565:
 		{
 			for (int y = 0; y < height; y += 4)
 				for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
 					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
 					{
 						u16 *ptr = (u16 *)dst + (y + iy) * width + x;
 						u16 *s = (u16 *)(src + 8 * xStep);
 						for (int j = 0; j < 4; j++)
 							*ptr++ = Common::swap16(*s++);
 					}
 		}
 		return PC_TEX_FMT_RGB565;
 	case GX_TF_RGB5A3:
 		{
 			for (int y = 0; y < height; y += 4)
 				for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
 					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
 						//decodebytesRGB5A3((u32*)dst+(y+iy)*width+x, (u16*)src, 4);
 						decodebytesRGB5A3((u32*)dst+(y+iy)*width+x, (u16*)(src + 8 * xStep));
 		}
 		return PC_TEX_FMT_BGRA32;
 	case GX_TF_RGBA8:  // speed critical
 		{
 			for (int y = 0; y < height; y += 4)
 			{
 				for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
 				{
 					const u8* src2 = src + 64 * yStep;
 					for (int iy = 0; iy < 4; iy++)
 						decodebytesARGB8_4((u32*)dst + (y+iy)*width + x, (u16*)src2 + 4 * iy, (u16*)src2 + 4 * iy + 16);
 				}
 			}
 		}
 		return PC_TEX_FMT_BGRA32;
 	case GX_TF_CMPR:  // speed critical
 		// The metroid games use this format almost exclusively.
 		{
 #if 0   // TODO - currently does not handle transparency correctly and causes problems when texture dimensions are not multiples of 8
 			// 11111111 22222222 55555555 66666666
 			// 33333333 44444444 77777777 88888888
 			for (int y = 0; y < height; y += 8)
 			{
 				for (int x = 0; x < width; x += 8)
 				{
 					copyDXTBlock(dst+(y/2)*width+x*2, src);
 					src += 8;
 					copyDXTBlock(dst+(y/2)*width+x*2+8, src);
 					src += 8;
 					copyDXTBlock(dst+(y/2+2)*width+x*2, src);
 					src += 8;
 					copyDXTBlock(dst+(y/2+2)*width+x*2+8, src);
 					src += 8;
 				}
 			}
 			return PC_TEX_FMT_DXT1;
 #else
 			for (int y = 0; y < height; y += 8)
 			{
 				for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
 				{
 					const u8* src2 = src + 4 * sizeof(DXTBlock) * yStep;
 					decodeDXTBlock((u32*)dst + y * width + x, (DXTBlock*)src2, width);
 										src2 += sizeof(DXTBlock);
 					decodeDXTBlock((u32*)dst + y * width + x + 4, (DXTBlock*)src2, width);
 										src2 += sizeof(DXTBlock);
 					decodeDXTBlock((u32*)dst + (y + 4) * width + x, (DXTBlock*)src2, width);
 										src2 += sizeof(DXTBlock);
 					decodeDXTBlock((u32*)dst + (y + 4) * width + x + 4, (DXTBlock*)src2, width);
 				}
 			}
 #endif
 			return PC_TEX_FMT_BGRA32;
 		}
 	}
 	// The "copy" texture formats, too?
 	return PC_TEX_FMT_NONE;
 }
 // JSD 01/06/11:
 // TODO: we really should ensure BOTH the source and destination addresses are aligned to 16-byte boundaries to
 // squeeze out a little more performance. _mm_loadu_si128/_mm_storeu_si128 is slower than _mm_load_si128/_mm_store_si128
@ -630,7 +246,7 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh
 // TODO: complete SSE2 optimization of less often used texture formats.
 // TODO: refactor algorithms using _mm_loadl_epi64 unaligned loads to prefer 128-bit aligned loads.
-PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int height, int texformat, int tlutaddr, int tlutfmt)
+PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int height, int texformat, int tlutaddr, int tlutfmt)
 {
 	const int Wsteps4 = (width + 3) / 4;
@ -832,11 +448,3 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he
 	// The "copy" texture formats, too?
 	return PC_TEX_FMT_RGBA32;
 }
 PC_TexFormat _TexDecoder_DecodeImpl(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt,bool rgbaOnly)
 {
 	if (rgbaOnly)
 		return TexDecoder_Decode_RGBA((u32*)dst, src, width, height, texformat, tlutaddr, tlutfmt);
 	else
 		return TexDecoder_Decode_real(dst, src, width, height, texformat, tlutaddr, tlutfmt);
 }
--- a/Source/Core/VideoCommon/TextureDecoder_x64.cpp
+++ b/Source/Core/VideoCommon/TextureDecoder_x64.cpp
@ -37,26 +37,6 @@
 // Decodes all known GameCube/Wii texture formats.
 // by ector
 static inline u32 decode5A3(u16 val)
 {
 	int r,g,b,a;
 	if ((val & 0x8000))
 	{
 		a = 0xFF;
 		r = Convert5To8((val >> 10) & 0x1F);
 		g = Convert5To8((val >> 5) & 0x1F);
 		b = Convert5To8(val & 0x1F);
 	}
 	else
 	{
 		a = Convert3To8((val >> 12) & 0x7);
 		r = Convert4To8((val >> 8) & 0xF);
 		g = Convert4To8((val >> 4) & 0xF);
 		b = Convert4To8(val & 0xF);
 	}
 	return (a << 24) | (r << 16) | (g << 8) | b;
 }
 static inline u32 decode5A3RGBA(u16 val)
 {
 	int r,g,b,a;
@ -103,18 +83,6 @@ struct DXTBlock
 	u8 lines[4];
 };
 //inline void decodebytesC4(u32 *dst, const u8 *src, int numbytes, int tlutaddr, int tlutfmt)
 inline void decodebytesC4_5A3_To_BGRA32(u32 *dst, const u8 *src, int tlutaddr)
 {
 	u16 *tlut = (u16*)(texMem + tlutaddr);
 	for (int x = 0; x < 4; x++)
 	{
 		u8 val = src[x];
 		*dst++ = decode5A3(Common::swap16(tlut[val >> 4]));
 		*dst++ = decode5A3(Common::swap16(tlut[val & 0xF]));
 	}
 }
 inline void decodebytesC4_5A3_To_rgba32(u32 *dst, const u8 *src, int tlutaddr)
 {
 	u16 *tlut = (u16*)(texMem + tlutaddr);
@ -126,17 +94,6 @@ inline void decodebytesC4_5A3_To_rgba32(u32 *dst, const u8 *src, int tlutaddr)
 	}
 }
 inline void decodebytesC4_To_Raw16(u16* dst, const u8* src, int tlutaddr)
 {
 	u16* tlut = (u16*)(texMem+tlutaddr);
 	for (int x = 0; x < 4; x++)
 	{
 		u8 val = src[x];
 		*dst++ = Common::swap16(tlut[val >> 4]);
 		*dst++ = Common::swap16(tlut[val & 0xF]);
 	}
 }
 inline void decodebytesC4IA8_To_RGBA(u32* dst, const u8* src, int tlutaddr)
 {
 	u16* tlut = (u16*)(texMem+tlutaddr);
@ -159,17 +116,6 @@ inline void decodebytesC4RGB565_To_RGBA(u32* dst, const u8* src, int tlutaddr)
 	}
 }
 //inline void decodebytesC8(u32 *dst, const u8 *src, int numbytes, int tlutaddr, int tlutfmt)
 inline void decodebytesC8_5A3_To_BGRA32(u32 *dst, const u8 *src, int tlutaddr)
 {
 	u16 *tlut = (u16*)(texMem + tlutaddr);
 	for (int x = 0; x < 8; x++)
 	{
 		u8 val = src[x];
 		*dst++ = decode5A3(Common::swap16(tlut[val]));
 	}
 }
 inline void decodebytesC8_5A3_To_RGBA32(u32 *dst, const u8 *src, int tlutaddr)
 {
 	u16 *tlut = (u16*)(texMem + tlutaddr);
@ -180,16 +126,6 @@ inline void decodebytesC8_5A3_To_RGBA32(u32 *dst, const u8 *src, int tlutaddr)
 	}
 }
 inline void decodebytesC8_To_Raw16(u16* dst, const u8* src, int tlutaddr)
 {
 	u16* tlut = (u16*)(texMem + tlutaddr);
 	for (int x = 0; x < 8; x++)
 	{
 		u8 val = src[x];
 		*dst++ = Common::swap16(tlut[val]);
 	}
 }
 inline void decodebytesC8IA8_To_RGBA(u32* dst, const u8* src, int tlutaddr)
 {
 	u16* tlut = (u16*)(texMem + tlutaddr);
@ -208,42 +144,6 @@ inline void decodebytesC8RGB565_To_RGBA(u32* dst, const u8* src, int tlutaddr)
 	}
 }
 #if _M_SSE >= 0x301
 static const __m128i kMaskSwap16 = _mm_set_epi32(0x0E0F0C0DL, 0x0A0B0809L, 0x06070405L, 0x02030001L);
 inline void decodebytesC8_To_Raw16_SSSE3(u16* dst, const u8* src, int tlutaddr)
 {
 	u16* tlut = (u16*)(texMem + tlutaddr);
 	// Make 8 16-bits unsigned integer values
 	__m128i a = _mm_setzero_si128();
 	a = _mm_insert_epi16(a, tlut[src[0]], 0);
 	a = _mm_insert_epi16(a, tlut[src[1]], 1);
 	a = _mm_insert_epi16(a, tlut[src[2]], 2);
 	a = _mm_insert_epi16(a, tlut[src[3]], 3);
 	a = _mm_insert_epi16(a, tlut[src[4]], 4);
 	a = _mm_insert_epi16(a, tlut[src[5]], 5);
 	a = _mm_insert_epi16(a, tlut[src[6]], 6);
 	a = _mm_insert_epi16(a, tlut[src[7]], 7);
 	// Apply Common::swap16() to 16-bits unsigned integers at once
 	const __m128i b = _mm_shuffle_epi8(a, kMaskSwap16);
 	// Store values to dst without polluting the caches
 	_mm_stream_si128((__m128i*)dst, b);
 }
 #endif
 inline void decodebytesC14X2_5A3_To_BGRA32(u32 *dst, const u16 *src, int tlutaddr)
 {
 	u16 *tlut = (u16*)(texMem + tlutaddr);
 	for (int x = 0; x < 4; x++)
 	{
 		u16 val = Common::swap16(src[x]);
 		*dst++ = decode5A3(Common::swap16(tlut[(val & 0x3FFF)]));
 	}
 }
 inline void decodebytesC14X2_5A3_To_RGBA(u32 *dst, const u16 *src, int tlutaddr)
 {
 	u16 *tlut = (u16*)(texMem + tlutaddr);
@ -254,16 +154,6 @@ inline void decodebytesC14X2_5A3_To_RGBA(u32 *dst, const u16 *src, int tlutaddr)
 	}
 }
 inline void decodebytesC14X2_To_Raw16(u16* dst, const u16* src, int tlutaddr)
 {
 	u16* tlut = (u16*)(texMem + tlutaddr);
 	for (int x = 0; x < 4; x++)
 	{
 		u16 val = Common::swap16(src[x]);
 		*dst++ = Common::swap16(tlut[(val & 0x3FFF)]);
 	}
 }
 inline void decodebytesC14X2IA8_To_RGBA(u32* dst, const u16* src, int tlutaddr)
 {
 	u16* tlut = (u16*)(texMem + tlutaddr);
@ -284,18 +174,6 @@ inline void decodebytesC14X2rgb565_To_RGBA(u32* dst, const u16* src, int tlutadd
 	}
 }
 // Needs more speed.
 inline void decodebytesIA4(u16 *dst, const u8 *src)
 {
 	for (int x = 0; x < 8; x++)
 	{
 		const u8 val = src[x];
 		u8 a = Convert4To8(val >> 4);
 		u8 l = Convert4To8(val & 0xF);
 		dst[x] = (a << 8) | l;
 	}
 }
 inline void decodebytesIA4RGBA(u32 *dst, const u8 *src)
 {
 	for (int x = 0; x < 8; x++)
@ -307,19 +185,6 @@ inline void decodebytesIA4RGBA(u32 *dst, const u8 *src)
 	}
 }
 inline void decodebytesRGB5A3(u32 *dst, const u16 *src)
 {
 #if 0
 	for (int x = 0; x < 4; x++)
 		dst[x] = decode5A3(Common::swap16(src[x]));
 #else
 	dst[0] = decode5A3(Common::swap16(src[0]));
 	dst[1] = decode5A3(Common::swap16(src[1]));
 	dst[2] = decode5A3(Common::swap16(src[2]));
 	dst[3] = decode5A3(Common::swap16(src[3]));
 #endif
 }
 inline void decodebytesRGB5A3rgba(u32 *dst, const u16 *src)
 {
 #if 0
@ -333,29 +198,6 @@ inline void decodebytesRGB5A3rgba(u32 *dst, const u16 *src)
 #endif
 }
 // This one is used by many video formats. It'd therefore be good if it was fast.
 // Needs more speed.
 inline void decodebytesARGB8_4(u32 *dst, const u16 *src, const u16 *src2)
 {
 #if 0
 	for (int x = 0; x < 4; x++)
 		dst[x] = Common::swap32((src2[x] << 16) | src[x]);
 #else
 	dst[0] = Common::swap32((src2[0] << 16) | src[0]);
 	dst[1] = Common::swap32((src2[1] << 16) | src[1]);
 	dst[2] = Common::swap32((src2[2] << 16) | src[2]);
 	dst[3] = Common::swap32((src2[3] << 16) | src[3]);
 #endif
 	// This can probably be done in a few SSE pack/unpack instructions + pshufb
 	// some unpack instruction x2:
 	// ABABABABABABABAB 1212121212121212 ->
 	// AB12AB12AB12AB12 AB12AB12AB12AB12
 	// 2x pshufb->
 	// 21BA21BA21BA21BA 21BA21BA21BA21BA
 	// and we are done.
 }
 inline void decodebytesARGB8_4ToRgba(u32 *dst, const u16 *src, const u16 * src2)
 {
 #if 0
@ -371,59 +213,11 @@ inline void decodebytesARGB8_4ToRgba(u32 *dst, const u16 *src, const u16 * src2)
 #endif
 }
 inline u32 makecol(int r, int g, int b, int a)
 {
 	return (a << 24)|(r << 16)|(g << 8)|b;
 }
 inline u32 makeRGBA(int r, int g, int b, int a)
 {
 	return (a<<24)|(b<<16)|(g<<8)|r;
 }
 static void decodeDXTBlock(u32 *dst, const DXTBlock *src, int pitch)
 {
 	// S3TC Decoder (Note: GCN decodes differently from PC so we can't use native support)
 	// Needs more speed.
 	u16 c1 = Common::swap16(src->color1);
 	u16 c2 = Common::swap16(src->color2);
 	int blue1 = Convert5To8(c1 & 0x1F);
 	int blue2 = Convert5To8(c2 & 0x1F);
 	int green1 = Convert6To8((c1 >> 5) & 0x3F);
 	int green2 = Convert6To8((c2 >> 5) & 0x3F);
 	int red1 = Convert5To8((c1 >> 11) & 0x1F);
 	int red2 = Convert5To8((c2 >> 11) & 0x1F);
 	int colors[4];
 	colors[0] = makecol(red1, green1, blue1, 255);
 	colors[1] = makecol(red2, green2, blue2, 255);
 	if (c1 > c2)
 	{
 		int blue3 = ((blue2 - blue1) >> 1) - ((blue2 - blue1) >> 3);
 		int green3 = ((green2 - green1) >> 1) - ((green2 - green1) >> 3);
 		int red3 = ((red2 - red1) >> 1) - ((red2 - red1) >> 3);
 		colors[2] = makecol(red1 + red3, green1 + green3, blue1 + blue3, 255);
 		colors[3] = makecol(red2 - red3, green2 - green3, blue2 - blue3, 255);
 	}
 	else
 	{
 		colors[2] = makecol((red1 + red2 + 1) / 2, // Average
 							(green1 + green2 + 1) / 2,
 							(blue1 + blue2 + 1) / 2, 255);
 		colors[3] = makecol(red2, green2, blue2, 0);  // Color2 but transparent
 	}
 	for (int y = 0; y < 4; y++)
 	{
 		int val = src->lines[y];
 		for (int x = 0; x < 4; x++)
 		{
 			dst[x] = colors[(val >> 6) & 3];
 			val <<= 2;
 		}
 		dst += pitch;
 	}
 }
 #ifdef CHECK
 static void decodeDXTBlockRGBA(u32 *dst, const DXTBlock *src, int pitch)
 {
@ -469,21 +263,6 @@ static void decodeDXTBlockRGBA(u32 *dst, const DXTBlock *src, int pitch)
 }
 #endif
 #if 0   // TODO - currently does not handle transparency correctly and causes problems when texture dimensions are not multiples of 8
 static void copyDXTBlock(u8* dst, const u8* src)
 {
 	((u16*)dst)[0] = Common::swap16(((u16*)src)[0]);
 	((u16*)dst)[1] = Common::swap16(((u16*)src)[1]);
 	u32 pixels = ((u32*)src)[1];
 	// A bit of trickiness here: the row are in the same order
 	// between the two formats, but the ordering within the rows
 	// is reversed.
 	pixels = ((pixels >> 4) & 0x0F0F0F0F) | ((pixels << 4) & 0xF0F0F0F0);
 	pixels = ((pixels >> 2) & 0x33333333) | ((pixels << 2) & 0xCCCCCCCC);
 	((u32*)dst)[1] = pixels;
 }
 #endif
 inline void SetOpenMPThreadCount(int width, int height)
 {
 #ifdef _OPENMP
@ -500,274 +279,6 @@ inline void SetOpenMPThreadCount(int width, int height)
 #endif
 }
 //switch endianness, unswizzle
 //TODO: to save memory, don't blindly convert everything to argb8888
 //also ARGB order needs to be swapped later, to accommodate modern hardware better
 //need to add DXT support too
 static PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt)
 {
 	SetOpenMPThreadCount(width, height);
 	const int Wsteps4 = (width + 3) / 4;
 	const int Wsteps8 = (width + 7) / 8;
 	switch (texformat)
 	{
 	case GX_TF_C4:
 		if (tlutfmt == 2)
 		{
 			// Special decoding is required for TLUT format 5A3
 			#pragma omp parallel for
 			for (int y = 0; y < height; y += 8)
 				for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
 					for (int iy = 0, xStep = yStep * 8; iy < 8; iy++, xStep++)
 						decodebytesC4_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, src + 4 * xStep, tlutaddr);
 		}
 		else
 		{
 			#pragma omp parallel for
 			for (int y = 0; y < height; y += 8)
 				for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
 					for (int iy = 0, xStep = yStep * 8; iy < 8; iy++, xStep++)
 						decodebytesC4_To_Raw16((u16*)dst + (y + iy) * width + x, src + 4 * xStep, tlutaddr);
 		}
 		return GetPCFormatFromTLUTFormat(tlutfmt);
 	case GX_TF_I4:
 		{
 			#pragma omp parallel for
 			for (int y = 0; y < height; y += 8)
 				for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
 					for (int iy = 0, xStep = yStep * 8 ; iy < 8; iy++,xStep++)
 						for (int ix = 0; ix < 4; ix++)
 						{
 							int val = src[4 * xStep + ix];
 							dst[(y + iy) * width + x + ix * 2] = Convert4To8(val >> 4);
 							dst[(y + iy) * width + x + ix * 2 + 1] = Convert4To8(val & 0xF);
 						}
 		}
 	   return PC_TEX_FMT_I4_AS_I8;
 	case GX_TF_I8:  // speed critical
 		{
 			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
 				for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
 					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
 					{
 						((u64*)(dst + (y + iy) * width + x))[0] = ((u64*)(src + 8 * xStep))[0];
 					}
 		}
 		return PC_TEX_FMT_I8;
 	case GX_TF_C8:
 		if (tlutfmt == 2)
 		{
 			// Special decoding is required for TLUT format 5A3
 			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
 				for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
 					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
 						decodebytesC8_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, src + 8 * xStep, tlutaddr);
 		}
 		else
 		{
 #if _M_SSE >= 0x301
 			if (cpu_info.bSSSE3)
 			{
 				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
 					for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
 						for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
 							decodebytesC8_To_Raw16_SSSE3((u16*)dst + (y + iy) * width + x, src + 8 * xStep, tlutaddr);
 			}
 			else
 #endif
 			{
 				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
 					for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
 						for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
 							decodebytesC8_To_Raw16((u16*)dst + (y + iy) * width + x, src  + 8 * xStep, tlutaddr);
 			}
 		}
 		return GetPCFormatFromTLUTFormat(tlutfmt);
 	case GX_TF_IA4:
 		{
 			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
 				for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++)
 					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
 						decodebytesIA4((u16*)dst + (y + iy) * width + x, src + 8 * xStep);
 		}
 		return PC_TEX_FMT_IA4_AS_IA8;
 	case GX_TF_IA8:
 		{
 			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
 				for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
 					for (int iy = 0, xStep = yStep * 4; iy < 4; iy++, xStep++)
 					{
 						u16 *ptr = (u16 *)dst + (y + iy) * width + x;
 						u16 *s = (u16 *)(src + 8 * xStep);
 						for (int j = 0; j < 4; j++)
 							*ptr++ = Common::swap16(*s++);
 					}
 		}
 		return PC_TEX_FMT_IA8;
 	case GX_TF_C14X2:
 		if (tlutfmt == 2)
 		{
 			// Special decoding is required for TLUT format 5A3
 			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
 				for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
 					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
 						decodebytesC14X2_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, (u16*)(src + 8 * xStep), tlutaddr);
 		}
 		else
 		{
 			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
 				for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
 					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
 						decodebytesC14X2_To_Raw16((u16*)dst + (y + iy) * width + x,(u16*)(src + 8 * xStep), tlutaddr);
 		}
 		return GetPCFormatFromTLUTFormat(tlutfmt);
 	case GX_TF_RGB565:
 		{
 			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
 				for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
 					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
 					{
 						u16 *ptr = (u16 *)dst + (y + iy) * width + x;
 						u16 *s = (u16 *)(src + 8 * xStep);
 						for (int j = 0; j < 4; j++)
 							*ptr++ = Common::swap16(*s++);
 					}
 		}
 		return PC_TEX_FMT_RGB565;
 	case GX_TF_RGB5A3:
 		{
 			#pragma omp parallel for
 			for (int y = 0; y < height; y += 4)
 				for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
 					for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++)
 						//decodebytesRGB5A3((u32*)dst+(y+iy)*width+x, (u16*)src, 4);
 						decodebytesRGB5A3((u32*)dst+(y+iy)*width+x, (u16*)(src + 8 * xStep));
 		}
 		return PC_TEX_FMT_BGRA32;
 	case GX_TF_RGBA8:  // speed critical
 		{
 #if _M_SSE >= 0x301
 			if (cpu_info.bSSSE3)
 			{
 				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
 				{
 					__m128i* p = (__m128i*)(src + y * width * 4);
 					for (int x = 0; x < width; x += 4)
 					{
 						// We use _mm_loadu_si128 instead of _mm_load_si128
 						// because "p" may not be aligned in 16-bytes alignment.
 						// See Issue 3493.
 						const __m128i a0 = _mm_loadu_si128(p++);
 						const __m128i a1 = _mm_loadu_si128(p++);
 						const __m128i a2 = _mm_loadu_si128(p++);
 						const __m128i a3 = _mm_loadu_si128(p++);
 						// Shuffle 16-bit integeres by _mm_unpacklo_epi16()/_mm_unpackhi_epi16(),
 						// apply Common::swap32() by _mm_shuffle_epi8() and
 						// store them by _mm_stream_si128().
 						// See decodebytesARGB8_4() about the idea.
 						static const __m128i kMaskSwap32 = _mm_set_epi32(0x0C0D0E0FL, 0x08090A0BL, 0x04050607L, 0x00010203L);
 						const __m128i b0 = _mm_unpacklo_epi16(a0, a2);
 						const __m128i c0 = _mm_shuffle_epi8(b0, kMaskSwap32);
 						_mm_stream_si128((__m128i*)((u32*)dst + (y + 0) * width + x), c0);
 						const __m128i b1 = _mm_unpackhi_epi16(a0, a2);
 						const __m128i c1 = _mm_shuffle_epi8(b1, kMaskSwap32);
 						_mm_stream_si128((__m128i*)((u32*)dst + (y + 1) * width + x), c1);
 						const __m128i b2 = _mm_unpacklo_epi16(a1, a3);
 						const __m128i c2 = _mm_shuffle_epi8(b2, kMaskSwap32);
 						_mm_stream_si128((__m128i*)((u32*)dst + (y + 2) * width + x), c2);
 						const __m128i b3 = _mm_unpackhi_epi16(a1, a3);
 						const __m128i c3 = _mm_shuffle_epi8(b3, kMaskSwap32);
 						_mm_stream_si128((__m128i*)((u32*)dst + (y + 3) * width + x), c3);
 					}
 				}
 			}
 			else
 #endif
 			{
 				#pragma omp parallel for
 				for (int y = 0; y < height; y += 4)
 					for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++)
 					{
 						const u8* src2 = src + 64 * yStep;
 						for (int iy = 0; iy < 4; iy++)
 							decodebytesARGB8_4((u32*)dst + (y+iy)*width + x, (u16*)src2 + 4 * iy, (u16*)src2 + 4 * iy + 16);
 					}
 			}
 		}
 		return PC_TEX_FMT_BGRA32;
 	case GX_TF_CMPR:  // speed critical
 		// The metroid games use this format almost exclusively.
 		{
 #if 0   // TODO - currently does not handle transparency correctly and causes problems when texture dimensions are not multiples of 8
 			// 11111111 22222222 55555555 66666666
 			// 33333333 44444444 77777777 88888888
 			for (int y = 0; y < height; y += 8)
 			{
 				for (int x = 0; x < width; x += 8)
 				{
 					copyDXTBlock(dst+(y/2)*width+x*2, src);
 					src += 8;
 					copyDXTBlock(dst+(y/2)*width+x*2+8, src);
 					src += 8;
 					copyDXTBlock(dst+(y/2+2)*width+x*2, src);
 					src += 8;
 					copyDXTBlock(dst+(y/2+2)*width+x*2+8, src);
 					src += 8;
 				}
 			}
 			return PC_TEX_FMT_DXT1;
 #else
 			#pragma omp parallel for
 			for (int y = 0; y < height; y += 8)
 			{
 				for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++)
 				{
 					const u8* src2 = src + 4 * sizeof(DXTBlock) * yStep;
 					decodeDXTBlock((u32*)dst + y * width + x, (DXTBlock*)src2, width);
 										src2 += sizeof(DXTBlock);
 					decodeDXTBlock((u32*)dst + y * width + x + 4, (DXTBlock*)src2, width);
 										src2 += sizeof(DXTBlock);
 					decodeDXTBlock((u32*)dst + (y + 4) * width + x, (DXTBlock*)src2, width);
 										src2 += sizeof(DXTBlock);
 					decodeDXTBlock((u32*)dst + (y + 4) * width + x + 4, (DXTBlock*)src2, width);
 				}
 			}
 #endif
 			return PC_TEX_FMT_BGRA32;
 		}
 	}
 	// The "copy" texture formats, too?
 	return PC_TEX_FMT_NONE;
 }
 // JSD 01/06/11:
 // TODO: we really should ensure BOTH the source and destination addresses are aligned to 16-byte boundaries to
 // squeeze out a little more performance. _mm_loadu_si128/_mm_storeu_si128 is slower than _mm_load_si128/_mm_store_si128
@ -776,7 +287,7 @@ static PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, in
 // TODO: complete SSE2 optimization of less often used texture formats.
 // TODO: refactor algorithms using _mm_loadl_epi64 unaligned loads to prefer 128-bit aligned loads.
-static PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int height, int texformat, int tlutaddr, int tlutfmt)
+PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int height, int texformat, int tlutaddr, int tlutfmt)
 {
 	SetOpenMPThreadCount(width, height);
@ -1844,11 +1355,3 @@ static PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width,
 	// The "copy" texture formats, too?
 	return PC_TEX_FMT_RGBA32;
 }
 PC_TexFormat _TexDecoder_DecodeImpl(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt,bool rgbaOnly)
 {
 	if (rgbaOnly)
 		return TexDecoder_Decode_RGBA((u32*)dst, src, width, height, texformat, tlutaddr, tlutfmt);
 	else
 		return TexDecoder_Decode_real(dst, src, width, height, texformat, tlutaddr, tlutfmt);
 }
--- a/Source/Core/VideoCommon/VideoConfig.cpp
+++ b/Source/Core/VideoCommon/VideoConfig.cpp
@ -36,7 +36,6 @@ VideoConfig::VideoConfig()
 	// disable all features by default
 	backend_info.APIType = API_NONE;
 	backend_info.bUseRGBATextures = false;
 	backend_info.bUseMinimalMipCount = false;
 	backend_info.bSupportsExclusiveFullscreen = false;
 }
--- a/Source/Core/VideoCommon/VideoConfig.h
+++ b/Source/Core/VideoCommon/VideoConfig.h
@ -137,7 +137,6 @@ struct VideoConfig final
 		std::vector<std::string> AAModes;
 		std::vector<std::string> PPShaders; // post-processing shaders
 		bool bUseRGBATextures; // used for D3D in TextureCache
 		bool bUseMinimalMipCount;
 		bool bSupportsExclusiveFullscreen;
 		bool bSupportsDualSourceBlend;