From a8e591dc73fc3b35f888690879673ba0250bda68 Mon Sep 17 00:00:00 2001 From: "Jasper St. Pierre" Date: Sun, 10 Aug 2014 13:39:20 -0400 Subject: [PATCH] VideoCommon: Remove support for decoding to ARGB textures The D3D / OGL backends only ever used RGBA textures, and the Software backend uses its own custom code for sampling. The ARGB path seems to just be dead code. Since ARGB and RGBA formats are similar, I don't think this will make the code more difficult to read or unable to be used as reference. Somebody who wants to use this code to output ARGB can simply modify the MakeRGBA function to put the shift at the other end. --- Source/Core/VideoBackends/D3D/main.cpp | 1 - Source/Core/VideoBackends/OGL/main.cpp | 1 - Source/Core/VideoCommon/TextureCacheBase.cpp | 5 +- Source/Core/VideoCommon/TextureDecoder.h | 4 +- .../VideoCommon/TextureDecoder_Common.cpp | 4 +- .../VideoCommon/TextureDecoder_Generic.cpp | 394 +------------- .../Core/VideoCommon/TextureDecoder_x64.cpp | 499 +----------------- Source/Core/VideoCommon/VideoConfig.cpp | 1 - Source/Core/VideoCommon/VideoConfig.h | 1 - 9 files changed, 8 insertions(+), 902 deletions(-) diff --git a/Source/Core/VideoBackends/D3D/main.cpp b/Source/Core/VideoBackends/D3D/main.cpp index a21b03a03c..e78d809e1b 100644 --- a/Source/Core/VideoBackends/D3D/main.cpp +++ b/Source/Core/VideoBackends/D3D/main.cpp @@ -72,7 +72,6 @@ void InitBackendInfo() } g_Config.backend_info.APIType = API_D3D; - g_Config.backend_info.bUseRGBATextures = true; // the GX formats barely match any D3D11 formats g_Config.backend_info.bUseMinimalMipCount = true; g_Config.backend_info.bSupportsExclusiveFullscreen = true; g_Config.backend_info.bSupportsDualSourceBlend = true; diff --git a/Source/Core/VideoBackends/OGL/main.cpp b/Source/Core/VideoBackends/OGL/main.cpp index f2741cc9ba..472bb19f43 100644 --- a/Source/Core/VideoBackends/OGL/main.cpp +++ b/Source/Core/VideoBackends/OGL/main.cpp @@ -132,7 +132,6 @@ static void GetShaders(std::vector &shaders) static void InitBackendInfo() { g_Config.backend_info.APIType = API_OPENGL; - g_Config.backend_info.bUseRGBATextures = true; g_Config.backend_info.bUseMinimalMipCount = false; g_Config.backend_info.bSupportsExclusiveFullscreen = false; //g_Config.backend_info.bSupportsDualSourceBlend = true; // is gpu dependent and must be set in renderer diff --git a/Source/Core/VideoCommon/TextureCacheBase.cpp b/Source/Core/VideoCommon/TextureCacheBase.cpp index 0bbc336a31..ad414e36cc 100644 --- a/Source/Core/VideoCommon/TextureCacheBase.cpp +++ b/Source/Core/VideoCommon/TextureCacheBase.cpp @@ -490,8 +490,7 @@ TextureCache::TCacheEntryBase* TextureCache::Load(unsigned int const stage, { if (!(texformat == GX_TF_RGBA8 && from_tmem)) { - pcfmt = TexDecoder_Decode(temp, src_data, expandedWidth, - expandedHeight, texformat, tlutaddr, tlutfmt, g_ActiveConfig.backend_info.bUseRGBATextures); + pcfmt = TexDecoder_Decode(temp, src_data, expandedWidth, expandedHeight, texformat, tlutaddr, tlutfmt); } else { @@ -567,7 +566,7 @@ TextureCache::TCacheEntryBase* TextureCache::Load(unsigned int const stage, const u8*& mip_src_data = from_tmem ? ((level % 2) ? ptr_odd : ptr_even) : src_data; - TexDecoder_Decode(temp, mip_src_data, expanded_mip_width, expanded_mip_height, texformat, tlutaddr, tlutfmt, g_ActiveConfig.backend_info.bUseRGBATextures); + TexDecoder_Decode(temp, mip_src_data, expanded_mip_width, expanded_mip_height, texformat, tlutaddr, tlutfmt); mip_src_data += TexDecoder_GetTextureSizeInBytes(expanded_mip_width, expanded_mip_height, texformat); entry->Load(mip_width, mip_height, expanded_mip_width, level); diff --git a/Source/Core/VideoCommon/TextureDecoder.h b/Source/Core/VideoCommon/TextureDecoder.h index efcce209b5..0c5493be71 100644 --- a/Source/Core/VideoCommon/TextureDecoder.h +++ b/Source/Core/VideoCommon/TextureDecoder.h @@ -71,7 +71,7 @@ enum PC_TexFormat PC_TEX_FMT_DXT1, }; -PC_TexFormat TexDecoder_Decode(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt,bool rgbaOnly = false); +PC_TexFormat TexDecoder_Decode(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt); void TexDecoder_DecodeTexel(u8 *dst, const u8 *src, int s, int t, int imageWidth, int texformat, int tlutaddr, int tlutfmt); void TexDecoder_DecodeTexelRGBA8FromTmem(u8 *dst, const u8 *src_ar, const u8* src_gb, int s, int t, int imageWidth); PC_TexFormat TexDecoder_DecodeRGBA8FromTmem(u8* dst, const u8 *src_ar, const u8 *src_gb, int width, int height); @@ -79,4 +79,4 @@ PC_TexFormat TexDecoder_DecodeRGBA8FromTmem(u8* dst, const u8 *src_ar, const u8 void TexDecoder_SetTexFmtOverlayOptions(bool enable, bool center); /* Internal method, implemented by TextureDecoder_Generic and TextureDecoder_x64. */ -PC_TexFormat _TexDecoder_DecodeImpl(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt,bool rgbaOnly); +PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int height, int texformat, int tlutaddr, int tlutfmt); diff --git a/Source/Core/VideoCommon/TextureDecoder_Common.cpp b/Source/Core/VideoCommon/TextureDecoder_Common.cpp index b65200cb69..63e6dc672e 100644 --- a/Source/Core/VideoCommon/TextureDecoder_Common.cpp +++ b/Source/Core/VideoCommon/TextureDecoder_Common.cpp @@ -242,9 +242,9 @@ static void TexDecoder_DrawOverlay(u8 *dst, int width, int height, int texformat } } -PC_TexFormat TexDecoder_Decode(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt,bool rgbaOnly) +PC_TexFormat TexDecoder_Decode(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt) { - PC_TexFormat pc_texformat = _TexDecoder_DecodeImpl(dst, src, width, height, texformat, tlutaddr, tlutfmt, rgbaOnly); + PC_TexFormat pc_texformat = _TexDecoder_DecodeImpl((u32*)dst, src, width, height, texformat, tlutaddr, tlutfmt); if (TexFmt_Overlay_Enable && pc_texformat != PC_TEX_FMT_NONE) TexDecoder_DrawOverlay(dst, width, height, texformat, pc_texformat); diff --git a/Source/Core/VideoCommon/TextureDecoder_Generic.cpp b/Source/Core/VideoCommon/TextureDecoder_Generic.cpp index 2357f7c322..dd23e4f924 100644 --- a/Source/Core/VideoCommon/TextureDecoder_Generic.cpp +++ b/Source/Core/VideoCommon/TextureDecoder_Generic.cpp @@ -17,26 +17,6 @@ // Decodes all known GameCube/Wii texture formats. // by ector -static inline u32 decode5A3(u16 val) -{ - int r,g,b,a; - if ((val & 0x8000)) - { - a = 0xFF; - r = Convert5To8((val >> 10) & 0x1F); - g = Convert5To8((val >> 5) & 0x1F); - b = Convert5To8(val & 0x1F); - } - else - { - a = Convert3To8((val >> 12) & 0x7); - r = Convert4To8((val >> 8) & 0xF); - g = Convert4To8((val >> 4) & 0xF); - b = Convert4To8(val & 0xF); - } - return (a << 24) | (r << 16) | (g << 8) | b; -} - static inline u32 decode5A3RGBA(u16 val) { int r,g,b,a; @@ -80,18 +60,6 @@ struct DXTBlock u8 lines[4]; }; -//inline void decodebytesC4(u32 *dst, const u8 *src, int numbytes, int tlutaddr, int tlutfmt) -inline void decodebytesC4_5A3_To_BGRA32(u32 *dst, const u8 *src, int tlutaddr) -{ - u16 *tlut = (u16*)(texMem + tlutaddr); - for (int x = 0; x < 4; x++) - { - u8 val = src[x]; - *dst++ = decode5A3(Common::swap16(tlut[val >> 4])); - *dst++ = decode5A3(Common::swap16(tlut[val & 0xF])); - } -} - inline void decodebytesC4_5A3_To_rgba32(u32 *dst, const u8 *src, int tlutaddr) { u16 *tlut = (u16*)(texMem + tlutaddr); @@ -103,17 +71,6 @@ inline void decodebytesC4_5A3_To_rgba32(u32 *dst, const u8 *src, int tlutaddr) } } -inline void decodebytesC4_To_Raw16(u16* dst, const u8* src, int tlutaddr) -{ - u16* tlut = (u16*)(texMem+tlutaddr); - for (int x = 0; x < 4; x++) - { - u8 val = src[x]; - *dst++ = Common::swap16(tlut[val >> 4]); - *dst++ = Common::swap16(tlut[val & 0xF]); - } -} - inline void decodebytesC4IA8_To_RGBA(u32* dst, const u8* src, int tlutaddr) { u16* tlut = (u16*)(texMem+tlutaddr); @@ -136,17 +93,6 @@ inline void decodebytesC4RGB565_To_RGBA(u32* dst, const u8* src, int tlutaddr) } } -//inline void decodebytesC8(u32 *dst, const u8 *src, int numbytes, int tlutaddr, int tlutfmt) -inline void decodebytesC8_5A3_To_BGRA32(u32 *dst, const u8 *src, int tlutaddr) -{ - u16 *tlut = (u16*)(texMem + tlutaddr); - for (int x = 0; x < 8; x++) - { - u8 val = src[x]; - *dst++ = decode5A3(Common::swap16(tlut[val])); - } -} - inline void decodebytesC8_5A3_To_RGBA32(u32 *dst, const u8 *src, int tlutaddr) { u16 *tlut = (u16*)(texMem + tlutaddr); @@ -157,16 +103,6 @@ inline void decodebytesC8_5A3_To_RGBA32(u32 *dst, const u8 *src, int tlutaddr) } } -inline void decodebytesC8_To_Raw16(u16* dst, const u8* src, int tlutaddr) -{ - u16* tlut = (u16*)(texMem + tlutaddr); - for (int x = 0; x < 8; x++) - { - u8 val = src[x]; - *dst++ = Common::swap16(tlut[val]); - } -} - inline void decodebytesC8IA8_To_RGBA(u32* dst, const u8* src, int tlutaddr) { u16* tlut = (u16*)(texMem + tlutaddr); @@ -185,16 +121,6 @@ inline void decodebytesC8RGB565_To_RGBA(u32* dst, const u8* src, int tlutaddr) } } -inline void decodebytesC14X2_5A3_To_BGRA32(u32 *dst, const u16 *src, int tlutaddr) -{ - u16 *tlut = (u16*)(texMem + tlutaddr); - for (int x = 0; x < 4; x++) - { - u16 val = Common::swap16(src[x]); - *dst++ = decode5A3(Common::swap16(tlut[(val & 0x3FFF)])); - } -} - inline void decodebytesC14X2_5A3_To_RGBA(u32 *dst, const u16 *src, int tlutaddr) { u16 *tlut = (u16*)(texMem + tlutaddr); @@ -205,16 +131,6 @@ inline void decodebytesC14X2_5A3_To_RGBA(u32 *dst, const u16 *src, int tlutaddr) } } -inline void decodebytesC14X2_To_Raw16(u16* dst, const u16* src, int tlutaddr) -{ - u16* tlut = (u16*)(texMem + tlutaddr); - for (int x = 0; x < 4; x++) - { - u16 val = Common::swap16(src[x]); - *dst++ = Common::swap16(tlut[(val & 0x3FFF)]); - } -} - inline void decodebytesC14X2IA8_To_RGBA(u32* dst, const u16* src, int tlutaddr) { u16* tlut = (u16*)(texMem + tlutaddr); @@ -235,18 +151,6 @@ inline void decodebytesC14X2rgb565_To_RGBA(u32* dst, const u16* src, int tlutadd } } -// Needs more speed. -inline void decodebytesIA4(u16 *dst, const u8 *src) -{ - for (int x = 0; x < 8; x++) - { - const u8 val = src[x]; - u8 a = Convert4To8(val >> 4); - u8 l = Convert4To8(val & 0xF); - dst[x] = (a << 8) | l; - } -} - inline void decodebytesIA4RGBA(u32 *dst, const u8 *src) { for (int x = 0; x < 8; x++) @@ -258,19 +162,6 @@ inline void decodebytesIA4RGBA(u32 *dst, const u8 *src) } } -inline void decodebytesRGB5A3(u32 *dst, const u16 *src) -{ -#if 0 - for (int x = 0; x < 4; x++) - dst[x] = decode5A3(Common::swap16(src[x])); -#else - dst[0] = decode5A3(Common::swap16(src[0])); - dst[1] = decode5A3(Common::swap16(src[1])); - dst[2] = decode5A3(Common::swap16(src[2])); - dst[3] = decode5A3(Common::swap16(src[3])); -#endif -} - inline void decodebytesRGB5A3rgba(u32 *dst, const u16 *src) { #if 0 @@ -284,29 +175,6 @@ inline void decodebytesRGB5A3rgba(u32 *dst, const u16 *src) #endif } -// This one is used by many video formats. It'd therefore be good if it was fast. -// Needs more speed. -inline void decodebytesARGB8_4(u32 *dst, const u16 *src, const u16 *src2) -{ -#if 0 - for (int x = 0; x < 4; x++) - dst[x] = Common::swap32((src2[x] << 16) | src[x]); -#else - dst[0] = Common::swap32((src2[0] << 16) | src[0]); - dst[1] = Common::swap32((src2[1] << 16) | src[1]); - dst[2] = Common::swap32((src2[2] << 16) | src[2]); - dst[3] = Common::swap32((src2[3] << 16) | src[3]); -#endif - - // This can probably be done in a few SSE pack/unpack instructions + pshufb - // some unpack instruction x2: - // ABABABABABABABAB 1212121212121212 -> - // AB12AB12AB12AB12 AB12AB12AB12AB12 - // 2x pshufb-> - // 21BA21BA21BA21BA 21BA21BA21BA21BA - // and we are done. -} - inline void decodebytesARGB8_4ToRgba(u32 *dst, const u16 *src, const u16 * src2) { #if 0 @@ -322,59 +190,11 @@ inline void decodebytesARGB8_4ToRgba(u32 *dst, const u16 *src, const u16 * src2) #endif } -inline u32 makecol(int r, int g, int b, int a) -{ - return (a << 24)|(r << 16)|(g << 8)|b; -} - inline u32 makeRGBA(int r, int g, int b, int a) { return (a<<24)|(b<<16)|(g<<8)|r; } -void decodeDXTBlock(u32 *dst, const DXTBlock *src, int pitch) -{ - // S3TC Decoder (Note: GCN decodes differently from PC so we can't use native support) - // Needs more speed. - u16 c1 = Common::swap16(src->color1); - u16 c2 = Common::swap16(src->color2); - int blue1 = Convert5To8(c1 & 0x1F); - int blue2 = Convert5To8(c2 & 0x1F); - int green1 = Convert6To8((c1 >> 5) & 0x3F); - int green2 = Convert6To8((c2 >> 5) & 0x3F); - int red1 = Convert5To8((c1 >> 11) & 0x1F); - int red2 = Convert5To8((c2 >> 11) & 0x1F); - int colors[4]; - colors[0] = makecol(red1, green1, blue1, 255); - colors[1] = makecol(red2, green2, blue2, 255); - if (c1 > c2) - { - int blue3 = ((blue2 - blue1) >> 1) - ((blue2 - blue1) >> 3); - int green3 = ((green2 - green1) >> 1) - ((green2 - green1) >> 3); - int red3 = ((red2 - red1) >> 1) - ((red2 - red1) >> 3); - colors[2] = makecol(red1 + red3, green1 + green3, blue1 + blue3, 255); - colors[3] = makecol(red2 - red3, green2 - green3, blue2 - blue3, 255); - } - else - { - colors[2] = makecol((red1 + red2 + 1) / 2, // Average - (green1 + green2 + 1) / 2, - (blue1 + blue2 + 1) / 2, 255); - colors[3] = makecol(red2, green2, blue2, 0); // Color2 but transparent - } - - for (int y = 0; y < 4; y++) - { - int val = src->lines[y]; - for (int x = 0; x < 4; x++) - { - dst[x] = colors[(val >> 6) & 3]; - val <<= 2; - } - dst += pitch; - } -} - void decodeDXTBlockRGBA(u32 *dst, const DXTBlock *src, int pitch) { // S3TC Decoder (Note: GCN decodes differently from PC so we can't use native support) @@ -418,210 +238,6 @@ void decodeDXTBlockRGBA(u32 *dst, const DXTBlock *src, int pitch) } } -#if 0 // TODO - currently does not handle transparency correctly and causes problems when texture dimensions are not multiples of 8 -static void copyDXTBlock(u8* dst, const u8* src) -{ - ((u16*)dst)[0] = Common::swap16(((u16*)src)[0]); - ((u16*)dst)[1] = Common::swap16(((u16*)src)[1]); - u32 pixels = ((u32*)src)[1]; - // A bit of trickiness here: the row are in the same order - // between the two formats, but the ordering within the rows - // is reversed. - pixels = ((pixels >> 4) & 0x0F0F0F0F) | ((pixels << 4) & 0xF0F0F0F0); - pixels = ((pixels >> 2) & 0x33333333) | ((pixels << 2) & 0xCCCCCCCC); - ((u32*)dst)[1] = pixels; -} -#endif - -//switch endianness, unswizzle -//TODO: to save memory, don't blindly convert everything to argb8888 -//also ARGB order needs to be swapped later, to accommodate modern hardware better -//need to add DXT support too -PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt) -{ - const int Wsteps4 = (width + 3) / 4; - const int Wsteps8 = (width + 7) / 8; - - switch (texformat) - { - case GX_TF_C4: - if (tlutfmt == 2) - { - // Special decoding is required for TLUT format 5A3 - for (int y = 0; y < height; y += 8) - for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++) - for (int iy = 0, xStep = yStep * 8; iy < 8; iy++, xStep++) - decodebytesC4_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, src + 4 * xStep, tlutaddr); - } - else - { - for (int y = 0; y < height; y += 8) - for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++) - for (int iy = 0, xStep = yStep * 8; iy < 8; iy++, xStep++) - decodebytesC4_To_Raw16((u16*)dst + (y + iy) * width + x, src + 4 * xStep, tlutaddr); - } - return GetPCFormatFromTLUTFormat(tlutfmt); - case GX_TF_I4: - { - for (int y = 0; y < height; y += 8) - for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++) - for (int iy = 0, xStep = yStep * 8 ; iy < 8; iy++,xStep++) - for (int ix = 0; ix < 4; ix++) - { - int val = src[4 * xStep + ix]; - dst[(y + iy) * width + x + ix * 2] = Convert4To8(val >> 4); - dst[(y + iy) * width + x + ix * 2 + 1] = Convert4To8(val & 0xF); - } - } - return PC_TEX_FMT_I4_AS_I8; - case GX_TF_I8: // speed critical - { - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - ((u64*)(dst + (y + iy) * width + x))[0] = ((u64*)(src + 8 * xStep))[0]; - } - return PC_TEX_FMT_I8; - case GX_TF_C8: - if (tlutfmt == 2) - { - // Special decoding is required for TLUT format 5A3 - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - decodebytesC8_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, src + 8 * xStep, tlutaddr); - } - else - { - - { - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - decodebytesC8_To_Raw16((u16*)dst + (y + iy) * width + x, src + 8 * xStep, tlutaddr); - } - } - return GetPCFormatFromTLUTFormat(tlutfmt); - case GX_TF_IA4: - { - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - decodebytesIA4((u16*)dst + (y + iy) * width + x, src + 8 * xStep); - } - return PC_TEX_FMT_IA4_AS_IA8; - case GX_TF_IA8: - { - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) - for (int iy = 0, xStep = yStep * 4; iy < 4; iy++, xStep++) - { - u16 *ptr = (u16 *)dst + (y + iy) * width + x; - u16 *s = (u16 *)(src + 8 * xStep); - for (int j = 0; j < 4; j++) - *ptr++ = Common::swap16(*s++); - } - } - return PC_TEX_FMT_IA8; - case GX_TF_C14X2: - if (tlutfmt == 2) - { - // Special decoding is required for TLUT format 5A3 - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - decodebytesC14X2_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, (u16*)(src + 8 * xStep), tlutaddr); - } - else - { - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - decodebytesC14X2_To_Raw16((u16*)dst + (y + iy) * width + x,(u16*)(src + 8 * xStep), tlutaddr); - } - return GetPCFormatFromTLUTFormat(tlutfmt); - case GX_TF_RGB565: - { - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - { - u16 *ptr = (u16 *)dst + (y + iy) * width + x; - u16 *s = (u16 *)(src + 8 * xStep); - for (int j = 0; j < 4; j++) - *ptr++ = Common::swap16(*s++); - } - } - return PC_TEX_FMT_RGB565; - case GX_TF_RGB5A3: - { - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - //decodebytesRGB5A3((u32*)dst+(y+iy)*width+x, (u16*)src, 4); - decodebytesRGB5A3((u32*)dst+(y+iy)*width+x, (u16*)(src + 8 * xStep)); - } - return PC_TEX_FMT_BGRA32; - case GX_TF_RGBA8: // speed critical - { - for (int y = 0; y < height; y += 4) - { - for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) - { - const u8* src2 = src + 64 * yStep; - for (int iy = 0; iy < 4; iy++) - decodebytesARGB8_4((u32*)dst + (y+iy)*width + x, (u16*)src2 + 4 * iy, (u16*)src2 + 4 * iy + 16); - } - } - } - return PC_TEX_FMT_BGRA32; - case GX_TF_CMPR: // speed critical - // The metroid games use this format almost exclusively. - { -#if 0 // TODO - currently does not handle transparency correctly and causes problems when texture dimensions are not multiples of 8 - // 11111111 22222222 55555555 66666666 - // 33333333 44444444 77777777 88888888 - for (int y = 0; y < height; y += 8) - { - for (int x = 0; x < width; x += 8) - { - copyDXTBlock(dst+(y/2)*width+x*2, src); - src += 8; - copyDXTBlock(dst+(y/2)*width+x*2+8, src); - src += 8; - copyDXTBlock(dst+(y/2+2)*width+x*2, src); - src += 8; - copyDXTBlock(dst+(y/2+2)*width+x*2+8, src); - src += 8; - } - } - return PC_TEX_FMT_DXT1; -#else - for (int y = 0; y < height; y += 8) - { - for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++) - { - const u8* src2 = src + 4 * sizeof(DXTBlock) * yStep; - decodeDXTBlock((u32*)dst + y * width + x, (DXTBlock*)src2, width); - src2 += sizeof(DXTBlock); - decodeDXTBlock((u32*)dst + y * width + x + 4, (DXTBlock*)src2, width); - src2 += sizeof(DXTBlock); - decodeDXTBlock((u32*)dst + (y + 4) * width + x, (DXTBlock*)src2, width); - src2 += sizeof(DXTBlock); - decodeDXTBlock((u32*)dst + (y + 4) * width + x + 4, (DXTBlock*)src2, width); - } - } -#endif - return PC_TEX_FMT_BGRA32; - } - } - - // The "copy" texture formats, too? - return PC_TEX_FMT_NONE; -} - - - // JSD 01/06/11: // TODO: we really should ensure BOTH the source and destination addresses are aligned to 16-byte boundaries to // squeeze out a little more performance. _mm_loadu_si128/_mm_storeu_si128 is slower than _mm_load_si128/_mm_store_si128 @@ -630,7 +246,7 @@ PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int heigh // TODO: complete SSE2 optimization of less often used texture formats. // TODO: refactor algorithms using _mm_loadl_epi64 unaligned loads to prefer 128-bit aligned loads. -PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int height, int texformat, int tlutaddr, int tlutfmt) +PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int height, int texformat, int tlutaddr, int tlutfmt) { const int Wsteps4 = (width + 3) / 4; @@ -832,11 +448,3 @@ PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int he // The "copy" texture formats, too? return PC_TEX_FMT_RGBA32; } - -PC_TexFormat _TexDecoder_DecodeImpl(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt,bool rgbaOnly) -{ - if (rgbaOnly) - return TexDecoder_Decode_RGBA((u32*)dst, src, width, height, texformat, tlutaddr, tlutfmt); - else - return TexDecoder_Decode_real(dst, src, width, height, texformat, tlutaddr, tlutfmt); -} diff --git a/Source/Core/VideoCommon/TextureDecoder_x64.cpp b/Source/Core/VideoCommon/TextureDecoder_x64.cpp index d1bb6ceabb..e9119ff1d1 100644 --- a/Source/Core/VideoCommon/TextureDecoder_x64.cpp +++ b/Source/Core/VideoCommon/TextureDecoder_x64.cpp @@ -37,26 +37,6 @@ // Decodes all known GameCube/Wii texture formats. // by ector -static inline u32 decode5A3(u16 val) -{ - int r,g,b,a; - if ((val & 0x8000)) - { - a = 0xFF; - r = Convert5To8((val >> 10) & 0x1F); - g = Convert5To8((val >> 5) & 0x1F); - b = Convert5To8(val & 0x1F); - } - else - { - a = Convert3To8((val >> 12) & 0x7); - r = Convert4To8((val >> 8) & 0xF); - g = Convert4To8((val >> 4) & 0xF); - b = Convert4To8(val & 0xF); - } - return (a << 24) | (r << 16) | (g << 8) | b; -} - static inline u32 decode5A3RGBA(u16 val) { int r,g,b,a; @@ -103,18 +83,6 @@ struct DXTBlock u8 lines[4]; }; -//inline void decodebytesC4(u32 *dst, const u8 *src, int numbytes, int tlutaddr, int tlutfmt) -inline void decodebytesC4_5A3_To_BGRA32(u32 *dst, const u8 *src, int tlutaddr) -{ - u16 *tlut = (u16*)(texMem + tlutaddr); - for (int x = 0; x < 4; x++) - { - u8 val = src[x]; - *dst++ = decode5A3(Common::swap16(tlut[val >> 4])); - *dst++ = decode5A3(Common::swap16(tlut[val & 0xF])); - } -} - inline void decodebytesC4_5A3_To_rgba32(u32 *dst, const u8 *src, int tlutaddr) { u16 *tlut = (u16*)(texMem + tlutaddr); @@ -126,17 +94,6 @@ inline void decodebytesC4_5A3_To_rgba32(u32 *dst, const u8 *src, int tlutaddr) } } -inline void decodebytesC4_To_Raw16(u16* dst, const u8* src, int tlutaddr) -{ - u16* tlut = (u16*)(texMem+tlutaddr); - for (int x = 0; x < 4; x++) - { - u8 val = src[x]; - *dst++ = Common::swap16(tlut[val >> 4]); - *dst++ = Common::swap16(tlut[val & 0xF]); - } -} - inline void decodebytesC4IA8_To_RGBA(u32* dst, const u8* src, int tlutaddr) { u16* tlut = (u16*)(texMem+tlutaddr); @@ -159,17 +116,6 @@ inline void decodebytesC4RGB565_To_RGBA(u32* dst, const u8* src, int tlutaddr) } } -//inline void decodebytesC8(u32 *dst, const u8 *src, int numbytes, int tlutaddr, int tlutfmt) -inline void decodebytesC8_5A3_To_BGRA32(u32 *dst, const u8 *src, int tlutaddr) -{ - u16 *tlut = (u16*)(texMem + tlutaddr); - for (int x = 0; x < 8; x++) - { - u8 val = src[x]; - *dst++ = decode5A3(Common::swap16(tlut[val])); - } -} - inline void decodebytesC8_5A3_To_RGBA32(u32 *dst, const u8 *src, int tlutaddr) { u16 *tlut = (u16*)(texMem + tlutaddr); @@ -180,16 +126,6 @@ inline void decodebytesC8_5A3_To_RGBA32(u32 *dst, const u8 *src, int tlutaddr) } } -inline void decodebytesC8_To_Raw16(u16* dst, const u8* src, int tlutaddr) -{ - u16* tlut = (u16*)(texMem + tlutaddr); - for (int x = 0; x < 8; x++) - { - u8 val = src[x]; - *dst++ = Common::swap16(tlut[val]); - } -} - inline void decodebytesC8IA8_To_RGBA(u32* dst, const u8* src, int tlutaddr) { u16* tlut = (u16*)(texMem + tlutaddr); @@ -208,42 +144,6 @@ inline void decodebytesC8RGB565_To_RGBA(u32* dst, const u8* src, int tlutaddr) } } -#if _M_SSE >= 0x301 -static const __m128i kMaskSwap16 = _mm_set_epi32(0x0E0F0C0DL, 0x0A0B0809L, 0x06070405L, 0x02030001L); - -inline void decodebytesC8_To_Raw16_SSSE3(u16* dst, const u8* src, int tlutaddr) -{ - u16* tlut = (u16*)(texMem + tlutaddr); - - // Make 8 16-bits unsigned integer values - __m128i a = _mm_setzero_si128(); - a = _mm_insert_epi16(a, tlut[src[0]], 0); - a = _mm_insert_epi16(a, tlut[src[1]], 1); - a = _mm_insert_epi16(a, tlut[src[2]], 2); - a = _mm_insert_epi16(a, tlut[src[3]], 3); - a = _mm_insert_epi16(a, tlut[src[4]], 4); - a = _mm_insert_epi16(a, tlut[src[5]], 5); - a = _mm_insert_epi16(a, tlut[src[6]], 6); - a = _mm_insert_epi16(a, tlut[src[7]], 7); - - // Apply Common::swap16() to 16-bits unsigned integers at once - const __m128i b = _mm_shuffle_epi8(a, kMaskSwap16); - - // Store values to dst without polluting the caches - _mm_stream_si128((__m128i*)dst, b); -} -#endif - -inline void decodebytesC14X2_5A3_To_BGRA32(u32 *dst, const u16 *src, int tlutaddr) -{ - u16 *tlut = (u16*)(texMem + tlutaddr); - for (int x = 0; x < 4; x++) - { - u16 val = Common::swap16(src[x]); - *dst++ = decode5A3(Common::swap16(tlut[(val & 0x3FFF)])); - } -} - inline void decodebytesC14X2_5A3_To_RGBA(u32 *dst, const u16 *src, int tlutaddr) { u16 *tlut = (u16*)(texMem + tlutaddr); @@ -254,16 +154,6 @@ inline void decodebytesC14X2_5A3_To_RGBA(u32 *dst, const u16 *src, int tlutaddr) } } -inline void decodebytesC14X2_To_Raw16(u16* dst, const u16* src, int tlutaddr) -{ - u16* tlut = (u16*)(texMem + tlutaddr); - for (int x = 0; x < 4; x++) - { - u16 val = Common::swap16(src[x]); - *dst++ = Common::swap16(tlut[(val & 0x3FFF)]); - } -} - inline void decodebytesC14X2IA8_To_RGBA(u32* dst, const u16* src, int tlutaddr) { u16* tlut = (u16*)(texMem + tlutaddr); @@ -284,18 +174,6 @@ inline void decodebytesC14X2rgb565_To_RGBA(u32* dst, const u16* src, int tlutadd } } -// Needs more speed. -inline void decodebytesIA4(u16 *dst, const u8 *src) -{ - for (int x = 0; x < 8; x++) - { - const u8 val = src[x]; - u8 a = Convert4To8(val >> 4); - u8 l = Convert4To8(val & 0xF); - dst[x] = (a << 8) | l; - } -} - inline void decodebytesIA4RGBA(u32 *dst, const u8 *src) { for (int x = 0; x < 8; x++) @@ -307,19 +185,6 @@ inline void decodebytesIA4RGBA(u32 *dst, const u8 *src) } } -inline void decodebytesRGB5A3(u32 *dst, const u16 *src) -{ -#if 0 - for (int x = 0; x < 4; x++) - dst[x] = decode5A3(Common::swap16(src[x])); -#else - dst[0] = decode5A3(Common::swap16(src[0])); - dst[1] = decode5A3(Common::swap16(src[1])); - dst[2] = decode5A3(Common::swap16(src[2])); - dst[3] = decode5A3(Common::swap16(src[3])); -#endif -} - inline void decodebytesRGB5A3rgba(u32 *dst, const u16 *src) { #if 0 @@ -333,29 +198,6 @@ inline void decodebytesRGB5A3rgba(u32 *dst, const u16 *src) #endif } -// This one is used by many video formats. It'd therefore be good if it was fast. -// Needs more speed. -inline void decodebytesARGB8_4(u32 *dst, const u16 *src, const u16 *src2) -{ -#if 0 - for (int x = 0; x < 4; x++) - dst[x] = Common::swap32((src2[x] << 16) | src[x]); -#else - dst[0] = Common::swap32((src2[0] << 16) | src[0]); - dst[1] = Common::swap32((src2[1] << 16) | src[1]); - dst[2] = Common::swap32((src2[2] << 16) | src[2]); - dst[3] = Common::swap32((src2[3] << 16) | src[3]); -#endif - - // This can probably be done in a few SSE pack/unpack instructions + pshufb - // some unpack instruction x2: - // ABABABABABABABAB 1212121212121212 -> - // AB12AB12AB12AB12 AB12AB12AB12AB12 - // 2x pshufb-> - // 21BA21BA21BA21BA 21BA21BA21BA21BA - // and we are done. -} - inline void decodebytesARGB8_4ToRgba(u32 *dst, const u16 *src, const u16 * src2) { #if 0 @@ -371,59 +213,11 @@ inline void decodebytesARGB8_4ToRgba(u32 *dst, const u16 *src, const u16 * src2) #endif } -inline u32 makecol(int r, int g, int b, int a) -{ - return (a << 24)|(r << 16)|(g << 8)|b; -} - inline u32 makeRGBA(int r, int g, int b, int a) { return (a<<24)|(b<<16)|(g<<8)|r; } -static void decodeDXTBlock(u32 *dst, const DXTBlock *src, int pitch) -{ - // S3TC Decoder (Note: GCN decodes differently from PC so we can't use native support) - // Needs more speed. - u16 c1 = Common::swap16(src->color1); - u16 c2 = Common::swap16(src->color2); - int blue1 = Convert5To8(c1 & 0x1F); - int blue2 = Convert5To8(c2 & 0x1F); - int green1 = Convert6To8((c1 >> 5) & 0x3F); - int green2 = Convert6To8((c2 >> 5) & 0x3F); - int red1 = Convert5To8((c1 >> 11) & 0x1F); - int red2 = Convert5To8((c2 >> 11) & 0x1F); - int colors[4]; - colors[0] = makecol(red1, green1, blue1, 255); - colors[1] = makecol(red2, green2, blue2, 255); - if (c1 > c2) - { - int blue3 = ((blue2 - blue1) >> 1) - ((blue2 - blue1) >> 3); - int green3 = ((green2 - green1) >> 1) - ((green2 - green1) >> 3); - int red3 = ((red2 - red1) >> 1) - ((red2 - red1) >> 3); - colors[2] = makecol(red1 + red3, green1 + green3, blue1 + blue3, 255); - colors[3] = makecol(red2 - red3, green2 - green3, blue2 - blue3, 255); - } - else - { - colors[2] = makecol((red1 + red2 + 1) / 2, // Average - (green1 + green2 + 1) / 2, - (blue1 + blue2 + 1) / 2, 255); - colors[3] = makecol(red2, green2, blue2, 0); // Color2 but transparent - } - - for (int y = 0; y < 4; y++) - { - int val = src->lines[y]; - for (int x = 0; x < 4; x++) - { - dst[x] = colors[(val >> 6) & 3]; - val <<= 2; - } - dst += pitch; - } -} - #ifdef CHECK static void decodeDXTBlockRGBA(u32 *dst, const DXTBlock *src, int pitch) { @@ -469,21 +263,6 @@ static void decodeDXTBlockRGBA(u32 *dst, const DXTBlock *src, int pitch) } #endif -#if 0 // TODO - currently does not handle transparency correctly and causes problems when texture dimensions are not multiples of 8 -static void copyDXTBlock(u8* dst, const u8* src) -{ - ((u16*)dst)[0] = Common::swap16(((u16*)src)[0]); - ((u16*)dst)[1] = Common::swap16(((u16*)src)[1]); - u32 pixels = ((u32*)src)[1]; - // A bit of trickiness here: the row are in the same order - // between the two formats, but the ordering within the rows - // is reversed. - pixels = ((pixels >> 4) & 0x0F0F0F0F) | ((pixels << 4) & 0xF0F0F0F0); - pixels = ((pixels >> 2) & 0x33333333) | ((pixels << 2) & 0xCCCCCCCC); - ((u32*)dst)[1] = pixels; -} -#endif - inline void SetOpenMPThreadCount(int width, int height) { #ifdef _OPENMP @@ -500,274 +279,6 @@ inline void SetOpenMPThreadCount(int width, int height) #endif } -//switch endianness, unswizzle -//TODO: to save memory, don't blindly convert everything to argb8888 -//also ARGB order needs to be swapped later, to accommodate modern hardware better -//need to add DXT support too -static PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt) -{ - SetOpenMPThreadCount(width, height); - - const int Wsteps4 = (width + 3) / 4; - const int Wsteps8 = (width + 7) / 8; - - switch (texformat) - { - case GX_TF_C4: - if (tlutfmt == 2) - { - // Special decoding is required for TLUT format 5A3 - #pragma omp parallel for - for (int y = 0; y < height; y += 8) - for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++) - for (int iy = 0, xStep = yStep * 8; iy < 8; iy++, xStep++) - decodebytesC4_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, src + 4 * xStep, tlutaddr); - } - else - { - #pragma omp parallel for - for (int y = 0; y < height; y += 8) - for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++) - for (int iy = 0, xStep = yStep * 8; iy < 8; iy++, xStep++) - decodebytesC4_To_Raw16((u16*)dst + (y + iy) * width + x, src + 4 * xStep, tlutaddr); - } - return GetPCFormatFromTLUTFormat(tlutfmt); - case GX_TF_I4: - { - #pragma omp parallel for - for (int y = 0; y < height; y += 8) - for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++) - for (int iy = 0, xStep = yStep * 8 ; iy < 8; iy++,xStep++) - for (int ix = 0; ix < 4; ix++) - { - int val = src[4 * xStep + ix]; - dst[(y + iy) * width + x + ix * 2] = Convert4To8(val >> 4); - dst[(y + iy) * width + x + ix * 2 + 1] = Convert4To8(val & 0xF); - } - } - return PC_TEX_FMT_I4_AS_I8; - case GX_TF_I8: // speed critical - { - #pragma omp parallel for - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - { - ((u64*)(dst + (y + iy) * width + x))[0] = ((u64*)(src + 8 * xStep))[0]; - } - } - return PC_TEX_FMT_I8; - case GX_TF_C8: - if (tlutfmt == 2) - { - // Special decoding is required for TLUT format 5A3 - #pragma omp parallel for - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - decodebytesC8_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, src + 8 * xStep, tlutaddr); - } - else - { - -#if _M_SSE >= 0x301 - - if (cpu_info.bSSSE3) - { - #pragma omp parallel for - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - decodebytesC8_To_Raw16_SSSE3((u16*)dst + (y + iy) * width + x, src + 8 * xStep, tlutaddr); - } - else -#endif - { - #pragma omp parallel for - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - decodebytesC8_To_Raw16((u16*)dst + (y + iy) * width + x, src + 8 * xStep, tlutaddr); - } - } - return GetPCFormatFromTLUTFormat(tlutfmt); - case GX_TF_IA4: - { - #pragma omp parallel for - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps8; x < width; x += 8, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - decodebytesIA4((u16*)dst + (y + iy) * width + x, src + 8 * xStep); - } - return PC_TEX_FMT_IA4_AS_IA8; - case GX_TF_IA8: - { - #pragma omp parallel for - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) - for (int iy = 0, xStep = yStep * 4; iy < 4; iy++, xStep++) - { - u16 *ptr = (u16 *)dst + (y + iy) * width + x; - u16 *s = (u16 *)(src + 8 * xStep); - for (int j = 0; j < 4; j++) - *ptr++ = Common::swap16(*s++); - } - - } - return PC_TEX_FMT_IA8; - case GX_TF_C14X2: - if (tlutfmt == 2) - { - // Special decoding is required for TLUT format 5A3 - #pragma omp parallel for - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - decodebytesC14X2_5A3_To_BGRA32((u32*)dst + (y + iy) * width + x, (u16*)(src + 8 * xStep), tlutaddr); - } - else - { - #pragma omp parallel for - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - decodebytesC14X2_To_Raw16((u16*)dst + (y + iy) * width + x,(u16*)(src + 8 * xStep), tlutaddr); - } - return GetPCFormatFromTLUTFormat(tlutfmt); - case GX_TF_RGB565: - { - #pragma omp parallel for - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - { - u16 *ptr = (u16 *)dst + (y + iy) * width + x; - u16 *s = (u16 *)(src + 8 * xStep); - for (int j = 0; j < 4; j++) - *ptr++ = Common::swap16(*s++); - } - } - return PC_TEX_FMT_RGB565; - case GX_TF_RGB5A3: - { - #pragma omp parallel for - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) - for (int iy = 0, xStep = 4 * yStep; iy < 4; iy++, xStep++) - //decodebytesRGB5A3((u32*)dst+(y+iy)*width+x, (u16*)src, 4); - decodebytesRGB5A3((u32*)dst+(y+iy)*width+x, (u16*)(src + 8 * xStep)); - } - return PC_TEX_FMT_BGRA32; - case GX_TF_RGBA8: // speed critical - { - -#if _M_SSE >= 0x301 - - if (cpu_info.bSSSE3) - { - #pragma omp parallel for - for (int y = 0; y < height; y += 4) - { - __m128i* p = (__m128i*)(src + y * width * 4); - for (int x = 0; x < width; x += 4) - { - // We use _mm_loadu_si128 instead of _mm_load_si128 - // because "p" may not be aligned in 16-bytes alignment. - // See Issue 3493. - const __m128i a0 = _mm_loadu_si128(p++); - const __m128i a1 = _mm_loadu_si128(p++); - const __m128i a2 = _mm_loadu_si128(p++); - const __m128i a3 = _mm_loadu_si128(p++); - - // Shuffle 16-bit integeres by _mm_unpacklo_epi16()/_mm_unpackhi_epi16(), - // apply Common::swap32() by _mm_shuffle_epi8() and - // store them by _mm_stream_si128(). - // See decodebytesARGB8_4() about the idea. - - static const __m128i kMaskSwap32 = _mm_set_epi32(0x0C0D0E0FL, 0x08090A0BL, 0x04050607L, 0x00010203L); - - const __m128i b0 = _mm_unpacklo_epi16(a0, a2); - const __m128i c0 = _mm_shuffle_epi8(b0, kMaskSwap32); - _mm_stream_si128((__m128i*)((u32*)dst + (y + 0) * width + x), c0); - - const __m128i b1 = _mm_unpackhi_epi16(a0, a2); - const __m128i c1 = _mm_shuffle_epi8(b1, kMaskSwap32); - _mm_stream_si128((__m128i*)((u32*)dst + (y + 1) * width + x), c1); - - const __m128i b2 = _mm_unpacklo_epi16(a1, a3); - const __m128i c2 = _mm_shuffle_epi8(b2, kMaskSwap32); - _mm_stream_si128((__m128i*)((u32*)dst + (y + 2) * width + x), c2); - - const __m128i b3 = _mm_unpackhi_epi16(a1, a3); - const __m128i c3 = _mm_shuffle_epi8(b3, kMaskSwap32); - _mm_stream_si128((__m128i*)((u32*)dst + (y + 3) * width + x), c3); - } - } - } - else - -#endif - - { - #pragma omp parallel for - for (int y = 0; y < height; y += 4) - for (int x = 0, yStep = (y / 4) * Wsteps4; x < width; x += 4, yStep++) - { - const u8* src2 = src + 64 * yStep; - for (int iy = 0; iy < 4; iy++) - decodebytesARGB8_4((u32*)dst + (y+iy)*width + x, (u16*)src2 + 4 * iy, (u16*)src2 + 4 * iy + 16); - } - } - } - return PC_TEX_FMT_BGRA32; - case GX_TF_CMPR: // speed critical - // The metroid games use this format almost exclusively. - { -#if 0 // TODO - currently does not handle transparency correctly and causes problems when texture dimensions are not multiples of 8 - // 11111111 22222222 55555555 66666666 - // 33333333 44444444 77777777 88888888 - for (int y = 0; y < height; y += 8) - { - for (int x = 0; x < width; x += 8) - { - copyDXTBlock(dst+(y/2)*width+x*2, src); - src += 8; - copyDXTBlock(dst+(y/2)*width+x*2+8, src); - src += 8; - copyDXTBlock(dst+(y/2+2)*width+x*2, src); - src += 8; - copyDXTBlock(dst+(y/2+2)*width+x*2+8, src); - src += 8; - } - } - return PC_TEX_FMT_DXT1; -#else - #pragma omp parallel for - for (int y = 0; y < height; y += 8) - { - for (int x = 0, yStep = (y / 8) * Wsteps8; x < width; x += 8, yStep++) - { - const u8* src2 = src + 4 * sizeof(DXTBlock) * yStep; - decodeDXTBlock((u32*)dst + y * width + x, (DXTBlock*)src2, width); - src2 += sizeof(DXTBlock); - decodeDXTBlock((u32*)dst + y * width + x + 4, (DXTBlock*)src2, width); - src2 += sizeof(DXTBlock); - decodeDXTBlock((u32*)dst + (y + 4) * width + x, (DXTBlock*)src2, width); - src2 += sizeof(DXTBlock); - decodeDXTBlock((u32*)dst + (y + 4) * width + x + 4, (DXTBlock*)src2, width); - } - } -#endif - return PC_TEX_FMT_BGRA32; - } - } - - // The "copy" texture formats, too? - return PC_TEX_FMT_NONE; -} - - - // JSD 01/06/11: // TODO: we really should ensure BOTH the source and destination addresses are aligned to 16-byte boundaries to // squeeze out a little more performance. _mm_loadu_si128/_mm_storeu_si128 is slower than _mm_load_si128/_mm_store_si128 @@ -776,7 +287,7 @@ static PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, in // TODO: complete SSE2 optimization of less often used texture formats. // TODO: refactor algorithms using _mm_loadl_epi64 unaligned loads to prefer 128-bit aligned loads. -static PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, int height, int texformat, int tlutaddr, int tlutfmt) +PC_TexFormat _TexDecoder_DecodeImpl(u32 * dst, const u8 * src, int width, int height, int texformat, int tlutaddr, int tlutfmt) { SetOpenMPThreadCount(width, height); @@ -1844,11 +1355,3 @@ static PC_TexFormat TexDecoder_Decode_RGBA(u32 * dst, const u8 * src, int width, // The "copy" texture formats, too? return PC_TEX_FMT_RGBA32; } - -PC_TexFormat _TexDecoder_DecodeImpl(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt,bool rgbaOnly) -{ - if (rgbaOnly) - return TexDecoder_Decode_RGBA((u32*)dst, src, width, height, texformat, tlutaddr, tlutfmt); - else - return TexDecoder_Decode_real(dst, src, width, height, texformat, tlutaddr, tlutfmt); -} diff --git a/Source/Core/VideoCommon/VideoConfig.cpp b/Source/Core/VideoCommon/VideoConfig.cpp index d6e2177b6b..8fd7a9efbb 100644 --- a/Source/Core/VideoCommon/VideoConfig.cpp +++ b/Source/Core/VideoCommon/VideoConfig.cpp @@ -36,7 +36,6 @@ VideoConfig::VideoConfig() // disable all features by default backend_info.APIType = API_NONE; - backend_info.bUseRGBATextures = false; backend_info.bUseMinimalMipCount = false; backend_info.bSupportsExclusiveFullscreen = false; } diff --git a/Source/Core/VideoCommon/VideoConfig.h b/Source/Core/VideoCommon/VideoConfig.h index ea63a61763..4d7f87f0ed 100644 --- a/Source/Core/VideoCommon/VideoConfig.h +++ b/Source/Core/VideoCommon/VideoConfig.h @@ -137,7 +137,6 @@ struct VideoConfig final std::vector AAModes; std::vector PPShaders; // post-processing shaders - bool bUseRGBATextures; // used for D3D in TextureCache bool bUseMinimalMipCount; bool bSupportsExclusiveFullscreen; bool bSupportsDualSourceBlend;