// Copyright (C) 2003-2008 Dolphin Project. // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation, version 2.0. // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License 2.0 for more details. // A copy of the GPL 2.0 should have been included with the program. // If not, see http://www.gnu.org/licenses/ // Official SVN repository and contact information can be found at // http://code.google.com/p/dolphin-emu/ #include "Common.h" #include "TextureDecoder.h" #include "LookUpTables.h" #include //Uncomment this to enable Texture Format ID overlays #define OVERLAY_TEXFMT #ifdef OVERLAY_TEXFMT bool TexFmt_Overlay_Enable=false; bool TexFmt_Overlay_Center=false; #endif // TRAM // STATE_TO_SAVE u8 texMem[TMEM_SIZE]; ////////////////////////////////////////////////////////////////////////// // Gamecube/Wii texture decoder ////////////////////////////////////////////////////////////////////////// // Decodes all known Gamecube/Wii texture formats. // by ector ////////////////////////////////////////////////////////////////////////// int TexDecoder_GetTexelSizeInNibbles(int format) { switch (format & 0x3f) { case GX_TF_I4: return 1; case GX_TF_I8: return 2; case GX_TF_IA4: return 2; case GX_TF_IA8: return 4; case GX_TF_RGB565: return 4; case GX_TF_RGB5A3: return 4; case GX_TF_RGBA8: return 8; case GX_TF_C4: return 1; case GX_TF_C8: return 2; case GX_TF_C14X2: return 4; case GX_TF_CMPR: return 1; default: return 1; } } int TexDecoder_GetTextureSizeInBytes(int width, int height, int format) { return (width * height * TexDecoder_GetTexelSizeInNibbles(format)) / 2; } u32 TexDecoder_GetSafeTextureHash(const u8 *src, int width, int height, int texformat) { int sz = TexDecoder_GetTextureSizeInBytes(width, height, texformat); u32 hash = 0x1337c0de; if (sz < 2048) { for (int i = 0; i < sz / 4; i += 13) { hash = _rotl(hash, 17) ^ ((u32 *)src)[i]; } return hash; } else { int step = sz / 13 / 4; for (int i = 0; i < sz / 4; i += step) { hash = _rotl(hash, 17) ^ ((u32 *)src)[i]; } } return hash; } int TexDecoder_GetBlockWidthInTexels(int format) { switch (format) { case GX_TF_I4: return 8; case GX_TF_I8: return 8; case GX_TF_IA4: return 8; case GX_TF_IA8: return 4; case GX_TF_RGB565: return 4; case GX_TF_RGB5A3: return 4; case GX_TF_RGBA8: return 4; case GX_TF_C4: return 8; case GX_TF_C8: return 8; case GX_TF_C14X2: return 4; case GX_TF_CMPR: return 8; default: return 8; } } //returns bytes int TexDecoder_GetPaletteSize(int format) { switch (format) { case GX_TF_C4: return 16*2; case GX_TF_C8: return 256*2; case GX_TF_C14X2: return 16384*2; default: return 0; } } inline u32 decode565(u16 val) { int r,g,b,a; r=lut5to8[(val>>11) & 0x1f]; g=lut6to8[(val>>5 ) & 0x3f]; b=lut5to8[(val ) & 0x1f]; a=0xFF; return (a<<24) | (r<<16) | (g<<8) | b; } inline u32 decodeIA8(u16 val) { int a=val>>8; int i=val&0xFF; return (a<<24) | (i<<16) | (i<<8) | i; } inline u32 decode5A3(u16 val) { int r,g,b,a; if ((val&0x8000)) { r=lut5to8[(val>>10) & 0x1f]; g=lut5to8[(val>>5 ) & 0x1f]; b=lut5to8[(val ) & 0x1f]; a=0xFF; } else { a=lut3to8[(val>>12) & 0x7]; r=lut4to8[(val>>8 ) & 0xf]; g=lut4to8[(val>>4 ) & 0xf]; b=lut4to8[(val ) & 0xf]; } return (a<<24) | (r<<16) | (g<<8) | b; } struct DXTBlock { u16 color1; u16 color2; u8 lines[4]; }; inline int expand8888(const int j) { int i = j | (j<<8); return i | (i<<16); } //inline void decodebytesI4(u32 *dst, const u8 *src, int numbytes) inline void decodebytesI4(u32 *dst, const u8 *src) { for (int x = 0; x < 4; x++) { int val = src[x]; *dst++ = expand8888(lut4to8[val>>4]); *dst++ = expand8888(lut4to8[val&15]); } } inline void decodebytesI8_8(u32 *dst, const u8 *src) { for (int x = 0; x < 8; x++) dst[x] = src[x] * 0x01010101; //expand8888(src[x]); *0x... may or may not be faster. not sure. Should be faster on P4 at least. } //inline void decodebytesC4(u32 *dst, const u8 *src, int numbytes, int tlutaddr, int tlutfmt) inline void decodebytesC4(u32 *dst, const u8 *src, int tlutaddr, int tlutfmt) { u16 *tlut = (u16*)(texMem + tlutaddr); for (int x = 0; x < 4; x++) { int val = src[x]; switch (tlutfmt) { case 0: *dst++ = decodeIA8(Common::swap16(tlut[val >> 4])); *dst++ = decodeIA8(Common::swap16(tlut[val & 15])); break; case 1: *dst++ = decode565(Common::swap16(tlut[val >> 4])); *dst++ = decode565(Common::swap16(tlut[val & 15])); break; case 2: *dst++ = decode5A3(Common::swap16(tlut[val >> 4])); *dst++ = decode5A3(Common::swap16(tlut[val & 15])); break; case 3: //ERROR *dst++ = 0xFFFF00FF; *dst++ = 0xFFFF00FF; break; } } } //inline void decodebytesC8(u32 *dst, const u8 *src, int numbytes, int tlutaddr, int tlutfmt) inline void decodebytesC8(u32 *dst, const u8 *src, int tlutaddr, int tlutfmt) { u16 *tlut = (u16*)(texMem+tlutaddr); for (int x = 0; x < 8; x++) { int val = src[x]; switch (tlutfmt) { case 0: *dst++ = decodeIA8(Common::swap16(tlut[val])); break; case 1: *dst++ = decode565(Common::swap16(tlut[val])); break; case 2: *dst++ = decode5A3(Common::swap16(tlut[val])); break; case 3: //ERROR *dst++ = 0xFFFF00FF; break; } } } //inline void decodebytesC14X2(u32 *dst, const u16 *src, int numpixels, int tlutaddr, int tlutfmt) inline void decodebytesC14X2(u32 *dst, const u16 *src, int tlutaddr, int tlutfmt) { u16 *tlut = (u16*)(texMem+tlutaddr); for (int x = 0; x < 4; x++) { int val = Common::swap16(src[x]); switch (tlutfmt) { case 0: *dst++ = decodeIA8(Common::swap16(tlut[(val&0x3FFF)])); break; case 1: *dst++ = decode565(Common::swap16(tlut[(val&0x3FFF)])); break; case 2: *dst++ = decode5A3(Common::swap16(tlut[(val&0x3FFF)])); break; case 3: //ERROR *dst++ = 0xFFFF00FF; break; } } } //inline void decodebytesRGB565(u32 *dst, const u16 *src, int numpixels) inline void decodebytesRGB565(u32 *dst, const u16 *src) { for (int x = 0; x < 4; x++) *dst++ = decode565(Common::swap16(src[x])); } //inline void decodebytesIA4(u32 *dst, const u8 *src, int numbytes) inline void decodebytesIA4(u32 *dst, const u8 *src) { for (int x = 0; x < 8; x++) { int val = src[x]; int a = lut4to8[val>>4]; int r = lut4to8[val&15]; dst[x] = (a<<24) | (r<<16) | (r<<8) | r; } } //inline void decodebytesIA8(u32 *dst, const u16 *src, int numpixels) inline void decodebytesIA8(u32 *dst, const u16 *src) { for (int x = 0; x < 4; x++) dst[x] = decodeIA8(Common::swap16(src[x])); } //inline void decodebytesRGB5A3(u32 *dst, const u16 *src, int numpixels) inline void decodebytesRGB5A3(u32 *dst, const u16 *src) { for (int x = 0; x < 4; x++) dst[x] = decode5A3(Common::swap16(src[x])); } // This one is used by many video formats. It'd therefore be good if it was fast. inline void decodebytesARGB8_4(u32 *dst, const u16 *src, const u16 *src2) { for (int x = 0; x < 4; x++) { dst[x] = Common::swap32((src2[x] << 16) | src[x]); } // This can probably be done in a few SSE pack/unpack instructions + pshufb // some unpack instruction x2: // ABABABABABABABAB 1212121212121212 -> // AB12AB12AB12AB12 AB12AB12AB12AB12 // 2x pshufb-> // 21BA21BA21BA21BA 21BA21BA21BA21BA // and we are done. } inline u32 makecol(int r, int g, int b, int a) { return (a<<24)|(r<<16)|(g<<8)|b; } void decodeDXTBlock(u32 *dst, const DXTBlock *src, int pitch) { u16 c1 = Common::swap16(src->color1); u16 c2 = Common::swap16(src->color2); int blue1 = lut5to8[c1 & 0x1F]; int blue2 = lut5to8[c2 & 0x1F]; int green1 = lut6to8[(c1>>5) & 0x3F]; int green2 = lut6to8[(c2>>5) & 0x3F]; int red1 = lut5to8[(c1>>11) & 0x1F]; int red2 = lut5to8[(c2>>11) & 0x1F]; int colors[4]; if (c1 > c2) { colors[0] = makecol(red1, green1, blue1, 255); colors[1] = makecol(red2, green2, blue2, 255); colors[2] = makecol(red1+(red2-red1)/3, green1+(green2-green1)/3, blue1+(blue2-blue1)/3, 255); colors[3] = makecol(red2+(red1-red2)/3, green2+(green1-green2)/3, blue2+(blue1-blue2)/3, 255); } else { colors[0] = makecol(red1, green1, blue1, 255); colors[1] = makecol(red2, green2, blue2, 255); colors[2] = makecol((red1+red2)/2, (green1+green2)/2, (blue1+blue2)/2, 255); colors[3] = makecol(0,0,0,0); //transparent } for (int y = 0; y < 4; y++) { int val = src->lines[y]; for (int x = 0; x < 4; x++) { dst[x] = colors[(val >> 6) & 3]; val <<= 2; } dst += pitch; } } //switch endianness, unswizzle //TODO: to save memory, don't blindly convert everything to argb8888 //also ARGB order needs to be swapped later, to accommodate modern hardware better //need to add DXT support too #ifdef OVERLAY_TEXFMT PC_TexFormat TexDecoder_Decode_real(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt) #else PC_TexFormat TexDecoder_Decode(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt) #endif { switch (texformat) { case GX_TF_C4: { for (int y = 0; y < height; y += 8) for (int x = 0; x < width; x += 8) for (int iy = 0; iy < 8; iy++, src += 4) //decodebytesC4((u32*)dst+(y+iy)*width+x, src, 4, tlutaddr, tlutfmt); decodebytesC4((u32*)dst+(y+iy)*width+x, src, tlutaddr, tlutfmt); } return PC_TEX_FMT_BGRA32; case GX_TF_I4: { // TODO: SSSE3 variant (pshufb), THP videos use this format. // SSSE3 variant could bring even more speed #if (defined(_WIN32) || (defined (_M_X64) && !defined(_WIN32))) __m128i Lmask = _mm_set1_epi8 (0x0F); __m128i Hmask = _mm_set1_epi8 (0xF0); __m128i* sseSrc = (__m128i *)src; __m128i* sseDst = (__m128i *)dst; for (int y = 0; y < height; y += 8) for (int x = 0; x < width; x += 8) for (int iy = 0; iy < 8; iy++, sseSrc++) { // TODO (mb2): func and don't let the optimizer perform all the clean up by itself __m128i s = _mm_load_si128 (sseSrc); // ab cd ef gh ... __m128i sl = _mm_and_si128 (s, Lmask); // 0b 0d 0f 0h ... __m128i sls = _mm_slli_epi16 (sl, 4); // b0 d0 f0 h0 ... __m128i sl_ = _mm_or_si128 (sl, sls); // bb dd ff ff ... __m128i sh = _mm_and_si128 (s, Hmask); // a0 c0 e0 g0 ... __m128i shs = _mm_srli_epi16 (sh, 4); // 0a 0c 0e g0 ... __m128i sh_ = _mm_or_si128 (sh, shs); // aa cc ee gg ... __m128i rl = _mm_unpacklo_epi8 (sh_, sl_); // bb aa dd cc ... __m128i rh = _mm_unpackhi_epi8 (sh_, sl_); // __m128i ral = _mm_unpacklo_epi8 (rl, rl); // bb bb aa aa ... __m128i rah = _mm_unpackhi_epi8 (rl, rl); // // result part a __m128i rall = _mm_unpacklo_epi16 (ral, ral); // bb bb bb bb ... -> done __m128i ralh = _mm_unpackhi_epi16 (ral, ral); // -> done __m128i rahl = _mm_unpacklo_epi16 (rah, rah); // -> done __m128i rahh = _mm_unpackhi_epi16 (rah, rah); // -> done __m128i rbl = _mm_unpacklo_epi8 (rh, rh); // __m128i rbh = _mm_unpackhi_epi8 (rh, rh); // // result part b __m128i rbll = _mm_unpacklo_epi16 (rbl, rbl); // -> done __m128i rblh = _mm_unpackhi_epi16 (rbl, rbl); // -> done __m128i rbhl = _mm_unpacklo_epi16 (rbh, rbh); // -> done __m128i rbhh = _mm_unpackhi_epi16 (rbh, rbh); // -> done // store sseDst = (__m128i*)(dst+((y+iy)*width+x)*4); // that sucks... too lazy _mm_store_si128 (sseDst++, rall); _mm_store_si128 (sseDst, ralh); iy++; sseDst = (__m128i*)(dst+((y+iy)*width+x)*4); _mm_store_si128 (sseDst++, rahl); _mm_store_si128 (sseDst, rahh); iy++; sseDst = (__m128i*)(dst+((y+iy)*width+x)*4); _mm_store_si128 (sseDst++, rbll); _mm_store_si128 (sseDst, rblh); iy++; sseDst = (__m128i*)(dst+((y+iy)*width+x)*4); _mm_store_si128 (sseDst++, rbhl); _mm_store_si128 (sseDst, rbhh); } #else for (int y = 0; y < height; y += 8) for (int x = 0; x < width; x += 8) for (int iy = 0; iy < 8; iy++, src += 4) //decodebytesI4((u32*)dst+(y+iy)*width+x, src, 4); decodebytesI4((u32*)dst+(y+iy)*width+x, src); #endif } return PC_TEX_FMT_BGRA32; case GX_TF_C8: { for (int y = 0; y < height; y += 4) for (int x = 0; x < width; x += 8) for (int iy = 0; iy < 4; iy++, src += 8) //decodebytesC8((u32*)dst+(y+iy)*width+x, src, 8, tlutaddr, tlutfmt); decodebytesC8((u32*)dst+(y+iy)*width+x, src, tlutaddr, tlutfmt); } return PC_TEX_FMT_BGRA32; case GX_TF_I8: // speed critical { #if (defined(_WIN32) || (defined (_M_X64) && !defined(_WIN32))) __m128i *sseSrc = (__m128i *)src; __m128i *sseDst = (__m128i *)dst; for (int y = 0; y < height; y += 4) for (int x = 0; x < width; x += 8) for (int iy = 0; iy < 4; iy++, sseSrc++) { // TODO (mb2): func and don't let the optimizer perform all the clean up by itself __m128i s = _mm_load_si128 (sseSrc); // ab cd ef gh ... __m128i rl = _mm_unpacklo_epi8 (s, s); // ab ab cd cd ... __m128i rh = _mm_unpackhi_epi8 (s, s); // // result __m128i rll = _mm_unpacklo_epi8 (rl, rl); // ab ab ab ab __m128i rlh = _mm_unpackhi_epi8 (rl, rl); __m128i rhl = _mm_unpacklo_epi8 (rh, rh); __m128i rhh = _mm_unpackhi_epi8 (rh, rh); // store sseDst = (__m128i*)(dst+((y+iy)*width+x)*4); _mm_store_si128 (sseDst++, rll); _mm_store_si128 (sseDst, rlh); iy++; sseDst = (__m128i*)(dst+((y+iy)*width+x)*4); _mm_store_si128 (sseDst++, rhl); _mm_store_si128 (sseDst, rhh); } #else for (int y = 0; y < height; y += 4) for (int x = 0; x < width; x += 8) for (int iy = 0; iy < 4; iy++, src += 8) decodebytesI8_8((u32*)dst+(y+iy)*width+x, src); #endif } return PC_TEX_FMT_BGRA32; case GX_TF_IA4: { for (int y = 0; y < height; y += 4) for (int x = 0; x < width; x += 8) for (int iy = 0; iy < 4; iy++, src += 8) //decodebytesIA4((u32*)dst+(y+iy)*width+x, src, 8); decodebytesIA4((u32*)dst+(y+iy)*width+x, src); } return PC_TEX_FMT_BGRA32; case GX_TF_IA8: { for (int y = 0; y < height; y += 4) for (int x = 0; x < width; x += 4) for (int iy = 0; iy < 4; iy++, src += 8) //decodebytesIA8((u32*)dst+(y+iy)*width+x, (u16*)src, 4); decodebytesIA8((u32*)dst+(y+iy)*width+x, (u16*)src); } return PC_TEX_FMT_BGRA32; case GX_TF_C14X2: { for (int y = 0; y < height; y += 4) for (int x = 0; x < width; x += 4) for (int iy = 0; iy < 4; iy++, src += 8) //decodebytesC14X2((u32*)dst+(y+iy)*width+x, (u16*)src, 4, tlutaddr, tlutfmt); decodebytesC14X2((u32*)dst+(y+iy)*width+x, (u16*)src, tlutaddr, tlutfmt); } return PC_TEX_FMT_BGRA32; case GX_TF_RGB565: { for (int y = 0; y < height; y += 4) for (int x = 0; x < width; x += 4) for (int iy = 0; iy < 4; iy++, src += 8) //decodebytesRGB565((u32*)dst+(y+iy)*width+x, (u16*)src, 4); decodebytesRGB565((u32*)dst+(y+iy)*width+x, (u16*)src); } return PC_TEX_FMT_BGRA32; case GX_TF_RGB5A3: { for (int y = 0; y < height; y += 4) for (int x = 0; x < width; x += 4) for (int iy = 0; iy < 4; iy++, src += 8) //decodebytesRGB5A3((u32*)dst+(y+iy)*width+x, (u16*)src, 4); decodebytesRGB5A3((u32*)dst+(y+iy)*width+x, (u16*)src); } return PC_TEX_FMT_BGRA32; case GX_TF_RGBA8: // speed critical { for (int y = 0; y < height; y += 4) { for (int x = 0; x < width; x += 4) { for (int iy = 0; iy < 4; iy++) { decodebytesARGB8_4((u32*)dst + (y+iy)*width + x, (u16*)src + 4 * iy, (u16*)src + 4 * iy + 16); } src += 64; } } } return PC_TEX_FMT_BGRA32; case GX_TF_CMPR: // speed critical { // TODO: Shuffle to PC S3TC (DXTC) format instead of converting // 11111111 22222222 55555555 66666666 // 33333333 44444444 77777777 88888888 // The metroid games use this format almost exclusively. for (int y = 0; y < height; y += 8) for (int x = 0; x < width; x += 8) { decodeDXTBlock((u32*)dst+y*width+x, (DXTBlock*)src, width); src += sizeof(DXTBlock); decodeDXTBlock((u32*)dst+y*width+x+4, (DXTBlock*)src, width); src += sizeof(DXTBlock); decodeDXTBlock((u32*)dst+(y+4)*width+x, (DXTBlock*)src, width); src += sizeof(DXTBlock); decodeDXTBlock((u32*)dst+(y+4)*width+x+4, (DXTBlock*)src, width); src += sizeof(DXTBlock); } } return PC_TEX_FMT_BGRA32; } // The "copy" texture formats, too? return PC_TEX_FMT_NONE; } void TexDecoder_SetTexFmtOverlayOptions(bool enable, bool center) { #ifdef OVERLAY_TEXFMT TexFmt_Overlay_Enable = enable; TexFmt_Overlay_Center = center; #endif } #ifdef OVERLAY_TEXFMT extern const char* texfmt[]; extern const unsigned char sfont_map[]; extern const unsigned char sfont_raw[][9*10]; PC_TexFormat TexDecoder_Decode(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt) { PC_TexFormat retval = TexDecoder_Decode_real(dst,src,width,height,texformat,tlutaddr,tlutfmt); if((!TexFmt_Overlay_Enable)||(retval==PC_TEX_FMT_NONE)) return retval; // assume ABGR/ARGB (32bit) int *dtp = (int*)dst; int w = min(width,40); int h = min(height,10); int xoff = (width-w)>>1; int yoff = (height-h)>>1; if(!TexFmt_Overlay_Center) { xoff=0; yoff=0; } const char* fmt = texfmt[texformat&15]; while(*fmt) { int xcnt = 0; int nchar = sfont_map[(int)*fmt]; const unsigned char *ptr = sfont_raw[nchar]; // each char is up to 9x10 for(int x=0;x<9;x++) { if(ptr[x]==0x78) break; xcnt++; } for(int y=0;y<10;y++) { for(int x=0;x