diff --git a/Source/Core/VideoCommon/Src/TextureDecoder.cpp b/Source/Core/VideoCommon/Src/TextureDecoder.cpp index dd7865bc14..161c3d6acb 100644 --- a/Source/Core/VideoCommon/Src/TextureDecoder.cpp +++ b/Source/Core/VideoCommon/Src/TextureDecoder.cpp @@ -185,150 +185,6 @@ struct DXTBlock u8 lines[4]; }; -inline int expand8888(const int j) -{ - int i = j | (j<<8); - return i | (i<<16); -} - -//inline void decodebytesI4(u32 *dst, const u8 *src, int numbytes) -inline void decodebytesI4(u32 *dst, const u8 *src) -{ - for (int x = 0; x < 4; x++) - { - int val = src[x]; - *dst++ = expand8888(lut4to8[val>>4]); - *dst++ = expand8888(lut4to8[val&15]); - } -} - -inline void sseDecodebytesI4(u8* dst, const __m128i* sseSrc, int height, - int width) { - __m128i* sseDst; - -#ifdef __SSSE3__ - // SSSE3 variant - if(cpu_info.bSSSE3) { - - // TODO(XK): Increase Loop Jump? - - __m128i s, m[8]; - unsigned char *umask; - - for(int i = 0; i < 8; i++) { - umask = (unsigned char *)&(m[i]); - for(int j = 0; j < 14; j += 4) { - umask[j] = 0x00 + (i * 4); - umask[j+1] = 0x01 + (i * 4); - umask[j+2] = 0x02 + (i * 4); - umask[j+3] = 0x03 + (i * 4); - } - } - - for (int y = 0; y < height; y += 8) { - for (int x = 0; x < width; x += 8) { - for (int iy = 0; iy < 8; iy++, sseSrc++) { - s = _mm_load_si128 (sseSrc); - - // TODO: Supplemental Value Lazyness v3 - sseDst = (__m128i*)(dst+((y+iy)*width+x)*4); - _mm_store_si128 (sseDst++, _mm_shuffle_epi8(s, m[1])); - _mm_store_si128 (sseDst, _mm_shuffle_epi8(s, m[0])); - iy++; - - sseDst = (__m128i*)(dst+((y+iy)*width+x)*4); - _mm_store_si128 (sseDst++, _mm_shuffle_epi8(s, m[3])); - _mm_store_si128 (sseDst, _mm_shuffle_epi8(s, m[2])); - iy++; - - sseDst = (__m128i*)(dst+((y+iy)*width+x)*4); - _mm_store_si128 (sseDst++, _mm_shuffle_epi8(s, m[5])); - _mm_store_si128 (sseDst, _mm_shuffle_epi8(s, m[4])); - iy++; - - sseDst = (__m128i*)(dst+((y+iy)*width+x)*4); - _mm_store_si128 (sseDst++, _mm_shuffle_epi8(s, m[7])); - _mm_store_si128 (sseDst, _mm_shuffle_epi8(s, m[6])); - iy++; - - } - } - } - - return; - - } -#endif - __m128i Lmask = _mm_set1_epi8 (0x0F); - __m128i Hmask = _mm_set1_epi8 (0xF0); - - for (int y = 0; y < height; y += 8) { - for (int x = 0; x < width; x += 8) { - for (int iy = 0; iy < 8; iy++, sseSrc++) { - // TODO (mb2): Don't let the optimizer perform all the clean up by itself - // (XK): Huh? What clean up? Where? Who? :) - - __m128i s = _mm_load_si128 (sseSrc); // ab cd ef gh ... - __m128i sl = _mm_and_si128 (s, Lmask); // 0b 0d 0f 0h ... - __m128i sls = _mm_slli_epi16 (sl, 4); // b0 d0 f0 h0 ... - __m128i sl_ = _mm_or_si128 (sl, sls); // bb dd ff ff ... - - __m128i sh = _mm_and_si128 (s, Hmask); // a0 c0 e0 g0 ... - __m128i shs = _mm_srli_epi16 (sh, 4); // 0a 0c 0e g0 ... - __m128i sh_ = _mm_or_si128 (sh, shs); // aa cc ee gg ... - __m128i rl = _mm_unpacklo_epi8 (sh_, sl_); // bb aa dd cc ... - __m128i rh = _mm_unpackhi_epi8 (sh_, sl_); // ff ee hh gg ... - - // result part a - __m128i ral = _mm_unpacklo_epi8 (rl, rl); // bb bb aa aa ... - __m128i rah = _mm_unpackhi_epi8 (rl, rl); // dd dd cc cc ... - - __m128i rall = _mm_unpacklo_epi16 (ral, ral); // bb bb bb bb ... -> done - __m128i ralh = _mm_unpackhi_epi16 (ral, ral); // aa aa aa aa ... -> done - __m128i rahl = _mm_unpacklo_epi16 (rah, rah); // dd dd dd dd ... -> done - __m128i rahh = _mm_unpackhi_epi16 (rah, rah); // cc cc cc cc ... -> done - - // result part b - __m128i rbl = _mm_unpacklo_epi8 (rh, rh); // ff ff ee ee ... - __m128i rbh = _mm_unpackhi_epi8 (rh, rh); // hh hh gg gg ... - - __m128i rbll = _mm_unpacklo_epi16 (rbl, rbl); // ff ff ff ff ... -> done - __m128i rblh = _mm_unpackhi_epi16 (rbl, rbl); // ee ee ee ee ... -> done - __m128i rbhl = _mm_unpacklo_epi16 (rbh, rbh); // hh hh hh hh ... -> done - __m128i rbhh = _mm_unpackhi_epi16 (rbh, rbh); // gg gg gg gg ... -> done - - // Store - // TODO: Value lazyness - sseDst = (__m128i*)(dst+((y+iy)*width+x)*4); - _mm_store_si128 (sseDst++, rall); - _mm_store_si128 (sseDst, ralh); - iy++; - - sseDst = (__m128i*)(dst+((y+iy)*width+x)*4); - _mm_store_si128 (sseDst++, rahl); - _mm_store_si128 (sseDst, rahh); - iy++; - - sseDst = (__m128i*)(dst+((y+iy)*width+x)*4); - _mm_store_si128 (sseDst++, rbll); - _mm_store_si128 (sseDst, rblh); - iy++; - - - sseDst = (__m128i*)(dst+((y+iy)*width+x)*4); - _mm_store_si128 (sseDst++, rbhl); - _mm_store_si128 (sseDst, rbhh); - } - } - } -} - -inline void decodebytesI8_8(u32 *dst, const u8 *src) -{ - for (int x = 0; x < 8; x++) - dst[x] = src[x] * 0x01010101; //expand8888(src[x]); *0x... may or may not be faster. not sure. Should be faster on P4 at least. -} - //inline void decodebytesC4(u32 *dst, const u8 *src, int numbytes, int tlutaddr, int tlutfmt) inline void decodebytesC4(u32 *dst, const u8 *src, int tlutaddr, int tlutfmt) { @@ -523,19 +379,26 @@ PC_TexFormat TexDecoder_Decode(u8 *dst, const u8 *src, int width, int height, in } return PC_TEX_FMT_BGRA32; case GX_TF_I4: - { - sseDecodebytesI4(dst, (const __m128i *)src, height, - width); - - /* Old non-SSE way - for (int y = 0; y < height; y += 8) - for (int x = 0; x < width; x += 8) - for (int iy = 0; iy < 8; iy++, src += 4) - //decodebytesI4((u32*)dst+(y+iy)*width+x, src, 4); - decodebytesI4((u32*)dst+(y+iy)*width+x, src); - */ + { + for (int y = 0; y < height; y += 8) + for (int x = 0; x < width; x += 8) + for (int iy = 0; iy < 8; iy++, src += 4) + for (int ix = 0; ix < 4; ix++) + { + int val = src[ix]; + dst[(y+iy)*width+x+ix*2] = lut4to8[val>>4]; + dst[(y+iy)*width+x+ix*2+1] = lut4to8[val&15]; + } } - return PC_TEX_FMT_BGRA32; + return PC_TEX_FMT_I8; + case GX_TF_I8: // speed critical + { + for (int y = 0; y < height; y += 4) + for (int x = 0; x < width; x += 8) + for (int iy = 0; iy < 4; iy++, src += 8) + memcpy(dst+(y+iy)*width+x, src, 8); + } + return PC_TEX_FMT_I8; case GX_TF_C8: { for (int y = 0; y < height; y += 4) @@ -545,40 +408,6 @@ PC_TexFormat TexDecoder_Decode(u8 *dst, const u8 *src, int width, int height, in decodebytesC8((u32*)dst+(y+iy)*width+x, src, tlutaddr, tlutfmt); } return PC_TEX_FMT_BGRA32; - case GX_TF_I8: // speed critical - { -#if 1 - __m128i *sseSrc = (__m128i *)src; - __m128i *sseDst = (__m128i *)dst; - for (int y = 0; y < height; y += 4) - for (int x = 0; x < width; x += 8) - for (int iy = 0; iy < 4; iy++, sseSrc++) { - // TODO (mb2): func and don't let the optimizer perform all the clean up by itself - __m128i s = _mm_load_si128 (sseSrc); // ab cd ef gh ... - __m128i rl = _mm_unpacklo_epi8 (s, s); // ab ab cd cd ... - __m128i rh = _mm_unpackhi_epi8 (s, s); // - // result - __m128i rll = _mm_unpacklo_epi8 (rl, rl); // ab ab ab ab - __m128i rlh = _mm_unpackhi_epi8 (rl, rl); - __m128i rhl = _mm_unpacklo_epi8 (rh, rh); - __m128i rhh = _mm_unpackhi_epi8 (rh, rh); - // store - sseDst = (__m128i*)(dst+((y+iy)*width+x)*4); - _mm_store_si128 (sseDst++, rll); - _mm_store_si128 (sseDst, rlh); - iy++; - sseDst = (__m128i*)(dst+((y+iy)*width+x)*4); - _mm_store_si128 (sseDst++, rhl); - _mm_store_si128 (sseDst, rhh); - } -#else - for (int y = 0; y < height; y += 4) - for (int x = 0; x < width; x += 8) - for (int iy = 0; iy < 4; iy++, src += 8) - decodebytesI8_8((u32*)dst+(y+iy)*width+x, src); -#endif - } - return PC_TEX_FMT_BGRA32; case GX_TF_IA4: { for (int y = 0; y < height; y += 4) diff --git a/Source/Core/VideoCommon/Src/TextureDecoder.h b/Source/Core/VideoCommon/Src/TextureDecoder.h index ceb1d61fff..f41da1b77e 100644 --- a/Source/Core/VideoCommon/Src/TextureDecoder.h +++ b/Source/Core/VideoCommon/Src/TextureDecoder.h @@ -72,7 +72,8 @@ int TexDecoder_GetPaletteSize(int fmt); enum PC_TexFormat { PC_TEX_FMT_NONE = 0, - PC_TEX_FMT_BGRA32 = 1, + PC_TEX_FMT_BGRA32, + PC_TEX_FMT_I8, }; PC_TexFormat TexDecoder_Decode(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt); diff --git a/Source/Plugins/Plugin_VideoDX9/Src/D3DTexture.cpp b/Source/Plugins/Plugin_VideoDX9/Src/D3DTexture.cpp index cfb592fcb9..a24604a211 100644 --- a/Source/Plugins/Plugin_VideoDX9/Src/D3DTexture.cpp +++ b/Source/Plugins/Plugin_VideoDX9/Src/D3DTexture.cpp @@ -44,11 +44,39 @@ LPDIRECT3DTEXTURE9 CreateTexture2D(const u8* buffer, const int width, const int D3DLOCKED_RECT Lock; pTexture->LockRect(level, &Lock, NULL, 0 ); - u32* pIn = pBuffer; switch(fmt) { + case D3DFMT_L8: + case D3DFMT_A8: + { + const u8 *pIn = buffer; + for (int y = 0; y < height; y++) + { + u8* pBits = ((u8*)Lock.pBits + (y * Lock.Pitch)); + memcpy(pBits, pIn, width); + pIn += pitch; + } + } + break; + case D3DFMT_A8L8: + { + const u8 *pIn = buffer; + // TODO(XK): Find a better way that does not involve either unpacking + // or downsampling (i.e. A4L4) + for (int y = 0; y < height; y++) + { + u8* pBits = ((u8*)Lock.pBits + (y * Lock.Pitch)); + for(int i = 0; i < width * 2; i += 2) { + pBits[i] = pIn[i / 2]; + pBits[i + 1] = pIn[i / 2]; + } + pIn += pitch; + } + } + break; case D3DFMT_A8R8G8B8: { + u32* pIn = pBuffer; for (int y = 0; y < height; y++) { u32* pBits = (u32*)((u8*)Lock.pBits + (y * Lock.Pitch)); diff --git a/Source/Plugins/Plugin_VideoDX9/Src/TextureCache.cpp b/Source/Plugins/Plugin_VideoDX9/Src/TextureCache.cpp index 1ee1263362..0457f0bd21 100644 --- a/Source/Plugins/Plugin_VideoDX9/Src/TextureCache.cpp +++ b/Source/Plugins/Plugin_VideoDX9/Src/TextureCache.cpp @@ -176,6 +176,12 @@ void TextureCache::Load(int stage, u32 address, int width, int height, int forma case PC_TEX_FMT_BGRA32: d3d_fmt = D3DFMT_A8R8G8B8; break; + /*case PC_TEX_FMT_BGRA16: is this of any use? + d3d_fmt = D3DFMT_A4R4G4B4; + break;*/ + case PC_TEX_FMT_I8: + d3d_fmt = D3DFMT_A8L8; + break; } //Make an entry in the table diff --git a/Source/Plugins/Plugin_VideoOGL/Src/TextureMngr.cpp b/Source/Plugins/Plugin_VideoOGL/Src/TextureMngr.cpp index efdef69c4f..ec228f2001 100644 --- a/Source/Plugins/Plugin_VideoOGL/Src/TextureMngr.cpp +++ b/Source/Plugins/Plugin_VideoOGL/Src/TextureMngr.cpp @@ -343,21 +343,30 @@ TextureMngr::TCacheEntry* TextureMngr::Load(int texstage, u32 address, int width glPixelStorei(GL_UNPACK_ROW_LENGTH, expandedWidth); int gl_format; + int gl_iformat; int gl_type; switch (dfmt) { case PC_TEX_FMT_NONE: PanicAlert("Invalid PC texture format %i", dfmt); case PC_TEX_FMT_BGRA32: gl_format = GL_BGRA; + gl_iformat = 4; + gl_type = GL_UNSIGNED_BYTE; + break; + case PC_TEX_FMT_I8: + gl_format = GL_LUMINANCE; + gl_iformat = GL_INTENSITY; gl_type = GL_UNSIGNED_BYTE; break; } if (!entry.isNonPow2 && ((tm0.min_filter & 3) == 1 || (tm0.min_filter & 3) == 2)) { - gluBuild2DMipmaps(GL_TEXTURE_2D, 4, width, height, gl_format, gl_type, temp); + glTexParameteri(GL_TEXTURE_2D, GL_GENERATE_MIPMAP, GL_TRUE); + glTexImage2D(GL_TEXTURE_2D, 0, gl_iformat, width, height, 0, gl_format, gl_type, temp); + glTexParameteri(GL_TEXTURE_2D, GL_GENERATE_MIPMAP, GL_FALSE); entry.bHaveMipMaps = true; } else - glTexImage2D(target, 0, 4, width, height, 0, gl_format, gl_type, temp); + glTexImage2D(target, 0, gl_iformat, width, height, 0, gl_format, gl_type, temp); if (expandedWidth != width) // reset glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);