Committing magumagu9's work on texture decoding transferral to OGL, I added DX9 support. Speed seems to have increased but I'm not sure. See issue 581 for more info.

git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@2222 8ced0084-cf51-0410-be5f-012b33b47a6e
2009-02-12 13:54:08 +00:00 · 2009-02-12 13:54:08 +00:00 · d2d097fba5
parent 1bed914bf4
commit d2d097fba5
5 changed files with 67 additions and 194 deletions
--- a/Source/Core/VideoCommon/Src/TextureDecoder.cpp
+++ b/Source/Core/VideoCommon/Src/TextureDecoder.cpp
@ -185,150 +185,6 @@ struct DXTBlock
    u8 lines[4];
 };

-inline int expand8888(const int j)
-{
-    int i = j | (j<<8);
-    return i | (i<<16);
-}
-
-//inline void decodebytesI4(u32 *dst, const u8 *src, int numbytes)
-inline void decodebytesI4(u32 *dst, const u8 *src)
-{
-    for (int x = 0; x < 4; x++)
-    {
-        int val = src[x];
-		*dst++ = expand8888(lut4to8[val>>4]);
-		*dst++ = expand8888(lut4to8[val&15]);
-    }
-}
-
-inline void sseDecodebytesI4(u8* dst, const __m128i* sseSrc, int height,
-                             int width) {
-    __m128i* sseDst;
-
-#ifdef __SSSE3__
-    // SSSE3 variant
-    if(cpu_info.bSSSE3) {
-
-        // TODO(XK): Increase Loop Jump?
-        
-        __m128i s, m[8];
-        unsigned char *umask;
-
-        for(int i = 0; i < 8; i++) {
-            umask = (unsigned char *)&(m[i]);
-            for(int j = 0; j < 14; j += 4) {
-                umask[j]   = 0x00 + (i * 4);
-                umask[j+1] = 0x01 + (i * 4);
-                umask[j+2] = 0x02 + (i * 4);
-                umask[j+3] = 0x03 + (i * 4);
-            }
-        }
-
-        for (int y = 0; y < height; y += 8) {
-            for (int x = 0; x < width; x += 8) {
-                for (int iy = 0; iy < 8; iy++, sseSrc++) {
-                    s = _mm_load_si128 (sseSrc);
-
-                    // TODO: Supplemental Value Lazyness v3
-                    sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
-                    _mm_store_si128 (sseDst++, _mm_shuffle_epi8(s, m[1]));
-                    _mm_store_si128 (sseDst, _mm_shuffle_epi8(s, m[0]));
-                    iy++;
-
-                    sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
-                    _mm_store_si128 (sseDst++, _mm_shuffle_epi8(s, m[3]));
-                    _mm_store_si128 (sseDst, _mm_shuffle_epi8(s, m[2]));
-                    iy++;
-
-                    sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
-                    _mm_store_si128 (sseDst++, _mm_shuffle_epi8(s, m[5]));
-                    _mm_store_si128 (sseDst, _mm_shuffle_epi8(s, m[4]));
-                    iy++;
-
-                    sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
-                    _mm_store_si128 (sseDst++, _mm_shuffle_epi8(s, m[7]));
-                    _mm_store_si128 (sseDst, _mm_shuffle_epi8(s, m[6]));
-                    iy++;
-
-                }
-            }
-        }
-
-        return;
-        
-    }
-#endif
-    __m128i Lmask = _mm_set1_epi8 (0x0F);
-    __m128i Hmask = _mm_set1_epi8 (0xF0);
-
-    for (int y = 0; y < height; y += 8) {
-        for (int x = 0; x < width; x += 8) {
-            for (int iy = 0; iy < 8; iy++, sseSrc++) {
-                // TODO (mb2): Don't let the optimizer perform all the clean up by itself
-                // (XK): Huh? What clean up? Where? Who? :)
-                
-                __m128i s = _mm_load_si128 (sseSrc);		// ab cd ef gh ...
-                __m128i sl = _mm_and_si128 (s, Lmask);		// 0b 0d 0f 0h ...
-                __m128i sls = _mm_slli_epi16 (sl, 4);		// b0 d0 f0 h0 ...
-                __m128i sl_ = _mm_or_si128 (sl, sls);		// bb dd ff ff ...
-                        
-                __m128i sh = _mm_and_si128 (s, Hmask);		// a0 c0 e0 g0 ...
-                __m128i shs = _mm_srli_epi16 (sh, 4);		// 0a 0c 0e g0 ...
-                __m128i sh_ = _mm_or_si128 (sh, shs);		// aa cc ee gg ...
-                __m128i rl = _mm_unpacklo_epi8 (sh_, sl_);	// bb aa dd cc ...
-                __m128i rh = _mm_unpackhi_epi8 (sh_, sl_);	// ff ee hh gg ...
-                        
-                // result part a
-                __m128i ral = _mm_unpacklo_epi8 (rl, rl);	// bb bb aa aa ...
-                __m128i rah = _mm_unpackhi_epi8 (rl, rl);	// dd dd cc cc ...
-                
-                __m128i rall = _mm_unpacklo_epi16 (ral, ral);	// bb bb bb bb ...	-> done
-                __m128i ralh = _mm_unpackhi_epi16 (ral, ral);	// aa aa aa aa ...	-> done
-                __m128i rahl = _mm_unpacklo_epi16 (rah, rah);	// dd dd dd dd ...	-> done
-                __m128i rahh = _mm_unpackhi_epi16 (rah, rah);	// cc cc cc cc ...	-> done
-                        
-                // result part b
-                __m128i rbl = _mm_unpacklo_epi8 (rh, rh);	// ff ff ee ee ...
-                __m128i rbh = _mm_unpackhi_epi8 (rh, rh);	// hh hh gg gg ...
-
-                __m128i rbll = _mm_unpacklo_epi16 (rbl, rbl);	// ff ff ff ff ...	-> done
-                __m128i rblh = _mm_unpackhi_epi16 (rbl, rbl);	// ee ee ee ee ...	-> done
-                __m128i rbhl = _mm_unpacklo_epi16 (rbh, rbh);	// hh hh hh hh ...	-> done
-                __m128i rbhh = _mm_unpackhi_epi16 (rbh, rbh);	// gg gg gg gg ...	-> done
-                
-                // Store
-                // TODO: Value lazyness
-                sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
-                _mm_store_si128 (sseDst++, rall); 
-                _mm_store_si128 (sseDst, ralh);
-                iy++;
-                
-                sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
-                _mm_store_si128 (sseDst++, rahl);
-                _mm_store_si128 (sseDst, rahh);
-                iy++;
-                
-                sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
-                _mm_store_si128 (sseDst++, rbll);
-                _mm_store_si128 (sseDst, rblh);
-                iy++;
-                
-                
-                sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
-                _mm_store_si128 (sseDst++, rbhl);
-                _mm_store_si128 (sseDst, rbhh);
-            }
-        }
-    }
-}
-
-inline void decodebytesI8_8(u32 *dst, const u8 *src)
-{
-	for (int x = 0; x < 8; x++)
-        dst[x] = src[x] * 0x01010101; //expand8888(src[x]);  *0x... may or may not be faster. not sure. Should be faster on P4 at least.
-}
-
 //inline void decodebytesC4(u32 *dst, const u8 *src, int numbytes, int tlutaddr, int tlutfmt)
 inline void decodebytesC4(u32 *dst, const u8 *src, int tlutaddr, int tlutfmt)
 {
@ -524,18 +380,25 @@ PC_TexFormat TexDecoder_Decode(u8 *dst, const u8 *src, int width, int height, in
        return PC_TEX_FMT_BGRA32;
    case GX_TF_I4:
 		{
-            sseDecodebytesI4(dst, (const __m128i *)src, height,
-                             width);
-            
-            /* Old non-SSE way
 			for (int y = 0; y < height; y += 8)
 				for (int x = 0; x < width; x += 8)
 					for (int iy = 0; iy < 8; iy++, src += 4)
-                        //decodebytesI4((u32*)dst+(y+iy)*width+x, src, 4);
-                        decodebytesI4((u32*)dst+(y+iy)*width+x, src);
-            */
+						for (int ix = 0; ix < 4; ix++)
+						{
+							int val = src[ix];
+							dst[(y+iy)*width+x+ix*2] = lut4to8[val>>4];
+							dst[(y+iy)*width+x+ix*2+1] = lut4to8[val&15];
 						}
-        return PC_TEX_FMT_BGRA32;
+        }
+       return PC_TEX_FMT_I8;
+	case GX_TF_I8:  // speed critical
+		{
+			for (int y = 0; y < height; y += 4)
+				for (int x = 0; x < width; x += 8)
+					for (int iy = 0; iy < 4; iy++, src += 8)
+						memcpy(dst+(y+iy)*width+x, src, 8);
+		}
+		return PC_TEX_FMT_I8;
    case GX_TF_C8:
        {
            for (int y = 0; y < height; y += 4)
@ -545,40 +408,6 @@ PC_TexFormat TexDecoder_Decode(u8 *dst, const u8 *src, int width, int height, in
                        decodebytesC8((u32*)dst+(y+iy)*width+x, src, tlutaddr, tlutfmt);
        }
        return PC_TEX_FMT_BGRA32;
-    case GX_TF_I8:  // speed critical
-		{
-#if 1
-			__m128i *sseSrc  = (__m128i *)src;
-			__m128i *sseDst  = (__m128i *)dst;
-            for (int y = 0; y < height; y += 4)
-                for (int x = 0; x < width; x += 8)
-					for (int iy = 0; iy < 4; iy++, sseSrc++) {
-						// TODO (mb2): func and don't let the optimizer perform all the clean up by itself
-						__m128i s = _mm_load_si128 (sseSrc);			// ab cd ef gh ...
-						__m128i rl = _mm_unpacklo_epi8 (s, s);			// ab ab cd cd ...
-						__m128i rh = _mm_unpackhi_epi8 (s, s);			// 
-						// result
-						__m128i rll = _mm_unpacklo_epi8 (rl, rl);		// ab ab ab ab 
-						__m128i rlh = _mm_unpackhi_epi8 (rl, rl);
-						__m128i rhl = _mm_unpacklo_epi8 (rh, rh);
-						__m128i rhh = _mm_unpackhi_epi8 (rh, rh);
-						// store
-						sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
-						_mm_store_si128 (sseDst++, rll); 
-						_mm_store_si128 (sseDst, rlh);
-						iy++;
-						sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
-						_mm_store_si128 (sseDst++, rhl);
-						_mm_store_si128 (sseDst, rhh);
-					}
-#else
-            for (int y = 0; y < height; y += 4)
-                for (int x = 0; x < width; x += 8)
-                    for (int iy = 0; iy < 4; iy++, src += 8)
-                        decodebytesI8_8((u32*)dst+(y+iy)*width+x, src);
-#endif
-		}
-        return PC_TEX_FMT_BGRA32;
    case GX_TF_IA4:
        {
            for (int y = 0; y < height; y += 4)
--- a/Source/Core/VideoCommon/Src/TextureDecoder.h
+++ b/Source/Core/VideoCommon/Src/TextureDecoder.h
@ -72,7 +72,8 @@ int TexDecoder_GetPaletteSize(int fmt);
 enum PC_TexFormat
 {
 	PC_TEX_FMT_NONE = 0,
-	PC_TEX_FMT_BGRA32 = 1,
+	PC_TEX_FMT_BGRA32,
+	PC_TEX_FMT_I8,
 };

 PC_TexFormat TexDecoder_Decode(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt);
--- a/Source/Plugins/Plugin_VideoDX9/Src/D3DTexture.cpp
+++ b/Source/Plugins/Plugin_VideoDX9/Src/D3DTexture.cpp
@ -44,11 +44,39 @@ LPDIRECT3DTEXTURE9 CreateTexture2D(const u8* buffer, const int width, const int
 	D3DLOCKED_RECT Lock;
 	pTexture->LockRect(level, &Lock, NULL, 0 );

-	u32* pIn = pBuffer;
 	switch(fmt) 
 	{
+	case D3DFMT_L8:
+	case D3DFMT_A8:
+	{
+		const u8 *pIn = buffer;
+		for (int y = 0; y < height; y++)
+		{
+			u8* pBits = ((u8*)Lock.pBits + (y * Lock.Pitch));
+			memcpy(pBits, pIn, width);
+			pIn += pitch;
+		}
+	}
+	break;
+	case D3DFMT_A8L8:
+		{
+			const u8 *pIn = buffer;
+			// TODO(XK): Find a better way that does not involve either unpacking
+			//           or downsampling (i.e. A4L4)
+			for (int y = 0; y < height; y++)
+			{
+				u8* pBits = ((u8*)Lock.pBits + (y * Lock.Pitch));
+				for(int i = 0; i < width * 2; i += 2) {
+					pBits[i] = pIn[i / 2];
+					pBits[i + 1] = pIn[i / 2];
+				}
+				pIn += pitch;
+			}
+		}
+		break;
 	case D3DFMT_A8R8G8B8:
 		{
+			u32* pIn = pBuffer;
 			for (int y = 0; y < height; y++)
 			{
 				u32* pBits = (u32*)((u8*)Lock.pBits + (y * Lock.Pitch));
--- a/Source/Plugins/Plugin_VideoDX9/Src/TextureCache.cpp
+++ b/Source/Plugins/Plugin_VideoDX9/Src/TextureCache.cpp
@ -176,6 +176,12 @@ void TextureCache::Load(int stage, u32 address, int width, int height, int forma
 	case PC_TEX_FMT_BGRA32:
 		d3d_fmt = D3DFMT_A8R8G8B8;
 		break;
+	/*case PC_TEX_FMT_BGRA16: is this of any use?
+		d3d_fmt = D3DFMT_A4R4G4B4;
+		break;*/
+	case PC_TEX_FMT_I8:
+		d3d_fmt = D3DFMT_A8L8;
+		break;
 	}

 	//Make an entry in the table
--- a/Source/Plugins/Plugin_VideoOGL/Src/TextureMngr.cpp
+++ b/Source/Plugins/Plugin_VideoOGL/Src/TextureMngr.cpp
@ -343,21 +343,30 @@ TextureMngr::TCacheEntry* TextureMngr::Load(int texstage, u32 address, int width
        glPixelStorei(GL_UNPACK_ROW_LENGTH, expandedWidth);

 	int gl_format;
+	int gl_iformat;
 	int gl_type;
 	switch (dfmt) {
 	case PC_TEX_FMT_NONE:
 		PanicAlert("Invalid PC texture format %i", dfmt); 
 	case PC_TEX_FMT_BGRA32:
 		gl_format = GL_BGRA;
+		gl_iformat = 4;
+		gl_type = GL_UNSIGNED_BYTE;
+		break;
+	case PC_TEX_FMT_I8:
+		gl_format = GL_LUMINANCE;
+		gl_iformat = GL_INTENSITY;
 		gl_type = GL_UNSIGNED_BYTE;
 		break;
 	}
    if (!entry.isNonPow2 && ((tm0.min_filter & 3) == 1 || (tm0.min_filter & 3) == 2)) {
-        gluBuild2DMipmaps(GL_TEXTURE_2D, 4, width, height, gl_format, gl_type, temp);
+        glTexParameteri(GL_TEXTURE_2D, GL_GENERATE_MIPMAP, GL_TRUE);
+        glTexImage2D(GL_TEXTURE_2D, 0, gl_iformat, width, height, 0, gl_format, gl_type, temp);
+        glTexParameteri(GL_TEXTURE_2D, GL_GENERATE_MIPMAP, GL_FALSE);
        entry.bHaveMipMaps = true;
    }
    else
-        glTexImage2D(target, 0, 4, width, height, 0, gl_format, gl_type, temp);
+        glTexImage2D(target, 0, gl_iformat, width, height, 0, gl_format, gl_type, temp);

    if (expandedWidth != width) // reset
        glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);