Committing magumagu9's work on texture decoding transferral to OGL, I added DX9 support. Speed seems to have increased but I'm not sure. See issue 581 for more info.
git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@2222 8ced0084-cf51-0410-be5f-012b33b47a6e
This commit is contained in:
parent
1bed914bf4
commit
d2d097fba5
|
@ -185,150 +185,6 @@ struct DXTBlock
|
|||
u8 lines[4];
|
||||
};
|
||||
|
||||
inline int expand8888(const int j)
|
||||
{
|
||||
int i = j | (j<<8);
|
||||
return i | (i<<16);
|
||||
}
|
||||
|
||||
//inline void decodebytesI4(u32 *dst, const u8 *src, int numbytes)
|
||||
inline void decodebytesI4(u32 *dst, const u8 *src)
|
||||
{
|
||||
for (int x = 0; x < 4; x++)
|
||||
{
|
||||
int val = src[x];
|
||||
*dst++ = expand8888(lut4to8[val>>4]);
|
||||
*dst++ = expand8888(lut4to8[val&15]);
|
||||
}
|
||||
}
|
||||
|
||||
inline void sseDecodebytesI4(u8* dst, const __m128i* sseSrc, int height,
|
||||
int width) {
|
||||
__m128i* sseDst;
|
||||
|
||||
#ifdef __SSSE3__
|
||||
// SSSE3 variant
|
||||
if(cpu_info.bSSSE3) {
|
||||
|
||||
// TODO(XK): Increase Loop Jump?
|
||||
|
||||
__m128i s, m[8];
|
||||
unsigned char *umask;
|
||||
|
||||
for(int i = 0; i < 8; i++) {
|
||||
umask = (unsigned char *)&(m[i]);
|
||||
for(int j = 0; j < 14; j += 4) {
|
||||
umask[j] = 0x00 + (i * 4);
|
||||
umask[j+1] = 0x01 + (i * 4);
|
||||
umask[j+2] = 0x02 + (i * 4);
|
||||
umask[j+3] = 0x03 + (i * 4);
|
||||
}
|
||||
}
|
||||
|
||||
for (int y = 0; y < height; y += 8) {
|
||||
for (int x = 0; x < width; x += 8) {
|
||||
for (int iy = 0; iy < 8; iy++, sseSrc++) {
|
||||
s = _mm_load_si128 (sseSrc);
|
||||
|
||||
// TODO: Supplemental Value Lazyness v3
|
||||
sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
|
||||
_mm_store_si128 (sseDst++, _mm_shuffle_epi8(s, m[1]));
|
||||
_mm_store_si128 (sseDst, _mm_shuffle_epi8(s, m[0]));
|
||||
iy++;
|
||||
|
||||
sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
|
||||
_mm_store_si128 (sseDst++, _mm_shuffle_epi8(s, m[3]));
|
||||
_mm_store_si128 (sseDst, _mm_shuffle_epi8(s, m[2]));
|
||||
iy++;
|
||||
|
||||
sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
|
||||
_mm_store_si128 (sseDst++, _mm_shuffle_epi8(s, m[5]));
|
||||
_mm_store_si128 (sseDst, _mm_shuffle_epi8(s, m[4]));
|
||||
iy++;
|
||||
|
||||
sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
|
||||
_mm_store_si128 (sseDst++, _mm_shuffle_epi8(s, m[7]));
|
||||
_mm_store_si128 (sseDst, _mm_shuffle_epi8(s, m[6]));
|
||||
iy++;
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return;
|
||||
|
||||
}
|
||||
#endif
|
||||
__m128i Lmask = _mm_set1_epi8 (0x0F);
|
||||
__m128i Hmask = _mm_set1_epi8 (0xF0);
|
||||
|
||||
for (int y = 0; y < height; y += 8) {
|
||||
for (int x = 0; x < width; x += 8) {
|
||||
for (int iy = 0; iy < 8; iy++, sseSrc++) {
|
||||
// TODO (mb2): Don't let the optimizer perform all the clean up by itself
|
||||
// (XK): Huh? What clean up? Where? Who? :)
|
||||
|
||||
__m128i s = _mm_load_si128 (sseSrc); // ab cd ef gh ...
|
||||
__m128i sl = _mm_and_si128 (s, Lmask); // 0b 0d 0f 0h ...
|
||||
__m128i sls = _mm_slli_epi16 (sl, 4); // b0 d0 f0 h0 ...
|
||||
__m128i sl_ = _mm_or_si128 (sl, sls); // bb dd ff ff ...
|
||||
|
||||
__m128i sh = _mm_and_si128 (s, Hmask); // a0 c0 e0 g0 ...
|
||||
__m128i shs = _mm_srli_epi16 (sh, 4); // 0a 0c 0e g0 ...
|
||||
__m128i sh_ = _mm_or_si128 (sh, shs); // aa cc ee gg ...
|
||||
__m128i rl = _mm_unpacklo_epi8 (sh_, sl_); // bb aa dd cc ...
|
||||
__m128i rh = _mm_unpackhi_epi8 (sh_, sl_); // ff ee hh gg ...
|
||||
|
||||
// result part a
|
||||
__m128i ral = _mm_unpacklo_epi8 (rl, rl); // bb bb aa aa ...
|
||||
__m128i rah = _mm_unpackhi_epi8 (rl, rl); // dd dd cc cc ...
|
||||
|
||||
__m128i rall = _mm_unpacklo_epi16 (ral, ral); // bb bb bb bb ... -> done
|
||||
__m128i ralh = _mm_unpackhi_epi16 (ral, ral); // aa aa aa aa ... -> done
|
||||
__m128i rahl = _mm_unpacklo_epi16 (rah, rah); // dd dd dd dd ... -> done
|
||||
__m128i rahh = _mm_unpackhi_epi16 (rah, rah); // cc cc cc cc ... -> done
|
||||
|
||||
// result part b
|
||||
__m128i rbl = _mm_unpacklo_epi8 (rh, rh); // ff ff ee ee ...
|
||||
__m128i rbh = _mm_unpackhi_epi8 (rh, rh); // hh hh gg gg ...
|
||||
|
||||
__m128i rbll = _mm_unpacklo_epi16 (rbl, rbl); // ff ff ff ff ... -> done
|
||||
__m128i rblh = _mm_unpackhi_epi16 (rbl, rbl); // ee ee ee ee ... -> done
|
||||
__m128i rbhl = _mm_unpacklo_epi16 (rbh, rbh); // hh hh hh hh ... -> done
|
||||
__m128i rbhh = _mm_unpackhi_epi16 (rbh, rbh); // gg gg gg gg ... -> done
|
||||
|
||||
// Store
|
||||
// TODO: Value lazyness
|
||||
sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
|
||||
_mm_store_si128 (sseDst++, rall);
|
||||
_mm_store_si128 (sseDst, ralh);
|
||||
iy++;
|
||||
|
||||
sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
|
||||
_mm_store_si128 (sseDst++, rahl);
|
||||
_mm_store_si128 (sseDst, rahh);
|
||||
iy++;
|
||||
|
||||
sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
|
||||
_mm_store_si128 (sseDst++, rbll);
|
||||
_mm_store_si128 (sseDst, rblh);
|
||||
iy++;
|
||||
|
||||
|
||||
sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
|
||||
_mm_store_si128 (sseDst++, rbhl);
|
||||
_mm_store_si128 (sseDst, rbhh);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
inline void decodebytesI8_8(u32 *dst, const u8 *src)
|
||||
{
|
||||
for (int x = 0; x < 8; x++)
|
||||
dst[x] = src[x] * 0x01010101; //expand8888(src[x]); *0x... may or may not be faster. not sure. Should be faster on P4 at least.
|
||||
}
|
||||
|
||||
//inline void decodebytesC4(u32 *dst, const u8 *src, int numbytes, int tlutaddr, int tlutfmt)
|
||||
inline void decodebytesC4(u32 *dst, const u8 *src, int tlutaddr, int tlutfmt)
|
||||
{
|
||||
|
@ -524,18 +380,25 @@ PC_TexFormat TexDecoder_Decode(u8 *dst, const u8 *src, int width, int height, in
|
|||
return PC_TEX_FMT_BGRA32;
|
||||
case GX_TF_I4:
|
||||
{
|
||||
sseDecodebytesI4(dst, (const __m128i *)src, height,
|
||||
width);
|
||||
|
||||
/* Old non-SSE way
|
||||
for (int y = 0; y < height; y += 8)
|
||||
for (int x = 0; x < width; x += 8)
|
||||
for (int iy = 0; iy < 8; iy++, src += 4)
|
||||
//decodebytesI4((u32*)dst+(y+iy)*width+x, src, 4);
|
||||
decodebytesI4((u32*)dst+(y+iy)*width+x, src);
|
||||
*/
|
||||
for (int ix = 0; ix < 4; ix++)
|
||||
{
|
||||
int val = src[ix];
|
||||
dst[(y+iy)*width+x+ix*2] = lut4to8[val>>4];
|
||||
dst[(y+iy)*width+x+ix*2+1] = lut4to8[val&15];
|
||||
}
|
||||
return PC_TEX_FMT_BGRA32;
|
||||
}
|
||||
return PC_TEX_FMT_I8;
|
||||
case GX_TF_I8: // speed critical
|
||||
{
|
||||
for (int y = 0; y < height; y += 4)
|
||||
for (int x = 0; x < width; x += 8)
|
||||
for (int iy = 0; iy < 4; iy++, src += 8)
|
||||
memcpy(dst+(y+iy)*width+x, src, 8);
|
||||
}
|
||||
return PC_TEX_FMT_I8;
|
||||
case GX_TF_C8:
|
||||
{
|
||||
for (int y = 0; y < height; y += 4)
|
||||
|
@ -545,40 +408,6 @@ PC_TexFormat TexDecoder_Decode(u8 *dst, const u8 *src, int width, int height, in
|
|||
decodebytesC8((u32*)dst+(y+iy)*width+x, src, tlutaddr, tlutfmt);
|
||||
}
|
||||
return PC_TEX_FMT_BGRA32;
|
||||
case GX_TF_I8: // speed critical
|
||||
{
|
||||
#if 1
|
||||
__m128i *sseSrc = (__m128i *)src;
|
||||
__m128i *sseDst = (__m128i *)dst;
|
||||
for (int y = 0; y < height; y += 4)
|
||||
for (int x = 0; x < width; x += 8)
|
||||
for (int iy = 0; iy < 4; iy++, sseSrc++) {
|
||||
// TODO (mb2): func and don't let the optimizer perform all the clean up by itself
|
||||
__m128i s = _mm_load_si128 (sseSrc); // ab cd ef gh ...
|
||||
__m128i rl = _mm_unpacklo_epi8 (s, s); // ab ab cd cd ...
|
||||
__m128i rh = _mm_unpackhi_epi8 (s, s); //
|
||||
// result
|
||||
__m128i rll = _mm_unpacklo_epi8 (rl, rl); // ab ab ab ab
|
||||
__m128i rlh = _mm_unpackhi_epi8 (rl, rl);
|
||||
__m128i rhl = _mm_unpacklo_epi8 (rh, rh);
|
||||
__m128i rhh = _mm_unpackhi_epi8 (rh, rh);
|
||||
// store
|
||||
sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
|
||||
_mm_store_si128 (sseDst++, rll);
|
||||
_mm_store_si128 (sseDst, rlh);
|
||||
iy++;
|
||||
sseDst = (__m128i*)(dst+((y+iy)*width+x)*4);
|
||||
_mm_store_si128 (sseDst++, rhl);
|
||||
_mm_store_si128 (sseDst, rhh);
|
||||
}
|
||||
#else
|
||||
for (int y = 0; y < height; y += 4)
|
||||
for (int x = 0; x < width; x += 8)
|
||||
for (int iy = 0; iy < 4; iy++, src += 8)
|
||||
decodebytesI8_8((u32*)dst+(y+iy)*width+x, src);
|
||||
#endif
|
||||
}
|
||||
return PC_TEX_FMT_BGRA32;
|
||||
case GX_TF_IA4:
|
||||
{
|
||||
for (int y = 0; y < height; y += 4)
|
||||
|
|
|
@ -72,7 +72,8 @@ int TexDecoder_GetPaletteSize(int fmt);
|
|||
enum PC_TexFormat
|
||||
{
|
||||
PC_TEX_FMT_NONE = 0,
|
||||
PC_TEX_FMT_BGRA32 = 1,
|
||||
PC_TEX_FMT_BGRA32,
|
||||
PC_TEX_FMT_I8,
|
||||
};
|
||||
|
||||
PC_TexFormat TexDecoder_Decode(u8 *dst, const u8 *src, int width, int height, int texformat, int tlutaddr, int tlutfmt);
|
||||
|
|
|
@ -44,11 +44,39 @@ LPDIRECT3DTEXTURE9 CreateTexture2D(const u8* buffer, const int width, const int
|
|||
D3DLOCKED_RECT Lock;
|
||||
pTexture->LockRect(level, &Lock, NULL, 0 );
|
||||
|
||||
u32* pIn = pBuffer;
|
||||
switch(fmt)
|
||||
{
|
||||
case D3DFMT_L8:
|
||||
case D3DFMT_A8:
|
||||
{
|
||||
const u8 *pIn = buffer;
|
||||
for (int y = 0; y < height; y++)
|
||||
{
|
||||
u8* pBits = ((u8*)Lock.pBits + (y * Lock.Pitch));
|
||||
memcpy(pBits, pIn, width);
|
||||
pIn += pitch;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case D3DFMT_A8L8:
|
||||
{
|
||||
const u8 *pIn = buffer;
|
||||
// TODO(XK): Find a better way that does not involve either unpacking
|
||||
// or downsampling (i.e. A4L4)
|
||||
for (int y = 0; y < height; y++)
|
||||
{
|
||||
u8* pBits = ((u8*)Lock.pBits + (y * Lock.Pitch));
|
||||
for(int i = 0; i < width * 2; i += 2) {
|
||||
pBits[i] = pIn[i / 2];
|
||||
pBits[i + 1] = pIn[i / 2];
|
||||
}
|
||||
pIn += pitch;
|
||||
}
|
||||
}
|
||||
break;
|
||||
case D3DFMT_A8R8G8B8:
|
||||
{
|
||||
u32* pIn = pBuffer;
|
||||
for (int y = 0; y < height; y++)
|
||||
{
|
||||
u32* pBits = (u32*)((u8*)Lock.pBits + (y * Lock.Pitch));
|
||||
|
|
|
@ -176,6 +176,12 @@ void TextureCache::Load(int stage, u32 address, int width, int height, int forma
|
|||
case PC_TEX_FMT_BGRA32:
|
||||
d3d_fmt = D3DFMT_A8R8G8B8;
|
||||
break;
|
||||
/*case PC_TEX_FMT_BGRA16: is this of any use?
|
||||
d3d_fmt = D3DFMT_A4R4G4B4;
|
||||
break;*/
|
||||
case PC_TEX_FMT_I8:
|
||||
d3d_fmt = D3DFMT_A8L8;
|
||||
break;
|
||||
}
|
||||
|
||||
//Make an entry in the table
|
||||
|
|
|
@ -343,21 +343,30 @@ TextureMngr::TCacheEntry* TextureMngr::Load(int texstage, u32 address, int width
|
|||
glPixelStorei(GL_UNPACK_ROW_LENGTH, expandedWidth);
|
||||
|
||||
int gl_format;
|
||||
int gl_iformat;
|
||||
int gl_type;
|
||||
switch (dfmt) {
|
||||
case PC_TEX_FMT_NONE:
|
||||
PanicAlert("Invalid PC texture format %i", dfmt);
|
||||
case PC_TEX_FMT_BGRA32:
|
||||
gl_format = GL_BGRA;
|
||||
gl_iformat = 4;
|
||||
gl_type = GL_UNSIGNED_BYTE;
|
||||
break;
|
||||
case PC_TEX_FMT_I8:
|
||||
gl_format = GL_LUMINANCE;
|
||||
gl_iformat = GL_INTENSITY;
|
||||
gl_type = GL_UNSIGNED_BYTE;
|
||||
break;
|
||||
}
|
||||
if (!entry.isNonPow2 && ((tm0.min_filter & 3) == 1 || (tm0.min_filter & 3) == 2)) {
|
||||
gluBuild2DMipmaps(GL_TEXTURE_2D, 4, width, height, gl_format, gl_type, temp);
|
||||
glTexParameteri(GL_TEXTURE_2D, GL_GENERATE_MIPMAP, GL_TRUE);
|
||||
glTexImage2D(GL_TEXTURE_2D, 0, gl_iformat, width, height, 0, gl_format, gl_type, temp);
|
||||
glTexParameteri(GL_TEXTURE_2D, GL_GENERATE_MIPMAP, GL_FALSE);
|
||||
entry.bHaveMipMaps = true;
|
||||
}
|
||||
else
|
||||
glTexImage2D(target, 0, 4, width, height, 0, gl_format, gl_type, temp);
|
||||
glTexImage2D(target, 0, gl_iformat, width, height, 0, gl_format, gl_type, temp);
|
||||
|
||||
if (expandedWidth != width) // reset
|
||||
glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
|
||||
|
|
Loading…
Reference in New Issue