GSdx: optimizations here and there

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@1161 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
gabest11 2009-05-09 08:37:02 +00:00
parent e10ca2ba49
commit 6ea395be4b
6 changed files with 238 additions and 165 deletions

View File

@ -510,7 +510,9 @@ REG64_(GIFReg, ALPHA)
UINT32 FIX:8; UINT32 FIX:8;
UINT32 _PAD2:24; UINT32 _PAD2:24;
REG_END2 REG_END2
__forceinline bool IsOpaque() const {return (A == B || C == 2 && FIX == 0) && D == 0 || (A == 0 && B == 2 && C == 2 && D == 2 && FIX == 0x80);} // output will be Cs/As // opaque => output will be Cs/As
__forceinline bool IsOpaque() const {return (A == B || C == 2 && FIX == 0) && D == 0 || (A == 0 && B == D && C == 2 && FIX == 0x80);}
__forceinline bool IsOpaque(int amin, int amax) const {return (A == B || amax == 0) && D == 0 || A == 0 && B == D && amin == 0x80 && amax == 0x80;}
REG_END2 REG_END2
REG64_(GIFReg, BITBLTBUF) REG64_(GIFReg, BITBLTBUF)
@ -1061,21 +1063,41 @@ REG_SET_END
__declspec(align(16)) struct GIFPath __declspec(align(16)) struct GIFPath
{ {
GIFTag tag; GIFTag tag;
UINT32 reg;
UINT32 nreg; UINT32 nreg;
UINT32 _pad[3]; UINT32 nloop;
UINT32 adonly;
GSVector4i regs; GSVector4i regs;
void SetTag(const void* mem) void SetTag(const void* mem)
{ {
GSVector4i v = GSVector4i::load<false>(mem); GSVector4i v = GSVector4i::load<false>(mem);
GSVector4i::store<true>(&tag, v); GSVector4i::store<true>(&tag, v);
nreg = 0; reg = 0;
regs = v.uph8(v >> 4) & 0x0f0f0f0f; regs = v.uph8(v >> 4) & 0x0f0f0f0f;
nreg = tag.NREG;
nloop = tag.NLOOP;
adonly = nreg == 1 && regs.u8[0] == GIF_REG_A_D;
} }
DWORD GetReg() __forceinline DWORD GetReg()
{ {
return regs.u8[nreg]; // (DWORD)GET_GIF_REG(tag, nreg); return regs.u8[reg]; // (DWORD)GET_GIF_REG(tag, nreg);
}
__forceinline bool StepReg()
{
if((++reg & 0xf) == nreg)
{
reg = 0;
if(--nloop == 0)
{
return false;
}
}
return true;
} }
}; };

View File

@ -117,9 +117,74 @@ protected:
return true; return true;
} }
void GetAlphaMinMax()
{
if(m_vtrace.m_alpha.valid)
{
return;
}
const GSDrawingEnvironment& env = m_env;
const GSDrawingContext* context = m_context;
GSVector4i a = GSVector4i(m_vtrace.m_min.c.wwww(m_vtrace.m_max.c)) >> 7;
if(PRIM->TME && context->TEX0.TCC)
{
DWORD bpp = GSLocalMemory::m_psm[context->TEX0.PSM].trbpp;
DWORD cbpp = GSLocalMemory::m_psm[context->TEX0.CPSM].trbpp;
DWORD pal = GSLocalMemory::m_psm[context->TEX0.PSM].pal;
if(bpp == 32)
{
a.y = 0;
a.w = 0xff;
}
else if(bpp == 24)
{
a.y = env.TEXA.AEM ? 0 : env.TEXA.TA0;
a.w = env.TEXA.TA0;
}
else if(bpp == 16)
{
a.y = env.TEXA.AEM ? 0 : min(env.TEXA.TA0, env.TEXA.TA1);
a.w = max(env.TEXA.TA0, env.TEXA.TA1);
}
else
{
m_mem.m_clut.GetAlphaMinMax32(a.y, a.w);
}
switch(context->TEX0.TFX)
{
case TFX_MODULATE:
a.x = (a.x * a.y) >> 7;
a.z = (a.z * a.w) >> 7;
if(a.x > 0xff) a.x = 0xff;
if(a.z > 0xff) a.z = 0xff;
break;
case TFX_DECAL:
break;
case TFX_HIGHLIGHT:
a.x = a.x + a.y;
a.z = a.z + a.w;
if(a.x > 0xff) a.x = 0xff;
if(a.z > 0xff) a.z = 0xff;
break;
case TFX_HIGHLIGHT2:
break;
default:
__assume(0);
}
}
m_vtrace.m_alpha.min = a.x;
m_vtrace.m_alpha.max = a.z;
m_vtrace.m_alpha.valid = true;
}
bool TryAlphaTest(DWORD& fm, DWORD& zm) bool TryAlphaTest(DWORD& fm, DWORD& zm)
{ {
const GSDrawingEnvironment& env = m_env;
const GSDrawingContext* context = m_context; const GSDrawingContext* context = m_context;
bool pass = true; bool pass = true;
@ -130,62 +195,10 @@ protected:
} }
else if(context->TEST.ATST != ATST_ALWAYS) else if(context->TEST.ATST != ATST_ALWAYS)
{ {
GSVector4i af = GSVector4i(m_vtrace.m_min.c.wwww(m_vtrace.m_max.c)) >> 7; GetAlphaMinMax();
int amin, amax; int amin = m_vtrace.m_alpha.min;
int amax = m_vtrace.m_alpha.max;
if(PRIM->TME && context->TEX0.TCC)
{
DWORD bpp = GSLocalMemory::m_psm[context->TEX0.PSM].trbpp;
DWORD cbpp = GSLocalMemory::m_psm[context->TEX0.CPSM].trbpp;
DWORD pal = GSLocalMemory::m_psm[context->TEX0.PSM].pal;
if(bpp == 32)
{
return false;
}
else if(bpp == 24)
{
amin = env.TEXA.AEM ? 0 : env.TEXA.TA0;
amax = env.TEXA.TA0;
}
else if(bpp == 16)
{
amin = env.TEXA.AEM ? 0 : min(env.TEXA.TA0, env.TEXA.TA1);
amax = max(env.TEXA.TA0, env.TEXA.TA1);
}
else
{
m_mem.m_clut.GetAlphaMinMax32(amin, amax);
}
switch(context->TEX0.TFX)
{
case TFX_MODULATE:
amin = (amin * af.x) >> 7;
amax = (amax * af.z) >> 7;
if(amin > 255) amin = 255;
if(amax > 255) amax = 255;
break;
case TFX_DECAL:
break;
case TFX_HIGHLIGHT:
amin = amin + af.x;
amax = amax + af.z;
if(amin > 255) amin = 255;
if(amax > 255) amax = 255;
break;
case TFX_HIGHLIGHT2:
break;
default:
__assume(0);
}
}
else
{
amin = af.x;
amax = af.z;
}
int aref = context->TEST.AREF; int aref = context->TEST.AREF;
@ -252,7 +265,7 @@ protected:
const GSDrawingEnvironment& env = m_env; const GSDrawingEnvironment& env = m_env;
const GSDrawingContext* context = m_context; const GSDrawingContext* context = m_context;
p.vm = m_mem.m_vm32; p.vm = m_mem.m_vm8;
p.fbo = m_mem.GetOffset(context->FRAME.Block(), context->FRAME.FBW, context->FRAME.PSM); p.fbo = m_mem.GetOffset(context->FRAME.Block(), context->FRAME.FBW, context->FRAME.PSM);
p.zbo = m_mem.GetOffset(context->ZBUF.Block(), context->FRAME.FBW, context->ZBUF.PSM); p.zbo = m_mem.GetOffset(context->ZBUF.Block(), context->FRAME.FBW, context->ZBUF.PSM);
@ -446,7 +459,31 @@ protected:
p.sel.datm = context->TEST.DATM; p.sel.datm = context->TEST.DATM;
} }
if(PRIM->ABE && !context->ALPHA.IsOpaque() || PRIM->AA1) int amin = 0, amax = 0xff;
if(PRIM->ABE && context->ALPHA.A != context->ALPHA.B && !PRIM->AA1)
{
if(context->ALPHA.C == 0)
{
GetAlphaMinMax();
amin = m_vtrace.m_alpha.min;
amax = m_vtrace.m_alpha.max;
}
else if(context->ALPHA.C == 1)
{
if(p.sel.fpsm == 1)
{
amin = amax = 0x80;
}
}
else if(context->ALPHA.C == 1)
{
amin = amax = context->ALPHA.FIX;
}
}
if(PRIM->ABE && !context->ALPHA.IsOpaque(amin, amax) || PRIM->AA1)
{ {
p.sel.abe = PRIM->ABE; p.sel.abe = PRIM->ABE;
p.sel.ababcd = context->ALPHA.ai32[0]; p.sel.ababcd = context->ALPHA.ai32[0];
@ -581,7 +618,7 @@ protected:
if(s_savez) {m_mem.SaveBMP(str, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameSize().cx, 512);} if(s_savez) {m_mem.SaveBMP(str, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameSize().cx, 512);}
} }
if(0)//stats.ticks > 1000000) if(0)//stats.ticks > 5000000)
{ {
printf("* [%I64d | %012I64x] ticks %I64d prims %d (%d) pixels %d (%d)\n", printf("* [%I64d | %012I64x] ticks %I64d prims %d (%d) pixels %d (%d)\n",
m_perfmon.GetFrame(), p.sel.key, m_perfmon.GetFrame(), p.sel.key,

View File

@ -80,7 +80,7 @@ GSState::GSState(BYTE* base, bool mt, void (*irq)())
m_sssize += sizeof(m_tr.x); m_sssize += sizeof(m_tr.x);
m_sssize += sizeof(m_tr.y); m_sssize += sizeof(m_tr.y);
m_sssize += m_mem.m_vmsize; m_sssize += m_mem.m_vmsize;
m_sssize += (sizeof(m_path[0].tag) + sizeof(m_path[0].nreg)) * 3; m_sssize += (sizeof(m_path[0].tag) + sizeof(m_path[0].reg)) * 3;
m_sssize += sizeof(m_q); m_sssize += sizeof(m_q);
ASSERT(base); ASSERT(base);
@ -453,14 +453,6 @@ void GSState::GIFPackedRegHandlerA_D(GIFPackedReg* r)
(this->*m_fpGIFRegHandlers[(BYTE)r->A_D.ADDR])(&r->r); (this->*m_fpGIFRegHandlers[(BYTE)r->A_D.ADDR])(&r->r);
} }
void GSState::GIFPackedRegHandlerA_D(GIFPackedReg* r, int size)
{
for(int i = 0; i < size; i++)
{
(this->*m_fpGIFRegHandlers[(BYTE)r[i].A_D.ADDR])(&r[i].r);
}
}
void GSState::GIFPackedRegHandlerNOP(GIFPackedReg* r) void GSState::GIFPackedRegHandlerNOP(GIFPackedReg* r)
{ {
} }
@ -1181,7 +1173,7 @@ template<int index> void GSState::Transfer(BYTE* mem, UINT32 size)
while(size > 0) while(size > 0)
{ {
if(path.tag.NLOOP == 0) if(path.nloop == 0)
{ {
path.SetTag(mem); path.SetTag(mem);
@ -1193,15 +1185,11 @@ template<int index> void GSState::Transfer(BYTE* mem, UINT32 size)
m_path3hack = 1; m_path3hack = 1;
} }
if(path.tag.NLOOP > 0) // eeuser 7.2.2. GIFtag: "... when NLOOP is 0, the GIF does not output anything, and values other than the EOP field are disregarded." if(path.nloop > 0) // eeuser 7.2.2. GIFtag: "... when NLOOP is 0, the GIF does not output anything, and values other than the EOP field are disregarded."
{ {
m_q = 1.0f; m_q = 1.0f;
if(path.tag.PRE) if(path.tag.PRE && (path.tag.FLG & 2) == 0)
{
ASSERT(path.tag.FLG != GIF_FLG_IMAGE); // kingdom hearts, ffxii, tales of abyss, berserk
if((path.tag.FLG & 2) == 0)
{ {
GIFReg r; GIFReg r;
r.i64 = path.tag.PRIM; r.i64 = path.tag.PRIM;
@ -1209,7 +1197,6 @@ template<int index> void GSState::Transfer(BYTE* mem, UINT32 size)
} }
} }
} }
}
else else
{ {
switch(path.tag.FLG) switch(path.tag.FLG)
@ -1218,37 +1205,44 @@ template<int index> void GSState::Transfer(BYTE* mem, UINT32 size)
// first try a shortcut for a very common case // first try a shortcut for a very common case
if(path.nreg == 0 && path.tag.NREG == 1 && size >= path.tag.NLOOP && path.GetReg() == GIF_REG_A_D) if(path.adonly && size >= path.nloop)
{ {
int n = path.tag.NLOOP; size -= path.nloop;
GIFPackedRegHandlerA_D((GIFPackedReg*)mem, n); do
{
(this->*m_fpGIFRegHandlers[(BYTE)((GIFPackedReg*)mem)->A_D.ADDR])(&((GIFPackedReg*)mem)->r);
mem += n * sizeof(GIFPackedReg); mem += sizeof(GIFPackedReg);
size -= n; }
while(--path.nloop > 0);
path.tag.NLOOP = 0;
} }
else else
{ {
while(size > 0) do
{ {
(this->*m_fpGIFPackedRegHandlers[path.GetReg()])((GIFPackedReg*)mem); DWORD reg = path.GetReg();
size--; switch(reg)
mem += sizeof(GIFPackedReg);
if((++path.nreg & 0xf) == path.tag.NREG)
{
path.nreg = 0;
path.tag.NLOOP--;
if(path.tag.NLOOP == 0)
{ {
case GIF_REG_RGBA:
GIFPackedRegHandlerRGBA((GIFPackedReg*)mem);
break;
case GIF_REG_STQ:
GIFPackedRegHandlerSTQ((GIFPackedReg*)mem);
break;
case GIF_REG_UV:
GIFPackedRegHandlerUV((GIFPackedReg*)mem);
break;
default:
(this->*m_fpGIFPackedRegHandlers[reg])((GIFPackedReg*)mem);
break; break;
} }
mem += sizeof(GIFPackedReg);
size--;
} }
} while(path.StepReg() && size > 0);
} }
break; break;
@ -1257,24 +1251,14 @@ template<int index> void GSState::Transfer(BYTE* mem, UINT32 size)
size *= 2; size *= 2;
while(size > 0) do
{ {
(this->*m_fpGIFRegHandlers[path.GetReg()])((GIFReg*)mem); (this->*m_fpGIFRegHandlers[path.GetReg()])((GIFReg*)mem);
size--;
mem += sizeof(GIFReg); mem += sizeof(GIFReg);
size--;
if((++path.nreg & 0xf) == path.tag.NREG)
{
path.nreg = 0;
path.tag.NLOOP--;
if(path.tag.NLOOP == 0)
{
break;
}
}
} }
while(path.StepReg() && size > 0);
if(size & 1) mem += sizeof(GIFReg); if(size & 1) mem += sizeof(GIFReg);
@ -1286,13 +1270,13 @@ template<int index> void GSState::Transfer(BYTE* mem, UINT32 size)
ASSERT(0); ASSERT(0);
path.tag.NLOOP = 0; path.nloop = 0;
break; break;
case GIF_FLG_IMAGE: case GIF_FLG_IMAGE:
{ {
int len = (int)min(size, path.tag.NLOOP); int len = (int)min(size, path.nloop);
//ASSERT(!(len&3)); //ASSERT(!(len&3));
@ -1315,7 +1299,7 @@ template<int index> void GSState::Transfer(BYTE* mem, UINT32 size)
} }
mem += len * 16; mem += len * 16;
path.tag.NLOOP -= len; path.nloop -= len;
size -= len; size -= len;
} }
@ -1328,7 +1312,7 @@ template<int index> void GSState::Transfer(BYTE* mem, UINT32 size)
if(index == 0) if(index == 0)
{ {
if(path.tag.EOP && path.tag.NLOOP == 0) if(path.tag.EOP && path.nloop == 0)
{ {
break; break;
} }
@ -1342,13 +1326,13 @@ template<int index> void GSState::Transfer(BYTE* mem, UINT32 size)
if(index == 0) if(index == 0)
{ {
if(size == 0 && path.tag.NLOOP > 0) if(size == 0 && path.nloop > 0)
{ {
if(m_mt) if(m_mt)
{ {
// TODO // TODO
path.tag.NLOOP = 0; path.nloop = 0;
} }
else else
{ {
@ -1433,8 +1417,11 @@ int GSState::Freeze(GSFreezeData* fd, bool sizeonly)
for(int i = 0; i < 3; i++) for(int i = 0; i < 3; i++)
{ {
m_path[i].tag.NREG = m_path[i].nreg;
m_path[i].tag.NLOOP = m_path[i].nloop;
WriteState(data, &m_path[i].tag); WriteState(data, &m_path[i].tag);
WriteState(data, &m_path[i].nreg); WriteState(data, &m_path[i].reg);
} }
WriteState(data, &m_q); WriteState(data, &m_q);
@ -1525,7 +1512,7 @@ int GSState::Defrost(const GSFreezeData* fd)
for(int i = 0; i < 3; i++) for(int i = 0; i < 3; i++)
{ {
ReadState(&m_path[i].tag, data); ReadState(&m_path[i].tag, data);
ReadState(&m_path[i].nreg, data); ReadState(&m_path[i].reg, data);
m_path[i].SetTag(&m_path[i].tag); // expand regs m_path[i].SetTag(&m_path[i].tag); // expand regs
} }

View File

@ -55,7 +55,6 @@ class GSState : public GSAlignedClass<16>
void GIFPackedRegHandlerXYZF3(GIFPackedReg* r); void GIFPackedRegHandlerXYZF3(GIFPackedReg* r);
void GIFPackedRegHandlerXYZ3(GIFPackedReg* r); void GIFPackedRegHandlerXYZ3(GIFPackedReg* r);
void GIFPackedRegHandlerA_D(GIFPackedReg* r); void GIFPackedRegHandlerA_D(GIFPackedReg* r);
void GIFPackedRegHandlerA_D(GIFPackedReg* r, int size);
void GIFPackedRegHandlerNOP(GIFPackedReg* r); void GIFPackedRegHandlerNOP(GIFPackedReg* r);
typedef void (GSState::*GIFRegHandler)(GIFReg* r); typedef void (GSState::*GIFRegHandler)(GIFReg* r);

View File

@ -283,6 +283,8 @@ bool GSTextureCacheSW::GSTexture::Update(const GIFRegTEX0& TEX0, const GIFRegTEX
DWORD blocks = 0; DWORD blocks = 0;
if(tw <= (bw << 6))
{
for(int y = r.top, o = pitch * s.cy; y < r.bottom; y += s.cy, dst += o) for(int y = r.top, o = pitch * s.cy; y < r.bottom; y += s.cy, dst += o)
{ {
DWORD base = psm.bn(0, y, bp, bw); DWORD base = psm.bn(0, y, bp, bw);
@ -291,33 +293,52 @@ bool GSTextureCacheSW::GSTexture::Update(const GIFRegTEX0& TEX0, const GIFRegTEX
{ {
DWORD block = base + psm.blockOffset[x >> 3]; DWORD block = base + psm.blockOffset[x >> 3];
if(block >= MAX_BLOCKS) if(block < MAX_BLOCKS)
{ {
continue;
}
DWORD row = block >> 5; DWORD row = block >> 5;
DWORD col = 1 << (block & 31); DWORD col = 1 << (block & 31);
if(m_valid[row] & col) if((m_valid[row] & col) == 0)
{ {
continue; m_valid[row] |= col;
}
// unfortunatelly a block may be part of the same texture multiple times at different places (when (1 << tw) > (tbw << 6), ex. 1024 > 640),
// so just can't set the block's bit to valid in one pass, even if 99.9% of the games don't address the repeated part at the right side
// TODO: still bogus if those repeated parts aren't fetched together
// m_valid[row] |= col;
(mem.*rtxb)(block, &dst[x * bytes], pitch, TEXA); (mem.*rtxb)(block, &dst[x * bytes], pitch, TEXA);
blocks++; blocks++;
} }
} }
}
}
}
else
{
// unfortunatelly a block may be part of the same texture multiple times at different places (tw 1024 > tbw 640, between 640 -> 1024 it is repeated from the next row),
// so just can't set the block's bit to valid in one pass, even if 99.9% of the games don't address the repeated part at the right side
m_state->m_perfmon.Put(GSPerfMon::Unswizzle, s.cx * s.cy * bytes * blocks); // TODO: still bogus if those repeated parts aren't fetched together
for(int y = r.top, o = pitch * s.cy; y < r.bottom; y += s.cy, dst += o)
{
DWORD base = psm.bn(0, y, bp, bw);
for(int x = r.left; x < r.right; x += s.cx)
{
DWORD block = base + psm.blockOffset[x >> 3];
if(block < MAX_BLOCKS)
{
DWORD row = block >> 5;
DWORD col = 1 << (block & 31);
if((m_valid[row] & col) == 0)
{
(mem.*rtxb)(block, &dst[x * bytes], pitch, TEXA);
blocks++;
}
}
}
}
for(int y = r.top; y < r.bottom; y += s.cy) for(int y = r.top; y < r.bottom; y += s.cy)
{ {
@ -327,17 +348,18 @@ bool GSTextureCacheSW::GSTexture::Update(const GIFRegTEX0& TEX0, const GIFRegTEX
{ {
DWORD block = base + psm.blockOffset[x >> 3]; DWORD block = base + psm.blockOffset[x >> 3];
if(block >= MAX_BLOCKS) if(block < MAX_BLOCKS)
{ {
continue;
}
DWORD row = block >> 5; DWORD row = block >> 5;
DWORD col = 1 << (block & 31); DWORD col = 1 << (block & 31);
m_valid[row] |= col; m_valid[row] |= col;
} }
} }
}
}
m_state->m_perfmon.Put(GSPerfMon::Unswizzle, s.cx * s.cy * bytes * blocks);
return true; return true;
} }

View File

@ -237,6 +237,8 @@ __declspec(align(16)) class GSVertexTrace
public: public:
GSVertexSW m_min, m_max; GSVertexSW m_min, m_max;
struct {int min, max; bool valid;} m_alpha; // source alpha range after tfx
union union
{ {
DWORD value; DWORD value;
@ -256,9 +258,10 @@ public:
m_map.Lookup(key)(v, count, m_min, m_max); m_map.Lookup(key)(v, count, m_min, m_max);
m_eq.value = (m_min.p == m_max.p).mask() | ((m_min.t == m_max.t).mask() << 4) | ((m_min.c == m_max.c).mask() << 8); m_eq.value = (m_min.p == m_max.p).mask() | ((m_min.t == m_max.t).mask() << 4) | ((m_min.c == m_max.c).mask() << 8);
m_alpha.valid = false;
} }
/* /*
*/
void Update(const GSVertexSW* v, int count) void Update(const GSVertexSW* v, int count)
{ {
GSVertexSW min, max; GSVertexSW min, max;
@ -284,5 +287,8 @@ public:
m_max = max; m_max = max;
m_eq.value = (min.p == max.p).mask() | ((min.t == max.t).mask() << 4) | ((min.c == max.c).mask() << 8); m_eq.value = (min.p == max.p).mask() | ((min.t == max.t).mask() << 4) | ((min.c == max.c).mask() << 8);
m_alpha.valid = false;
} }
*/
}; };