GSdx: optimizations here and there

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@1161 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
gabest11 2009-05-09 08:37:02 +00:00
parent e10ca2ba49
commit 6ea395be4b
6 changed files with 238 additions and 165 deletions

View File

@ -510,7 +510,9 @@ REG64_(GIFReg, ALPHA)
UINT32 FIX:8;
UINT32 _PAD2:24;
REG_END2
__forceinline bool IsOpaque() const {return (A == B || C == 2 && FIX == 0) && D == 0 || (A == 0 && B == 2 && C == 2 && D == 2 && FIX == 0x80);} // output will be Cs/As
// opaque => output will be Cs/As
__forceinline bool IsOpaque() const {return (A == B || C == 2 && FIX == 0) && D == 0 || (A == 0 && B == D && C == 2 && FIX == 0x80);}
__forceinline bool IsOpaque(int amin, int amax) const {return (A == B || amax == 0) && D == 0 || A == 0 && B == D && amin == 0x80 && amax == 0x80;}
REG_END2
REG64_(GIFReg, BITBLTBUF)
@ -1061,21 +1063,41 @@ REG_SET_END
__declspec(align(16)) struct GIFPath
{
GIFTag tag;
UINT32 reg;
UINT32 nreg;
UINT32 _pad[3];
UINT32 nloop;
UINT32 adonly;
GSVector4i regs;
void SetTag(const void* mem)
{
GSVector4i v = GSVector4i::load<false>(mem);
GSVector4i::store<true>(&tag, v);
nreg = 0;
reg = 0;
regs = v.uph8(v >> 4) & 0x0f0f0f0f;
nreg = tag.NREG;
nloop = tag.NLOOP;
adonly = nreg == 1 && regs.u8[0] == GIF_REG_A_D;
}
DWORD GetReg()
__forceinline DWORD GetReg()
{
return regs.u8[nreg]; // (DWORD)GET_GIF_REG(tag, nreg);
return regs.u8[reg]; // (DWORD)GET_GIF_REG(tag, nreg);
}
__forceinline bool StepReg()
{
if((++reg & 0xf) == nreg)
{
reg = 0;
if(--nloop == 0)
{
return false;
}
}
return true;
}
};

View File

@ -117,9 +117,74 @@ protected:
return true;
}
void GetAlphaMinMax()
{
if(m_vtrace.m_alpha.valid)
{
return;
}
const GSDrawingEnvironment& env = m_env;
const GSDrawingContext* context = m_context;
GSVector4i a = GSVector4i(m_vtrace.m_min.c.wwww(m_vtrace.m_max.c)) >> 7;
if(PRIM->TME && context->TEX0.TCC)
{
DWORD bpp = GSLocalMemory::m_psm[context->TEX0.PSM].trbpp;
DWORD cbpp = GSLocalMemory::m_psm[context->TEX0.CPSM].trbpp;
DWORD pal = GSLocalMemory::m_psm[context->TEX0.PSM].pal;
if(bpp == 32)
{
a.y = 0;
a.w = 0xff;
}
else if(bpp == 24)
{
a.y = env.TEXA.AEM ? 0 : env.TEXA.TA0;
a.w = env.TEXA.TA0;
}
else if(bpp == 16)
{
a.y = env.TEXA.AEM ? 0 : min(env.TEXA.TA0, env.TEXA.TA1);
a.w = max(env.TEXA.TA0, env.TEXA.TA1);
}
else
{
m_mem.m_clut.GetAlphaMinMax32(a.y, a.w);
}
switch(context->TEX0.TFX)
{
case TFX_MODULATE:
a.x = (a.x * a.y) >> 7;
a.z = (a.z * a.w) >> 7;
if(a.x > 0xff) a.x = 0xff;
if(a.z > 0xff) a.z = 0xff;
break;
case TFX_DECAL:
break;
case TFX_HIGHLIGHT:
a.x = a.x + a.y;
a.z = a.z + a.w;
if(a.x > 0xff) a.x = 0xff;
if(a.z > 0xff) a.z = 0xff;
break;
case TFX_HIGHLIGHT2:
break;
default:
__assume(0);
}
}
m_vtrace.m_alpha.min = a.x;
m_vtrace.m_alpha.max = a.z;
m_vtrace.m_alpha.valid = true;
}
bool TryAlphaTest(DWORD& fm, DWORD& zm)
{
const GSDrawingEnvironment& env = m_env;
const GSDrawingContext* context = m_context;
bool pass = true;
@ -130,62 +195,10 @@ protected:
}
else if(context->TEST.ATST != ATST_ALWAYS)
{
GSVector4i af = GSVector4i(m_vtrace.m_min.c.wwww(m_vtrace.m_max.c)) >> 7;
GetAlphaMinMax();
int amin, amax;
if(PRIM->TME && context->TEX0.TCC)
{
DWORD bpp = GSLocalMemory::m_psm[context->TEX0.PSM].trbpp;
DWORD cbpp = GSLocalMemory::m_psm[context->TEX0.CPSM].trbpp;
DWORD pal = GSLocalMemory::m_psm[context->TEX0.PSM].pal;
if(bpp == 32)
{
return false;
}
else if(bpp == 24)
{
amin = env.TEXA.AEM ? 0 : env.TEXA.TA0;
amax = env.TEXA.TA0;
}
else if(bpp == 16)
{
amin = env.TEXA.AEM ? 0 : min(env.TEXA.TA0, env.TEXA.TA1);
amax = max(env.TEXA.TA0, env.TEXA.TA1);
}
else
{
m_mem.m_clut.GetAlphaMinMax32(amin, amax);
}
switch(context->TEX0.TFX)
{
case TFX_MODULATE:
amin = (amin * af.x) >> 7;
amax = (amax * af.z) >> 7;
if(amin > 255) amin = 255;
if(amax > 255) amax = 255;
break;
case TFX_DECAL:
break;
case TFX_HIGHLIGHT:
amin = amin + af.x;
amax = amax + af.z;
if(amin > 255) amin = 255;
if(amax > 255) amax = 255;
break;
case TFX_HIGHLIGHT2:
break;
default:
__assume(0);
}
}
else
{
amin = af.x;
amax = af.z;
}
int amin = m_vtrace.m_alpha.min;
int amax = m_vtrace.m_alpha.max;
int aref = context->TEST.AREF;
@ -252,7 +265,7 @@ protected:
const GSDrawingEnvironment& env = m_env;
const GSDrawingContext* context = m_context;
p.vm = m_mem.m_vm32;
p.vm = m_mem.m_vm8;
p.fbo = m_mem.GetOffset(context->FRAME.Block(), context->FRAME.FBW, context->FRAME.PSM);
p.zbo = m_mem.GetOffset(context->ZBUF.Block(), context->FRAME.FBW, context->ZBUF.PSM);
@ -446,7 +459,31 @@ protected:
p.sel.datm = context->TEST.DATM;
}
if(PRIM->ABE && !context->ALPHA.IsOpaque() || PRIM->AA1)
int amin = 0, amax = 0xff;
if(PRIM->ABE && context->ALPHA.A != context->ALPHA.B && !PRIM->AA1)
{
if(context->ALPHA.C == 0)
{
GetAlphaMinMax();
amin = m_vtrace.m_alpha.min;
amax = m_vtrace.m_alpha.max;
}
else if(context->ALPHA.C == 1)
{
if(p.sel.fpsm == 1)
{
amin = amax = 0x80;
}
}
else if(context->ALPHA.C == 1)
{
amin = amax = context->ALPHA.FIX;
}
}
if(PRIM->ABE && !context->ALPHA.IsOpaque(amin, amax) || PRIM->AA1)
{
p.sel.abe = PRIM->ABE;
p.sel.ababcd = context->ALPHA.ai32[0];
@ -581,7 +618,7 @@ protected:
if(s_savez) {m_mem.SaveBMP(str, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameSize().cx, 512);}
}
if(0)//stats.ticks > 1000000)
if(0)//stats.ticks > 5000000)
{
printf("* [%I64d | %012I64x] ticks %I64d prims %d (%d) pixels %d (%d)\n",
m_perfmon.GetFrame(), p.sel.key,

View File

@ -80,7 +80,7 @@ GSState::GSState(BYTE* base, bool mt, void (*irq)())
m_sssize += sizeof(m_tr.x);
m_sssize += sizeof(m_tr.y);
m_sssize += m_mem.m_vmsize;
m_sssize += (sizeof(m_path[0].tag) + sizeof(m_path[0].nreg)) * 3;
m_sssize += (sizeof(m_path[0].tag) + sizeof(m_path[0].reg)) * 3;
m_sssize += sizeof(m_q);
ASSERT(base);
@ -453,14 +453,6 @@ void GSState::GIFPackedRegHandlerA_D(GIFPackedReg* r)
(this->*m_fpGIFRegHandlers[(BYTE)r->A_D.ADDR])(&r->r);
}
void GSState::GIFPackedRegHandlerA_D(GIFPackedReg* r, int size)
{
for(int i = 0; i < size; i++)
{
(this->*m_fpGIFRegHandlers[(BYTE)r[i].A_D.ADDR])(&r[i].r);
}
}
void GSState::GIFPackedRegHandlerNOP(GIFPackedReg* r)
{
}
@ -1181,7 +1173,7 @@ template<int index> void GSState::Transfer(BYTE* mem, UINT32 size)
while(size > 0)
{
if(path.tag.NLOOP == 0)
if(path.nloop == 0)
{
path.SetTag(mem);
@ -1193,20 +1185,15 @@ template<int index> void GSState::Transfer(BYTE* mem, UINT32 size)
m_path3hack = 1;
}
if(path.tag.NLOOP > 0) // eeuser 7.2.2. GIFtag: "... when NLOOP is 0, the GIF does not output anything, and values other than the EOP field are disregarded."
if(path.nloop > 0) // eeuser 7.2.2. GIFtag: "... when NLOOP is 0, the GIF does not output anything, and values other than the EOP field are disregarded."
{
m_q = 1.0f;
if(path.tag.PRE)
if(path.tag.PRE && (path.tag.FLG & 2) == 0)
{
ASSERT(path.tag.FLG != GIF_FLG_IMAGE); // kingdom hearts, ffxii, tales of abyss, berserk
if((path.tag.FLG & 2) == 0)
{
GIFReg r;
r.i64 = path.tag.PRIM;
(this->*m_fpGIFRegHandlers[GIF_A_D_REG_PRIM])(&r);
}
GIFReg r;
r.i64 = path.tag.PRIM;
(this->*m_fpGIFRegHandlers[GIF_A_D_REG_PRIM])(&r);
}
}
}
@ -1218,37 +1205,44 @@ template<int index> void GSState::Transfer(BYTE* mem, UINT32 size)
// first try a shortcut for a very common case
if(path.nreg == 0 && path.tag.NREG == 1 && size >= path.tag.NLOOP && path.GetReg() == GIF_REG_A_D)
if(path.adonly && size >= path.nloop)
{
int n = path.tag.NLOOP;
size -= path.nloop;
GIFPackedRegHandlerA_D((GIFPackedReg*)mem, n);
do
{
(this->*m_fpGIFRegHandlers[(BYTE)((GIFPackedReg*)mem)->A_D.ADDR])(&((GIFPackedReg*)mem)->r);
mem += n * sizeof(GIFPackedReg);
size -= n;
path.tag.NLOOP = 0;
mem += sizeof(GIFPackedReg);
}
while(--path.nloop > 0);
}
else
{
while(size > 0)
do
{
(this->*m_fpGIFPackedRegHandlers[path.GetReg()])((GIFPackedReg*)mem);
DWORD reg = path.GetReg();
size--;
mem += sizeof(GIFPackedReg);
if((++path.nreg & 0xf) == path.tag.NREG)
switch(reg)
{
path.nreg = 0;
path.tag.NLOOP--;
if(path.tag.NLOOP == 0)
{
break;
}
case GIF_REG_RGBA:
GIFPackedRegHandlerRGBA((GIFPackedReg*)mem);
break;
case GIF_REG_STQ:
GIFPackedRegHandlerSTQ((GIFPackedReg*)mem);
break;
case GIF_REG_UV:
GIFPackedRegHandlerUV((GIFPackedReg*)mem);
break;
default:
(this->*m_fpGIFPackedRegHandlers[reg])((GIFPackedReg*)mem);
break;
}
mem += sizeof(GIFPackedReg);
size--;
}
while(path.StepReg() && size > 0);
}
break;
@ -1257,24 +1251,14 @@ template<int index> void GSState::Transfer(BYTE* mem, UINT32 size)
size *= 2;
while(size > 0)
do
{
(this->*m_fpGIFRegHandlers[path.GetReg()])((GIFReg*)mem);
size--;
mem += sizeof(GIFReg);
if((++path.nreg & 0xf) == path.tag.NREG)
{
path.nreg = 0;
path.tag.NLOOP--;
if(path.tag.NLOOP == 0)
{
break;
}
}
size--;
}
while(path.StepReg() && size > 0);
if(size & 1) mem += sizeof(GIFReg);
@ -1286,13 +1270,13 @@ template<int index> void GSState::Transfer(BYTE* mem, UINT32 size)
ASSERT(0);
path.tag.NLOOP = 0;
path.nloop = 0;
break;
case GIF_FLG_IMAGE:
{
int len = (int)min(size, path.tag.NLOOP);
int len = (int)min(size, path.nloop);
//ASSERT(!(len&3));
@ -1315,7 +1299,7 @@ template<int index> void GSState::Transfer(BYTE* mem, UINT32 size)
}
mem += len * 16;
path.tag.NLOOP -= len;
path.nloop -= len;
size -= len;
}
@ -1328,7 +1312,7 @@ template<int index> void GSState::Transfer(BYTE* mem, UINT32 size)
if(index == 0)
{
if(path.tag.EOP && path.tag.NLOOP == 0)
if(path.tag.EOP && path.nloop == 0)
{
break;
}
@ -1342,13 +1326,13 @@ template<int index> void GSState::Transfer(BYTE* mem, UINT32 size)
if(index == 0)
{
if(size == 0 && path.tag.NLOOP > 0)
if(size == 0 && path.nloop > 0)
{
if(m_mt)
{
// TODO
path.tag.NLOOP = 0;
path.nloop = 0;
}
else
{
@ -1433,8 +1417,11 @@ int GSState::Freeze(GSFreezeData* fd, bool sizeonly)
for(int i = 0; i < 3; i++)
{
m_path[i].tag.NREG = m_path[i].nreg;
m_path[i].tag.NLOOP = m_path[i].nloop;
WriteState(data, &m_path[i].tag);
WriteState(data, &m_path[i].nreg);
WriteState(data, &m_path[i].reg);
}
WriteState(data, &m_q);
@ -1525,7 +1512,7 @@ int GSState::Defrost(const GSFreezeData* fd)
for(int i = 0; i < 3; i++)
{
ReadState(&m_path[i].tag, data);
ReadState(&m_path[i].nreg, data);
ReadState(&m_path[i].reg, data);
m_path[i].SetTag(&m_path[i].tag); // expand regs
}

View File

@ -55,7 +55,6 @@ class GSState : public GSAlignedClass<16>
void GIFPackedRegHandlerXYZF3(GIFPackedReg* r);
void GIFPackedRegHandlerXYZ3(GIFPackedReg* r);
void GIFPackedRegHandlerA_D(GIFPackedReg* r);
void GIFPackedRegHandlerA_D(GIFPackedReg* r, int size);
void GIFPackedRegHandlerNOP(GIFPackedReg* r);
typedef void (GSState::*GIFRegHandler)(GIFReg* r);

View File

@ -283,61 +283,83 @@ bool GSTextureCacheSW::GSTexture::Update(const GIFRegTEX0& TEX0, const GIFRegTEX
DWORD blocks = 0;
for(int y = r.top, o = pitch * s.cy; y < r.bottom; y += s.cy, dst += o)
if(tw <= (bw << 6))
{
DWORD base = psm.bn(0, y, bp, bw);
for(int x = r.left; x < r.right; x += s.cx)
for(int y = r.top, o = pitch * s.cy; y < r.bottom; y += s.cy, dst += o)
{
DWORD block = base + psm.blockOffset[x >> 3];
DWORD base = psm.bn(0, y, bp, bw);
if(block >= MAX_BLOCKS)
for(int x = r.left; x < r.right; x += s.cx)
{
continue;
DWORD block = base + psm.blockOffset[x >> 3];
if(block < MAX_BLOCKS)
{
DWORD row = block >> 5;
DWORD col = 1 << (block & 31);
if((m_valid[row] & col) == 0)
{
m_valid[row] |= col;
(mem.*rtxb)(block, &dst[x * bytes], pitch, TEXA);
blocks++;
}
}
}
}
}
else
{
// unfortunatelly a block may be part of the same texture multiple times at different places (tw 1024 > tbw 640, between 640 -> 1024 it is repeated from the next row),
// so just can't set the block's bit to valid in one pass, even if 99.9% of the games don't address the repeated part at the right side
DWORD row = block >> 5;
DWORD col = 1 << (block & 31);
// TODO: still bogus if those repeated parts aren't fetched together
if(m_valid[row] & col)
for(int y = r.top, o = pitch * s.cy; y < r.bottom; y += s.cy, dst += o)
{
DWORD base = psm.bn(0, y, bp, bw);
for(int x = r.left; x < r.right; x += s.cx)
{
continue;
DWORD block = base + psm.blockOffset[x >> 3];
if(block < MAX_BLOCKS)
{
DWORD row = block >> 5;
DWORD col = 1 << (block & 31);
if((m_valid[row] & col) == 0)
{
(mem.*rtxb)(block, &dst[x * bytes], pitch, TEXA);
blocks++;
}
}
}
}
// unfortunatelly a block may be part of the same texture multiple times at different places (when (1 << tw) > (tbw << 6), ex. 1024 > 640),
// so just can't set the block's bit to valid in one pass, even if 99.9% of the games don't address the repeated part at the right side
for(int y = r.top; y < r.bottom; y += s.cy)
{
DWORD base = psm.bn(0, y, bp, bw);
// TODO: still bogus if those repeated parts aren't fetched together
for(int x = r.left; x < r.right; x += s.cx)
{
DWORD block = base + psm.blockOffset[x >> 3];
// m_valid[row] |= col;
if(block < MAX_BLOCKS)
{
DWORD row = block >> 5;
DWORD col = 1 << (block & 31);
(mem.*rtxb)(block, &dst[x * bytes], pitch, TEXA);
blocks++;
m_valid[row] |= col;
}
}
}
}
m_state->m_perfmon.Put(GSPerfMon::Unswizzle, s.cx * s.cy * bytes * blocks);
for(int y = r.top; y < r.bottom; y += s.cy)
{
DWORD base = psm.bn(0, y, bp, bw);
for(int x = r.left; x < r.right; x += s.cx)
{
DWORD block = base + psm.blockOffset[x >> 3];
if(block >= MAX_BLOCKS)
{
continue;
}
DWORD row = block >> 5;
DWORD col = 1 << (block & 31);
m_valid[row] |= col;
}
}
return true;
}

View File

@ -237,6 +237,8 @@ __declspec(align(16)) class GSVertexTrace
public:
GSVertexSW m_min, m_max;
struct {int min, max; bool valid;} m_alpha; // source alpha range after tfx
union
{
DWORD value;
@ -256,9 +258,10 @@ public:
m_map.Lookup(key)(v, count, m_min, m_max);
m_eq.value = (m_min.p == m_max.p).mask() | ((m_min.t == m_max.t).mask() << 4) | ((m_min.c == m_max.c).mask() << 8);
m_alpha.valid = false;
}
/*
*/
void Update(const GSVertexSW* v, int count)
{
GSVertexSW min, max;
@ -284,5 +287,8 @@ public:
m_max = max;
m_eq.value = (min.p == max.p).mask() | ((min.t == max.t).mask() << 4) | ((min.c == max.c).mask() << 8);
m_alpha.valid = false;
}
*/
};