From 6ea395be4b7bfe3a20acf4db5d608ada586b643c Mon Sep 17 00:00:00 2001 From: gabest11 Date: Sat, 9 May 2009 08:37:02 +0000 Subject: [PATCH] GSdx: optimizations here and there git-svn-id: http://pcsx2.googlecode.com/svn/trunk@1161 96395faa-99c1-11dd-bbfe-3dabce05a288 --- plugins/GSdx/GS.h | 32 +++++- plugins/GSdx/GSRendererSW.h | 155 ++++++++++++++++++------------ plugins/GSdx/GSState.cpp | 109 +++++++++------------ plugins/GSdx/GSState.h | 1 - plugins/GSdx/GSTextureCacheSW.cpp | 98 +++++++++++-------- plugins/GSdx/GSVertexSW.h | 8 +- 6 files changed, 238 insertions(+), 165 deletions(-) diff --git a/plugins/GSdx/GS.h b/plugins/GSdx/GS.h index f670531aa4..43585a0aab 100644 --- a/plugins/GSdx/GS.h +++ b/plugins/GSdx/GS.h @@ -510,7 +510,9 @@ REG64_(GIFReg, ALPHA) UINT32 FIX:8; UINT32 _PAD2:24; REG_END2 - __forceinline bool IsOpaque() const {return (A == B || C == 2 && FIX == 0) && D == 0 || (A == 0 && B == 2 && C == 2 && D == 2 && FIX == 0x80);} // output will be Cs/As + // opaque => output will be Cs/As + __forceinline bool IsOpaque() const {return (A == B || C == 2 && FIX == 0) && D == 0 || (A == 0 && B == D && C == 2 && FIX == 0x80);} + __forceinline bool IsOpaque(int amin, int amax) const {return (A == B || amax == 0) && D == 0 || A == 0 && B == D && amin == 0x80 && amax == 0x80;} REG_END2 REG64_(GIFReg, BITBLTBUF) @@ -1061,21 +1063,41 @@ REG_SET_END __declspec(align(16)) struct GIFPath { GIFTag tag; + UINT32 reg; UINT32 nreg; - UINT32 _pad[3]; + UINT32 nloop; + UINT32 adonly; GSVector4i regs; void SetTag(const void* mem) { GSVector4i v = GSVector4i::load(mem); GSVector4i::store(&tag, v); - nreg = 0; + reg = 0; regs = v.uph8(v >> 4) & 0x0f0f0f0f; + nreg = tag.NREG; + nloop = tag.NLOOP; + adonly = nreg == 1 && regs.u8[0] == GIF_REG_A_D; } - DWORD GetReg() + __forceinline DWORD GetReg() { - return regs.u8[nreg]; // (DWORD)GET_GIF_REG(tag, nreg); + return regs.u8[reg]; // (DWORD)GET_GIF_REG(tag, nreg); + } + + __forceinline bool StepReg() + { + if((++reg & 0xf) == nreg) + { + reg = 0; + + if(--nloop == 0) + { + return false; + } + } + + return true; } }; diff --git a/plugins/GSdx/GSRendererSW.h b/plugins/GSdx/GSRendererSW.h index c0d2cb0889..2f540cacc6 100644 --- a/plugins/GSdx/GSRendererSW.h +++ b/plugins/GSdx/GSRendererSW.h @@ -117,9 +117,74 @@ protected: return true; } + void GetAlphaMinMax() + { + if(m_vtrace.m_alpha.valid) + { + return; + } + + const GSDrawingEnvironment& env = m_env; + const GSDrawingContext* context = m_context; + + GSVector4i a = GSVector4i(m_vtrace.m_min.c.wwww(m_vtrace.m_max.c)) >> 7; + + if(PRIM->TME && context->TEX0.TCC) + { + DWORD bpp = GSLocalMemory::m_psm[context->TEX0.PSM].trbpp; + DWORD cbpp = GSLocalMemory::m_psm[context->TEX0.CPSM].trbpp; + DWORD pal = GSLocalMemory::m_psm[context->TEX0.PSM].pal; + + if(bpp == 32) + { + a.y = 0; + a.w = 0xff; + } + else if(bpp == 24) + { + a.y = env.TEXA.AEM ? 0 : env.TEXA.TA0; + a.w = env.TEXA.TA0; + } + else if(bpp == 16) + { + a.y = env.TEXA.AEM ? 0 : min(env.TEXA.TA0, env.TEXA.TA1); + a.w = max(env.TEXA.TA0, env.TEXA.TA1); + } + else + { + m_mem.m_clut.GetAlphaMinMax32(a.y, a.w); + } + + switch(context->TEX0.TFX) + { + case TFX_MODULATE: + a.x = (a.x * a.y) >> 7; + a.z = (a.z * a.w) >> 7; + if(a.x > 0xff) a.x = 0xff; + if(a.z > 0xff) a.z = 0xff; + break; + case TFX_DECAL: + break; + case TFX_HIGHLIGHT: + a.x = a.x + a.y; + a.z = a.z + a.w; + if(a.x > 0xff) a.x = 0xff; + if(a.z > 0xff) a.z = 0xff; + break; + case TFX_HIGHLIGHT2: + break; + default: + __assume(0); + } + } + + m_vtrace.m_alpha.min = a.x; + m_vtrace.m_alpha.max = a.z; + m_vtrace.m_alpha.valid = true; + } + bool TryAlphaTest(DWORD& fm, DWORD& zm) { - const GSDrawingEnvironment& env = m_env; const GSDrawingContext* context = m_context; bool pass = true; @@ -130,62 +195,10 @@ protected: } else if(context->TEST.ATST != ATST_ALWAYS) { - GSVector4i af = GSVector4i(m_vtrace.m_min.c.wwww(m_vtrace.m_max.c)) >> 7; + GetAlphaMinMax(); - int amin, amax; - - if(PRIM->TME && context->TEX0.TCC) - { - DWORD bpp = GSLocalMemory::m_psm[context->TEX0.PSM].trbpp; - DWORD cbpp = GSLocalMemory::m_psm[context->TEX0.CPSM].trbpp; - DWORD pal = GSLocalMemory::m_psm[context->TEX0.PSM].pal; - - if(bpp == 32) - { - return false; - } - else if(bpp == 24) - { - amin = env.TEXA.AEM ? 0 : env.TEXA.TA0; - amax = env.TEXA.TA0; - } - else if(bpp == 16) - { - amin = env.TEXA.AEM ? 0 : min(env.TEXA.TA0, env.TEXA.TA1); - amax = max(env.TEXA.TA0, env.TEXA.TA1); - } - else - { - m_mem.m_clut.GetAlphaMinMax32(amin, amax); - } - - switch(context->TEX0.TFX) - { - case TFX_MODULATE: - amin = (amin * af.x) >> 7; - amax = (amax * af.z) >> 7; - if(amin > 255) amin = 255; - if(amax > 255) amax = 255; - break; - case TFX_DECAL: - break; - case TFX_HIGHLIGHT: - amin = amin + af.x; - amax = amax + af.z; - if(amin > 255) amin = 255; - if(amax > 255) amax = 255; - break; - case TFX_HIGHLIGHT2: - break; - default: - __assume(0); - } - } - else - { - amin = af.x; - amax = af.z; - } + int amin = m_vtrace.m_alpha.min; + int amax = m_vtrace.m_alpha.max; int aref = context->TEST.AREF; @@ -252,7 +265,7 @@ protected: const GSDrawingEnvironment& env = m_env; const GSDrawingContext* context = m_context; - p.vm = m_mem.m_vm32; + p.vm = m_mem.m_vm8; p.fbo = m_mem.GetOffset(context->FRAME.Block(), context->FRAME.FBW, context->FRAME.PSM); p.zbo = m_mem.GetOffset(context->ZBUF.Block(), context->FRAME.FBW, context->ZBUF.PSM); @@ -446,7 +459,31 @@ protected: p.sel.datm = context->TEST.DATM; } - if(PRIM->ABE && !context->ALPHA.IsOpaque() || PRIM->AA1) + int amin = 0, amax = 0xff; + + if(PRIM->ABE && context->ALPHA.A != context->ALPHA.B && !PRIM->AA1) + { + if(context->ALPHA.C == 0) + { + GetAlphaMinMax(); + + amin = m_vtrace.m_alpha.min; + amax = m_vtrace.m_alpha.max; + } + else if(context->ALPHA.C == 1) + { + if(p.sel.fpsm == 1) + { + amin = amax = 0x80; + } + } + else if(context->ALPHA.C == 1) + { + amin = amax = context->ALPHA.FIX; + } + } + + if(PRIM->ABE && !context->ALPHA.IsOpaque(amin, amax) || PRIM->AA1) { p.sel.abe = PRIM->ABE; p.sel.ababcd = context->ALPHA.ai32[0]; @@ -581,7 +618,7 @@ protected: if(s_savez) {m_mem.SaveBMP(str, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameSize().cx, 512);} } - if(0)//stats.ticks > 1000000) + if(0)//stats.ticks > 5000000) { printf("* [%I64d | %012I64x] ticks %I64d prims %d (%d) pixels %d (%d)\n", m_perfmon.GetFrame(), p.sel.key, diff --git a/plugins/GSdx/GSState.cpp b/plugins/GSdx/GSState.cpp index d6a9a07a30..81bba7ec3d 100644 --- a/plugins/GSdx/GSState.cpp +++ b/plugins/GSdx/GSState.cpp @@ -80,7 +80,7 @@ GSState::GSState(BYTE* base, bool mt, void (*irq)()) m_sssize += sizeof(m_tr.x); m_sssize += sizeof(m_tr.y); m_sssize += m_mem.m_vmsize; - m_sssize += (sizeof(m_path[0].tag) + sizeof(m_path[0].nreg)) * 3; + m_sssize += (sizeof(m_path[0].tag) + sizeof(m_path[0].reg)) * 3; m_sssize += sizeof(m_q); ASSERT(base); @@ -453,14 +453,6 @@ void GSState::GIFPackedRegHandlerA_D(GIFPackedReg* r) (this->*m_fpGIFRegHandlers[(BYTE)r->A_D.ADDR])(&r->r); } -void GSState::GIFPackedRegHandlerA_D(GIFPackedReg* r, int size) -{ - for(int i = 0; i < size; i++) - { - (this->*m_fpGIFRegHandlers[(BYTE)r[i].A_D.ADDR])(&r[i].r); - } -} - void GSState::GIFPackedRegHandlerNOP(GIFPackedReg* r) { } @@ -1181,7 +1173,7 @@ template void GSState::Transfer(BYTE* mem, UINT32 size) while(size > 0) { - if(path.tag.NLOOP == 0) + if(path.nloop == 0) { path.SetTag(mem); @@ -1193,20 +1185,15 @@ template void GSState::Transfer(BYTE* mem, UINT32 size) m_path3hack = 1; } - if(path.tag.NLOOP > 0) // eeuser 7.2.2. GIFtag: "... when NLOOP is 0, the GIF does not output anything, and values other than the EOP field are disregarded." + if(path.nloop > 0) // eeuser 7.2.2. GIFtag: "... when NLOOP is 0, the GIF does not output anything, and values other than the EOP field are disregarded." { m_q = 1.0f; - if(path.tag.PRE) + if(path.tag.PRE && (path.tag.FLG & 2) == 0) { - ASSERT(path.tag.FLG != GIF_FLG_IMAGE); // kingdom hearts, ffxii, tales of abyss, berserk - - if((path.tag.FLG & 2) == 0) - { - GIFReg r; - r.i64 = path.tag.PRIM; - (this->*m_fpGIFRegHandlers[GIF_A_D_REG_PRIM])(&r); - } + GIFReg r; + r.i64 = path.tag.PRIM; + (this->*m_fpGIFRegHandlers[GIF_A_D_REG_PRIM])(&r); } } } @@ -1218,37 +1205,44 @@ template void GSState::Transfer(BYTE* mem, UINT32 size) // first try a shortcut for a very common case - if(path.nreg == 0 && path.tag.NREG == 1 && size >= path.tag.NLOOP && path.GetReg() == GIF_REG_A_D) + if(path.adonly && size >= path.nloop) { - int n = path.tag.NLOOP; + size -= path.nloop; - GIFPackedRegHandlerA_D((GIFPackedReg*)mem, n); + do + { + (this->*m_fpGIFRegHandlers[(BYTE)((GIFPackedReg*)mem)->A_D.ADDR])(&((GIFPackedReg*)mem)->r); - mem += n * sizeof(GIFPackedReg); - size -= n; - - path.tag.NLOOP = 0; + mem += sizeof(GIFPackedReg); + } + while(--path.nloop > 0); } else { - while(size > 0) + do { - (this->*m_fpGIFPackedRegHandlers[path.GetReg()])((GIFPackedReg*)mem); + DWORD reg = path.GetReg(); - size--; - mem += sizeof(GIFPackedReg); - - if((++path.nreg & 0xf) == path.tag.NREG) + switch(reg) { - path.nreg = 0; - path.tag.NLOOP--; - - if(path.tag.NLOOP == 0) - { - break; - } + case GIF_REG_RGBA: + GIFPackedRegHandlerRGBA((GIFPackedReg*)mem); + break; + case GIF_REG_STQ: + GIFPackedRegHandlerSTQ((GIFPackedReg*)mem); + break; + case GIF_REG_UV: + GIFPackedRegHandlerUV((GIFPackedReg*)mem); + break; + default: + (this->*m_fpGIFPackedRegHandlers[reg])((GIFPackedReg*)mem); + break; } + + mem += sizeof(GIFPackedReg); + size--; } + while(path.StepReg() && size > 0); } break; @@ -1257,24 +1251,14 @@ template void GSState::Transfer(BYTE* mem, UINT32 size) size *= 2; - while(size > 0) + do { (this->*m_fpGIFRegHandlers[path.GetReg()])((GIFReg*)mem); - size--; mem += sizeof(GIFReg); - - if((++path.nreg & 0xf) == path.tag.NREG) - { - path.nreg = 0; - path.tag.NLOOP--; - - if(path.tag.NLOOP == 0) - { - break; - } - } + size--; } + while(path.StepReg() && size > 0); if(size & 1) mem += sizeof(GIFReg); @@ -1286,13 +1270,13 @@ template void GSState::Transfer(BYTE* mem, UINT32 size) ASSERT(0); - path.tag.NLOOP = 0; + path.nloop = 0; break; case GIF_FLG_IMAGE: { - int len = (int)min(size, path.tag.NLOOP); + int len = (int)min(size, path.nloop); //ASSERT(!(len&3)); @@ -1315,7 +1299,7 @@ template void GSState::Transfer(BYTE* mem, UINT32 size) } mem += len * 16; - path.tag.NLOOP -= len; + path.nloop -= len; size -= len; } @@ -1328,7 +1312,7 @@ template void GSState::Transfer(BYTE* mem, UINT32 size) if(index == 0) { - if(path.tag.EOP && path.tag.NLOOP == 0) + if(path.tag.EOP && path.nloop == 0) { break; } @@ -1342,13 +1326,13 @@ template void GSState::Transfer(BYTE* mem, UINT32 size) if(index == 0) { - if(size == 0 && path.tag.NLOOP > 0) + if(size == 0 && path.nloop > 0) { if(m_mt) { // TODO - path.tag.NLOOP = 0; + path.nloop = 0; } else { @@ -1433,8 +1417,11 @@ int GSState::Freeze(GSFreezeData* fd, bool sizeonly) for(int i = 0; i < 3; i++) { + m_path[i].tag.NREG = m_path[i].nreg; + m_path[i].tag.NLOOP = m_path[i].nloop; + WriteState(data, &m_path[i].tag); - WriteState(data, &m_path[i].nreg); + WriteState(data, &m_path[i].reg); } WriteState(data, &m_q); @@ -1525,7 +1512,7 @@ int GSState::Defrost(const GSFreezeData* fd) for(int i = 0; i < 3; i++) { ReadState(&m_path[i].tag, data); - ReadState(&m_path[i].nreg, data); + ReadState(&m_path[i].reg, data); m_path[i].SetTag(&m_path[i].tag); // expand regs } diff --git a/plugins/GSdx/GSState.h b/plugins/GSdx/GSState.h index 5aeb4fe202..d22a814890 100644 --- a/plugins/GSdx/GSState.h +++ b/plugins/GSdx/GSState.h @@ -55,7 +55,6 @@ class GSState : public GSAlignedClass<16> void GIFPackedRegHandlerXYZF3(GIFPackedReg* r); void GIFPackedRegHandlerXYZ3(GIFPackedReg* r); void GIFPackedRegHandlerA_D(GIFPackedReg* r); - void GIFPackedRegHandlerA_D(GIFPackedReg* r, int size); void GIFPackedRegHandlerNOP(GIFPackedReg* r); typedef void (GSState::*GIFRegHandler)(GIFReg* r); diff --git a/plugins/GSdx/GSTextureCacheSW.cpp b/plugins/GSdx/GSTextureCacheSW.cpp index 0344e0c2a7..9342c20dda 100644 --- a/plugins/GSdx/GSTextureCacheSW.cpp +++ b/plugins/GSdx/GSTextureCacheSW.cpp @@ -283,61 +283,83 @@ bool GSTextureCacheSW::GSTexture::Update(const GIFRegTEX0& TEX0, const GIFRegTEX DWORD blocks = 0; - for(int y = r.top, o = pitch * s.cy; y < r.bottom; y += s.cy, dst += o) + if(tw <= (bw << 6)) { - DWORD base = psm.bn(0, y, bp, bw); - - for(int x = r.left; x < r.right; x += s.cx) + for(int y = r.top, o = pitch * s.cy; y < r.bottom; y += s.cy, dst += o) { - DWORD block = base + psm.blockOffset[x >> 3]; + DWORD base = psm.bn(0, y, bp, bw); - if(block >= MAX_BLOCKS) + for(int x = r.left; x < r.right; x += s.cx) { - continue; + DWORD block = base + psm.blockOffset[x >> 3]; + + if(block < MAX_BLOCKS) + { + DWORD row = block >> 5; + DWORD col = 1 << (block & 31); + + if((m_valid[row] & col) == 0) + { + m_valid[row] |= col; + + (mem.*rtxb)(block, &dst[x * bytes], pitch, TEXA); + + blocks++; + } + } } + } + } + else + { + // unfortunatelly a block may be part of the same texture multiple times at different places (tw 1024 > tbw 640, between 640 -> 1024 it is repeated from the next row), + // so just can't set the block's bit to valid in one pass, even if 99.9% of the games don't address the repeated part at the right side + + // TODO: still bogus if those repeated parts aren't fetched together - DWORD row = block >> 5; - DWORD col = 1 << (block & 31); + for(int y = r.top, o = pitch * s.cy; y < r.bottom; y += s.cy, dst += o) + { + DWORD base = psm.bn(0, y, bp, bw); - if(m_valid[row] & col) + for(int x = r.left; x < r.right; x += s.cx) { - continue; + DWORD block = base + psm.blockOffset[x >> 3]; + + if(block < MAX_BLOCKS) + { + DWORD row = block >> 5; + DWORD col = 1 << (block & 31); + + if((m_valid[row] & col) == 0) + { + (mem.*rtxb)(block, &dst[x * bytes], pitch, TEXA); + + blocks++; + } + } } + } - // unfortunatelly a block may be part of the same texture multiple times at different places (when (1 << tw) > (tbw << 6), ex. 1024 > 640), - // so just can't set the block's bit to valid in one pass, even if 99.9% of the games don't address the repeated part at the right side - - // TODO: still bogus if those repeated parts aren't fetched together + for(int y = r.top; y < r.bottom; y += s.cy) + { + DWORD base = psm.bn(0, y, bp, bw); - // m_valid[row] |= col; + for(int x = r.left; x < r.right; x += s.cx) + { + DWORD block = base + psm.blockOffset[x >> 3]; - (mem.*rtxb)(block, &dst[x * bytes], pitch, TEXA); + if(block < MAX_BLOCKS) + { + DWORD row = block >> 5; + DWORD col = 1 << (block & 31); - blocks++; + m_valid[row] |= col; + } + } } } m_state->m_perfmon.Put(GSPerfMon::Unswizzle, s.cx * s.cy * bytes * blocks); - for(int y = r.top; y < r.bottom; y += s.cy) - { - DWORD base = psm.bn(0, y, bp, bw); - - for(int x = r.left; x < r.right; x += s.cx) - { - DWORD block = base + psm.blockOffset[x >> 3]; - - if(block >= MAX_BLOCKS) - { - continue; - } - - DWORD row = block >> 5; - DWORD col = 1 << (block & 31); - - m_valid[row] |= col; - } - } - return true; } diff --git a/plugins/GSdx/GSVertexSW.h b/plugins/GSdx/GSVertexSW.h index ac21c0bd6d..146aa1531b 100644 --- a/plugins/GSdx/GSVertexSW.h +++ b/plugins/GSdx/GSVertexSW.h @@ -237,6 +237,8 @@ __declspec(align(16)) class GSVertexTrace public: GSVertexSW m_min, m_max; + struct {int min, max; bool valid;} m_alpha; // source alpha range after tfx + union { DWORD value; @@ -256,9 +258,10 @@ public: m_map.Lookup(key)(v, count, m_min, m_max); m_eq.value = (m_min.p == m_max.p).mask() | ((m_min.t == m_max.t).mask() << 4) | ((m_min.c == m_max.c).mask() << 8); + + m_alpha.valid = false; } /* -*/ void Update(const GSVertexSW* v, int count) { GSVertexSW min, max; @@ -284,5 +287,8 @@ public: m_max = max; m_eq.value = (min.p == max.p).mask() | ((min.t == max.t).mask() << 4) | ((min.c == max.c).mask() << 8); + + m_alpha.valid = false; } +*/ };