GSdx: Small optimizations here and there, just saving changes before trying to add an index buffer, that might help reducing load on the main gs thread a bit. That's where I think the bottleneck currently is in games with high polygon count.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@5036 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
gabest11 2011-12-31 15:41:07 +00:00
parent b97dff6c89
commit 5325f9b490
9 changed files with 144 additions and 78 deletions

View File

@ -449,7 +449,7 @@ GSLocalMemory::~GSLocalMemory()
for_each(m_omap.begin(), m_omap.end(), aligned_free_second());
for_each(m_po4map.begin(), m_po4map.end(), aligned_free_second());
for(hash_map<uint32, list<GSVector2i>*>::iterator i = m_p2tmap.begin(); i != m_p2tmap.end(); i++)
for(hash_map<uint64, vector<GSVector2i>*>::iterator i = m_p2tmap.begin(); i != m_p2tmap.end(); i++)
{
delete [] i->second;
}
@ -526,11 +526,11 @@ GSPixelOffset4* GSLocalMemory::GetPixelOffset4(const GIFRegFRAME& FRAME, const G
static bool cmp_vec2x(const GSVector2i& a, const GSVector2i& b) {return a.x < b.x;}
list<GSVector2i>* GSLocalMemory::GetPage2TileMap(const GIFRegTEX0& TEX0)
vector<GSVector2i>* GSLocalMemory::GetPage2TileMap(const GIFRegTEX0& TEX0)
{
uint32 hash = TEX0.TBP0 | (TEX0.TBW << 14) | (TEX0.PSM << 20) | (TEX0.TW << 26);
uint64 hash = TEX0.u64 & 0x3ffffffffull; // TBP0 TBW PSM TW TH
hash_map<uint32, list<GSVector2i>*>::iterator i = m_p2tmap.find(hash);
hash_map<uint64, vector<GSVector2i>*>::iterator i = m_p2tmap.find(hash);
if(i != m_p2tmap.end())
{
@ -540,13 +540,13 @@ list<GSVector2i>* GSLocalMemory::GetPage2TileMap(const GIFRegTEX0& TEX0)
GSVector2i bs = m_psm[TEX0.PSM].bs;
int tw = std::max<int>(1 << TEX0.TW, bs.x);
// int th = std::max<int>(1 << TEX0.TH, bs.y);
int th = std::max<int>(1 << TEX0.TH, bs.y);
const GSOffset* o = GetOffset(TEX0.TBP0, TEX0.TBW, TEX0.PSM);
hash_map<uint32, hash_set<uint32> > tmp; // key = page, value = y:x, 7 bits each, max 128x128 tiles for the worst case (1024x1024 32bpp 8x8 blocks)
for(int y = 0; y < 1024; y += bs.y) // the hash is a little short on bits for TEX0.TH, hard-coding it to 1024 lines
for(int y = 0; y < th; y += bs.y)
{
uint32 base = o->block.row[y >> 3];
@ -563,7 +563,7 @@ list<GSVector2i>* GSLocalMemory::GetPage2TileMap(const GIFRegTEX0& TEX0)
// combine the lower 5 bits of the address into a 9:5 pointer:mask form, so the "valid bits" can be tested against an uint32 array
list<GSVector2i>* p2t = new list<GSVector2i>[MAX_PAGES];
vector<GSVector2i>* p2t = new vector<GSVector2i>[MAX_PAGES];
for(hash_map<uint32, hash_set<uint32> >::iterator i = tmp.begin(); i != tmp.end(); i++)
{
@ -594,16 +594,12 @@ list<GSVector2i>* GSLocalMemory::GetPage2TileMap(const GIFRegTEX0& TEX0)
// sort by x and flip the mask (it will be used to erase a lot of bits in a loop, [x] &= ~y)
vector<GSVector2i> tmp;
for(hash_map<uint32, uint32>::iterator j = m.begin(); j != m.end(); j++)
{
tmp.push_back(GSVector2i(j->first, ~j->second));
p2t[page].push_back(GSVector2i(j->first, ~j->second));
}
std::sort(tmp.begin(), tmp.end(), cmp_vec2x);
p2t[page].insert(p2t[page].end(), tmp.begin(), tmp.end());
std::sort(p2t[page].begin(), p2t[page].end(), cmp_vec2x);
}
m_p2tmap[hash] = p2t;

View File

@ -156,7 +156,7 @@ protected:
hash_map<uint32, GSOffset*> m_omap;
hash_map<uint32, GSPixelOffset4*> m_po4map;
hash_map<uint32, list<GSVector2i>*> m_p2tmap;
hash_map<uint64, vector<GSVector2i>*> m_p2tmap;
public:
GSLocalMemory();
@ -164,7 +164,7 @@ public:
GSOffset* GetOffset(uint32 bp, uint32 bw, uint32 psm);
GSPixelOffset4* GetPixelOffset4(const GIFRegFRAME& FRAME, const GIFRegZBUF& ZBUF);
list<GSVector2i>* GetPage2TileMap(const GIFRegTEX0& TEX0);
vector<GSVector2i>* GetPage2TileMap(const GIFRegTEX0& TEX0);
// address

View File

@ -115,7 +115,8 @@ void GSRasterizer::Draw(shared_ptr<GSRasterizerData> data)
bool scissor_test = !data->bbox.eq(data->bbox.rintersect(data->scissor));
m_scissor = data->scissor;
m_fscissor = GSVector4(data->scissor);
m_fscissor_x = GSVector4(data->scissor).xzxz();
m_fscissor_y = GSVector4(data->scissor).ywyw();
m_pixels = 0;
@ -163,7 +164,7 @@ void GSRasterizer::Draw(shared_ptr<GSRasterizerData> data)
uint64 ticks = __rdtsc() - start;
_InterlockedExchangeAdd(&data->ticks, ticks);
_InterlockedExchangeAdd(&data->ticks, (long)ticks);
_InterlockedExchangeAdd(&data->pixels, m_pixels);
m_ds->EndDraw(data->frame, ticks, m_pixels);
@ -228,11 +229,9 @@ void GSRasterizer::DrawLine(const GSVertexSW* v)
if(m_scissor.top <= p.y && p.y < m_scissor.bottom && IsOneOfMyScanlines(p.y))
{
GSVector4 scissor = m_fscissor.xzxz();
GSVector4 lrf = scan.p.upl(v[1].p.blend32(v[0].p, mask)).ceil();
GSVector4 l = lrf.max(scissor);
GSVector4 r = lrf.min(scissor);
GSVector4 l = lrf.max(m_fscissor_x);
GSVector4 r = lrf.min(m_fscissor_x);
GSVector4i lr = GSVector4i(l.xxyy(r));
int left = lr.extract32<0>();
@ -333,8 +332,8 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertices)
if(i == 7) return; // y0 == y1 == y2
GSVector4 tbf = y0011.xzxz(y1221).ceil();
GSVector4 tbmax = tbf.max(m_fscissor.ywyw());
GSVector4 tbmin = tbf.min(m_fscissor.ywyw());
GSVector4 tbmax = tbf.max(m_fscissor_y);
GSVector4 tbmin = tbf.min(m_fscissor_y);
GSVector4i tb = GSVector4i(tbmax.xzyw(tbmin));
dv[0] = v[1] - v[0];
@ -452,7 +451,7 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& edge, co
GSVertexSW* RESTRICT e = &m_edge.buff[m_edge.count];
GSVector4 scissor = m_fscissor.xzxz();
GSVector4 scissor = m_fscissor_x;
top = FindMyNextScanline(top);
@ -516,6 +515,7 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertices, bool solidrect)
if(solidrect)
{
/*
if(m_id == 0)
{
m_ds->DrawRect(r, scan);
@ -523,6 +523,33 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertices, bool solidrect)
m_pixels += r.width() * r.height();
}
return;
*/
if(m_threads == 1)
{
m_ds->DrawRect(r, scan);
m_pixels += r.width() * r.height();
}
else
{
int top = FindMyNextScanline(r.top);
int bottom = r.bottom;
while(top < bottom)
{
r.top = top;
r.bottom = std::min<int>((top + (1 << THREAD_HEIGHT)) & ~((1 << THREAD_HEIGHT) - 1), bottom);
m_ds->DrawRect(r, scan);
m_pixels += r.width() * r.height();
top = r.bottom + ((m_threads - 1) << THREAD_HEIGHT);
}
}
return;
}
@ -575,13 +602,12 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
GSVertexSW* RESTRICT e = &m_edge.buff[m_edge.count];
GSVector4 lrtb = v0.p.upl(v1.p).ceil();
if(orientation)
{
GSVector4 tbmax = lrtb.max(m_fscissor.yyyy());
GSVector4 tbmin = lrtb.min(m_fscissor.wwww());
GSVector4i tb = GSVector4i(tbmax.zwzw(tbmin));
GSVector4 tbf = v0.p.yyyy(v1.p).ceil(); // t t b b
GSVector4 tbmax = tbf.max(m_fscissor_y); // max(t, st) max(t, sb) max(b, st) max(b, sb)
GSVector4 tbmin = tbf.min(m_fscissor_y); // min(t, st) min(t, sb) min(b, st) min(b, sb)
GSVector4i tb = GSVector4i(tbmax.xzyw(tbmin)); // max(t, st) max(b, sb) min(t, st) min(b, sb)
int top, bottom;
@ -589,27 +615,27 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
if((dv.p >= GSVector4::zero()).mask() & 2)
{
top = tb.extract32<0>();
bottom = tb.extract32<3>();
top = tb.extract32<0>(); // max(t, st)
bottom = tb.extract32<3>(); // min(b, sb)
if(top >= bottom) return;
edge = v0;
dedge = dv / dv.p.yyyy();
edge += dedge * (tbmax.zzzz() - edge.p.yyyy());
edge += dedge * (tbmax.xxxx() - edge.p.yyyy());
}
else
{
top = tb.extract32<1>();
bottom = tb.extract32<2>();
top = tb.extract32<1>(); // max(b, st)
bottom = tb.extract32<2>(); // min(t, sb)
if(top >= bottom) return;
edge = v1;
dedge = dv / dv.p.yyyy();
edge += dedge * (tbmax.wwww() - edge.p.yyyy());
edge += dedge * (tbmax.zzzz() - edge.p.yyyy());
}
GSVector4i p = GSVector4i(edge.p.upl(dedge.p) * 0x10000);
@ -664,9 +690,10 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
}
else
{
GSVector4 lrmax = lrtb.max(m_fscissor.xxxx());
GSVector4 lrmin = lrtb.min(m_fscissor.zzzz());
GSVector4i lr = GSVector4i(lrmax.xyxy(lrmin));
GSVector4 lrf = v0.p.xxxx(v1.p).ceil(); // l l r r
GSVector4 lrmax = lrf.max(m_fscissor_x); // max(l, sl) max(l, sr) max(r, sl) max(r, sr)
GSVector4 lrmin = lrf.min(m_fscissor_x); // min(l, sl) min(l, sr) min(r, sl) min(r, sr)
GSVector4i lr = GSVector4i(lrmax.xzyw(lrmin)); // max(l, sl) max(r, sl) min(l, sr) min(r, sr)
int left, right;
@ -674,8 +701,8 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
if((dv.p >= GSVector4::zero()).mask() & 1)
{
left = lr.extract32<0>();
right = lr.extract32<3>();
left = lr.extract32<0>(); // max(l, sl)
right = lr.extract32<3>(); // min(r, sr)
if(left >= right) return;
@ -686,15 +713,15 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
}
else
{
left = lr.extract32<1>();
right = lr.extract32<2>();
left = lr.extract32<1>(); // max(r, sl)
right = lr.extract32<2>(); // min(l, sr)
if(left >= right) return;
edge = v1;
dedge = dv / dv.p.xxxx();
edge += dedge * (lrmax.yyyy() - edge.p.xxxx());
edge += dedge * (lrmax.zzzz() - edge.p.xxxx());
}
GSVector4i p = GSVector4i(edge.p.upl(dedge.p) * 0x10000);
@ -811,6 +838,8 @@ void GSRasterizer::Flush(const GSVertexSW* vertices, const GSVertexSW& dscan, bo
GSRasterizerList::GSRasterizerList()
: GSJobQueue<shared_ptr<GSRasterizerData> >()
, m_sync_count(0)
, m_syncpoint_count(0)
, m_solidrect_count(0)
{
}
@ -849,6 +878,11 @@ void GSRasterizerList::Sync()
void GSRasterizerList::Process(shared_ptr<GSRasterizerData>& item)
{
if(item->solidrect)
{
m_solidrect_count++;
}
/*
if(m_workers.size() > 1 && item->solidrect) // TODO: clip to thread area and dispatch?
{
for(size_t i = 0; i < m_workers.size(); i++)
@ -860,13 +894,15 @@ void GSRasterizerList::Process(shared_ptr<GSRasterizerData>& item)
return;
}
*/
if(item->syncpoint)
{
for(size_t i = 0; i < m_workers.size(); i++)
{
m_workers[i]->Wait();
}
m_syncpoint_count++;
}
for(size_t i = 0; i < m_workers.size(); i++)

View File

@ -126,7 +126,8 @@ protected:
int m_threads;
uint8* m_myscanline;
GSVector4i m_scissor;
GSVector4 m_fscissor;
GSVector4 m_fscissor_x;
GSVector4 m_fscissor_y;
struct {GSVertexSW* buff; int count;} m_edge;
int m_pixels;
@ -213,6 +214,8 @@ public:
}
int m_sync_count;
int m_syncpoint_count;
int m_solidrect_count;
// IRasterizer

View File

@ -85,8 +85,9 @@ void GSRendererSW::VSync(int field)
//
printf("m_sync_count = %d\n", ((GSRasterizerList*)m_rl)->m_sync_count); ((GSRasterizerList*)m_rl)->m_sync_count = 0;
*/
printf("m_syncpoint_count = %d\n", ((GSRasterizerList*)m_rl)->m_syncpoint_count); ((GSRasterizerList*)m_rl)->m_syncpoint_count = 0;
printf("m_solidrect_count = %d\n", ((GSRasterizerList*)m_rl)->m_solidrect_count); ((GSRasterizerList*)m_rl)->m_solidrect_count = 0;
*/
GSRendererT<GSVertexSW>::VSync(field);
m_tc->IncAge();
@ -513,7 +514,12 @@ bool GSRendererSW::GetScanlineGlobalData(GSRasterizerData2* data2)
bool zwrite = zm != 0xffffffff;
bool ztest = context->TEST.ZTE && context->TEST.ZTST > ZTST_ALWAYS;
/*
printf("%05x %d %05x %d %05x %d %dx%d\n",
fwrite || ftest ? m_context->FRAME.Block() : 0xfffff, m_context->FRAME.PSM,
zwrite || ztest ? m_context->ZBUF.Block() : 0xfffff, m_context->ZBUF.PSM,
PRIM->TME ? m_context->TEX0.TBP0 : 0xfffff, m_context->TEX0.PSM, (int)m_context->TEX0.TW, (int)m_context->TEX0.TH);
*/
if(!fwrite && !zwrite) return false;
gd.sel.fwrite = fwrite;
@ -1003,24 +1009,25 @@ void GSRendererSW::VertexKick(bool skip)
if(GSVertexSW* v = DrawingKick<prim>(skip, count))
{
GS_PRIM_CLASS primclass = GSUtil::GetPrimClass(prim);
if(!m_dump)
{
GSVector4 pmin, pmax;
switch(primclass)
switch(prim)
{
case GS_POINT_CLASS:
case GS_POINTLIST:
pmin = v[0].p;
pmax = v[0].p;
break;
case GS_LINE_CLASS:
case GS_SPRITE_CLASS:
case GS_LINELIST:
case GS_LINESTRIP:
case GS_SPRITE:
pmin = v[0].p.min(v[1].p);
pmax = v[0].p.max(v[1].p);
break;
case GS_TRIANGLE_CLASS:
case GS_TRIANGLELIST:
case GS_TRIANGLESTRIP:
case GS_TRIANGLEFAN:
pmin = v[0].p.min(v[1].p).min(v[2].p);
pmax = v[0].p.max(v[1].p).max(v[2].p);
break;
@ -1030,17 +1037,21 @@ if(!m_dump)
GSVector4 test = (pmax < scissor) | (pmin > scissor.zwxy());
switch(primclass)
switch(prim)
{
case GS_TRIANGLE_CLASS:
case GS_SPRITE_CLASS:
case GS_TRIANGLELIST:
case GS_TRIANGLESTRIP:
case GS_TRIANGLEFAN:
case GS_SPRITE:
test |= pmin.ceil() == pmax.ceil();
break;
}
switch(primclass)
switch(prim)
{
case GS_TRIANGLE_CLASS:
case GS_TRIANGLELIST:
case GS_TRIANGLESTRIP:
case GS_TRIANGLEFAN:
// are in line or just two of them are the same (cross product == 0)
GSVector4 tmp = (v[1].p - v[0].p) * (v[2].p - v[0].p).yxwz();
test |= tmp == tmp.yxwz();
@ -1052,26 +1063,42 @@ if(!m_dump)
return;
}
}
switch(primclass)
switch(prim)
{
case GS_POINT_CLASS:
case GS_POINTLIST:
break;
case GS_LINE_CLASS:
case GS_LINELIST:
case GS_LINESTRIP:
if(PRIM->IIP == 0) {v[0].c = v[1].c;}
break;
case GS_TRIANGLE_CLASS:
case GS_TRIANGLELIST:
case GS_TRIANGLESTRIP:
case GS_TRIANGLEFAN:
if(PRIM->IIP == 0) {v[0].c = v[2].c; v[1].c = v[2].c;}
break;
case GS_SPRITE_CLASS:
case GS_SPRITE:
break;
}
if(m_count < 30 && m_count >= 3)
{
GSVertexSW* v = &m_vertices[m_count - 3];
int tl = 0;
int br = 0;
if(primclass == GS_TRIANGLE_CLASS && GSVertexSW::IsQuad(&m_vertices[m_count - 3], tl, br))
bool isquad = false;
switch(prim)
{
case GS_TRIANGLESTRIP:
case GS_TRIANGLEFAN:
case GS_TRIANGLELIST:
isquad = GSVertexSW::IsQuad(v, tl, br);
break;
}
if(isquad)
{
m_count -= 3;

View File

@ -337,22 +337,24 @@ void GSTextureCache::InvalidateVideoMem(GSOffset* o, const GSVector4i& rect, boo
if(GSUtil::HasSharedBits(psm, s->m_TEX0.PSM))
{
uint32* RESTRICT valid = s->m_valid;
bool b = bp == s->m_TEX0.TBP0;
if(!s->m_target)
{
if(s->m_repeating)
{
list<GSVector2i>& l = s->m_p2t[page];
vector<GSVector2i>& l = s->m_p2t[page];
for(list<GSVector2i>::iterator k = l.begin(); k != l.end(); k++)
for(vector<GSVector2i>::iterator k = l.begin(); k != l.end(); k++)
{
s->m_valid[k->x] &= k->y;
valid[k->x] &= k->y;
}
}
else
{
s->m_valid[page] = 0;
valid[page] = 0;
}
s->m_complete = false;

View File

@ -75,7 +75,7 @@ public:
bool m_target;
bool m_complete;
bool m_repeating;
list<GSVector2i>* m_p2t;
vector<GSVector2i>* m_p2t;
public:
Source(GSRenderer* r, const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA, uint8* temp);

View File

@ -97,18 +97,20 @@ void GSTextureCacheSW::InvalidatePages(const vector<uint32>* pages, uint32 psm)
if(GSUtil::HasSharedBits(psm, t->m_TEX0.PSM))
{
uint32* RESTRICT valid = t->m_valid;
if(t->m_repeating)
{
list<GSVector2i>& l = t->m_p2t[page];
vector<GSVector2i>& l = t->m_p2t[page];
for(list<GSVector2i>::iterator j = l.begin(); j != l.end(); j++)
for(vector<GSVector2i>::iterator j = l.begin(); j != l.end(); j++)
{
t->m_valid[j->x] &= j->y;
valid[j->x] &= j->y;
}
}
else
{
t->m_valid[page] = 0;
valid[page] = 0;
}
t->m_complete = false;

View File

@ -38,7 +38,7 @@ public:
uint32 m_age;
bool m_complete;
bool m_repeating;
list<GSVector2i>* m_p2t;
vector<GSVector2i>* m_p2t;
uint32 m_valid[MAX_PAGES];
struct {uint32 bm[16]; const vector<uint32>* n;} m_pages;