GSdx: Polished the recent changes a bit. Single threaded mode should be back to normal, 2-4 threads might be faster or slower. All in all, it has a lot more potential now. Rendering is almost as separated as with d3d, everything needed is packed and copied for the worker threads, synchronization between local memory and the temporary buffers is properly done. This model could also be back-ported to d3d. Or the software rasterizers could be hardware assisted somehow, there are a lot less sync points where those buffers should match with the contents of the local memory.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@4993 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
gabest11 2011-12-18 21:57:48 +00:00
parent f318e84aca
commit b86e3ebd19
7 changed files with 236 additions and 146 deletions

View File

@ -64,14 +64,16 @@ bool GSRasterizer::IsOneOfMyScanlines(int scanline) const
bool GSRasterizer::IsOneOfMyScanlines(int top, int bottom) const
{
top >>= THREAD_HEIGHT;
bottom >>= THREAD_HEIGHT;
top = top >> THREAD_HEIGHT;
bottom = (bottom + (1 << THREAD_HEIGHT) - 1) >> THREAD_HEIGHT;
do
while(top < bottom)
{
if(m_myscanline[top]) return true;
if(m_myscanline[top++])
{
return true;
}
}
while(top++ < bottom);
return false;
}
@ -292,8 +294,6 @@ void GSRasterizer::DrawTriangle(const GSVertexSW* vertices)
GSVector4 tbmin = tbf.min(m_fscissor.ywyw());
GSVector4i tb = GSVector4i(tbmax.xzyw(tbmin));
if(m_threads > 1 && !IsOneOfMyScanlines(tb.x, tb.w)) return;
dv[0] = v[1] - v[0];
dv[1] = v[2] - v[0];
dv[2] = v[2] - v[1];
@ -845,6 +845,7 @@ void GSRasterizerMT::ThreadProc()
GSRasterizerList::GSRasterizerList()
: m_sync_count(0)
, m_count(0)
, m_dispatched(0)
{
}
@ -856,22 +857,34 @@ GSRasterizerList::~GSRasterizerList()
}
}
void GSRasterizerList::Draw(shared_ptr<GSRasterizerData> data)
{
Sync();
front()->Draw(data);
}
void GSRasterizerList::Queue(shared_ptr<GSRasterizerData> data)
{
// TODO: do not send data to every thread, try to bin them (based on bbox & scissor)
if(data->solidrect)
if(size() > 1)
{
Sync();
ASSERT(!data->solidrect); // should call Draw instead, but it will work anyway
front()->Draw(data);
return;
data->solidrect = false;
}
GSVector4i bbox = data->bbox.rintersect(data->scissor);
for(int i = 0; i < size(); i++)
{
(*this)[i]->Queue(data);
GSRasterizer* r = (*this)[i];
if(r->IsOneOfMyScanlines(bbox.top, bbox.bottom))
{
r->Queue(data);
m_dispatched++;
}
}
m_count++;
@ -888,6 +901,9 @@ void GSRasterizerList::Sync()
m_sync_count++;
//printf("%d %d%%\n", m_count, 100 * m_dispatched / (m_count * size()));
m_count = 0;
m_dispatched = 0;
}
}

View File

@ -127,8 +127,6 @@ protected:
void DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GSVertexSW& dv, int orientation, int side);
__forceinline bool IsOneOfMyScanlines(int scanline) const;
__forceinline bool IsOneOfMyScanlines(int top, int bottom) const;
__forceinline void AddScanline(GSVertexSW* e, int pixels, int left, int top, const GSVertexSW& scan);
__forceinline void Flush(const GSVertexSW* vertices, const GSVertexSW& dscan, bool edge = false);
@ -136,6 +134,9 @@ public:
GSRasterizer(IDrawScanline* ds, int id, int threads);
virtual ~GSRasterizer();
__forceinline bool IsOneOfMyScanlines(int scanline) const;
__forceinline bool IsOneOfMyScanlines(int top, int bottom) const;
void Draw(shared_ptr<GSRasterizerData> data);
// IRasterizer
@ -169,6 +170,7 @@ class GSRasterizerList : public IRasterizer, protected vector<GSRasterizer*>
{
protected:
int m_count;
int m_dispatched;
GSRasterizerList();
@ -189,6 +191,12 @@ public:
return rl;
}
size_t IsMultiThreaded() const {return size();}
void Draw(shared_ptr<GSRasterizerData> data);
// IRasterizer
void Queue(shared_ptr<GSRasterizerData> data);
void Sync();

View File

@ -36,6 +36,9 @@ GSRendererSW::GSRendererSW(int threads)
m_rl = GSRasterizerList::Create<GSDrawScanline>(threads);
m_output = (uint8*)_aligned_malloc(1024 * 1024 * sizeof(uint32), 32);
memset(m_tex_pages, 0, sizeof(m_tex_pages));
memset(m_fzb_pages, 0, sizeof(m_fzb_pages));
}
GSRendererSW::~GSRendererSW()
@ -68,7 +71,7 @@ void GSRendererSW::VSync(int field)
Sync(); // IncAge might delete a cached texture in use
// printf("m_sync_count = %d\n", m_rl->m_sync_count); m_rl->m_sync_count = 0;
//printf("m_sync_count = %d\n", m_rl->m_sync_count); m_rl->m_sync_count = 0;
m_tc->IncAge();
@ -133,8 +136,6 @@ void GSRendererSW::Draw()
{
if(m_dump) m_dump.Object(m_vertices, m_count, m_vt.m_primclass);
// TODO: palette may be rendered (point-list in a few visual novels) and not ready by the time it needs to be loaded => vm to clut transfer (TEX0.CLD) should wait for the rasterizers to finish, if needed
if(m_fzb != m_context->offset.fzb)
{
// rasterizers must write the same outputs at the same time, this makes sure each thread has its own private surface area
@ -157,7 +158,7 @@ void GSRendererSW::Draw()
data->scissor = GSVector4i(m_context->scissor.in);
data->scissor.z = std::min<int>(data->scissor.z, (int)m_context->FRAME.FBW * 64); // TODO: find a game that overflows and check which one is the right behaviour
data->bbox = GSVector4i(m_vt.m_min.p.xyxy(m_vt.m_max.p));
data->bbox = GSVector4i(m_vt.m_min.p.floor().xyxy(m_vt.m_max.p.ceil()));
data->primclass = m_vt.m_primclass;
data->vertices = (GSVertexSW*)_aligned_malloc(sizeof(GSVertexSW) * m_count, 16); // TODO: detach m_vertices and reallocate later?
memcpy(data->vertices, m_vertices, sizeof(GSVertexSW) * m_count); // TODO: m_vt.Update fetches all the vertices already, could also store them here
@ -165,15 +166,6 @@ void GSRendererSW::Draw()
data->solidrect = gd->sel.IsSolidRect();
data->frame = m_perfmon.GetFrame();
if(s_dump)
{
if(data->solidrect) Sync();
((GSRasterizerData2*)data.get())->DumpInput();
}
m_rl->Queue(data);
GSVector4i r = data->bbox.rintersect(data->scissor);
if(gd->sel.fwrite)
@ -186,7 +178,83 @@ void GSRendererSW::Draw()
m_tc->InvalidateVideoMem(m_context->offset.zb, r);
}
// Sync();
if(!m_rl->IsMultiThreaded() || data->solidrect || s_dump)
{
if(s_dump)
{
uint64 frame = m_perfmon.GetFrame();
string s;
if(s_save && s_n >= s_saven && PRIM->TME)
{
s = format("c:\\temp1\\_%05d_f%lld_tex_%05x_%d.bmp", s_n, frame, (int)m_context->TEX0.TBP0, (int)m_context->TEX0.PSM);
m_mem.SaveBMP(s, m_context->TEX0.TBP0, m_context->TEX0.TBW, m_context->TEX0.PSM, 1 << m_context->TEX0.TW, 1 << m_context->TEX0.TH);
}
s_n++;
if(s_save && s_n >= s_saven)
{
s = format("c:\\temp1\\_%05d_f%lld_rt0_%05x_%d.bmp", s_n, frame, m_context->FRAME.Block(), m_context->FRAME.PSM);
m_mem.SaveBMP(s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512);
}
if(s_savez && s_n >= s_saven)
{
s = format("c:\\temp1\\_%05d_f%lld_rz0_%05x_%d.bmp", s_n, frame, m_context->ZBUF.Block(), m_context->ZBUF.PSM);
m_mem.SaveBMP(s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512);
}
s_n++;
}
m_rl->Draw(data);
Sync();
if(s_dump)
{
uint64 frame = m_perfmon.GetFrame();
string s;
if(s_save && s_n >= s_saven)
{
s = format("c:\\temp1\\_%05d_f%lld_rt1_%05x_%d.bmp", s_n, frame, m_context->FRAME.Block(), m_context->FRAME.PSM);
m_mem.SaveBMP(s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512);
}
if(s_savez && s_n >= s_saven)
{
s = format("c:\\temp1\\_%05d_f%lld_rz1_%05x_%d.bmp", s_n, frame, m_context->ZBUF.Block(), m_context->ZBUF.PSM);
m_mem.SaveBMP(s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512);
}
s_n++;
}
}
else
{
m_rl->Queue(data);
if(gd->sel.fwrite)
{
InvalidatePages(m_context->offset.fb, r);
}
if(gd->sel.zwrite)
{
InvalidatePages(m_context->offset.zb, r);
}
// Sync();
}
// TODO: m_perfmon.Put(GSPerfMon::Prim, stats.prims);
// TODO: m_perfmon.Put(GSPerfMon::Fillrate, stats.pixels);
@ -205,21 +273,108 @@ void GSRendererSW::Draw()
void GSRendererSW::Sync()
{
//printf("sync\n");
m_rl->Sync();
m_tc->ResetInvalidPages();
memset(m_tex_pages, 0, sizeof(m_tex_pages));
memset(m_fzb_pages, 0, sizeof(m_fzb_pages));
}
void GSRendererSW::InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r)
{
m_tc->InvalidateVideoMem(m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM), r);
//printf("ivm %05x %d %d\n", BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM);
Sync(); // TODO: not needed if nothing uses the affected pages (this is the most frequently called Sync! get rid of it)
GSOffset* o = m_mem.GetOffset(BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM);
m_tc->InvalidateVideoMem(o, r);
if(CheckPages(o, r)) // check if the changing pages either used as a texture or a target
{
Sync();
}
}
void GSRendererSW::InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r)
{
Sync(); // TODO: not needed if nothing uses the affected pages
//printf("ilm %05x %d %d\n", BITBLTBUF.DBP, BITBLTBUF.DBW, BITBLTBUF.DPSM);
GSOffset* o = m_mem.GetOffset(BITBLTBUF.SBP, BITBLTBUF.SBW, BITBLTBUF.SPSM);
if(CheckPages(o, r)) // TODO: only checking m_fzb_pages would be enough (read-backs are rare anyway)
{
Sync();
}
}
void GSRendererSW::InvalidatePages(const GSTextureCacheSW::Texture* t)
{
//printf("tex %05x %d %d\n", t->m_TEX0.TBP0, t->m_TEX0.TBW, t->m_TEX0.PSM);
for(size_t i = 0; i < countof(t->m_pages); i++)
{
if(m_fzb_pages[i] & t->m_pages[i]) // currently begin drawn to? => sync
{
Sync();
return;
}
m_tex_pages[i] |= t->m_pages[i]; // remember which texture pages are used
}
}
void GSRendererSW::InvalidatePages(const GSOffset* o, const GSVector4i& rect)
{
//printf("fzb %05x %d %d\n", o->bp, o->bw, o->psm);
GSVector2i bs = (o->bp & 31) == 0 ? GSLocalMemory::m_psm[o->psm].pgs : GSLocalMemory::m_psm[o->psm].bs;
GSVector4i r = rect.ralign<Align_Outside>(bs);
for(int y = r.top; y < r.bottom; y += bs.y)
{
uint32 base = o->block.row[y >> 3];
for(int x = r.left; x < r.right; x += bs.x)
{
uint32 page = (base + o->block.col[x >> 3]) >> 5;
if(page < MAX_PAGES)
{
m_fzb_pages[page >> 5] |= 1 << (page & 31);
}
}
}
}
bool GSRendererSW::CheckPages(const GSOffset* o, const GSVector4i& rect)
{
GSVector2i bs = (o->bp & 31) == 0 ? GSLocalMemory::m_psm[o->psm].pgs : GSLocalMemory::m_psm[o->psm].bs;
GSVector4i r = rect.ralign<Align_Outside>(bs);
for(int y = r.top; y < r.bottom; y += bs.y)
{
uint32 base = o->block.row[y >> 3];
for(int x = r.left; x < r.right; x += bs.x)
{
uint32 page = (base + o->block.col[x >> 3]) >> 5;
if(page < MAX_PAGES)
{
uint32 mask = 1 << (page & 31);
if((m_tex_pages[page >> 5] | m_fzb_pages[page >> 5]) & mask)
{
return true;
}
}
}
}
return false;
}
#include "GSTextureSW.h"
@ -335,7 +490,7 @@ bool GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd)
if(t == NULL) {ASSERT(0); return false;}
if(!m_tc->CanUpdate(t)) Sync();
InvalidatePages(t);
GSVector4i r;
@ -488,7 +643,7 @@ bool GSRendererSW::GetScanlineGlobalData(GSScanlineGlobalData& gd)
if(t == NULL) {ASSERT(0); return false;}
if(!m_tc->CanUpdate(t)) Sync();
InvalidatePages(t);
GSVector4i r;

View File

@ -29,13 +29,6 @@ class GSRendererSW : public GSRendererT<GSVertexSW>
{
class GSRasterizerData2 : public GSRasterizerData
{
GSRenderer* renderer;
GIFRegFRAME FRAME;
GIFRegZBUF ZBUF;
GIFRegTEX0 TEX0;
uint32 TME;
GSVector2i framesize;
public:
GSRasterizerData2(GSRenderer* r)
{
@ -45,13 +38,6 @@ class GSRendererSW : public GSRendererT<GSVertexSW>
gd->dimx = NULL;
param = gd;
renderer = r;
FRAME = r->m_context->FRAME;
ZBUF = r->m_context->ZBUF;
TEX0 = r->m_context->TEX0;
TME = r->PRIM->TME;
framesize = GSVector2i(r->GetFrameRect().width(), 512);
}
virtual ~GSRasterizerData2()
@ -62,73 +48,6 @@ class GSRendererSW : public GSRendererT<GSVertexSW>
if(gd->dimx) _aligned_free(gd->dimx);
_aligned_free(gd);
DumpOutput();
}
// FIXME: not really possible to save whole input/output anymore, strips of the picture may lag in multi-threaded mode
void DumpInput()
{
if(!renderer->s_dump) return; // || !(m_context->TEX1.MXL > 0 && m_context->TEX1.MMIN >= 2 && m_context->TEX1.MMIN <= 5 && m_vt.m_lod.x > 0))
GSAutoLock l(&renderer->s_lock);
uint64 frame = renderer->m_perfmon.GetFrame();
string s;
if(renderer->s_save && renderer->s_n >= renderer->s_saven && TME)
{
s = format("c:\\temp1\\_%05d_f%lld_tex_%05x_%d.bmp", renderer->s_n, frame, (int)TEX0.TBP0, (int)TEX0.PSM);
renderer->m_mem.SaveBMP(s, TEX0.TBP0, TEX0.TBW, TEX0.PSM, 1 << TEX0.TW, 1 << TEX0.TH);
}
renderer->s_n++;
if(renderer->s_save && renderer->s_n >= renderer->s_saven)
{
s = format("c:\\temp1\\_%05d_f%lld_rt0_%05x_%d.bmp", renderer->s_n, frame, FRAME.Block(), FRAME.PSM);
renderer->m_mem.SaveBMP(s, FRAME.Block(), FRAME.FBW, FRAME.PSM, framesize.x, framesize.y);
}
if(renderer->s_savez && renderer->s_n >= renderer->s_saven)
{
s = format("c:\\temp1\\_%05d_f%lld_rz0_%05x_%d.bmp", renderer->s_n, frame, ZBUF.Block(), ZBUF.PSM);
renderer->m_mem.SaveBMP(s, ZBUF.Block(), FRAME.FBW, ZBUF.PSM, framesize.x, framesize.y);
}
renderer->s_n++;
}
void DumpOutput()
{
if(!renderer->s_dump) return; // || !(m_context->TEX1.MXL > 0 && m_context->TEX1.MMIN >= 2 && m_context->TEX1.MMIN <= 5 && m_vt.m_lod.x > 0)
GSAutoLock l(&renderer->s_lock);
uint64 frame = renderer->m_perfmon.GetFrame();
string s;
if(renderer->s_save && renderer->s_n >= renderer->s_saven)
{
s = format("c:\\temp1\\_%05d_f%lld_rt1_%05x_%d.bmp", renderer->s_n, frame, FRAME.Block(), FRAME.PSM);
renderer->m_mem.SaveBMP(s, FRAME.Block(), FRAME.FBW, FRAME.PSM, framesize.x, framesize.y);
}
if(renderer->s_savez && renderer->s_n >= renderer->s_saven)
{
s = format("c:\\temp1\\_%05d_f%lld_rz1_%05x_%d.bmp", renderer->s_n, frame, ZBUF.Block(), ZBUF.PSM);
renderer->m_mem.SaveBMP(s, ZBUF.Block(), FRAME.FBW, ZBUF.PSM, framesize.x, framesize.y);
}
renderer->s_n++;
}
};
@ -139,6 +58,8 @@ protected:
uint8* m_output;
bool m_reset;
GSPixelOffset4* m_fzb;
uint32 m_fzb_pages[16];
uint32 m_tex_pages[16];
void Reset();
void VSync(int field);
@ -150,6 +71,10 @@ protected:
void InvalidateVideoMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r);
void InvalidateLocalMem(const GIFRegBITBLTBUF& BITBLTBUF, const GSVector4i& r);
void InvalidatePages(const GSOffset* o, const GSVector4i& rect);
void InvalidatePages(const GSTextureCacheSW::Texture* t);
bool CheckPages(const GSOffset* o, const GSVector4i& rect);
bool GetScanlineGlobalData(GSScanlineGlobalData& gd);
public:

View File

@ -613,6 +613,19 @@ void GSState::ApplyTEX0(int i, GIFRegTEX0& TEX0)
if(wt)
{
GIFRegBITBLTBUF BITBLTBUF;
BITBLTBUF.SBP = TEX0.CBP;
BITBLTBUF.SBW = 1;
BITBLTBUF.SPSM = TEX0.CSM;
GSVector4i r = GSVector4i::zero();
r.right = GSLocalMemory::m_psm[TEX0.CPSM].pgs.x;
r.bottom = GSLocalMemory::m_psm[TEX0.CPSM].pgs.y;
InvalidateLocalMem(BITBLTBUF, r);
m_mem.m_clut.Write(m_env.CTXT[i].TEX0, m_env.TEXCLUT);
}
}

View File

@ -25,7 +25,6 @@
GSTextureCacheSW::GSTextureCacheSW(GSState* state)
: m_state(state)
{
memset(m_invalid, 0, sizeof(m_invalid));
}
GSTextureCacheSW::~GSTextureCacheSW()
@ -102,11 +101,9 @@ GSTextureCacheSW::Texture* GSTextureCacheSW::Lookup(const GIFRegTEX0& TEX0, cons
void GSTextureCacheSW::InvalidateVideoMem(const GSOffset* o, const GSVector4i& rect)
{
uint32 bp = o->bp;
uint32 bw = o->bw;
uint32 psm = o->psm;
GSVector2i bs = (bp & 31) == 0 ? GSLocalMemory::m_psm[psm].pgs : GSLocalMemory::m_psm[psm].bs;
GSVector2i bs = (o->bp & 31) == 0 ? GSLocalMemory::m_psm[psm].pgs : GSLocalMemory::m_psm[psm].bs;
GSVector4i r = rect.ralign<Align_Outside>(bs);
@ -120,8 +117,6 @@ void GSTextureCacheSW::InvalidateVideoMem(const GSOffset* o, const GSVector4i& r
if(page < MAX_PAGES)
{
m_invalid[page >> 5] |= 1 << (page & 31); // remember which pages might be invalid for future texture updates
const list<Texture*>& map = m_map[page];
for(list<Texture*>::const_iterator i = map.begin(); i != map.end(); i++)
@ -198,24 +193,6 @@ void GSTextureCacheSW::IncAge()
}
}
bool GSTextureCacheSW::CanUpdate(Texture* t)
{
for(size_t i = 0; i < countof(m_invalid); i++)
{
if(m_invalid[i] & t->m_pages[i])
{
return false;
}
}
return true;
}
void GSTextureCacheSW::ResetInvalidPages()
{
memset(m_invalid, 0, sizeof(m_invalid));
}
//
GSTextureCacheSW::Texture::Texture(GSState* state, const GSOffset* offset, uint32 tw0, const GIFRegTEX0& TEX0, const GIFRegTEXA& TEXA)

View File

@ -57,7 +57,6 @@ protected:
GSState* m_state;
hash_set<Texture*> m_textures;
list<Texture*> m_map[MAX_PAGES];
uint32 m_invalid[16];
public:
GSTextureCacheSW(GSState* state);
@ -70,7 +69,4 @@ public:
void RemoveAll();
void RemoveAt(Texture* t);
void IncAge();
bool CanUpdate(Texture* t);
void ResetInvalidPages();
};