GSdx: Saving the conditional var update (vista or better) before I try a new idea again. That Sync() call is wasting too much time, if there was only one queue then the main thread could also grab and process elements instead of just waiting for the workers.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@5005 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
gabest11 2011-12-22 01:48:16 +00:00
parent 5e0e4ce6a8
commit 2421c68bee
11 changed files with 240 additions and 49 deletions

View File

@ -127,7 +127,7 @@ bool GPURenderer::Merge()
void GPURenderer::VSync() void GPURenderer::VSync()
{ {
GSPerfMonAutoTimer pmat(m_perfmon); GSPerfMonAutoTimer pmat(&m_perfmon);
m_perfmon.Put(GSPerfMon::Frame); m_perfmon.Put(GSPerfMon::Frame);

View File

@ -29,7 +29,7 @@ GPURendererSW::GPURendererSW(GSDevice* dev, int threads)
{ {
m_output = (uint32*)_aligned_malloc(m_mem.GetWidth() * m_mem.GetHeight() * sizeof(uint32), 16); m_output = (uint32*)_aligned_malloc(m_mem.GetWidth() * m_mem.GetHeight() * sizeof(uint32), 16);
m_rl = GSRasterizerList::Create<GPUDrawScanline>(threads); m_rl = GSRasterizerList::Create<GPUDrawScanline>(threads, &m_perfmon);
} }
GPURendererSW::~GPURendererSW() GPURendererSW::~GPURendererSW()

View File

@ -141,7 +141,7 @@ void GPUState::Invalidate(const GSVector4i& r)
void GPUState::WriteData(const uint8* mem, uint32 size) void GPUState::WriteData(const uint8* mem, uint32 size)
{ {
GSPerfMonAutoTimer pmat(m_perfmon); GSPerfMonAutoTimer pmat(&m_perfmon);
size <<= 2; size <<= 2;
@ -165,7 +165,7 @@ void GPUState::WriteData(const uint8* mem, uint32 size)
void GPUState::ReadData(uint8* mem, uint32 size) void GPUState::ReadData(uint8* mem, uint32 size)
{ {
GSPerfMonAutoTimer pmat(m_perfmon); GSPerfMonAutoTimer pmat(&m_perfmon);
int remaining = m_read.bytes - m_read.cur; int remaining = m_read.bytes - m_read.cur;
@ -194,7 +194,7 @@ void GPUState::ReadData(uint8* mem, uint32 size)
void GPUState::WriteStatus(uint32 status) void GPUState::WriteStatus(uint32 status)
{ {
GSPerfMonAutoTimer pmat(m_perfmon); GSPerfMonAutoTimer pmat(&m_perfmon);
uint32 b = status >> 24; uint32 b = status >> 24;
@ -205,7 +205,7 @@ void GPUState::WriteStatus(uint32 status)
uint32 GPUState::ReadStatus() uint32 GPUState::ReadStatus()
{ {
GSPerfMonAutoTimer pmat(m_perfmon); GSPerfMonAutoTimer pmat(&m_perfmon);
m_env.STATUS.LCF = ~m_env.STATUS.LCF; // ? m_env.STATUS.LCF = ~m_env.STATUS.LCF; // ?

View File

@ -1411,7 +1411,7 @@ void GSDrawScanline::DrawRect(const GSVector4i& r, const GSVertexSW& v)
} }
else else
{ {
if(m == 0) if((m & 0xffff) == 0)
{ {
DrawRectT<uint16, false>(zbr, zbc, r, z, m); DrawRectT<uint16, false>(zbr, zbc, r, z, m);
} }
@ -1451,7 +1451,7 @@ void GSDrawScanline::DrawRect(const GSVector4i& r, const GSVertexSW& v)
{ {
c = ((c & 0xf8) >> 3) | ((c & 0xf800) >> 6) | ((c & 0xf80000) >> 9) | ((c & 0x80000000) >> 16); c = ((c & 0xf8) >> 3) | ((c & 0xf800) >> 6) | ((c & 0xf80000) >> 9) | ((c & 0x80000000) >> 16);
if(m == 0) if((m & 0xffff) == 0)
{ {
DrawRectT<uint16, false>(fbr, fbc, r, c, m); DrawRectT<uint16, false>(fbr, fbc, r, c, m);
} }
@ -1477,6 +1477,8 @@ void GSDrawScanline::DrawRectT(const int* RESTRICT row, const int* RESTRICT col,
mask = mask.xxzzlh(); mask = mask.xxzzlh();
} }
if(masked) ASSERT(mask.u32[0] != 0);
color = color.andnot(mask); color = color.andnot(mask);
GSVector4i br = r.ralign<Align_Inside>(GSVector2i(8 * 4 / sizeof(T), 8)); GSVector4i br = r.ralign<Align_Inside>(GSVector2i(8 * 4 / sizeof(T), 8));

View File

@ -23,14 +23,14 @@
#include "GSPerfMon.h" #include "GSPerfMon.h"
GSPerfMon::GSPerfMon() GSPerfMon::GSPerfMon()
: m_total(0) : m_frame(0)
, m_begin(0)
, m_frame(0)
, m_lastframe(0) , m_lastframe(0)
, m_count(0) , m_count(0)
{ {
memset(m_counters, 0, sizeof(m_counters)); memset(m_counters, 0, sizeof(m_counters));
memset(m_stats, 0, sizeof(m_stats)); memset(m_stats, 0, sizeof(m_stats));
memset(m_total, 0, sizeof(m_total));
memset(m_begin, 0, sizeof(m_begin));
} }
void GSPerfMon::Put(counter_t c, double val) void GSPerfMon::Put(counter_t c, double val)
@ -69,32 +69,35 @@ void GSPerfMon::Update()
memset(m_counters, 0, sizeof(m_counters)); memset(m_counters, 0, sizeof(m_counters));
} }
void GSPerfMon::Start() void GSPerfMon::Start(int timer)
{ {
m_start = __rdtsc(); m_start[timer] = __rdtsc();
if(m_begin == 0) if(m_begin[timer] == 0)
{ {
m_begin = m_start; m_begin[timer] = m_start[timer];
} }
} }
void GSPerfMon::Stop() void GSPerfMon::Stop(int timer)
{ {
if(m_start > 0) if(m_start[timer] > 0)
{ {
m_total += __rdtsc() - m_start; m_total[timer] += __rdtsc() - m_start[timer];
m_start = 0; m_start[timer] = 0;
} }
} }
int GSPerfMon::CPU() int GSPerfMon::CPU(int timer, bool reset)
{ {
int percent = (int)(100 * m_total / (__rdtsc() - m_begin)); int percent = m_total[timer] / 1000; // (int)(100 * m_total[timer] / (__rdtsc() - m_begin[timer]));
m_begin = 0; if(reset)
m_start = 0; {
m_total = 0; m_begin[timer] = 0;
m_start[timer] = 0;
m_total[timer] = 0;
}
return percent; return percent;
} }

View File

@ -24,18 +24,30 @@
class GSPerfMon class GSPerfMon
{ {
public: public:
enum counter_t {Frame, Prim, Draw, Swizzle, Unswizzle, Fillrate, Quad, CounterLast}; enum timer_t
{
Main,
Sync,
WorkerDraw0, WorkerDraw1, WorkerDraw2, WorkerDraw3, WorkerDraw4, WorkerDraw5, WorkerDraw6, WorkerDraw7, WorkerDraw8, WorkerDraw9, WorkerDraw10, WorkerDraw11, WorkerDraw12, WorkerDraw13, WorkerDraw14, WorkerDraw15,
WorkerSync0, WorkerSync1, WorkerSync2, WorkerSync3, WorkerSync4, WorkerSync5, WorkerSync6, WorkerSync7, WorkerSync8, WorkerSync9, WorkerSync10, WorkerSync11, WorkerSync12, WorkerSync13, WorkerSync14, WorkerSync15,
WorkerSleep0, WorkerSleep1, WorkerSleep2, WorkerSleep3, WorkerSleep4, WorkerSleep5, WorkerSleep6, WorkerSleep7, WorkerSleep8, WorkerSleep9, WorkerSleep10, WorkerSleep11, WorkerSleep12, WorkerSleep13, WorkerSleep14, WorkerSleep15,
TimerLast,
};
enum counter_t
{
Frame, Prim, Draw, Swizzle, Unswizzle, Fillrate, Quad,
CounterLast,
};
protected: protected:
double m_counters[CounterLast]; double m_counters[CounterLast];
double m_stats[CounterLast]; double m_stats[CounterLast];
uint64 m_begin, m_total, m_start, m_frame; uint64 m_begin[TimerLast], m_total[TimerLast], m_start[TimerLast];
uint64 m_frame;
clock_t m_lastframe; clock_t m_lastframe;
int m_count; int m_count;
void Start();
void Stop();
friend class GSPerfMonAutoTimer; friend class GSPerfMonAutoTimer;
public: public:
@ -43,17 +55,22 @@ public:
void SetFrame(uint64 frame) {m_frame = frame;} void SetFrame(uint64 frame) {m_frame = frame;}
uint64 GetFrame() {return m_frame;} uint64 GetFrame() {return m_frame;}
void Put(counter_t c, double val = 0); void Put(counter_t c, double val = 0);
double Get(counter_t c) {return m_stats[c];} double Get(counter_t c) {return m_stats[c];}
void Update(); void Update();
int CPU();
void Start(int timer = Main);
void Stop(int timer = Main);
int CPU(int timer = Main, bool reset = true);
}; };
class GSPerfMonAutoTimer class GSPerfMonAutoTimer
{ {
GSPerfMon* m_pm; GSPerfMon* m_pm;
int m_timer;
public: public:
GSPerfMonAutoTimer(GSPerfMon& pm) {(m_pm = &pm)->Start();} GSPerfMonAutoTimer(GSPerfMon* pm, int timer = GSPerfMon::Main) {m_timer = timer; (m_pm = pm)->Start(m_timer);}
~GSPerfMonAutoTimer() {m_pm->Stop();} ~GSPerfMonAutoTimer() {m_pm->Stop(m_timer);}
}; };

View File

@ -30,10 +30,11 @@
#define THREAD_HEIGHT 4 #define THREAD_HEIGHT 4
GSRasterizer::GSRasterizer(IDrawScanline* ds, int id, int threads) GSRasterizer::GSRasterizer(IDrawScanline* ds, int id, int threads, GSPerfMon* perfmon)
: m_ds(ds) : m_ds(ds)
, m_id(id) , m_id(id)
, m_threads(threads) , m_threads(threads)
, m_perfmon(perfmon)
{ {
m_edge.buff = (GSVertexSW*)vmalloc(sizeof(GSVertexSW) * 2048, false); m_edge.buff = (GSVertexSW*)vmalloc(sizeof(GSVertexSW) * 2048, false);
m_edge.count = 0; m_edge.count = 0;
@ -88,6 +89,8 @@ void GSRasterizer::Queue(shared_ptr<GSRasterizerData> data)
void GSRasterizer::Draw(shared_ptr<GSRasterizerData> data) void GSRasterizer::Draw(shared_ptr<GSRasterizerData> data)
{ {
GSPerfMonAutoTimer pmat(m_perfmon, GSPerfMon::WorkerDraw0 + m_id);
m_ds->BeginDraw(data->param); m_ds->BeginDraw(data->param);
const GSVertexSW* vertices = data->vertices; const GSVertexSW* vertices = data->vertices;
@ -763,8 +766,8 @@ void GSRasterizer::Flush(const GSVertexSW* vertices, const GSVertexSW& dscan, bo
// //
GSRasterizerMT::GSRasterizerMT(IDrawScanline* ds, int id, int threads) GSRasterizerMT::GSRasterizerMT(IDrawScanline* ds, int id, int threads, GSPerfMon* perfmon)
: GSRasterizer(ds, id, threads) : GSRasterizer(ds, id, threads, perfmon)
, m_exit(false) , m_exit(false)
, m_break(true) , m_break(true)
{ {
@ -840,6 +843,94 @@ void GSRasterizerMT::ThreadProc()
} }
} }
#ifdef _WINDOWS
GSRasterizerMT2::GSRasterizerMT2(IDrawScanline* ds, int id, int threads, GSPerfMon* perfmon)
: GSRasterizer(ds, id, threads, perfmon)
{
InitializeSRWLock(&m_lock);
InitializeConditionVariable(&m_notempty);
InitializeConditionVariable(&m_empty);
CreateThread();
}
GSRasterizerMT2::~GSRasterizerMT2()
{
m_queue.push(shared_ptr<GSRasterizerData>());
WakeConditionVariable(&m_notempty);
CloseThread();
}
void GSRasterizerMT2::Queue(shared_ptr<GSRasterizerData> data)
{
AcquireSRWLockExclusive(&m_lock);
m_queue.push(data);
ReleaseSRWLockExclusive(&m_lock);
WakeConditionVariable(&m_notempty);
}
void GSRasterizerMT2::Sync()
{
AcquireSRWLockExclusive(&m_lock);
while(!m_queue.empty())
{
// TODO: instead of just waiting for the workers, help finishing their queues!
// TODO: to do that, queues needs to be merged and id'ed, and threads must switch m_myscanline on the fly
GSPerfMonAutoTimer pmat(m_perfmon, GSPerfMon::WorkerSync0 + m_id);
SleepConditionVariableSRW(&m_empty, &m_lock, INFINITE, 0);
}
ReleaseSRWLockExclusive(&m_lock);
}
void GSRasterizerMT2::ThreadProc()
{
AcquireSRWLockExclusive(&m_lock);
while(true)
{
while(m_queue.empty())
{
GSPerfMonAutoTimer pmat(m_perfmon, GSPerfMon::WorkerSleep0 + m_id);
SleepConditionVariableSRW(&m_notempty, &m_lock, INFINITE, 0);
}
shared_ptr<GSRasterizerData> data;
data = m_queue.front();
ReleaseSRWLockExclusive(&m_lock);
if(data == NULL)
{
break;
}
Draw(data);
AcquireSRWLockExclusive(&m_lock);
m_queue.pop();
if(m_queue.empty())
{
WakeConditionVariable(&m_empty);
}
}
}
#endif
// //
GSRasterizerList::GSRasterizerList() GSRasterizerList::GSRasterizerList()

View File

@ -103,9 +103,12 @@ public:
virtual void Sync() = 0; virtual void Sync() = 0;
}; };
#include "GSPerfMon.h"
__aligned(class, 32) GSRasterizer : public IRasterizer __aligned(class, 32) GSRasterizer : public IRasterizer
{ {
protected: protected:
GSPerfMon* m_perfmon;
IDrawScanline* m_ds; IDrawScanline* m_ds;
int m_id; int m_id;
int m_threads; int m_threads;
@ -131,7 +134,7 @@ protected:
__forceinline void Flush(const GSVertexSW* vertices, const GSVertexSW& dscan, bool edge = false); __forceinline void Flush(const GSVertexSW* vertices, const GSVertexSW& dscan, bool edge = false);
public: public:
GSRasterizer(IDrawScanline* ds, int id, int threads); GSRasterizer(IDrawScanline* ds, int id, int threads, GSPerfMon* perfmon);
virtual ~GSRasterizer(); virtual ~GSRasterizer();
__forceinline bool IsOneOfMyScanlines(int scanline) const; __forceinline bool IsOneOfMyScanlines(int scanline) const;
@ -150,14 +153,14 @@ class GSRasterizerMT : public GSRasterizer, private GSThread
protected: protected:
volatile bool m_exit; volatile bool m_exit;
volatile bool m_break; volatile bool m_break;
GSCritSec m_lock;
GSEvent m_draw; GSEvent m_draw;
queue<shared_ptr<GSRasterizerData> > m_queue; queue<shared_ptr<GSRasterizerData> > m_queue;
GSCritSec m_lock;
void ThreadProc(); void ThreadProc();
public: public:
GSRasterizerMT(IDrawScanline* ds, int id, int threads); GSRasterizerMT(IDrawScanline* ds, int id, int threads, GSPerfMon* perfmon);
virtual ~GSRasterizerMT(); virtual ~GSRasterizerMT();
// IRasterizer // IRasterizer
@ -166,6 +169,30 @@ public:
void Sync(); void Sync();
}; };
#ifdef _WINDOWS
class GSRasterizerMT2 : public GSRasterizer, private GSThread
{
protected:
SRWLOCK m_lock;
CONDITION_VARIABLE m_notempty;
CONDITION_VARIABLE m_empty;
queue<shared_ptr<GSRasterizerData> > m_queue;
void ThreadProc();
public:
GSRasterizerMT2(IDrawScanline* ds, int id, int threads, GSPerfMon* perfmon);
virtual ~GSRasterizerMT2();
// IRasterizer
void Queue(shared_ptr<GSRasterizerData> data);
void Sync();
};
#endif
class GSRasterizerList : public IRasterizer, protected vector<GSRasterizer*> class GSRasterizerList : public IRasterizer, protected vector<GSRasterizer*>
{ {
protected: protected:
@ -177,21 +204,40 @@ protected:
public: public:
virtual ~GSRasterizerList(); virtual ~GSRasterizerList();
template<class DS> static IRasterizer* Create(int threads) template<class DS> static IRasterizer* Create(int threads, GSPerfMon* perfmon)
{ {
threads = std::max<int>(threads, 0); threads = std::max<int>(threads, 0);
if(threads == 0) if(threads == 0)
{ {
return new GSRasterizer(new DS(), 0, 1); return new GSRasterizer(new DS(), 0, 1, perfmon);
} }
else else
{ {
GSRasterizerList* rl = new GSRasterizerList(); GSRasterizerList* rl = new GSRasterizerList();
#ifdef _WINDOWS
OSVERSIONINFOEX version;
memset(&version, 0, sizeof(version));
version.dwOSVersionInfoSize = sizeof(version);
GetVersionEx((OSVERSIONINFO*)&version);
if(version.dwMajorVersion >= 6)
{
for(int i = 0; i < threads; i++)
{
rl->push_back(new GSRasterizerMT2(new DS(), i, threads, perfmon));
}
return rl;
}
#endif
for(int i = 0; i < threads; i++) for(int i = 0; i < threads; i++)
{ {
rl->push_back(new GSRasterizerMT(new DS(), i, threads)); rl->push_back(new GSRasterizerMT(new DS(), i, threads, perfmon));
} }
return rl; return rl;

View File

@ -289,7 +289,7 @@ void GSRenderer::SetVSync(bool enabled)
void GSRenderer::VSync(int field) void GSRenderer::VSync(int field)
{ {
GSPerfMonAutoTimer pmat(m_perfmon); GSPerfMonAutoTimer pmat(&m_perfmon);
m_perfmon.Put(GSPerfMon::Frame); m_perfmon.Put(GSPerfMon::Frame);

View File

@ -33,7 +33,7 @@ GSRendererSW::GSRendererSW(int threads)
memset(m_texture, 0, sizeof(m_texture)); memset(m_texture, 0, sizeof(m_texture));
m_rl = GSRasterizerList::Create<GSDrawScanline>(threads); m_rl = GSRasterizerList::Create<GSDrawScanline>(threads, &m_perfmon);
m_output = (uint8*)_aligned_malloc(1024 * 1024 * sizeof(uint32), 32); m_output = (uint8*)_aligned_malloc(1024 * 1024 * sizeof(uint32), 32);
@ -67,11 +67,41 @@ void GSRendererSW::Reset()
void GSRendererSW::VSync(int field) void GSRendererSW::VSync(int field)
{ {
GSRendererT<GSVertexSW>::VSync(field);
Sync(); // IncAge might delete a cached texture in use Sync(); // IncAge might delete a cached texture in use
/*
printf("CPU %d Sync %d W %d %d %d | %d %d %d | %d %d %d | %d %d %d | %d %d %d | %d %d %d | %d %d %d | %d %d %d\n",
m_perfmon.CPU(GSPerfMon::Main),
m_perfmon.CPU(GSPerfMon::Sync),
m_perfmon.CPU(GSPerfMon::WorkerSync0),
m_perfmon.CPU(GSPerfMon::WorkerSleep0),
m_perfmon.CPU(GSPerfMon::WorkerDraw0),
m_perfmon.CPU(GSPerfMon::WorkerSync1),
m_perfmon.CPU(GSPerfMon::WorkerSleep1),
m_perfmon.CPU(GSPerfMon::WorkerDraw1),
m_perfmon.CPU(GSPerfMon::WorkerSync2),
m_perfmon.CPU(GSPerfMon::WorkerSleep2),
m_perfmon.CPU(GSPerfMon::WorkerDraw2),
m_perfmon.CPU(GSPerfMon::WorkerSync3),
m_perfmon.CPU(GSPerfMon::WorkerSleep3),
m_perfmon.CPU(GSPerfMon::WorkerDraw3),
m_perfmon.CPU(GSPerfMon::WorkerSync4),
m_perfmon.CPU(GSPerfMon::WorkerSleep4),
m_perfmon.CPU(GSPerfMon::WorkerDraw4),
m_perfmon.CPU(GSPerfMon::WorkerSync5),
m_perfmon.CPU(GSPerfMon::WorkerSleep5),
m_perfmon.CPU(GSPerfMon::WorkerDraw5),
m_perfmon.CPU(GSPerfMon::WorkerSync6),
m_perfmon.CPU(GSPerfMon::WorkerSleep6),
m_perfmon.CPU(GSPerfMon::WorkerDraw6),
m_perfmon.CPU(GSPerfMon::WorkerSync7),
m_perfmon.CPU(GSPerfMon::WorkerSleep7),
m_perfmon.CPU(GSPerfMon::WorkerDraw7));
//printf("m_sync_count = %d\n", ((GSRasterizerList*)m_rl)->m_sync_count); ((GSRasterizerList*)m_rl)->m_sync_count = 0; //
printf("m_sync_count = %d\n", ((GSRasterizerList*)m_rl)->m_sync_count); ((GSRasterizerList*)m_rl)->m_sync_count = 0;
*/
GSRendererT<GSVertexSW>::VSync(field);
m_tc->IncAge(); m_tc->IncAge();
@ -265,6 +295,8 @@ void GSRendererSW::Sync()
{ {
//printf("sync\n"); //printf("sync\n");
GSPerfMonAutoTimer pmat(&m_perfmon, GSPerfMon::Sync);
m_rl->Sync(); m_rl->Sync();
memset(m_tex_pages, 0, sizeof(m_tex_pages)); memset(m_tex_pages, 0, sizeof(m_tex_pages));
@ -303,7 +335,7 @@ void GSRendererSW::InvalidatePages(const GSTextureCacheSW::Texture* t)
for(size_t i = 0; i < countof(t->m_pages); i++) for(size_t i = 0; i < countof(t->m_pages); i++)
{ {
if(m_fzb_pages[i] & t->m_pages[i]) // currently begin drawn to? => sync if(m_fzb_pages[i] & t->m_pages[i]) // currently being drawn to? => sync
{ {
Sync(); Sync();

View File

@ -1477,7 +1477,7 @@ void GSState::SoftReset(uint32 mask)
void GSState::ReadFIFO(uint8* mem, int size) void GSState::ReadFIFO(uint8* mem, int size)
{ {
GSPerfMonAutoTimer pmat(m_perfmon); GSPerfMonAutoTimer pmat(&m_perfmon);
Flush(); Flush();
@ -1498,7 +1498,7 @@ template void GSState::Transfer<3>(const uint8* mem, uint32 size);
template<int index> void GSState::Transfer(const uint8* mem, uint32 size) template<int index> void GSState::Transfer(const uint8* mem, uint32 size)
{ {
GSPerfMonAutoTimer pmat(m_perfmon); GSPerfMonAutoTimer pmat(&m_perfmon);
const uint8* start = mem; const uint8* start = mem;