From 2421c68bee5b96d6870fd29a54ae450971294571 Mon Sep 17 00:00:00 2001 From: gabest11 Date: Thu, 22 Dec 2011 01:48:16 +0000 Subject: [PATCH] GSdx: Saving the conditional var update (vista or better) before I try a new idea again. That Sync() call is wasting too much time, if there was only one queue then the main thread could also grab and process elements instead of just waiting for the workers. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@5005 96395faa-99c1-11dd-bbfe-3dabce05a288 --- plugins/GSdx/GPURenderer.cpp | 2 +- plugins/GSdx/GPURendererSW.cpp | 2 +- plugins/GSdx/GPUState.cpp | 8 +-- plugins/GSdx/GSDrawScanline.cpp | 6 +- plugins/GSdx/GSPerfMon.cpp | 35 ++++++------ plugins/GSdx/GSPerfMon.h | 33 ++++++++--- plugins/GSdx/GSRasterizer.cpp | 97 ++++++++++++++++++++++++++++++++- plugins/GSdx/GSRasterizer.h | 58 ++++++++++++++++++-- plugins/GSdx/GSRenderer.cpp | 2 +- plugins/GSdx/GSRendererSW.cpp | 42 ++++++++++++-- plugins/GSdx/GSState.cpp | 4 +- 11 files changed, 240 insertions(+), 49 deletions(-) diff --git a/plugins/GSdx/GPURenderer.cpp b/plugins/GSdx/GPURenderer.cpp index 3cf0509f25..7b71a41d49 100644 --- a/plugins/GSdx/GPURenderer.cpp +++ b/plugins/GSdx/GPURenderer.cpp @@ -127,7 +127,7 @@ bool GPURenderer::Merge() void GPURenderer::VSync() { - GSPerfMonAutoTimer pmat(m_perfmon); + GSPerfMonAutoTimer pmat(&m_perfmon); m_perfmon.Put(GSPerfMon::Frame); diff --git a/plugins/GSdx/GPURendererSW.cpp b/plugins/GSdx/GPURendererSW.cpp index 6eb1f078e3..3a40e3988f 100644 --- a/plugins/GSdx/GPURendererSW.cpp +++ b/plugins/GSdx/GPURendererSW.cpp @@ -29,7 +29,7 @@ GPURendererSW::GPURendererSW(GSDevice* dev, int threads) { m_output = (uint32*)_aligned_malloc(m_mem.GetWidth() * m_mem.GetHeight() * sizeof(uint32), 16); - m_rl = GSRasterizerList::Create(threads); + m_rl = GSRasterizerList::Create(threads, &m_perfmon); } GPURendererSW::~GPURendererSW() diff --git a/plugins/GSdx/GPUState.cpp b/plugins/GSdx/GPUState.cpp index a657aff1a5..b1dbcb250c 100644 --- a/plugins/GSdx/GPUState.cpp +++ b/plugins/GSdx/GPUState.cpp @@ -141,7 +141,7 @@ void GPUState::Invalidate(const GSVector4i& r) void GPUState::WriteData(const uint8* mem, uint32 size) { - GSPerfMonAutoTimer pmat(m_perfmon); + GSPerfMonAutoTimer pmat(&m_perfmon); size <<= 2; @@ -165,7 +165,7 @@ void GPUState::WriteData(const uint8* mem, uint32 size) void GPUState::ReadData(uint8* mem, uint32 size) { - GSPerfMonAutoTimer pmat(m_perfmon); + GSPerfMonAutoTimer pmat(&m_perfmon); int remaining = m_read.bytes - m_read.cur; @@ -194,7 +194,7 @@ void GPUState::ReadData(uint8* mem, uint32 size) void GPUState::WriteStatus(uint32 status) { - GSPerfMonAutoTimer pmat(m_perfmon); + GSPerfMonAutoTimer pmat(&m_perfmon); uint32 b = status >> 24; @@ -205,7 +205,7 @@ void GPUState::WriteStatus(uint32 status) uint32 GPUState::ReadStatus() { - GSPerfMonAutoTimer pmat(m_perfmon); + GSPerfMonAutoTimer pmat(&m_perfmon); m_env.STATUS.LCF = ~m_env.STATUS.LCF; // ? diff --git a/plugins/GSdx/GSDrawScanline.cpp b/plugins/GSdx/GSDrawScanline.cpp index c694c09c59..ce87ed8b81 100644 --- a/plugins/GSdx/GSDrawScanline.cpp +++ b/plugins/GSdx/GSDrawScanline.cpp @@ -1411,7 +1411,7 @@ void GSDrawScanline::DrawRect(const GSVector4i& r, const GSVertexSW& v) } else { - if(m == 0) + if((m & 0xffff) == 0) { DrawRectT(zbr, zbc, r, z, m); } @@ -1451,7 +1451,7 @@ void GSDrawScanline::DrawRect(const GSVector4i& r, const GSVertexSW& v) { c = ((c & 0xf8) >> 3) | ((c & 0xf800) >> 6) | ((c & 0xf80000) >> 9) | ((c & 0x80000000) >> 16); - if(m == 0) + if((m & 0xffff) == 0) { DrawRectT(fbr, fbc, r, c, m); } @@ -1477,6 +1477,8 @@ void GSDrawScanline::DrawRectT(const int* RESTRICT row, const int* RESTRICT col, mask = mask.xxzzlh(); } + if(masked) ASSERT(mask.u32[0] != 0); + color = color.andnot(mask); GSVector4i br = r.ralign(GSVector2i(8 * 4 / sizeof(T), 8)); diff --git a/plugins/GSdx/GSPerfMon.cpp b/plugins/GSdx/GSPerfMon.cpp index 724cc41325..6c5ec05238 100644 --- a/plugins/GSdx/GSPerfMon.cpp +++ b/plugins/GSdx/GSPerfMon.cpp @@ -23,14 +23,14 @@ #include "GSPerfMon.h" GSPerfMon::GSPerfMon() - : m_total(0) - , m_begin(0) - , m_frame(0) + : m_frame(0) , m_lastframe(0) , m_count(0) { memset(m_counters, 0, sizeof(m_counters)); memset(m_stats, 0, sizeof(m_stats)); + memset(m_total, 0, sizeof(m_total)); + memset(m_begin, 0, sizeof(m_begin)); } void GSPerfMon::Put(counter_t c, double val) @@ -69,32 +69,35 @@ void GSPerfMon::Update() memset(m_counters, 0, sizeof(m_counters)); } -void GSPerfMon::Start() +void GSPerfMon::Start(int timer) { - m_start = __rdtsc(); + m_start[timer] = __rdtsc(); - if(m_begin == 0) + if(m_begin[timer] == 0) { - m_begin = m_start; + m_begin[timer] = m_start[timer]; } } -void GSPerfMon::Stop() +void GSPerfMon::Stop(int timer) { - if(m_start > 0) + if(m_start[timer] > 0) { - m_total += __rdtsc() - m_start; - m_start = 0; + m_total[timer] += __rdtsc() - m_start[timer]; + m_start[timer] = 0; } } -int GSPerfMon::CPU() +int GSPerfMon::CPU(int timer, bool reset) { - int percent = (int)(100 * m_total / (__rdtsc() - m_begin)); + int percent = m_total[timer] / 1000; // (int)(100 * m_total[timer] / (__rdtsc() - m_begin[timer])); - m_begin = 0; - m_start = 0; - m_total = 0; + if(reset) + { + m_begin[timer] = 0; + m_start[timer] = 0; + m_total[timer] = 0; + } return percent; } diff --git a/plugins/GSdx/GSPerfMon.h b/plugins/GSdx/GSPerfMon.h index 4df83c13f4..53483ccc82 100644 --- a/plugins/GSdx/GSPerfMon.h +++ b/plugins/GSdx/GSPerfMon.h @@ -24,18 +24,30 @@ class GSPerfMon { public: - enum counter_t {Frame, Prim, Draw, Swizzle, Unswizzle, Fillrate, Quad, CounterLast}; + enum timer_t + { + Main, + Sync, + WorkerDraw0, WorkerDraw1, WorkerDraw2, WorkerDraw3, WorkerDraw4, WorkerDraw5, WorkerDraw6, WorkerDraw7, WorkerDraw8, WorkerDraw9, WorkerDraw10, WorkerDraw11, WorkerDraw12, WorkerDraw13, WorkerDraw14, WorkerDraw15, + WorkerSync0, WorkerSync1, WorkerSync2, WorkerSync3, WorkerSync4, WorkerSync5, WorkerSync6, WorkerSync7, WorkerSync8, WorkerSync9, WorkerSync10, WorkerSync11, WorkerSync12, WorkerSync13, WorkerSync14, WorkerSync15, + WorkerSleep0, WorkerSleep1, WorkerSleep2, WorkerSleep3, WorkerSleep4, WorkerSleep5, WorkerSleep6, WorkerSleep7, WorkerSleep8, WorkerSleep9, WorkerSleep10, WorkerSleep11, WorkerSleep12, WorkerSleep13, WorkerSleep14, WorkerSleep15, + TimerLast, + }; + + enum counter_t + { + Frame, Prim, Draw, Swizzle, Unswizzle, Fillrate, Quad, + CounterLast, + }; protected: double m_counters[CounterLast]; double m_stats[CounterLast]; - uint64 m_begin, m_total, m_start, m_frame; + uint64 m_begin[TimerLast], m_total[TimerLast], m_start[TimerLast]; + uint64 m_frame; clock_t m_lastframe; int m_count; - void Start(); - void Stop(); - friend class GSPerfMonAutoTimer; public: @@ -43,17 +55,22 @@ public: void SetFrame(uint64 frame) {m_frame = frame;} uint64 GetFrame() {return m_frame;} + void Put(counter_t c, double val = 0); double Get(counter_t c) {return m_stats[c];} void Update(); - int CPU(); + + void Start(int timer = Main); + void Stop(int timer = Main); + int CPU(int timer = Main, bool reset = true); }; class GSPerfMonAutoTimer { GSPerfMon* m_pm; + int m_timer; public: - GSPerfMonAutoTimer(GSPerfMon& pm) {(m_pm = &pm)->Start();} - ~GSPerfMonAutoTimer() {m_pm->Stop();} + GSPerfMonAutoTimer(GSPerfMon* pm, int timer = GSPerfMon::Main) {m_timer = timer; (m_pm = pm)->Start(m_timer);} + ~GSPerfMonAutoTimer() {m_pm->Stop(m_timer);} }; diff --git a/plugins/GSdx/GSRasterizer.cpp b/plugins/GSdx/GSRasterizer.cpp index 3a32367d3b..605c32812e 100644 --- a/plugins/GSdx/GSRasterizer.cpp +++ b/plugins/GSdx/GSRasterizer.cpp @@ -30,10 +30,11 @@ #define THREAD_HEIGHT 4 -GSRasterizer::GSRasterizer(IDrawScanline* ds, int id, int threads) +GSRasterizer::GSRasterizer(IDrawScanline* ds, int id, int threads, GSPerfMon* perfmon) : m_ds(ds) , m_id(id) , m_threads(threads) + , m_perfmon(perfmon) { m_edge.buff = (GSVertexSW*)vmalloc(sizeof(GSVertexSW) * 2048, false); m_edge.count = 0; @@ -88,6 +89,8 @@ void GSRasterizer::Queue(shared_ptr data) void GSRasterizer::Draw(shared_ptr data) { + GSPerfMonAutoTimer pmat(m_perfmon, GSPerfMon::WorkerDraw0 + m_id); + m_ds->BeginDraw(data->param); const GSVertexSW* vertices = data->vertices; @@ -763,8 +766,8 @@ void GSRasterizer::Flush(const GSVertexSW* vertices, const GSVertexSW& dscan, bo // -GSRasterizerMT::GSRasterizerMT(IDrawScanline* ds, int id, int threads) - : GSRasterizer(ds, id, threads) +GSRasterizerMT::GSRasterizerMT(IDrawScanline* ds, int id, int threads, GSPerfMon* perfmon) + : GSRasterizer(ds, id, threads, perfmon) , m_exit(false) , m_break(true) { @@ -840,6 +843,94 @@ void GSRasterizerMT::ThreadProc() } } +#ifdef _WINDOWS + +GSRasterizerMT2::GSRasterizerMT2(IDrawScanline* ds, int id, int threads, GSPerfMon* perfmon) + : GSRasterizer(ds, id, threads, perfmon) +{ + InitializeSRWLock(&m_lock); + InitializeConditionVariable(&m_notempty); + InitializeConditionVariable(&m_empty); + + CreateThread(); +} + +GSRasterizerMT2::~GSRasterizerMT2() +{ + m_queue.push(shared_ptr()); + + WakeConditionVariable(&m_notempty); + + CloseThread(); +} + +void GSRasterizerMT2::Queue(shared_ptr data) +{ + AcquireSRWLockExclusive(&m_lock); + + m_queue.push(data); + + ReleaseSRWLockExclusive(&m_lock); + + WakeConditionVariable(&m_notempty); +} + +void GSRasterizerMT2::Sync() +{ + AcquireSRWLockExclusive(&m_lock); + + while(!m_queue.empty()) + { + // TODO: instead of just waiting for the workers, help finishing their queues! + // TODO: to do that, queues needs to be merged and id'ed, and threads must switch m_myscanline on the fly + + GSPerfMonAutoTimer pmat(m_perfmon, GSPerfMon::WorkerSync0 + m_id); + + SleepConditionVariableSRW(&m_empty, &m_lock, INFINITE, 0); + } + + ReleaseSRWLockExclusive(&m_lock); +} + +void GSRasterizerMT2::ThreadProc() +{ + AcquireSRWLockExclusive(&m_lock); + + while(true) + { + while(m_queue.empty()) + { + GSPerfMonAutoTimer pmat(m_perfmon, GSPerfMon::WorkerSleep0 + m_id); + + SleepConditionVariableSRW(&m_notempty, &m_lock, INFINITE, 0); + } + + shared_ptr data; + + data = m_queue.front(); + + ReleaseSRWLockExclusive(&m_lock); + + if(data == NULL) + { + break; + } + + Draw(data); + + AcquireSRWLockExclusive(&m_lock); + + m_queue.pop(); + + if(m_queue.empty()) + { + WakeConditionVariable(&m_empty); + } + } +} + +#endif + // GSRasterizerList::GSRasterizerList() diff --git a/plugins/GSdx/GSRasterizer.h b/plugins/GSdx/GSRasterizer.h index 67b02565c9..ebbbac20bc 100644 --- a/plugins/GSdx/GSRasterizer.h +++ b/plugins/GSdx/GSRasterizer.h @@ -103,9 +103,12 @@ public: virtual void Sync() = 0; }; +#include "GSPerfMon.h" + __aligned(class, 32) GSRasterizer : public IRasterizer { protected: + GSPerfMon* m_perfmon; IDrawScanline* m_ds; int m_id; int m_threads; @@ -131,7 +134,7 @@ protected: __forceinline void Flush(const GSVertexSW* vertices, const GSVertexSW& dscan, bool edge = false); public: - GSRasterizer(IDrawScanline* ds, int id, int threads); + GSRasterizer(IDrawScanline* ds, int id, int threads, GSPerfMon* perfmon); virtual ~GSRasterizer(); __forceinline bool IsOneOfMyScanlines(int scanline) const; @@ -150,14 +153,14 @@ class GSRasterizerMT : public GSRasterizer, private GSThread protected: volatile bool m_exit; volatile bool m_break; + GSCritSec m_lock; GSEvent m_draw; queue > m_queue; - GSCritSec m_lock; void ThreadProc(); public: - GSRasterizerMT(IDrawScanline* ds, int id, int threads); + GSRasterizerMT(IDrawScanline* ds, int id, int threads, GSPerfMon* perfmon); virtual ~GSRasterizerMT(); // IRasterizer @@ -166,6 +169,30 @@ public: void Sync(); }; +#ifdef _WINDOWS + +class GSRasterizerMT2 : public GSRasterizer, private GSThread +{ +protected: + SRWLOCK m_lock; + CONDITION_VARIABLE m_notempty; + CONDITION_VARIABLE m_empty; + queue > m_queue; + + void ThreadProc(); + +public: + GSRasterizerMT2(IDrawScanline* ds, int id, int threads, GSPerfMon* perfmon); + virtual ~GSRasterizerMT2(); + + // IRasterizer + + void Queue(shared_ptr data); + void Sync(); +}; + +#endif + class GSRasterizerList : public IRasterizer, protected vector { protected: @@ -177,21 +204,40 @@ protected: public: virtual ~GSRasterizerList(); - template static IRasterizer* Create(int threads) + template static IRasterizer* Create(int threads, GSPerfMon* perfmon) { threads = std::max(threads, 0); if(threads == 0) { - return new GSRasterizer(new DS(), 0, 1); + return new GSRasterizer(new DS(), 0, 1, perfmon); } else { GSRasterizerList* rl = new GSRasterizerList(); + #ifdef _WINDOWS + + OSVERSIONINFOEX version; + memset(&version, 0, sizeof(version)); + version.dwOSVersionInfoSize = sizeof(version); + GetVersionEx((OSVERSIONINFO*)&version); + + if(version.dwMajorVersion >= 6) + { + for(int i = 0; i < threads; i++) + { + rl->push_back(new GSRasterizerMT2(new DS(), i, threads, perfmon)); + } + + return rl; + } + + #endif + for(int i = 0; i < threads; i++) { - rl->push_back(new GSRasterizerMT(new DS(), i, threads)); + rl->push_back(new GSRasterizerMT(new DS(), i, threads, perfmon)); } return rl; diff --git a/plugins/GSdx/GSRenderer.cpp b/plugins/GSdx/GSRenderer.cpp index f7076e7ae5..b5d434dcf8 100644 --- a/plugins/GSdx/GSRenderer.cpp +++ b/plugins/GSdx/GSRenderer.cpp @@ -289,7 +289,7 @@ void GSRenderer::SetVSync(bool enabled) void GSRenderer::VSync(int field) { - GSPerfMonAutoTimer pmat(m_perfmon); + GSPerfMonAutoTimer pmat(&m_perfmon); m_perfmon.Put(GSPerfMon::Frame); diff --git a/plugins/GSdx/GSRendererSW.cpp b/plugins/GSdx/GSRendererSW.cpp index 6e5dff5064..24f85bb1ff 100644 --- a/plugins/GSdx/GSRendererSW.cpp +++ b/plugins/GSdx/GSRendererSW.cpp @@ -33,7 +33,7 @@ GSRendererSW::GSRendererSW(int threads) memset(m_texture, 0, sizeof(m_texture)); - m_rl = GSRasterizerList::Create(threads); + m_rl = GSRasterizerList::Create(threads, &m_perfmon); m_output = (uint8*)_aligned_malloc(1024 * 1024 * sizeof(uint32), 32); @@ -67,11 +67,41 @@ void GSRendererSW::Reset() void GSRendererSW::VSync(int field) { - GSRendererT::VSync(field); - Sync(); // IncAge might delete a cached texture in use + /* + printf("CPU %d Sync %d W %d %d %d | %d %d %d | %d %d %d | %d %d %d | %d %d %d | %d %d %d | %d %d %d | %d %d %d\n", + m_perfmon.CPU(GSPerfMon::Main), + m_perfmon.CPU(GSPerfMon::Sync), + m_perfmon.CPU(GSPerfMon::WorkerSync0), + m_perfmon.CPU(GSPerfMon::WorkerSleep0), + m_perfmon.CPU(GSPerfMon::WorkerDraw0), + m_perfmon.CPU(GSPerfMon::WorkerSync1), + m_perfmon.CPU(GSPerfMon::WorkerSleep1), + m_perfmon.CPU(GSPerfMon::WorkerDraw1), + m_perfmon.CPU(GSPerfMon::WorkerSync2), + m_perfmon.CPU(GSPerfMon::WorkerSleep2), + m_perfmon.CPU(GSPerfMon::WorkerDraw2), + m_perfmon.CPU(GSPerfMon::WorkerSync3), + m_perfmon.CPU(GSPerfMon::WorkerSleep3), + m_perfmon.CPU(GSPerfMon::WorkerDraw3), + m_perfmon.CPU(GSPerfMon::WorkerSync4), + m_perfmon.CPU(GSPerfMon::WorkerSleep4), + m_perfmon.CPU(GSPerfMon::WorkerDraw4), + m_perfmon.CPU(GSPerfMon::WorkerSync5), + m_perfmon.CPU(GSPerfMon::WorkerSleep5), + m_perfmon.CPU(GSPerfMon::WorkerDraw5), + m_perfmon.CPU(GSPerfMon::WorkerSync6), + m_perfmon.CPU(GSPerfMon::WorkerSleep6), + m_perfmon.CPU(GSPerfMon::WorkerDraw6), + m_perfmon.CPU(GSPerfMon::WorkerSync7), + m_perfmon.CPU(GSPerfMon::WorkerSleep7), + m_perfmon.CPU(GSPerfMon::WorkerDraw7)); - //printf("m_sync_count = %d\n", ((GSRasterizerList*)m_rl)->m_sync_count); ((GSRasterizerList*)m_rl)->m_sync_count = 0; + // + printf("m_sync_count = %d\n", ((GSRasterizerList*)m_rl)->m_sync_count); ((GSRasterizerList*)m_rl)->m_sync_count = 0; + */ + + GSRendererT::VSync(field); m_tc->IncAge(); @@ -265,6 +295,8 @@ void GSRendererSW::Sync() { //printf("sync\n"); + GSPerfMonAutoTimer pmat(&m_perfmon, GSPerfMon::Sync); + m_rl->Sync(); memset(m_tex_pages, 0, sizeof(m_tex_pages)); @@ -303,7 +335,7 @@ void GSRendererSW::InvalidatePages(const GSTextureCacheSW::Texture* t) for(size_t i = 0; i < countof(t->m_pages); i++) { - if(m_fzb_pages[i] & t->m_pages[i]) // currently begin drawn to? => sync + if(m_fzb_pages[i] & t->m_pages[i]) // currently being drawn to? => sync { Sync(); diff --git a/plugins/GSdx/GSState.cpp b/plugins/GSdx/GSState.cpp index f0b900e67b..9719537bcd 100644 --- a/plugins/GSdx/GSState.cpp +++ b/plugins/GSdx/GSState.cpp @@ -1477,7 +1477,7 @@ void GSState::SoftReset(uint32 mask) void GSState::ReadFIFO(uint8* mem, int size) { - GSPerfMonAutoTimer pmat(m_perfmon); + GSPerfMonAutoTimer pmat(&m_perfmon); Flush(); @@ -1498,7 +1498,7 @@ template void GSState::Transfer<3>(const uint8* mem, uint32 size); template void GSState::Transfer(const uint8* mem, uint32 size) { - GSPerfMonAutoTimer pmat(m_perfmon); + GSPerfMonAutoTimer pmat(&m_perfmon); const uint8* start = mem;