GSdx: SW rasterizer converted to use pthreads semaphores in the place of spinwaits. Performance mileage will vary on this; probably favors dual core machines over quads or i7's. Some tinkering might ink some more fps out of it and get it to be a speedup in all cases though.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@2296 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
Jake.Stine 2009-12-03 23:08:52 +00:00
parent 2413af5b6e
commit 97041701ae
3 changed files with 1510 additions and 1465 deletions

View File

@ -24,6 +24,24 @@
#include "StdAfx.h" #include "StdAfx.h"
#include "GSRasterizer.h" #include "GSRasterizer.h"
#include "pthread.h"
// Using a spinning finish on the main (MTGS) thread is apparently a big win still, over trying
// to wait out all the pending m_finished semaphores. It leaves one spinwait in the rasterizer,
// but that's still worlds better than 2-6 spinning threads like before.
#define UseSpinningFinish 1
// Set this to 1 to remove a lot of non-const div/modulus ops from the rasterization process.
// Might likely be a measurable speedup but limits threading to 1, 2, 4, and 8 threads.
#define UseConstThreadCount 0
#if !UseConstThreadCount
// ThreadsConst - const number of threads. User-configured threads (in GSdx panel) must match
// this value if UseConstThreadCount is enabled. [yeah, it's hacky for now]
static const int ThreadsConst = 2;
static const int ThreadMaskConst = ThreadsConst-1;
#endif
GSRasterizer::GSRasterizer(IDrawScanline* ds, int id, int threads) GSRasterizer::GSRasterizer(IDrawScanline* ds, int id, int threads)
: m_ds(ds) : m_ds(ds)
, m_id(id) , m_id(id)
@ -36,6 +54,15 @@ GSRasterizer::~GSRasterizer()
delete m_ds; delete m_ds;
} }
__forceinline bool GSRasterizer::IsOneOfMyScanlines(int scanline) const
{
#if UseConstThreadCount
return (ThreadMaskConst==0) || ((scanline & ThreadMaskConst) == m_id);
#else
return (scanline % m_threads) == m_id;
#endif
}
void GSRasterizer::Draw(const GSRasterizerData* data) void GSRasterizer::Draw(const GSRasterizerData* data)
{ {
m_dsf.ssl = NULL; m_dsf.ssl = NULL;
@ -96,7 +123,7 @@ void GSRasterizer::DrawPoint(const GSVertexSW* v, const GSVector4i& scissor)
if(scissor.left <= p.x && p.x < scissor.right && scissor.top <= p.y && p.y < scissor.bottom) if(scissor.left <= p.x && p.x < scissor.right && scissor.top <= p.y && p.y < scissor.bottom)
{ {
if((p.y % m_threads) == m_id) if(IsOneOfMyScanlines(p.y))
{ {
m_dsf.ssp(v, *v); m_dsf.ssp(v, *v);
@ -458,7 +485,7 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& l, const
{ {
do do
{ {
if((top % m_threads) == m_id) if(IsOneOfMyScanlines(top))
{ {
GSVector4 lr = l.p.xyxy(r).ceil(); GSVector4 lr = l.p.xyxy(r).ceil();
@ -499,7 +526,7 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& l, const
{ {
do do
{ {
if((top % m_threads) == m_id) if(IsOneOfMyScanlines(top))
{ {
GSVector4 lr = l.p.ceil(); GSVector4 lr = l.p.ceil();
@ -586,7 +613,7 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertices, const GSVector4i& scis
for(; r.top < r.bottom; r.top++, scan.t += dedge.t) for(; r.top < r.bottom; r.top++, scan.t += dedge.t)
{ {
if((r.top % m_threads) == m_id) if(IsOneOfMyScanlines(r.top))
{ {
m_dsf.ssl(r.right, r.left, r.top, scan); m_dsf.ssl(r.right, r.left, r.top, scan);
@ -661,7 +688,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
int xi = x >> 16; int xi = x >> 16;
int xf = x & 0xffff; int xf = x & 0xffff;
if(scissor.left <= xi && xi < scissor.right && (xi % m_threads) == m_id) if(scissor.left <= xi && xi < scissor.right && IsOneOfMyScanlines(xi))
{ {
m_stats.pixels++; m_stats.pixels++;
@ -689,7 +716,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
int xi = (x >> 16) + 1; int xi = (x >> 16) + 1;
int xf = x & 0xffff; int xf = x & 0xffff;
if(scissor.left <= xi && xi < scissor.right && (xi % m_threads) == m_id) if(scissor.left <= xi && xi < scissor.right && IsOneOfMyScanlines(xi))
{ {
m_stats.pixels++; m_stats.pixels++;
@ -759,7 +786,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
int yi = y >> 16; int yi = y >> 16;
int yf = y & 0xffff; int yf = y & 0xffff;
if(scissor.top <= yi && yi < scissor.bottom && (yi % m_threads) == m_id) if(scissor.top <= yi && yi < scissor.bottom && IsOneOfMyScanlines(yi))
{ {
m_stats.pixels++; m_stats.pixels++;
@ -787,7 +814,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
int yi = (y >> 16) + 1; int yi = (y >> 16) + 1;
int yf = y & 0xffff; int yf = y & 0xffff;
if(scissor.top <= yi && yi < scissor.bottom && (yi % m_threads) == m_id) if(scissor.top <= yi && yi < scissor.bottom && IsOneOfMyScanlines(yi))
{ {
m_stats.pixels++; m_stats.pixels++;
@ -811,108 +838,108 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
// //
GSRasterizerMT::GSRasterizerMT(IDrawScanline* ds, int id, int threads, long* sync) GSRasterizerMT::GSRasterizerMT(IDrawScanline* ds, int id, int threads, sem_t& finished, volatile long& sync)
: GSRasterizer(ds, id, threads) : GSRasterizer(ds, id, threads)
, m_finished(finished)
, m_sync(sync) , m_sync(sync)
, m_exit(false) , m_exit(false)
, m_data(NULL) , m_data(NULL)
{ {
if(id > 0) sem_init(&m_semaphore, false, 0);
{ sem_init(&m_stopped, false, 0);
CreateThread(); CreateThread();
}
} }
GSRasterizerMT::~GSRasterizerMT() GSRasterizerMT::~GSRasterizerMT()
{ {
m_exit = true; m_exit = true;
sem_post(&m_semaphore);
sem_wait(&m_stopped);
sem_destroy(&m_semaphore);
sem_destroy(&m_stopped);
} }
void GSRasterizerMT::Draw(const GSRasterizerData* data) void GSRasterizerMT::Draw(const GSRasterizerData* data)
{ {
if(m_id == 0)
{
__super::Draw(data);
}
else
{
m_data = data; m_data = data;
sem_post(&m_semaphore);
_interlockedbittestandset(m_sync, m_id);
}
} }
void GSRasterizerMT::ThreadProc() void GSRasterizerMT::ThreadProc()
{ {
// _mm_setcsr(MXCSR); // _mm_setcsr(MXCSR);
while(!m_exit) while( true )
{
if(*m_sync & (1 << m_id))
{ {
sem_wait(&m_semaphore);
if(m_exit) break;
__super::Draw(m_data); __super::Draw(m_data);
_interlockedbittestandreset(m_sync, m_id); if( UseSpinningFinish )
} _interlockedbittestandreset( &m_sync, m_id );
else else
{ sem_post(&m_finished);
_mm_pause();
}
} }
sem_post(&m_stopped);
} }
// //
GSRasterizerList::GSRasterizerList() GSRasterizerList::GSRasterizerList()
{ {
// User/Source Coding Rule 24. (M impact, ML generality) Place each m_threadcount = 0;
// synchronization variable alone, separated by 128 bytes or in a separate cache sem_init(&m_finished, false, 0);
// line.
m_sync = (long*)_aligned_malloc(128, 64);
*m_sync = 0;
} }
GSRasterizerList::~GSRasterizerList() GSRasterizerList::~GSRasterizerList()
{ {
FreeRasterizers(); FreeRasterizers();
sem_destroy(&m_finished);
_aligned_free(m_sync);
} }
void GSRasterizerList::FreeRasterizers() void GSRasterizerList::FreeRasterizers()
{ {
for_each(begin(), end(), delete_object()); for(unsigned i=0; i<size(); ++i) delete (*this)[i];
clear(); clear();
} }
void GSRasterizerList::Draw(const GSRasterizerData* data) void GSRasterizerList::Draw(const GSRasterizerData* data)
{ {
*m_sync = 0;
m_stats.Reset(); m_stats.Reset();
int64 start = __rdtsc(); int64 start = __rdtsc();
for(list<IRasterizer*>::reverse_iterator i = rbegin(); i != rend(); i++) m_sync = m_syncstart;
for(unsigned i=1; i<size(); ++i)
{ {
(*i)->Draw(data); (*this)[i]->Draw(data);
} }
while(*m_sync) (*this)[0]->Draw(data);
if( UseSpinningFinish )
{ {
_mm_pause(); while(m_sync) _mm_pause();
}
else
{
for(unsigned i=1; i<size(); ++i )
sem_wait(&m_finished);
} }
m_stats.ticks = __rdtsc() - start; m_stats.ticks = __rdtsc() - start;
for(list<IRasterizer*>::iterator i = begin(); i != end(); i++) for(unsigned i=0; i<size(); ++i)
{ {
GSRasterizerStats s; GSRasterizerStats s;
(*i)->GetStats(s); (*this)[i]->GetStats(s);
m_stats.pixels += s.pixels; m_stats.pixels += s.pixels;
m_stats.prims = max(m_stats.prims, s.prims); m_stats.prims = max(m_stats.prims, s.prims);

View File

@ -27,6 +27,9 @@
#include "GSThread.h" #include "GSThread.h"
#include "GSAlignedClass.h" #include "GSAlignedClass.h"
#include "pthread.h"
#include "semaphore.h"
__declspec(align(16)) class GSRasterizerData __declspec(align(16)) class GSRasterizerData
{ {
public: public:
@ -93,6 +96,8 @@ protected:
void DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GSVertexSW& dv, const GSVector4i& scissor, int orientation, int side); void DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GSVertexSW& dv, const GSVector4i& scissor, int orientation, int side);
inline bool IsOneOfMyScanlines(int scanline) const;
public: public:
GSRasterizer(IDrawScanline* ds, int id = 0, int threads = 0); GSRasterizer(IDrawScanline* ds, int id = 0, int threads = 0);
virtual ~GSRasterizer(); virtual ~GSRasterizer();
@ -106,14 +111,18 @@ public:
class GSRasterizerMT : public GSRasterizer, private GSThread class GSRasterizerMT : public GSRasterizer, private GSThread
{ {
long* m_sync; protected:
sem_t& m_finished;
volatile long& m_sync;
sem_t m_semaphore;
sem_t m_stopped;
bool m_exit; bool m_exit;
const GSRasterizerData* m_data; const GSRasterizerData* m_data;
void ThreadProc(); void ThreadProc();
public: public:
GSRasterizerMT(IDrawScanline* ds, int id, int threads, long* sync); GSRasterizerMT(IDrawScanline* ds, int id, int threads, sem_t& finished, volatile long& sync);
virtual ~GSRasterizerMT(); virtual ~GSRasterizerMT();
// IRasterizer // IRasterizer
@ -121,11 +130,14 @@ public:
void Draw(const GSRasterizerData* data); void Draw(const GSRasterizerData* data);
}; };
class GSRasterizerList : protected list<IRasterizer*>, public IRasterizer class GSRasterizerList : protected vector<IRasterizer*>, public IRasterizer
{ {
long* m_sync; protected:
int m_threadcount;
sem_t m_finished;
volatile long m_sync;
long m_syncstart;
GSRasterizerStats m_stats; GSRasterizerStats m_stats;
void FreeRasterizers(); void FreeRasterizers();
public: public:
@ -138,9 +150,13 @@ public:
threads = max(threads, 1); // TODO: min(threads, number of cpu cores) threads = max(threads, 1); // TODO: min(threads, number of cpu cores)
for(int i = 0; i < threads; i++) push_back(new GSRasterizer(new DS(parent, 0), 0, threads));
m_syncstart = 0;
for(int i = 1; i < threads; i++)
{ {
push_back(new GSRasterizerMT(new DS(parent, i), i, threads, m_sync)); push_back(new GSRasterizerMT(new DS(parent, i), i, threads, m_finished, m_sync));
_interlockedbittestandset(&m_syncstart, i);
} }
} }

File diff suppressed because it is too large Load Diff