GSdx: SW rasterizer converted to use pthreads semaphores in the place of spinwaits. Performance mileage will vary on this; probably favors dual core machines over quads or i7's. Some tinkering might ink some more fps out of it and get it to be a speedup in all cases though.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@2296 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
Jake.Stine 2009-12-03 23:08:52 +00:00
parent 2413af5b6e
commit 97041701ae
3 changed files with 1510 additions and 1465 deletions

View File

@ -24,6 +24,24 @@
#include "StdAfx.h"
#include "GSRasterizer.h"
#include "pthread.h"
// Using a spinning finish on the main (MTGS) thread is apparently a big win still, over trying
// to wait out all the pending m_finished semaphores. It leaves one spinwait in the rasterizer,
// but that's still worlds better than 2-6 spinning threads like before.
#define UseSpinningFinish 1
// Set this to 1 to remove a lot of non-const div/modulus ops from the rasterization process.
// Might likely be a measurable speedup but limits threading to 1, 2, 4, and 8 threads.
#define UseConstThreadCount 0
#if !UseConstThreadCount
// ThreadsConst - const number of threads. User-configured threads (in GSdx panel) must match
// this value if UseConstThreadCount is enabled. [yeah, it's hacky for now]
static const int ThreadsConst = 2;
static const int ThreadMaskConst = ThreadsConst-1;
#endif
GSRasterizer::GSRasterizer(IDrawScanline* ds, int id, int threads)
: m_ds(ds)
, m_id(id)
@ -36,6 +54,15 @@ GSRasterizer::~GSRasterizer()
delete m_ds;
}
__forceinline bool GSRasterizer::IsOneOfMyScanlines(int scanline) const
{
#if UseConstThreadCount
return (ThreadMaskConst==0) || ((scanline & ThreadMaskConst) == m_id);
#else
return (scanline % m_threads) == m_id;
#endif
}
void GSRasterizer::Draw(const GSRasterizerData* data)
{
m_dsf.ssl = NULL;
@ -96,7 +123,7 @@ void GSRasterizer::DrawPoint(const GSVertexSW* v, const GSVector4i& scissor)
if(scissor.left <= p.x && p.x < scissor.right && scissor.top <= p.y && p.y < scissor.bottom)
{
if((p.y % m_threads) == m_id)
if(IsOneOfMyScanlines(p.y))
{
m_dsf.ssp(v, *v);
@ -458,7 +485,7 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& l, const
{
do
{
if((top % m_threads) == m_id)
if(IsOneOfMyScanlines(top))
{
GSVector4 lr = l.p.xyxy(r).ceil();
@ -499,7 +526,7 @@ void GSRasterizer::DrawTriangleSection(int top, int bottom, GSVertexSW& l, const
{
do
{
if((top % m_threads) == m_id)
if(IsOneOfMyScanlines(top))
{
GSVector4 lr = l.p.ceil();
@ -586,7 +613,7 @@ void GSRasterizer::DrawSprite(const GSVertexSW* vertices, const GSVector4i& scis
for(; r.top < r.bottom; r.top++, scan.t += dedge.t)
{
if((r.top % m_threads) == m_id)
if(IsOneOfMyScanlines(r.top))
{
m_dsf.ssl(r.right, r.left, r.top, scan);
@ -661,7 +688,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
int xi = x >> 16;
int xf = x & 0xffff;
if(scissor.left <= xi && xi < scissor.right && (xi % m_threads) == m_id)
if(scissor.left <= xi && xi < scissor.right && IsOneOfMyScanlines(xi))
{
m_stats.pixels++;
@ -689,7 +716,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
int xi = (x >> 16) + 1;
int xf = x & 0xffff;
if(scissor.left <= xi && xi < scissor.right && (xi % m_threads) == m_id)
if(scissor.left <= xi && xi < scissor.right && IsOneOfMyScanlines(xi))
{
m_stats.pixels++;
@ -759,7 +786,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
int yi = y >> 16;
int yf = y & 0xffff;
if(scissor.top <= yi && yi < scissor.bottom && (yi % m_threads) == m_id)
if(scissor.top <= yi && yi < scissor.bottom && IsOneOfMyScanlines(yi))
{
m_stats.pixels++;
@ -787,7 +814,7 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
int yi = (y >> 16) + 1;
int yf = y & 0xffff;
if(scissor.top <= yi && yi < scissor.bottom && (yi % m_threads) == m_id)
if(scissor.top <= yi && yi < scissor.bottom && IsOneOfMyScanlines(yi))
{
m_stats.pixels++;
@ -811,108 +838,108 @@ void GSRasterizer::DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GS
//
GSRasterizerMT::GSRasterizerMT(IDrawScanline* ds, int id, int threads, long* sync)
GSRasterizerMT::GSRasterizerMT(IDrawScanline* ds, int id, int threads, sem_t& finished, volatile long& sync)
: GSRasterizer(ds, id, threads)
, m_finished(finished)
, m_sync(sync)
, m_exit(false)
, m_data(NULL)
{
if(id > 0)
{
CreateThread();
}
sem_init(&m_semaphore, false, 0);
sem_init(&m_stopped, false, 0);
CreateThread();
}
GSRasterizerMT::~GSRasterizerMT()
{
m_exit = true;
sem_post(&m_semaphore);
sem_wait(&m_stopped);
sem_destroy(&m_semaphore);
sem_destroy(&m_stopped);
}
void GSRasterizerMT::Draw(const GSRasterizerData* data)
{
if(m_id == 0)
{
__super::Draw(data);
}
else
{
m_data = data;
_interlockedbittestandset(m_sync, m_id);
}
m_data = data;
sem_post(&m_semaphore);
}
void GSRasterizerMT::ThreadProc()
{
// _mm_setcsr(MXCSR);
while(!m_exit)
while( true )
{
if(*m_sync & (1 << m_id))
{
__super::Draw(m_data);
sem_wait(&m_semaphore);
_interlockedbittestandreset(m_sync, m_id);
}
if(m_exit) break;
__super::Draw(m_data);
if( UseSpinningFinish )
_interlockedbittestandreset( &m_sync, m_id );
else
{
_mm_pause();
}
sem_post(&m_finished);
}
sem_post(&m_stopped);
}
//
GSRasterizerList::GSRasterizerList()
{
// User/Source Coding Rule 24. (M impact, ML generality) Place each
// synchronization variable alone, separated by 128 bytes or in a separate cache
// line.
m_sync = (long*)_aligned_malloc(128, 64);
*m_sync = 0;
m_threadcount = 0;
sem_init(&m_finished, false, 0);
}
GSRasterizerList::~GSRasterizerList()
{
FreeRasterizers();
_aligned_free(m_sync);
sem_destroy(&m_finished);
}
void GSRasterizerList::FreeRasterizers()
{
for_each(begin(), end(), delete_object());
for(unsigned i=0; i<size(); ++i) delete (*this)[i];
clear();
}
void GSRasterizerList::Draw(const GSRasterizerData* data)
{
*m_sync = 0;
m_stats.Reset();
int64 start = __rdtsc();
for(list<IRasterizer*>::reverse_iterator i = rbegin(); i != rend(); i++)
m_sync = m_syncstart;
for(unsigned i=1; i<size(); ++i)
{
(*i)->Draw(data);
(*this)[i]->Draw(data);
}
while(*m_sync)
(*this)[0]->Draw(data);
if( UseSpinningFinish )
{
_mm_pause();
while(m_sync) _mm_pause();
}
else
{
for(unsigned i=1; i<size(); ++i )
sem_wait(&m_finished);
}
m_stats.ticks = __rdtsc() - start;
for(list<IRasterizer*>::iterator i = begin(); i != end(); i++)
for(unsigned i=0; i<size(); ++i)
{
GSRasterizerStats s;
(*i)->GetStats(s);
(*this)[i]->GetStats(s);
m_stats.pixels += s.pixels;
m_stats.prims = max(m_stats.prims, s.prims);

View File

@ -27,6 +27,9 @@
#include "GSThread.h"
#include "GSAlignedClass.h"
#include "pthread.h"
#include "semaphore.h"
__declspec(align(16)) class GSRasterizerData
{
public:
@ -93,6 +96,8 @@ protected:
void DrawEdge(const GSVertexSW& v0, const GSVertexSW& v1, const GSVertexSW& dv, const GSVector4i& scissor, int orientation, int side);
inline bool IsOneOfMyScanlines(int scanline) const;
public:
GSRasterizer(IDrawScanline* ds, int id = 0, int threads = 0);
virtual ~GSRasterizer();
@ -106,14 +111,18 @@ public:
class GSRasterizerMT : public GSRasterizer, private GSThread
{
long* m_sync;
protected:
sem_t& m_finished;
volatile long& m_sync;
sem_t m_semaphore;
sem_t m_stopped;
bool m_exit;
const GSRasterizerData* m_data;
void ThreadProc();
public:
GSRasterizerMT(IDrawScanline* ds, int id, int threads, long* sync);
GSRasterizerMT(IDrawScanline* ds, int id, int threads, sem_t& finished, volatile long& sync);
virtual ~GSRasterizerMT();
// IRasterizer
@ -121,11 +130,14 @@ public:
void Draw(const GSRasterizerData* data);
};
class GSRasterizerList : protected list<IRasterizer*>, public IRasterizer
class GSRasterizerList : protected vector<IRasterizer*>, public IRasterizer
{
long* m_sync;
protected:
int m_threadcount;
sem_t m_finished;
volatile long m_sync;
long m_syncstart;
GSRasterizerStats m_stats;
void FreeRasterizers();
public:
@ -138,9 +150,13 @@ public:
threads = max(threads, 1); // TODO: min(threads, number of cpu cores)
for(int i = 0; i < threads; i++)
push_back(new GSRasterizer(new DS(parent, 0), 0, threads));
m_syncstart = 0;
for(int i = 1; i < threads; i++)
{
push_back(new GSRasterizerMT(new DS(parent, i), i, threads, m_sync));
push_back(new GSRasterizerMT(new DS(parent, i), i, threads, m_finished, m_sync));
_interlockedbittestandset(&m_syncstart, i);
}
}

File diff suppressed because it is too large Load Diff