Merge branch 'gsdx-boost-queue'

2015-04-17 19:13:32 +02:00 · 2015-04-17 19:13:32 +02:00 · 1d70865f09
parent 545c1d387c e605ed1d09
commit 1d70865f09
15 changed files with 666 additions and 21 deletions
--- a/plugins/GSdx/GS.cpp
+++ b/plugins/GSdx/GS.cpp
@ -853,7 +853,11 @@ EXPORT_C GSgetTitleInfo2(char* dest, size_t length)
 	if(s_gs->m_GStitleInfoBuffer[0])
 	{
 #ifdef _CX11_
 		std::lock_guard<std::mutex> lock(s_gs->m_pGSsetTitle_Crit);
 #else
 		GSAutoLock lock(&s_gs->m_pGSsetTitle_Crit);
 #endif
 		s = format("GSdx | %s", s_gs->m_GStitleInfoBuffer);
--- a/plugins/GSdx/GSCapture.cpp
+++ b/plugins/GSdx/GSCapture.cpp
@ -386,7 +386,11 @@ GSCapture::~GSCapture()
 bool GSCapture::BeginCapture(float fps)
 {
-	GSAutoLock lock(this);
+#ifdef _CX11_
 	std::lock_guard<std::mutex> lock(m_lock);
 #else
 	GSAutoLock lock(&m_lock);
 #endif
 	ASSERT(fps != 0);
@ -481,7 +485,11 @@ bool GSCapture::BeginCapture(float fps)
 bool GSCapture::DeliverFrame(const void* bits, int pitch, bool rgba)
 {
-	GSAutoLock lock(this);
+#ifdef _CX11_
 	std::lock_guard<std::mutex> lock(m_lock);
 #else
 	GSAutoLock lock(&m_lock);
 #endif
 	if(bits == NULL || pitch == 0)
 	{
@ -506,7 +514,11 @@ bool GSCapture::DeliverFrame(const void* bits, int pitch, bool rgba)
 bool GSCapture::EndCapture()
 {
-	GSAutoLock lock(this);
+#ifdef _CX11_
 	std::lock_guard<std::mutex> lock(m_lock);
 #else
 	GSAutoLock lock(&m_lock);
 #endif
 #ifdef _WINDOWS
--- a/plugins/GSdx/GSCapture.h
+++ b/plugins/GSdx/GSCapture.h
@ -22,14 +22,21 @@
 #pragma once
 #include "GSVector.h"
 #ifndef _CX11_
 #include "GSThread.h"
 #endif
 #ifdef _WINDOWS
 #include "GSCaptureDlg.h"
 #endif
-class GSCapture : protected GSCritSec
+class GSCapture
 {
 #ifdef _CX11_
 	std::mutex m_lock;
 #else
 	GSCritSec m_lock;
 #endif
 	bool m_capturing;
 	GSVector2i m_size;
--- a/plugins/GSdx/GSLinuxDialog.cpp
+++ b/plugins/GSdx/GSLinuxDialog.cpp
@ -165,12 +165,12 @@ bool RunLinuxDialog()
 	GtkWidget *fsaa_combo_box, *render_combo_box, *filter_combo_box;
 	GtkWidget *shader, *shader_conf, *shader_label, *shader_conf_label;
-	GtkWidget *shadeboost_check, *paltex_check, *fba_check, *aa_check,  *native_res_check, *stretch_hack_check, *fxaa_check, *shaderfx_check, *align_sprite_check;
+	GtkWidget *shadeboost_check, *paltex_check, *fba_check, *aa_check,  *native_res_check, *fxaa_check, *shaderfx_check, *spin_thread_check;
 	GtkWidget *sb_contrast, *sb_brightness, *sb_saturation;
 	GtkWidget *resx_spin, *resy_spin;
 	GtkWidget *hack_table, *hack_skipdraw_label, *hack_box, *hack_frame;
-	GtkWidget *hack_alpha_check, *hack_date_check, *hack_offset_check, *hack_skipdraw_spin, *hack_sprite_check, * hack_wild_check, *hack_enble_check, *hack_logz_check;
+	GtkWidget *hack_alpha_check, *hack_date_check, *hack_offset_check, *hack_skipdraw_spin, *hack_sprite_check, * hack_wild_check, *hack_enble_check, *hack_logz_check, *align_sprite_check, *stretch_hack_check;
 	GtkWidget *hack_tco_label, *hack_tco_entry;
 	GtkWidget *gl_box, *gl_frame, *gl_table;
@ -352,6 +352,7 @@ bool RunLinuxDialog()
 	paltex_check     = gtk_check_button_new_with_label("Allow 8 bits textures");
 	fba_check        = gtk_check_button_new_with_label("Alpha correction (FBA)");
 	aa_check         = gtk_check_button_new_with_label("Edge anti-aliasing (AA1)");
 	spin_thread_check= gtk_check_button_new_with_label("Disable thread sleeping (6+ cores CPU)");
 	fxaa_check       = gtk_check_button_new_with_label("Fxaa shader");
 	shaderfx_check   = gtk_check_button_new_with_label("External shader");
@ -360,6 +361,7 @@ bool RunLinuxDialog()
 	gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(paltex_check), theApp.GetConfig("paltex", 0));
 	gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(fba_check), theApp.GetConfig("fba", 1));
 	gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(aa_check), theApp.GetConfig("aa1", 0));
 	gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(spin_thread_check), theApp.GetConfig("spin_thread", 0));
 	gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(fxaa_check), theApp.GetConfig("fxaa", 0));
 	gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(shaderfx_check), theApp.GetConfig("shaderfx", 0));
 	gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(native_res_check), theApp.GetConfig("nativeres", 0));
@ -414,6 +416,7 @@ bool RunLinuxDialog()
 	gtk_container_add(GTK_CONTAINER(sw_box), threads_box);
 	gtk_container_add(GTK_CONTAINER(sw_box), aa_check);
 	gtk_container_add(GTK_CONTAINER(sw_box), spin_thread_check);
 	// Tables are strange. The numbers are for their position: left, right, top, bottom.
 	gtk_table_attach_defaults(GTK_TABLE(shader_table), fxaa_check, 0, 1, 0, 1);
@ -544,6 +547,7 @@ override_GL_ARB_shading_language_420pack = -1
 		theApp.SetConfig("paltex", (int)gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(paltex_check)));
 		theApp.SetConfig("fba", (int)gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(fba_check)));
 		theApp.SetConfig("aa1", (int)gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(aa_check)));
 		theApp.SetConfig("spin_thread", (int)gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(spin_thread_check)));
 		theApp.SetConfig("fxaa", (int)gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(fxaa_check)));
 		theApp.SetConfig("shaderfx", (int)gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(shaderfx_check)));
 		theApp.SetConfig("nativeres", (int)gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(native_res_check)));
--- a/plugins/GSdx/GSLocalMemory.h
+++ b/plugins/GSdx/GSLocalMemory.h
@ -26,7 +26,9 @@
 #include "GSVector.h"
 #include "GSBlock.h"
 #include "GSClut.h"
 #ifndef _CX11_
 #include "GSThread.h"
 #endif
 class GSOffset : public GSAlignedClass<32>
 {
--- a/plugins/GSdx/GSRasterizer.cpp
+++ b/plugins/GSdx/GSRasterizer.cpp
@ -104,7 +104,7 @@ int GSRasterizer::FindMyNextScanline(int top) const
 	return top;
 }
-void GSRasterizer::Queue(shared_ptr<GSRasterizerData> data)
+void GSRasterizer::Queue(const shared_ptr<GSRasterizerData>& data)
 {
 	Draw(data.get());
 }
@ -1147,7 +1147,7 @@ GSRasterizerList::GSRasterizerList(int threads, GSPerfMon* perfmon)
 GSRasterizerList::~GSRasterizerList()
 {
-	for(vector<GSWorker*>::iterator i = m_workers.begin(); i != m_workers.end(); i++)
+	for(auto i = m_workers.begin(); i != m_workers.end(); i++)
 	{
 		delete *i;
 	}
@ -1155,7 +1155,7 @@ GSRasterizerList::~GSRasterizerList()
 	_aligned_free(m_scanline);
 }
-void GSRasterizerList::Queue(shared_ptr<GSRasterizerData> data)
+void GSRasterizerList::Queue(const shared_ptr<GSRasterizerData>& data)
 {
 	GSVector4i r = data->bbox.rintersect(data->scissor);
@ -1232,3 +1232,29 @@ void GSRasterizerList::GSWorker::Process(shared_ptr<GSRasterizerData>& item)
 {
 	m_r->Draw(item.get());
 }
 // GSRasterizerList::GSWorkerSpin
 #ifdef ENABLE_BOOST
 GSRasterizerList::GSWorkerSpin::GSWorkerSpin(GSRasterizer* r)
 	: GSJobQueueSpin<shared_ptr<GSRasterizerData> >()
 	, m_r(r)
 {
 }
 GSRasterizerList::GSWorkerSpin::~GSWorkerSpin()
 {
 	Wait();
 	delete m_r;
 }
 int GSRasterizerList::GSWorkerSpin::GetPixels(bool reset)
 {
 	return m_r->GetPixels(reset);
 }
 void GSRasterizerList::GSWorkerSpin::Process(shared_ptr<GSRasterizerData>& item)
 {
 	m_r->Draw(item.get());
 }
 #endif
--- a/plugins/GSdx/GSRasterizer.h
+++ b/plugins/GSdx/GSRasterizer.h
@ -24,9 +24,13 @@
 #include "GS.h"
 #include "GSVertexSW.h"
 #include "GSFunctionMap.h"
 #include "GSThread.h"
 #include "GSAlignedClass.h"
 #include "GSPerfMon.h"
 #ifdef ENABLE_BOOST
 #include "GSThread_CXX11.h"
 #else
 #include "GSThread.h"
 #endif
 __aligned(class, 32) GSRasterizerData : public GSAlignedClass<32>
 {
@ -115,7 +119,7 @@ class IRasterizer : public GSAlignedClass<32>
 public:
 	virtual ~IRasterizer() {}
-	virtual void Queue(shared_ptr<GSRasterizerData> data) = 0;
+	virtual void Queue(const shared_ptr<GSRasterizerData>& data) = 0;
 	virtual void Sync() = 0;
 	virtual bool IsSynced() const = 0;
 	virtual int GetPixels(bool reset = true) = 0;
@ -170,7 +174,7 @@ public:
 	// IRasterizer
-	void Queue(shared_ptr<GSRasterizerData> data);
+	void Queue(const shared_ptr<GSRasterizerData>& data);
 	void Sync() {}
 	bool IsSynced() const {return true;}
 	int GetPixels(bool reset);
@ -195,8 +199,29 @@ protected:
 		void Process(shared_ptr<GSRasterizerData>& item);
 	};
 #ifdef ENABLE_BOOST
 	class GSWorkerSpin : public GSJobQueueSpin<shared_ptr<GSRasterizerData> >
 	{
 		GSRasterizer* m_r;
 	public:
 		GSWorkerSpin(GSRasterizer* r);
 		virtual ~GSWorkerSpin();
 		int GetPixels(bool reset);
 		// GSJobQueue
 		void Process(shared_ptr<GSRasterizerData>& item);
 	};
 #endif
 	GSPerfMon* m_perfmon;
 #ifdef ENABLE_BOOST
 	vector<IGSJobQueue<shared_ptr<GSRasterizerData> > *> m_workers;
 #else
 	vector<GSWorker*> m_workers;
 #endif
 	uint8* m_scanline;
 	GSRasterizerList(int threads, GSPerfMon* perfmon);
@ -204,7 +229,7 @@ protected:
 public:
 	virtual ~GSRasterizerList();
-	template<class DS> static IRasterizer* Create(int threads, GSPerfMon* perfmon)
+	template<class DS> static IRasterizer* Create(int threads, GSPerfMon* perfmon, bool spin_thread = false)
 	{
 		threads = std::max<int>(threads, 0);
@ -218,7 +243,14 @@ public:
 			for(int i = 0; i < threads; i++)
 			{
 #ifdef ENABLE_BOOST
 				if (spin_thread)
 					rl->m_workers.push_back(new GSWorkerSpin(new GSRasterizer(new DS(), i, threads, perfmon)));
 				else
 					rl->m_workers.push_back(new GSWorker(new GSRasterizer(new DS(), i, threads, perfmon)));
 #else
 				rl->m_workers.push_back(new GSWorker(new GSRasterizer(new DS(), i, threads, perfmon)));
 #endif
 			}
 			return rl;
@ -227,7 +259,7 @@ public:
 	// IRasterizer
-	void Queue(shared_ptr<GSRasterizerData> data);
+	void Queue(const shared_ptr<GSRasterizerData>& data);
 	void Sync();
 	bool IsSynced() const;
 	int GetPixels(bool reset);
--- a/plugins/GSdx/GSRenderer.cpp
+++ b/plugins/GSdx/GSRenderer.cpp
@ -406,7 +406,11 @@ void GSRenderer::VSync(int field)
 			// be noticeable).  Besides, these locks are extremely short -- overhead of conditional
 			// is way more expensive than just waiting for the CriticalSection in 1 of 10,000,000 tries. --air
 #ifdef _CX11_
 			std::lock_guard<std::mutex> lock(m_pGSsetTitle_Crit);
 #else
 			GSAutoLock lock(&m_pGSsetTitle_Crit);
 #endif
 			strncpy(m_GStitleInfoBuffer, s.c_str(), countof(m_GStitleInfoBuffer) - 1);
--- a/plugins/GSdx/GSRenderer.h
+++ b/plugins/GSdx/GSRenderer.h
@ -78,7 +78,11 @@ public:
 	virtual void EndCapture();
 public:
 #ifdef _CX11_
 	std::mutex m_pGSsetTitle_Crit;
 #else
 	GSCritSec m_pGSsetTitle_Crit;
 #endif
 	char m_GStitleInfoBuffer[128];
 };
--- a/plugins/GSdx/GSRendererSW.cpp
+++ b/plugins/GSdx/GSRendererSW.cpp
@ -41,7 +41,8 @@ GSRendererSW::GSRendererSW(int threads)
 	memset(m_texture, 0, sizeof(m_texture));
-	m_rl = GSRasterizerList::Create<GSDrawScanline>(threads, &m_perfmon);
+	bool spin_thread = !!theApp.GetConfig("spin_thread", 0);
 	m_rl = GSRasterizerList::Create<GSDrawScanline>(threads, &m_perfmon, spin_thread);
 	m_output = (uint8*)_aligned_malloc(1024 * 1024 * sizeof(uint32), 32);
--- a/plugins/GSdx/GSThread.cpp
+++ b/plugins/GSdx/GSThread.cpp
@ -20,10 +20,15 @@
 */
 #include "stdafx.h"
 #ifdef ENABLE_BOOST
 #include "GSThread_CXX11.h"
 #else
 #include "GSThread.h"
 #endif
 #ifdef _WINDOWS
 #ifndef ENABLE_BOOST
 InitializeConditionVariablePtr pInitializeConditionVariable;
 WakeConditionVariablePtr pWakeConditionVariable;
 WakeAllConditionVariablePtr pWakeAllConditionVariable;
@ -65,6 +70,7 @@ public:
 };
 static InitCondVar s_icv;
 #endif
 #endif
--- a/plugins/GSdx/GSThread.h
+++ b/plugins/GSdx/GSThread.h
@ -152,9 +152,6 @@ public:
 #include <pthread.h>
 #endif
 #include <mutex>
 #include <condition_variable>
 class GSThread : public IGSThread
 {
    #ifdef _STD_THREAD_
--- a/plugins/GSdx/GSThread_CXX11.h
+++ b/plugins/GSdx/GSThread_CXX11.h
@ -0,0 +1,355 @@
 /*
 *	Copyright (C) 2007-2009 Gabest
 *	http://www.gabest.org
 *
 *  This Program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2, or (at your option)
 *  any later version.
 *
 *  This Program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with GNU Make; see the file COPYING.  If not, write to
 *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
 *  http://www.gnu.org/copyleft/gpl.html
 *
 */
 #pragma once
 #include "GSdx.h"
 #define BOOST_STAND_ALONE
 #ifdef BOOST_STAND_ALONE
 #include "boost_spsc_queue.hpp"
 #else
 #include <boost/lockfree/spsc_queue.hpp>
 #endif
 class IGSThread
 {
 protected:
 	virtual void ThreadProc() = 0;
 };
 // let us use std::thread for now, comment out the definition to go back to pthread
 // There are currently some bugs/limitations to std::thread (see various comment)
 // For the moment let's keep pthread but uses new std object (mutex, cond_var)
 //#define _STD_THREAD_
 #ifdef _WINDOWS
 class GSThread : public IGSThread
 {
    DWORD m_ThreadId;
    HANDLE m_hThread;
 	static DWORD WINAPI StaticThreadProc(void* lpParam);
 protected:
 	void CreateThread();
 	void CloseThread();
 public:
 	GSThread();
 	virtual ~GSThread();
 };
 #else
 #ifdef _STD_THREAD_
 #include <thread>
 #else
 #include <pthread.h>
 #endif
 class GSThread : public IGSThread
 {
    #ifdef _STD_THREAD_
    std::thread *t;
    #else
    pthread_attr_t m_thread_attr;
    pthread_t m_thread;
    #endif
    static void* StaticThreadProc(void* param);
 protected:
 	void CreateThread();
 	void CloseThread();
 public:
 	GSThread();
 	virtual ~GSThread();
 };
 #endif
 // To allow switching between queue dynamically
 template<class T> class IGSJobQueue : public GSThread
 {
 public:
 	IGSJobQueue() {}
 	virtual ~IGSJobQueue() {}
 	virtual bool IsEmpty() const = 0;
 	virtual void Push(const T& item) = 0;
 	virtual void Wait() = 0;
 	virtual void Process(T& item) = 0;
 	virtual int GetPixels(bool reset) = 0;
 };
 // This queue doesn't reserve any thread. It would be nicer for 2c/4c CPU.
 // pros: no hard limit on thread numbers
 // cons: less performance by thread
 template<class T> class GSJobQueue : public IGSJobQueue<T>
 {
 protected:
 	std::atomic<int16_t> m_count;
 	std::atomic<bool> m_exit;
 #ifdef BOOST_STAND_ALONE
 	ringbuffer_base<T, 256> m_queue;
 #else
 	boost::lockfree::spsc_queue<T, boost::lockfree::capacity<255> > m_queue;
 #endif
 	std::mutex m_lock;
 	std::condition_variable m_empty;
 	std::condition_variable m_notempty;
 	void ThreadProc() {
 		std::unique_lock<std::mutex> l(m_lock);
 		while (true) {
 			while (m_count == 0) {
 				if (m_exit.load(memory_order_acquire)) return;
 				m_notempty.wait(l);
 			}
 			l.unlock();
 			int16_t consumed = 0;
 			for (int16_t nb = m_count; nb >= 0; nb--) {
 				if (m_queue.consume_one(*this))
 					consumed++;
 			}
 			l.lock();
 			m_count -= consumed;
 			if (m_count <= 0)
 				m_empty.notify_one();
 		}
 	}
 public:
 	GSJobQueue() :
 		m_count(0),
 		m_exit(false)
 	{
 		this->CreateThread();
 	}
 	virtual ~GSJobQueue() {
 		m_exit.store(true, memory_order_release);
 		m_notempty.notify_one();
 		this->CloseThread();
 	}
 	bool IsEmpty() const {
 		ASSERT(m_count >= 0);
 		return m_count == 0;
 	}
 	void Push(const T& item) {
 		while(!m_queue.push(item))
 			std::this_thread::yield();
 		std::unique_lock<std::mutex> l(m_lock);
 		m_count++;
 		l.unlock();
 		m_notempty.notify_one();
 	}
 	void Wait() {
 		if (m_count > 0) {
 			std::unique_lock<std::mutex> l(m_lock);
 			while (m_count > 0) {
 				m_empty.wait(l);
 			}
 		}
 		ASSERT(m_count == 0);
 	}
 	void operator() (T& item) {
 		this->Process(item);
 	}
 };
 // This queue reserves 'only' RENDERING threads mostly the same performance as a no reservation queue if the CPU is fast enough
 // pros: nearly best fps by thread
 // cons: requires (1 + eThreads) cores for GS emulation only ! Reserved to 6/8 cores CPU.
 // Note: I'm not sure of the source of the speedup
 //		1/ It could be related to less MT logic (lock, cond var)
 //		2/ But I highly suspect that waking up thread is rather slow.  My guess
 //		is that low power feature (like C state) increases latency. In this case
 //		gain will be smaller if PCSX2 is running or in limited core CPU (<=4)
 template<class T> class GSJobQueueSpin : public IGSJobQueue<T>
 {
 protected:
 	std::atomic<int16_t> m_count;
 	std::atomic<bool> m_exit;
 #ifdef BOOST_STAND_ALONE
 	ringbuffer_base<T, 256> m_queue;
 #else
 	boost::lockfree::spsc_queue<T, boost::lockfree::capacity<255> > m_queue;
 #endif
 	std::mutex m_lock;
 	std::condition_variable m_empty;
 	void ThreadProc() {
 		std::unique_lock<std::mutex> l(m_lock, defer_lock);
 		while (true) {
 			while (m_count == 0) {
 				if (m_exit.load(memory_order_acquire)) return;
 				std::this_thread::yield();
 			}
 			int16_t consumed = 0;
 			for (int16_t nb = m_count; nb >= 0; nb--) {
 				if (m_queue.consume_one(*this))
 					consumed++;
 			}
 			l.lock();
 			m_count -= consumed;
 			l.unlock();
 			if (m_count <= 0)
 				m_empty.notify_one();
 		}
 	}
 public:
 	GSJobQueueSpin() :
 		m_count(0),
 		m_exit(false)
 	{
 		this->CreateThread();
 	};
 	virtual ~GSJobQueueSpin() {
 		m_exit.store(true, memory_order_release);
 		this->CloseThread();
 	}
 	bool IsEmpty() const {
 		ASSERT(m_count >= 0);
 		return m_count == 0;
 	}
 	void Push(const T& item) {
 		while(!m_queue.push(item))
 			std::this_thread::yield();
 		m_count++;
 	}
 	void Wait() {
 		if (m_count > 0) {
 			std::unique_lock<std::mutex> l(m_lock);
 			while (m_count > 0) {
 				m_empty.wait(l);
 			}
 		}
 		ASSERT(m_count == 0);
 	}
 	void operator() (T& item) {
 		this->Process(item);
 	}
 };
 // This queue reserves RENDERING threads + GS threads onto dedicated CPU
 // pros: best fps by thread
 // cons: requires (1 + eThreads) cores for GS emulation only ! Reserved to 8 cores CPU.
 #if 0
 template<class T> class GSJobQueue : public IGSJobQueue<T>
 {
 protected:
 	std::atomic<int16_t> m_count;
 	std::atomic<bool> m_exit;
 	boost::lockfree::spsc_queue<T, boost::lockfree::capacity<255> > m_queue;
 	void ThreadProc() {
 		while (true) {
 			while (m_count == 0) {
 				if (m_exit.load(memory_order_acquire)) return;
 				std::this_thread::yield();
 			}
 			m_count -= m_queue.consume_all(*this);
 		}
 	}
 public:
 	GSJobQueue() :
 		m_count(0),
 		m_exit(false)
 	{
 		CreateThread();
 	};
 	virtual ~GSJobQueue() {
 		m_exit = true;
 		CloseThread();
 	}
 	bool IsEmpty() const {
 		ASSERT(m_count >= 0);
 		return m_count == 0;
 	}
 	void Push(const T& item) {
 		m_count++;
 		while(!m_queue.push(item))
 			std::this_thread::yield();
 	}
 	void Wait() {
 		while (m_count > 0)
 			std::this_thread::yield();
 		ASSERT(m_count == 0);
 	}
 	virtual void Process(T& item) = 0;
 	void operator() (T& item) {
 		this->Process(item);
 	}
 };
 #endif
--- a/plugins/GSdx/boost_spsc_queue.hpp
+++ b/plugins/GSdx/boost_spsc_queue.hpp
@ -0,0 +1,177 @@
 // This version is a stripped down version of boost/lockfree/spsc_queue.hpp boost_spsc_queue.hpp
 // Rational
 // * Performance is better on linux than the standard std::queue
 // * Performance in the same on windows
 // => 100-200MB of dependency feel rather unfriendly
 // Potential optimization
 // * plug condition variable into the queue directly to avoid redundant m_count
 // * Restore boost optimization
 //   => unlikely or replace it with a % (if size is 2^n)
 //  lock-free single-producer/single-consumer ringbuffer
 //  this algorithm is implemented in various projects (linux kernel)
 //
 //  Copyright (C) 2009-2013 Tim Blechmann
 //
 //  Distributed under the Boost Software License, Version 1.0. (See
 //  accompanying file LICENSE_1_0.txt or copy at
 //  http://www.boost.org/LICENSE_1_0.txt)
 // Boost Software License - Version 1.0 - August 17th, 2003
 //
 // Permission is hereby granted, free of charge, to any person or organization
 // obtaining a copy of the software and accompanying documentation covered by
 // this license (the "Software") to use, reproduce, display, distribute,
 // execute, and transmit the Software, and to prepare derivative works of the
 // Software, and to permit third-parties to whom the Software is furnished to
 // do so, all subject to the following:
 //
 // The copyright notices in the Software and this entire statement, including
 // the above license grant, this restriction and the following disclaimer,
 // must be included in all copies of the Software, in whole or in part, and
 // all derivative works of the Software, unless such copies or derivative
 // works are solely in the form of machine-executable object code generated by
 // a source language processor.
 //
 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 // FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
 // SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 // FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 // ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 // DEALINGS IN THE SOFTWARE.
 template <typename T, size_t max_size>
 class ringbuffer_base
 {
    static const int padding_size = 64 - sizeof(size_t);
    atomic<size_t> write_index_;
    char padding1[padding_size]; /* force read_index and write_index to different cache lines */
    atomic<size_t> read_index_;
    T *buffer;
    ringbuffer_base(ringbuffer_base const &) = delete;
    ringbuffer_base(ringbuffer_base &&)      = delete;
    const ringbuffer_base& operator=( const ringbuffer_base& ) = delete;
 public:
    ringbuffer_base(void):
        write_index_(0), read_index_(0)
    {
        // Use dynamically allocation here with no T object dependency
        // Otherwise the ringbuffer_base destructor will call the destructor
        // of T which crash if T is a (invalid) shared_ptr.
        //
        // Note another solution will be to create a char buffer as union of T
        buffer = (T*)_aligned_malloc(sizeof(T)*max_size, 32);
    }
    ~ringbuffer_base(void) {
        // destroy all remaining items
        T out;
        while (pop(out)) {};
        _aligned_free(buffer);
    }
    static size_t next_index(size_t arg)
    {
        size_t ret = arg + 1;
 #if 0
        while (unlikely(ret >= max_size))
 #else
        while (ret >= max_size)
 #endif
            ret -= max_size;
        return ret;
    }
    bool push(T const & t)
    {
        const size_t write_index = write_index_.load(memory_order_relaxed);  // only written from push thread
        const size_t next = next_index(write_index);
        if (next == read_index_.load(memory_order_acquire))
            return false; /* ringbuffer is full */
        new (buffer + write_index) T(t); // copy-construct
        write_index_.store(next, memory_order_release);
        return true;
    }
    bool pop (T & ret)
    {
        const size_t write_index = write_index_.load(memory_order_acquire);
        const size_t read_index  = read_index_.load(memory_order_relaxed); // only written from pop thread
        if (empty(write_index, read_index))
            return false;
        ret = buffer[read_index];
        buffer[read_index].~T();
        size_t next = next_index(read_index);
        read_index_.store(next, memory_order_release);
        return true;
    }
    template <typename Functor>
    bool consume_one(Functor & f)
    {
        const size_t write_index = write_index_.load(memory_order_acquire);
        const size_t read_index  = read_index_.load(memory_order_relaxed); // only written from pop thread
        if (empty(write_index, read_index))
            return false;
        f(buffer[read_index]);
        buffer[read_index].~T();
        size_t next = next_index(read_index);
        read_index_.store(next, memory_order_release);
        return true;
    }
 public:
    /** reset the ringbuffer
     *
     * \note Not thread-safe
     * */
    void reset(void)
    {
        write_index_.store(0, memory_order_relaxed);
        read_index_.store(0, memory_order_release);
    }
    /** Check if the ringbuffer is empty
     *
     * \return true, if the ringbuffer is empty, false otherwise
     * \note Due to the concurrent nature of the ringbuffer the result may be inaccurate.
     * */
    bool empty(void)
    {
        return empty(write_index_.load(memory_order_relaxed), read_index_.load(memory_order_relaxed));
    }
    /**
     * \return true, if implementation is lock-free.
     *
     * */
    bool is_lock_free(void) const
    {
        return write_index_.is_lock_free() && read_index_.is_lock_free();
    }
 private:
    bool empty(size_t write_index, size_t read_index)
    {
        return write_index == read_index;
    }
 };
--- a/plugins/GSdx/stdafx.h
+++ b/plugins/GSdx/stdafx.h
@ -60,6 +60,12 @@
 #endif
 // Require at least Visual Studio 2012
 #if defined(__linux__) || (defined(_MSC_VER) && (_MSC_VER >= 1700))
 #define _CX11_
 #define ENABLE_BOOST // queue is from boost but it doesn't require a full boost install
 #endif
 // put these into vc9/common7/ide/usertype.dat to have them highlighted
 typedef unsigned char uint8;
@ -96,6 +102,14 @@ typedef uint32 uptr;
 #include <set>
 #include <queue>
 #include <algorithm>
 #ifdef _CX11_
 #include <thread>
 #include <atomic>
 #endif
 #if defined(__linux__) || defined(_CX11_)
 #include <mutex>
 #include <condition_variable>
 #endif
 using namespace std;