From 8deee6afbc564d7c7d87d7e7d650be7ff70fab94 Mon Sep 17 00:00:00 2001
From: Gregory Hainaut <gregory.hainaut@gmail.com>
Date: Tue, 3 Mar 2015 10:00:43 +0100
Subject: [PATCH 01/11] gsdx: include some C++11 define for later

---
 plugins/GSdx/GSThread.h | 3 ---
 plugins/GSdx/stdafx.h   | 8 ++++++++
 2 files changed, 8 insertions(+), 3 deletions(-)
diff --git a/plugins/GSdx/GSThread.h b/plugins/GSdx/GSThread.h
index b97d34027e..2d10c68a77 100644
--- a/plugins/GSdx/GSThread.h
+++ b/plugins/GSdx/GSThread.h
@@ -152,9 +152,6 @@ public:
 #include <pthread.h>
 #endif
 
-#include <mutex>
-#include <condition_variable>
-
 class GSThread : public IGSThread
 {
     #ifdef _STD_THREAD_
diff --git a/plugins/GSdx/stdafx.h b/plugins/GSdx/stdafx.h
index cda44482f1..d21c72716e 100644
--- a/plugins/GSdx/stdafx.h
+++ b/plugins/GSdx/stdafx.h
@@ -96,6 +96,14 @@ typedef uint32 uptr;
 #include <set>
 #include <queue>
 #include <algorithm>
+#ifdef _CX11_
+#include <thread>
+#include <atomic>
+#endif
+#if defined(__linux__) || defined(_CX11_)
+#include <mutex>
+#include <condition_variable>
+#endif
 
 using namespace std;
 

From 9ad5933120eab35286d3fe74239b0e3bdde9be71 Mon Sep 17 00:00:00 2001
From: Gregory Hainaut <gregory.hainaut@gmail.com>
Date: Tue, 3 Mar 2015 18:22:09 +0100
Subject: [PATCH 02/11] gsdx: Use composition insead of inheritance to support
 lock

To ease update to C++11
---
 plugins/GSdx/GSCapture.cpp | 6 +++---
 plugins/GSdx/GSCapture.h   | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/plugins/GSdx/GSCapture.cpp b/plugins/GSdx/GSCapture.cpp
index 4595760945..904bfd5a0d 100644
--- a/plugins/GSdx/GSCapture.cpp
+++ b/plugins/GSdx/GSCapture.cpp
@@ -386,7 +386,7 @@ GSCapture::~GSCapture()
 
 bool GSCapture::BeginCapture(float fps)
 {
-	GSAutoLock lock(this);
+	GSAutoLock lock(&m_lock);
 
 	ASSERT(fps != 0);
 
@@ -481,7 +481,7 @@ bool GSCapture::BeginCapture(float fps)
 
 bool GSCapture::DeliverFrame(const void* bits, int pitch, bool rgba)
 {
-	GSAutoLock lock(this);
+	GSAutoLock lock(&m_lock);
 
 	if(bits == NULL || pitch == 0)
 	{
@@ -506,7 +506,7 @@ bool GSCapture::DeliverFrame(const void* bits, int pitch, bool rgba)
 
 bool GSCapture::EndCapture()
 {
-	GSAutoLock lock(this);
+	GSAutoLock lock(&m_lock);
 
 #ifdef _WINDOWS
 
diff --git a/plugins/GSdx/GSCapture.h b/plugins/GSdx/GSCapture.h
index 65125c05e8..eb6e45601c 100644
--- a/plugins/GSdx/GSCapture.h
+++ b/plugins/GSdx/GSCapture.h
@@ -28,8 +28,9 @@
 #include "GSCaptureDlg.h"
 #endif
 
-class GSCapture : protected GSCritSec
+class GSCapture
 {
+	GSCritSec m_lock;
 	bool m_capturing;
 	GSVector2i m_size;
 

From a75d78bd7e3784ad9aa8bbcd371bd39def29caa7 Mon Sep 17 00:00:00 2001
From: Gregory Hainaut <gregory.hainaut@gmail.com>
Date: Tue, 3 Mar 2015 18:29:21 +0100
Subject: [PATCH 03/11] gsdx: use standard lock_guard instead of GSAutoLock

---
 plugins/GSdx/GS.cpp          |  4 ++++
 plugins/GSdx/GSCapture.cpp   | 12 ++++++++++++
 plugins/GSdx/GSCapture.h     |  6 ++++++
 plugins/GSdx/GSLocalMemory.h |  2 ++
 plugins/GSdx/GSRenderer.cpp  |  4 ++++
 plugins/GSdx/GSRenderer.h    |  4 ++++
 6 files changed, 32 insertions(+)

diff --git a/plugins/GSdx/GS.cpp b/plugins/GSdx/GS.cpp
index 7cca083196..c09fb2596e 100644
--- a/plugins/GSdx/GS.cpp
+++ b/plugins/GSdx/GS.cpp
@@ -853,7 +853,11 @@ EXPORT_C GSgetTitleInfo2(char* dest, size_t length)
 
 	if(s_gs->m_GStitleInfoBuffer[0])
 	{
+#ifdef _CX11_
+		std::lock_guard<std::mutex> lock(s_gs->m_pGSsetTitle_Crit);
+#else
 		GSAutoLock lock(&s_gs->m_pGSsetTitle_Crit);
+#endif
 
 		s = format("GSdx | %s", s_gs->m_GStitleInfoBuffer);
 
diff --git a/plugins/GSdx/GSCapture.cpp b/plugins/GSdx/GSCapture.cpp
index 904bfd5a0d..d5c7b34615 100644
--- a/plugins/GSdx/GSCapture.cpp
+++ b/plugins/GSdx/GSCapture.cpp
@@ -386,7 +386,11 @@ GSCapture::~GSCapture()
 
 bool GSCapture::BeginCapture(float fps)
 {
+#ifdef _CX11_
+	std::lock_guard<std::mutex> lock(m_lock);
+#else
 	GSAutoLock lock(&m_lock);
+#endif
 
 	ASSERT(fps != 0);
 
@@ -481,7 +485,11 @@ bool GSCapture::BeginCapture(float fps)
 
 bool GSCapture::DeliverFrame(const void* bits, int pitch, bool rgba)
 {
+#ifdef _CX11_
+	std::lock_guard<std::mutex> lock(m_lock);
+#else
 	GSAutoLock lock(&m_lock);
+#endif
 
 	if(bits == NULL || pitch == 0)
 	{
@@ -506,7 +514,11 @@ bool GSCapture::DeliverFrame(const void* bits, int pitch, bool rgba)
 
 bool GSCapture::EndCapture()
 {
+#ifdef _CX11_
+	std::lock_guard<std::mutex> lock(m_lock);
+#else
 	GSAutoLock lock(&m_lock);
+#endif
 
 #ifdef _WINDOWS
 
diff --git a/plugins/GSdx/GSCapture.h b/plugins/GSdx/GSCapture.h
index eb6e45601c..f6a0d56c0e 100644
--- a/plugins/GSdx/GSCapture.h
+++ b/plugins/GSdx/GSCapture.h
@@ -22,7 +22,9 @@
 #pragma once
 
 #include "GSVector.h"
+#ifndef _CX11_
 #include "GSThread.h"
+#endif
 
 #ifdef _WINDOWS
 #include "GSCaptureDlg.h"
@@ -30,7 +32,11 @@
 
 class GSCapture
 {
+#ifdef _CX11_
+	std::mutex m_lock;
+#else
 	GSCritSec m_lock;
+#endif
 	bool m_capturing;
 	GSVector2i m_size;
 
diff --git a/plugins/GSdx/GSLocalMemory.h b/plugins/GSdx/GSLocalMemory.h
index 70e171090f..591e654369 100644
--- a/plugins/GSdx/GSLocalMemory.h
+++ b/plugins/GSdx/GSLocalMemory.h
@@ -26,7 +26,9 @@
 #include "GSVector.h"
 #include "GSBlock.h"
 #include "GSClut.h"
+#ifndef _CX11_
 #include "GSThread.h"
+#endif
 
 class GSOffset : public GSAlignedClass<32>
 {
diff --git a/plugins/GSdx/GSRenderer.cpp b/plugins/GSdx/GSRenderer.cpp
index 876ba5fea4..eb23a9607d 100644
--- a/plugins/GSdx/GSRenderer.cpp
+++ b/plugins/GSdx/GSRenderer.cpp
@@ -406,7 +406,11 @@ void GSRenderer::VSync(int field)
 			// be noticeable).  Besides, these locks are extremely short -- overhead of conditional
 			// is way more expensive than just waiting for the CriticalSection in 1 of 10,000,000 tries. --air
 
+#ifdef _CX11_
+			std::lock_guard<std::mutex> lock(m_pGSsetTitle_Crit);
+#else
 			GSAutoLock lock(&m_pGSsetTitle_Crit);
+#endif
 
 			strncpy(m_GStitleInfoBuffer, s.c_str(), countof(m_GStitleInfoBuffer) - 1);
 
diff --git a/plugins/GSdx/GSRenderer.h b/plugins/GSdx/GSRenderer.h
index 0a68c16c3c..ad4eb22e64 100644
--- a/plugins/GSdx/GSRenderer.h
+++ b/plugins/GSdx/GSRenderer.h
@@ -78,7 +78,11 @@ public:
 	virtual void EndCapture();
 
 public:
+#ifdef _CX11_
+	std::mutex m_pGSsetTitle_Crit;
+#else
 	GSCritSec m_pGSsetTitle_Crit;
+#endif
 
 	char m_GStitleInfoBuffer[128];
 };

From 96820614729ff832c6613883c954021031d33b26 Mon Sep 17 00:00:00 2001
From: Gregory Hainaut <gregory.hainaut@gmail.com>
Date: Tue, 3 Mar 2015 10:01:22 +0100
Subject: [PATCH 04/11] gsdx-queue:add a new job dispatcher queue based on
 boost and C++11

It is faster on linux, it requires less code, and it is "portable"

It requires boost (only hpp files) + MSVC 2013 (for atomic) (seem doable by 2012 too)

Actually there are several queues that either use spinlock or full sleep
---
 plugins/GSdx/GSRasterizer.h   |   6 +-
 plugins/GSdx/GSThread.cpp     |   6 +
 plugins/GSdx/GSThread_CXX11.h | 339 ++++++++++++++++++++++++++++++++++
 3 files changed, 350 insertions(+), 1 deletion(-)
 create mode 100644 plugins/GSdx/GSThread_CXX11.h

diff --git a/plugins/GSdx/GSRasterizer.h b/plugins/GSdx/GSRasterizer.h
index 998a744512..bd86546ec0 100644
--- a/plugins/GSdx/GSRasterizer.h
+++ b/plugins/GSdx/GSRasterizer.h
@@ -24,9 +24,13 @@
 #include "GS.h"
 #include "GSVertexSW.h"
 #include "GSFunctionMap.h"
-#include "GSThread.h"
 #include "GSAlignedClass.h"
 #include "GSPerfMon.h"
+#ifdef ENABLE_BOOST
+#include "GSThread_CXX11.h"
+#else
+#include "GSThread.h"
+#endif
 
 __aligned(class, 32) GSRasterizerData : public GSAlignedClass<32>
 {
diff --git a/plugins/GSdx/GSThread.cpp b/plugins/GSdx/GSThread.cpp
index 9e63c055ed..0b2588feda 100644
--- a/plugins/GSdx/GSThread.cpp
+++ b/plugins/GSdx/GSThread.cpp
@@ -20,10 +20,15 @@
  */
 
 #include "stdafx.h"
+#ifdef ENABLE_BOOST
+#include "GSThread_CXX11.h"
+#else
 #include "GSThread.h"
+#endif
 
 #ifdef _WINDOWS
 
+#ifndef ENABLE_BOOST
 InitializeConditionVariablePtr pInitializeConditionVariable;
 WakeConditionVariablePtr pWakeConditionVariable;
 WakeAllConditionVariablePtr pWakeAllConditionVariable;
@@ -65,6 +70,7 @@ public:
 };
 
 static InitCondVar s_icv;
+#endif
 
 #endif
 
diff --git a/plugins/GSdx/GSThread_CXX11.h b/plugins/GSdx/GSThread_CXX11.h
new file mode 100644
index 0000000000..33a87a4906
--- /dev/null
+++ b/plugins/GSdx/GSThread_CXX11.h
@@ -0,0 +1,339 @@
+/*
+ *	Copyright (C) 2007-2009 Gabest
+ *	http://www.gabest.org
+ *
+ *  This Program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This Program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with GNU Make; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA USA.
+ *  http://www.gnu.org/copyleft/gpl.html
+ *
+ */
+
+#pragma once
+
+#include "GSdx.h"
+#include <boost/lockfree/spsc_queue.hpp>
+
+class IGSThread
+{
+protected:
+	virtual void ThreadProc() = 0;
+};
+
+// let us use std::thread for now, comment out the definition to go back to pthread
+// There are currently some bugs/limitations to std::thread (see various comment)
+// For the moment let's keep pthread but uses new std object (mutex, cond_var)
+//#define _STD_THREAD_
+
+#ifdef _WINDOWS
+
+class GSThread : public IGSThread
+{
+    DWORD m_ThreadId;
+    HANDLE m_hThread;
+
+	static DWORD WINAPI StaticThreadProc(void* lpParam);
+
+protected:
+	void CreateThread();
+	void CloseThread();
+
+public:
+	GSThread();
+	virtual ~GSThread();
+};
+
+#else
+
+#ifdef _STD_THREAD_
+#include <thread>
+#else
+#include <pthread.h>
+#endif
+
+class GSThread : public IGSThread
+{
+    #ifdef _STD_THREAD_
+    std::thread *t;
+    #else
+    pthread_attr_t m_thread_attr;
+    pthread_t m_thread;
+    #endif
+    static void* StaticThreadProc(void* param);
+
+protected:
+	void CreateThread();
+	void CloseThread();
+
+public:
+	GSThread();
+	virtual ~GSThread();
+};
+
+#endif
+
+// Activate only a single define (From the lowest latency to better CPU usage)
+
+// This queue locks RENDERING threads + GS threads onto dedicated CPU
+// pros: best fps by thread
+// cons: requires (1 + eThreads) cores for GS emulation only ! Reserved to 8 cores CPU.
+//#define NO_WAIT_BUT_CPU_INTENSIVE
+
+// This queue locks 'only' RENDERING threads mostly the same performance as above it the CPU is fast enough
+// pros: nearly best fps by thread
+// cons: requires (1 + eThreads) cores for GS emulation only ! Reserved to 6/8 cores CPU.
+//#define WAIT_ON_GS_STILL_CPU_INTENSIVE
+
+// This queue doesn't lock any thread. It would be nicer for 2c/4c CPU.
+// pros: no hard limit on thread numbers
+// cons: less performance by thread
+#define FULL_WAIT_LESS_CPU_INTENSIVE
+
+#if defined(FULL_WAIT_LESS_CPU_INTENSIVE)
+
+template<class T> class GSJobQueue : private GSThread
+{
+protected:
+	std::atomic<int16_t> m_count;
+	std::atomic<bool> m_exit;
+	boost::lockfree::spsc_queue<T, boost::lockfree::capacity<256> > m_queue;
+
+	std::mutex m_lock;
+	std::condition_variable m_empty;
+	std::condition_variable m_notempty;
+
+	void ThreadProc() {
+		std::unique_lock<std::mutex> l(m_lock);
+
+		while (true) {
+
+			while (m_count == 0) {
+				if (m_exit.load(memory_order_acquire)) return;
+				m_notempty.wait(l);
+			}
+
+			l.unlock();
+
+			int16_t consumed = 0;
+			for (int16_t nb = m_count; nb >= 0; nb--) {
+				if (m_queue.consume_one(*this))
+					consumed++;
+			}
+
+			l.lock();
+
+			m_count -= consumed;
+
+			if (m_count <= 0)
+				m_empty.notify_one();
+
+		}
+	}
+
+public:
+	GSJobQueue() :
+		m_count(0),
+		m_exit(false)
+	{
+		CreateThread();
+	};
+
+	virtual ~GSJobQueue() {
+		m_exit = true;
+		m_notempty.notify_one();
+		CloseThread();
+	}
+
+	bool IsEmpty() const {
+		ASSERT(m_count >= 0);
+
+		return m_count == 0;
+	}
+
+	void Push(const T& item) {
+		while(!m_queue.push(item))
+			std::this_thread::yield();
+
+		std::unique_lock<std::mutex> l(m_lock);
+
+		m_count++;
+
+		l.unlock();
+
+		m_notempty.notify_one();
+	}
+
+	void Wait() {
+		if (m_count > 0) {
+			std::unique_lock<std::mutex> l(m_lock);
+			while (m_count > 0) {
+				m_empty.wait(l);
+			}
+		}
+
+		ASSERT(m_count == 0);
+	}
+
+	virtual void Process(T& item) = 0;
+
+	void operator()(T& item) {
+		Process(item);
+	}
+};
+
+#elif defined(WAIT_ON_GS_STILL_CPU_INTENSIVE)
+
+template<class T> class GSJobQueue : private GSThread
+{
+protected:
+	std::atomic<int16_t> m_count;
+	std::atomic<bool> m_exit;
+	boost::lockfree::spsc_queue<T, boost::lockfree::capacity<256> > m_queue;
+
+	std::mutex m_lock;
+	std::condition_variable m_empty;
+
+	void ThreadProc() {
+		std::unique_lock<std::mutex> l(m_lock, defer_lock);
+
+		while (true) {
+
+			while (m_count == 0) {
+				if (m_exit.load(memory_order_acquire)) return;
+				std::this_thread::yield();
+			}
+
+			int16_t consumed = 0;
+			for (int16_t nb = m_count; nb >= 0; nb--) {
+				if (m_queue.consume_one(*this))
+					consumed++;
+			}
+
+			l.lock();
+
+			m_count -= consumed;
+
+			l.unlock();
+
+			if (m_count <= 0)
+				m_empty.notify_one();
+
+		}
+	}
+
+public:
+	GSJobQueue() :
+		m_count(0),
+		m_exit(false)
+	{
+		CreateThread();
+	};
+
+	virtual ~GSJobQueue() {
+		m_exit = true;
+		CloseThread();
+	}
+
+	bool IsEmpty() const {
+		ASSERT(m_count >= 0);
+
+		return m_count == 0;
+	}
+
+	void Push(const T& item) {
+		while(!m_queue.push(item))
+			std::this_thread::yield();
+
+		m_count++;
+	}
+
+	void Wait() {
+		if (m_count > 0) {
+			std::unique_lock<std::mutex> l(m_lock);
+			while (m_count > 0) {
+				m_empty.wait(l);
+			}
+		}
+
+		ASSERT(m_count == 0);
+	}
+
+	virtual void Process(T& item) = 0;
+
+	void operator()(T& item) {
+		Process(item);
+	}
+};
+
+#elif defined(NO_WAIT_BUT_CPU_INTENSIVE)
+
+template<class T> class GSJobQueue : private GSThread
+{
+protected:
+	std::atomic<int16_t> m_count;
+	std::atomic<bool> m_exit;
+	boost::lockfree::spsc_queue<T, boost::lockfree::capacity<256> > m_queue;
+
+	void ThreadProc() {
+		while (true) {
+			while (m_count == 0) {
+				if (m_exit.load(memory_order_acquire)) return;
+				std::this_thread::yield();
+			}
+
+			m_count -= m_queue.consume_all(*this);
+		}
+	}
+
+public:
+	GSJobQueue() :
+		m_count(0),
+		m_exit(false)
+	{
+		CreateThread();
+	};
+
+	virtual ~GSJobQueue() {
+		m_exit = true;
+		CloseThread();
+	}
+
+	bool IsEmpty() const {
+		ASSERT(m_count >= 0);
+
+		return m_count == 0;
+	}
+
+	void Push(const T& item) {
+		m_count++;
+		while(!m_queue.push(item))
+			std::this_thread::yield();
+	}
+
+	void Wait() {
+		while (m_count > 0)
+			std::this_thread::yield();
+
+		ASSERT(m_count == 0);
+	}
+
+	virtual void Process(T& item) = 0;
+
+	void operator()(T& item) {
+		Process(item);
+	}
+};
+
+#else
+	#very bad
+#endif

From 0aac47ca59196512cab23046bb76cb28e717c7b6 Mon Sep 17 00:00:00 2001
From: Gregory Hainaut <gregory.hainaut@gmail.com>
Date: Fri, 13 Mar 2015 19:52:04 +0100
Subject: [PATCH 05/11] gsdx-queue: add a new option "spin_thread" to select
 the queue behavior at runtime

If someone has a more elegant solution, feel free to share it

spin_thread = 0
spin_thread = 1 // the faster but GS thread will never stop, very bad for laptop
---
 plugins/GSdx/GSRasterizer.cpp | 34 +++++++++++++++--
 plugins/GSdx/GSRasterizer.h   | 30 ++++++++++++++-
 plugins/GSdx/GSRendererSW.cpp |  3 +-
 plugins/GSdx/GSThread_CXX11.h | 70 +++++++++++++++++------------------
 4 files changed, 96 insertions(+), 41 deletions(-)

diff --git a/plugins/GSdx/GSRasterizer.cpp b/plugins/GSdx/GSRasterizer.cpp
index 3462ab4082..e957b7ae76 100644
--- a/plugins/GSdx/GSRasterizer.cpp
+++ b/plugins/GSdx/GSRasterizer.cpp
@@ -1147,7 +1147,7 @@ GSRasterizerList::GSRasterizerList(int threads, GSPerfMon* perfmon)
 
 GSRasterizerList::~GSRasterizerList()
 {
-	for(vector<GSWorker*>::iterator i = m_workers.begin(); i != m_workers.end(); i++)
+	for(auto i = m_workers.begin(); i != m_workers.end(); i++)
 	{
 		delete *i;
 	}
@@ -1210,13 +1210,13 @@ int GSRasterizerList::GetPixels(bool reset)
 
 // GSRasterizerList::GSWorker
 
-GSRasterizerList::GSWorker::GSWorker(GSRasterizer* r) 
+GSRasterizerList::GSWorker::GSWorker(GSRasterizer* r)
 	: GSJobQueue<shared_ptr<GSRasterizerData> >()
 	, m_r(r)
 {
 }
 
-GSRasterizerList::GSWorker::~GSWorker() 
+GSRasterizerList::GSWorker::~GSWorker()
 {
 	Wait();
 
@@ -1228,7 +1228,33 @@ int GSRasterizerList::GSWorker::GetPixels(bool reset)
 	return m_r->GetPixels(reset);
 }
 
-void GSRasterizerList::GSWorker::Process(shared_ptr<GSRasterizerData>& item) 
+void GSRasterizerList::GSWorker::Process(shared_ptr<GSRasterizerData>& item)
 {
 	m_r->Draw(item.get());
 }
+
+// GSRasterizerList::GSWorkerSpin
+#ifdef ENABLE_BOOST
+GSRasterizerList::GSWorkerSpin::GSWorkerSpin(GSRasterizer* r)
+	: GSJobQueueSpin<shared_ptr<GSRasterizerData> >()
+	, m_r(r)
+{
+}
+
+GSRasterizerList::GSWorkerSpin::~GSWorkerSpin()
+{
+	Wait();
+
+	delete m_r;
+}
+
+int GSRasterizerList::GSWorkerSpin::GetPixels(bool reset)
+{
+	return m_r->GetPixels(reset);
+}
+
+void GSRasterizerList::GSWorkerSpin::Process(shared_ptr<GSRasterizerData>& item)
+{
+	m_r->Draw(item.get());
+}
+#endif
diff --git a/plugins/GSdx/GSRasterizer.h b/plugins/GSdx/GSRasterizer.h
index bd86546ec0..8fcc66255c 100644
--- a/plugins/GSdx/GSRasterizer.h
+++ b/plugins/GSdx/GSRasterizer.h
@@ -199,8 +199,29 @@ protected:
 		void Process(shared_ptr<GSRasterizerData>& item);
 	};
 
+#ifdef ENABLE_BOOST
+	class GSWorkerSpin : public GSJobQueueSpin<shared_ptr<GSRasterizerData> >
+	{
+		GSRasterizer* m_r;
+
+	public:
+		GSWorkerSpin(GSRasterizer* r);
+		virtual ~GSWorkerSpin();
+
+		int GetPixels(bool reset);
+
+		// GSJobQueue
+
+		void Process(shared_ptr<GSRasterizerData>& item);
+	};
+#endif
+
 	GSPerfMon* m_perfmon;
+#ifdef ENABLE_BOOST
+	vector<IGSJobQueue<shared_ptr<GSRasterizerData> > *> m_workers;
+#else
 	vector<GSWorker*> m_workers;
+#endif
 	uint8* m_scanline;
 
 	GSRasterizerList(int threads, GSPerfMon* perfmon);
@@ -208,7 +229,7 @@ protected:
 public:
 	virtual ~GSRasterizerList();
 
-	template<class DS> static IRasterizer* Create(int threads, GSPerfMon* perfmon)
+	template<class DS> static IRasterizer* Create(int threads, GSPerfMon* perfmon, bool spin_thread = false)
 	{
 		threads = std::max<int>(threads, 0);
 
@@ -222,7 +243,14 @@ public:
 
 			for(int i = 0; i < threads; i++)
 			{
+#ifdef ENABLE_BOOST
+				if (spin_thread)
+					rl->m_workers.push_back(new GSWorkerSpin(new GSRasterizer(new DS(), i, threads, perfmon)));
+				else
+					rl->m_workers.push_back(new GSWorker(new GSRasterizer(new DS(), i, threads, perfmon)));
+#else
 				rl->m_workers.push_back(new GSWorker(new GSRasterizer(new DS(), i, threads, perfmon)));
+#endif
 			}
 
 			return rl;
diff --git a/plugins/GSdx/GSRendererSW.cpp b/plugins/GSdx/GSRendererSW.cpp
index 83fd2402f3..3dfa829c57 100644
--- a/plugins/GSdx/GSRendererSW.cpp
+++ b/plugins/GSdx/GSRendererSW.cpp
@@ -41,7 +41,8 @@ GSRendererSW::GSRendererSW(int threads)
 
 	memset(m_texture, 0, sizeof(m_texture));
 
-	m_rl = GSRasterizerList::Create<GSDrawScanline>(threads, &m_perfmon);
+	bool spin_thread = !!theApp.GetConfig("spin_thread", 0);
+	m_rl = GSRasterizerList::Create<GSDrawScanline>(threads, &m_perfmon, spin_thread);
 
 	m_output = (uint8*)_aligned_malloc(1024 * 1024 * sizeof(uint32), 32);
 
diff --git a/plugins/GSdx/GSThread_CXX11.h b/plugins/GSdx/GSThread_CXX11.h
index 33a87a4906..4cf7264d34 100644
--- a/plugins/GSdx/GSThread_CXX11.h
+++ b/plugins/GSdx/GSThread_CXX11.h
@@ -82,26 +82,25 @@ public:
 
 #endif
 
-// Activate only a single define (From the lowest latency to better CPU usage)
+// To allow switching between queue dynamically
+template<class T> class IGSJobQueue : public GSThread
+{
+public:
+	IGSJobQueue() {}
+	virtual ~IGSJobQueue() {}
 
-// This queue locks RENDERING threads + GS threads onto dedicated CPU
-// pros: best fps by thread
-// cons: requires (1 + eThreads) cores for GS emulation only ! Reserved to 8 cores CPU.
-//#define NO_WAIT_BUT_CPU_INTENSIVE
+	virtual bool IsEmpty() const = 0;
+	virtual void Push(const T& item) = 0;
+	virtual void Wait() = 0;
 
-// This queue locks 'only' RENDERING threads mostly the same performance as above it the CPU is fast enough
-// pros: nearly best fps by thread
-// cons: requires (1 + eThreads) cores for GS emulation only ! Reserved to 6/8 cores CPU.
-//#define WAIT_ON_GS_STILL_CPU_INTENSIVE
+	virtual void Process(T& item) = 0;
+	virtual int GetPixels(bool reset) = 0;
+};
 
 // This queue doesn't lock any thread. It would be nicer for 2c/4c CPU.
 // pros: no hard limit on thread numbers
 // cons: less performance by thread
-#define FULL_WAIT_LESS_CPU_INTENSIVE
-
-#if defined(FULL_WAIT_LESS_CPU_INTENSIVE)
-
-template<class T> class GSJobQueue : private GSThread
+template<class T> class GSJobQueue : public IGSJobQueue<T>
 {
 protected:
 	std::atomic<int16_t> m_count;
@@ -145,13 +144,13 @@ public:
 		m_count(0),
 		m_exit(false)
 	{
-		CreateThread();
-	};
+		this->CreateThread();
+	}
 
 	virtual ~GSJobQueue() {
 		m_exit = true;
 		m_notempty.notify_one();
-		CloseThread();
+		this->CloseThread();
 	}
 
 	bool IsEmpty() const {
@@ -184,16 +183,16 @@ public:
 		ASSERT(m_count == 0);
 	}
 
-	virtual void Process(T& item) = 0;
-
-	void operator()(T& item) {
-		Process(item);
+	void operator() (T& item) {
+		this->Process(item);
 	}
 };
 
-#elif defined(WAIT_ON_GS_STILL_CPU_INTENSIVE)
 
-template<class T> class GSJobQueue : private GSThread
+// This queue locks 'only' RENDERING threads mostly the same performance as above if the CPU is fast enough
+// pros: nearly best fps by thread
+// cons: requires (1 + eThreads) cores for GS emulation only ! Reserved to 6/8 cores CPU.
+template<class T> class GSJobQueueSpin : public IGSJobQueue<T>
 {
 protected:
 	std::atomic<int16_t> m_count;
@@ -232,16 +231,16 @@ protected:
 	}
 
 public:
-	GSJobQueue() :
+	GSJobQueueSpin() :
 		m_count(0),
 		m_exit(false)
 	{
-		CreateThread();
+		this->CreateThread();
 	};
 
-	virtual ~GSJobQueue() {
+	virtual ~GSJobQueueSpin() {
 		m_exit = true;
-		CloseThread();
+		this->CloseThread();
 	}
 
 	bool IsEmpty() const {
@@ -270,14 +269,17 @@ public:
 
 	virtual void Process(T& item) = 0;
 
-	void operator()(T& item) {
-		Process(item);
+	void operator() (T& item) {
+		this->Process(item);
 	}
 };
 
-#elif defined(NO_WAIT_BUT_CPU_INTENSIVE)
+// This queue locks RENDERING threads + GS threads onto dedicated CPU
+// pros: best fps by thread
+// cons: requires (1 + eThreads) cores for GS emulation only ! Reserved to 8 cores CPU.
+#if 0
 
-template<class T> class GSJobQueue : private GSThread
+template<class T> class GSJobQueue : public IGSJobQueue<T>
 {
 protected:
 	std::atomic<int16_t> m_count;
@@ -329,11 +331,9 @@ public:
 
 	virtual void Process(T& item) = 0;
 
-	void operator()(T& item) {
-		Process(item);
+	void operator() (T& item) {
+		this->Process(item);
 	}
 };
 
-#else
-	#very bad
 #endif

From c9194301a0b02ce30fcc124310728a3e4d36efd0 Mon Sep 17 00:00:00 2001
From: Gregory Hainaut <gregory.hainaut@gmail.com>
Date: Fri, 13 Mar 2015 19:59:31 +0100
Subject: [PATCH 06/11] gsdx-queue: (linux) add a GUI option to select the
 queue

---
 plugins/GSdx/GSLinuxDialog.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/plugins/GSdx/GSLinuxDialog.cpp b/plugins/GSdx/GSLinuxDialog.cpp
index ba652bd46d..1e9686d5fd 100644
--- a/plugins/GSdx/GSLinuxDialog.cpp
+++ b/plugins/GSdx/GSLinuxDialog.cpp
@@ -165,12 +165,12 @@ bool RunLinuxDialog()
 
 	GtkWidget *fsaa_combo_box, *render_combo_box, *filter_combo_box;
 	GtkWidget *shader, *shader_conf, *shader_label, *shader_conf_label;
-	GtkWidget *shadeboost_check, *paltex_check, *fba_check, *aa_check,  *native_res_check, *stretch_hack_check, *fxaa_check, *shaderfx_check, *align_sprite_check;
+	GtkWidget *shadeboost_check, *paltex_check, *fba_check, *aa_check,  *native_res_check, *fxaa_check, *shaderfx_check, *spin_thread_check;
 	GtkWidget *sb_contrast, *sb_brightness, *sb_saturation;
 	GtkWidget *resx_spin, *resy_spin;
 
 	GtkWidget *hack_table, *hack_skipdraw_label, *hack_box, *hack_frame;
-	GtkWidget *hack_alpha_check, *hack_date_check, *hack_offset_check, *hack_skipdraw_spin, *hack_sprite_check, * hack_wild_check, *hack_enble_check, *hack_logz_check;
+	GtkWidget *hack_alpha_check, *hack_date_check, *hack_offset_check, *hack_skipdraw_spin, *hack_sprite_check, * hack_wild_check, *hack_enble_check, *hack_logz_check, *align_sprite_check, *stretch_hack_check;
 	GtkWidget *hack_tco_label, *hack_tco_entry;
 	GtkWidget *gl_box, *gl_frame, *gl_table;
 
@@ -352,6 +352,7 @@ bool RunLinuxDialog()
 	paltex_check     = gtk_check_button_new_with_label("Allow 8 bits textures");
 	fba_check        = gtk_check_button_new_with_label("Alpha correction (FBA)");
 	aa_check         = gtk_check_button_new_with_label("Edge anti-aliasing (AA1)");
+	spin_thread_check= gtk_check_button_new_with_label("Disable thread sleeping (6+ cores CPU)");
 	fxaa_check       = gtk_check_button_new_with_label("Fxaa shader");
 	shaderfx_check   = gtk_check_button_new_with_label("External shader");
 
@@ -360,6 +361,7 @@ bool RunLinuxDialog()
 	gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(paltex_check), theApp.GetConfig("paltex", 0));
 	gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(fba_check), theApp.GetConfig("fba", 1));
 	gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(aa_check), theApp.GetConfig("aa1", 0));
+	gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(spin_thread_check), theApp.GetConfig("spin_thread", 0));
 	gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(fxaa_check), theApp.GetConfig("fxaa", 0));
 	gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(shaderfx_check), theApp.GetConfig("shaderfx", 0));
 	gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(native_res_check), theApp.GetConfig("nativeres", 0));
@@ -414,6 +416,7 @@ bool RunLinuxDialog()
 
 	gtk_container_add(GTK_CONTAINER(sw_box), threads_box);
 	gtk_container_add(GTK_CONTAINER(sw_box), aa_check);
+	gtk_container_add(GTK_CONTAINER(sw_box), spin_thread_check);
 
 	// Tables are strange. The numbers are for their position: left, right, top, bottom.
 	gtk_table_attach_defaults(GTK_TABLE(shader_table), fxaa_check, 0, 1, 0, 1);
@@ -544,6 +547,7 @@ override_GL_ARB_shading_language_420pack = -1
 		theApp.SetConfig("paltex", (int)gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(paltex_check)));
 		theApp.SetConfig("fba", (int)gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(fba_check)));
 		theApp.SetConfig("aa1", (int)gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(aa_check)));
+		theApp.SetConfig("spin_thread", (int)gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(spin_thread_check)));
 		theApp.SetConfig("fxaa", (int)gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(fxaa_check)));
 		theApp.SetConfig("shaderfx", (int)gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(shaderfx_check)));
 		theApp.SetConfig("nativeres", (int)gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(native_res_check)));

From 90794c302a6dc9c07525a283117c2e9a01adc6ef Mon Sep 17 00:00:00 2001
From: Gregory Hainaut <gregory.hainaut@gmail.com>
Date: Sat, 21 Mar 2015 15:39:31 +0100
Subject: [PATCH 07/11] gsdx-queue: import spsc_queue of boost

I remove 80% of the file to only keep the ring buffer core

Same speed as boost but without the boost dependency
---
 plugins/GSdx/boost_spsc_queue.hpp | 177 ++++++++++++++++++++++++++++++
 1 file changed, 177 insertions(+)
 create mode 100644 plugins/GSdx/boost_spsc_queue.hpp

diff --git a/plugins/GSdx/boost_spsc_queue.hpp b/plugins/GSdx/boost_spsc_queue.hpp
new file mode 100644
index 0000000000..c1104a5de7
--- /dev/null
+++ b/plugins/GSdx/boost_spsc_queue.hpp
@@ -0,0 +1,177 @@
+// This version is a stripped down version of boost/lockfree/spsc_queue.hpp boost_spsc_queue.hpp
+// Rational
+// * Performance is better on linux than the standard std::queue
+// * Performance in the same on windows
+// => 100-200MB of dependency feel rather unfriendly
+
+// Potential optimization
+// * plug condition variable into the queue directly to avoid redundant m_count
+
+// * Restore boost optimization
+//   => unlikely or replace it with a % (if size is 2^n)
+
+
+//  lock-free single-producer/single-consumer ringbuffer
+//  this algorithm is implemented in various projects (linux kernel)
+//
+//  Copyright (C) 2009-2013 Tim Blechmann
+//
+//  Distributed under the Boost Software License, Version 1.0. (See
+//  accompanying file LICENSE_1_0.txt or copy at
+//  http://www.boost.org/LICENSE_1_0.txt)
+
+// Boost Software License - Version 1.0 - August 17th, 2003
+//
+// Permission is hereby granted, free of charge, to any person or organization
+// obtaining a copy of the software and accompanying documentation covered by
+// this license (the "Software") to use, reproduce, display, distribute,
+// execute, and transmit the Software, and to prepare derivative works of the
+// Software, and to permit third-parties to whom the Software is furnished to
+// do so, all subject to the following:
+//
+// The copyright notices in the Software and this entire statement, including
+// the above license grant, this restriction and the following disclaimer,
+// must be included in all copies of the Software, in whole or in part, and
+// all derivative works of the Software, unless such copies or derivative
+// works are solely in the form of machine-executable object code generated by
+// a source language processor.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+// SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+// FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS IN THE SOFTWARE.
+
+
+template <typename T, size_t max_size>
+class ringbuffer_base
+{
+    static const int padding_size = 64 - sizeof(size_t);
+
+    atomic<size_t> write_index_;
+    char padding1[padding_size]; /* force read_index and write_index to different cache lines */
+    atomic<size_t> read_index_;
+
+    T *buffer;
+
+    ringbuffer_base(ringbuffer_base const &) = delete;
+    ringbuffer_base(ringbuffer_base &&)      = delete;
+    const ringbuffer_base& operator=( const ringbuffer_base& ) = delete;
+
+public:
+    ringbuffer_base(void):
+        write_index_(0), read_index_(0)
+    {
+        // Use dynamically allocation here with no T object dependency
+        // Otherwise the ringbuffer_base destructor will call the destructor
+        // of T which crash if T is a (invalid) shared_ptr.
+        //
+        // Note another solution will be to create a char buffer as union of T
+        buffer = (T*)_aligned_malloc(sizeof(T)*max_size, 32);
+    }
+
+    ~ringbuffer_base(void) {
+        // destroy all remaining items
+        T out;
+        while (pop(out)) {};
+
+        _aligned_free(buffer);
+    }
+
+
+    static size_t next_index(size_t arg)
+    {
+        size_t ret = arg + 1;
+#if 0
+        while (unlikely(ret >= max_size))
+#else
+        while (ret >= max_size)
+#endif
+            ret -= max_size;
+        return ret;
+    }
+
+    bool push(T const & t)
+    {
+        const size_t write_index = write_index_.load(memory_order_relaxed);  // only written from push thread
+        const size_t next = next_index(write_index);
+
+        if (next == read_index_.load(memory_order_acquire))
+            return false; /* ringbuffer is full */
+
+        new (buffer + write_index) T(t); // copy-construct
+
+        write_index_.store(next, memory_order_release);
+
+        return true;
+    }
+
+    bool pop (T & ret)
+    {
+        const size_t write_index = write_index_.load(memory_order_acquire);
+        const size_t read_index  = read_index_.load(memory_order_relaxed); // only written from pop thread
+        if (empty(write_index, read_index))
+            return false;
+
+        ret = buffer[read_index];
+        buffer[read_index].~T();
+
+        size_t next = next_index(read_index);
+        read_index_.store(next, memory_order_release);
+        return true;
+    }
+
+    template <typename Functor>
+    bool consume_one(Functor & f)
+    {
+        const size_t write_index = write_index_.load(memory_order_acquire);
+        const size_t read_index  = read_index_.load(memory_order_relaxed); // only written from pop thread
+        if (empty(write_index, read_index))
+            return false;
+
+        f(buffer[read_index]);
+        buffer[read_index].~T();
+
+        size_t next = next_index(read_index);
+        read_index_.store(next, memory_order_release);
+        return true;
+    }
+
+public:
+    /** reset the ringbuffer
+     *
+     * \note Not thread-safe
+     * */
+    void reset(void)
+    {
+        write_index_.store(0, memory_order_relaxed);
+        read_index_.store(0, memory_order_release);
+    }
+
+    /** Check if the ringbuffer is empty
+     *
+     * \return true, if the ringbuffer is empty, false otherwise
+     * \note Due to the concurrent nature of the ringbuffer the result may be inaccurate.
+     * */
+    bool empty(void)
+    {
+        return empty(write_index_.load(memory_order_relaxed), read_index_.load(memory_order_relaxed));
+    }
+
+    /**
+     * \return true, if implementation is lock-free.
+     *
+     * */
+    bool is_lock_free(void) const
+    {
+        return write_index_.is_lock_free() && read_index_.is_lock_free();
+    }
+
+private:
+    bool empty(size_t write_index, size_t read_index)
+    {
+        return write_index == read_index;
+    }
+};

From 84b33d2ddb684b96b1abdcade29757f99a76497a Mon Sep 17 00:00:00 2001
From: Gregory Hainaut <gregory.hainaut@gmail.com>
Date: Fri, 20 Mar 2015 23:19:44 +0100
Subject: [PATCH 08/11] gsdx-queue: plug the new queue as a drop-off
 replacement of previous boost queue

---
 plugins/GSdx/GSThread_CXX11.h | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/plugins/GSdx/GSThread_CXX11.h b/plugins/GSdx/GSThread_CXX11.h
index 4cf7264d34..da112d70a8 100644
--- a/plugins/GSdx/GSThread_CXX11.h
+++ b/plugins/GSdx/GSThread_CXX11.h
@@ -22,7 +22,12 @@
 #pragma once
 
 #include "GSdx.h"
+#define BOOST_STAND_ALONE
+#ifdef BOOST_STAND_ALONE
+#include "boost_spsc_queue.hpp"
+#else
 #include <boost/lockfree/spsc_queue.hpp>
+#endif
 
 class IGSThread
 {
@@ -97,7 +102,7 @@ public:
 	virtual int GetPixels(bool reset) = 0;
 };
 
-// This queue doesn't lock any thread. It would be nicer for 2c/4c CPU.
+// This queue doesn't reserve any thread. It would be nicer for 2c/4c CPU.
 // pros: no hard limit on thread numbers
 // cons: less performance by thread
 template<class T> class GSJobQueue : public IGSJobQueue<T>
@@ -105,7 +110,11 @@ template<class T> class GSJobQueue : public IGSJobQueue<T>
 protected:
 	std::atomic<int16_t> m_count;
 	std::atomic<bool> m_exit;
-	boost::lockfree::spsc_queue<T, boost::lockfree::capacity<256> > m_queue;
+#ifdef BOOST_STAND_ALONE
+	ringbuffer_base<T, 256> m_queue;
+#else
+	boost::lockfree::spsc_queue<T, boost::lockfree::capacity<255> > m_queue;
+#endif
 
 	std::mutex m_lock;
 	std::condition_variable m_empty;
@@ -148,7 +157,7 @@ public:
 	}
 
 	virtual ~GSJobQueue() {
-		m_exit = true;
+		m_exit.store(true, memory_order_release);
 		m_notempty.notify_one();
 		this->CloseThread();
 	}
@@ -189,7 +198,7 @@ public:
 };
 
 
-// This queue locks 'only' RENDERING threads mostly the same performance as above if the CPU is fast enough
+// This queue reserves 'only' RENDERING threads mostly the same performance as a no reservation queue if the CPU is fast enough
 // pros: nearly best fps by thread
 // cons: requires (1 + eThreads) cores for GS emulation only ! Reserved to 6/8 cores CPU.
 template<class T> class GSJobQueueSpin : public IGSJobQueue<T>
@@ -197,7 +206,11 @@ template<class T> class GSJobQueueSpin : public IGSJobQueue<T>
 protected:
 	std::atomic<int16_t> m_count;
 	std::atomic<bool> m_exit;
-	boost::lockfree::spsc_queue<T, boost::lockfree::capacity<256> > m_queue;
+#ifdef BOOST_STAND_ALONE
+	ringbuffer_base<T, 256> m_queue;
+#else
+	boost::lockfree::spsc_queue<T, boost::lockfree::capacity<255> > m_queue;
+#endif
 
 	std::mutex m_lock;
 	std::condition_variable m_empty;
@@ -239,7 +252,7 @@ public:
 	};
 
 	virtual ~GSJobQueueSpin() {
-		m_exit = true;
+		m_exit.store(true, memory_order_release);
 		this->CloseThread();
 	}
 
@@ -267,14 +280,12 @@ public:
 		ASSERT(m_count == 0);
 	}
 
-	virtual void Process(T& item) = 0;
-
 	void operator() (T& item) {
 		this->Process(item);
 	}
 };
 
-// This queue locks RENDERING threads + GS threads onto dedicated CPU
+// This queue reserves RENDERING threads + GS threads onto dedicated CPU
 // pros: best fps by thread
 // cons: requires (1 + eThreads) cores for GS emulation only ! Reserved to 8 cores CPU.
 #if 0
@@ -284,7 +295,7 @@ template<class T> class GSJobQueue : public IGSJobQueue<T>
 protected:
 	std::atomic<int16_t> m_count;
 	std::atomic<bool> m_exit;
-	boost::lockfree::spsc_queue<T, boost::lockfree::capacity<256> > m_queue;
+	boost::lockfree::spsc_queue<T, boost::lockfree::capacity<255> > m_queue;
 
 	void ThreadProc() {
 		while (true) {

From d91e989abbba0e6c5eed3aa04fbedafe442b2391 Mon Sep 17 00:00:00 2001
From: Gregory Hainaut <gregory.hainaut@gmail.com>
Date: Sat, 21 Mar 2015 15:09:58 +0100
Subject: [PATCH 09/11] gsdx-queue: pass shared_ptr by reference

It avoids atomic +1/-1 of the reference counter

The counter is still incremented when the ptr is copyed into the queue
---
 plugins/GSdx/GSRasterizer.cpp | 4 ++--
 plugins/GSdx/GSRasterizer.h   | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/plugins/GSdx/GSRasterizer.cpp b/plugins/GSdx/GSRasterizer.cpp
index e957b7ae76..58c89bfd77 100644
--- a/plugins/GSdx/GSRasterizer.cpp
+++ b/plugins/GSdx/GSRasterizer.cpp
@@ -104,7 +104,7 @@ int GSRasterizer::FindMyNextScanline(int top) const
 	return top;
 }
 
-void GSRasterizer::Queue(shared_ptr<GSRasterizerData> data)
+void GSRasterizer::Queue(const shared_ptr<GSRasterizerData>& data)
 {
 	Draw(data.get());
 }
@@ -1155,7 +1155,7 @@ GSRasterizerList::~GSRasterizerList()
 	_aligned_free(m_scanline);
 }
 
-void GSRasterizerList::Queue(shared_ptr<GSRasterizerData> data)
+void GSRasterizerList::Queue(const shared_ptr<GSRasterizerData>& data)
 {
 	GSVector4i r = data->bbox.rintersect(data->scissor);
 
diff --git a/plugins/GSdx/GSRasterizer.h b/plugins/GSdx/GSRasterizer.h
index 8fcc66255c..3cf60fb3e5 100644
--- a/plugins/GSdx/GSRasterizer.h
+++ b/plugins/GSdx/GSRasterizer.h
@@ -119,7 +119,7 @@ class IRasterizer : public GSAlignedClass<32>
 public:
 	virtual ~IRasterizer() {}
 
-	virtual void Queue(shared_ptr<GSRasterizerData> data) = 0;
+	virtual void Queue(const shared_ptr<GSRasterizerData>& data) = 0;
 	virtual void Sync() = 0;
 	virtual bool IsSynced() const = 0;
 	virtual int GetPixels(bool reset = true) = 0;
@@ -174,7 +174,7 @@ public:
 
 	// IRasterizer
 
-	void Queue(shared_ptr<GSRasterizerData> data);
+	void Queue(const shared_ptr<GSRasterizerData>& data);
 	void Sync() {}
 	bool IsSynced() const {return true;}
 	int GetPixels(bool reset);
@@ -259,7 +259,7 @@ public:
 
 	// IRasterizer
 
-	void Queue(shared_ptr<GSRasterizerData> data);
+	void Queue(const shared_ptr<GSRasterizerData>& data);
 	void Sync();
 	bool IsSynced() const;
 	int GetPixels(bool reset);

From fa243afbab1069542b5f0396a4756a7831894658 Mon Sep 17 00:00:00 2001
From: Gregory Hainaut <gregory.hainaut@gmail.com>
Date: Wed, 4 Mar 2015 09:41:02 +0100
Subject: [PATCH 10/11] gsdx SW: enable new queue && C++11 on linux/MSVC 2012+

---
 plugins/GSdx/stdafx.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/plugins/GSdx/stdafx.h b/plugins/GSdx/stdafx.h
index d21c72716e..5726d9c623 100644
--- a/plugins/GSdx/stdafx.h
+++ b/plugins/GSdx/stdafx.h
@@ -60,6 +60,12 @@
 
 #endif
 
+// Require at least Visual Studio 2012
+#if defined(__linux__) || (defined(_MSC_VER) && (_MSC_VER >= 1700))
+#define _CX11_
+#define ENABLE_BOOST // queue is from boost but it doesn't require a full boost install
+#endif
+
 // put these into vc9/common7/ide/usertype.dat to have them highlighted
 
 typedef unsigned char uint8;

From e605ed1d09f6fe45713aae776721a7d41a695f39 Mon Sep 17 00:00:00 2001
From: Gregory Hainaut <gregory.hainaut@gmail.com>
Date: Fri, 27 Mar 2015 18:58:34 +0100
Subject: [PATCH 11/11] gsdx-queue: add a comment for the future

---
 plugins/GSdx/GSThread_CXX11.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/plugins/GSdx/GSThread_CXX11.h b/plugins/GSdx/GSThread_CXX11.h
index da112d70a8..317132f1ee 100644
--- a/plugins/GSdx/GSThread_CXX11.h
+++ b/plugins/GSdx/GSThread_CXX11.h
@@ -201,6 +201,11 @@ public:
 // This queue reserves 'only' RENDERING threads mostly the same performance as a no reservation queue if the CPU is fast enough
 // pros: nearly best fps by thread
 // cons: requires (1 + eThreads) cores for GS emulation only ! Reserved to 6/8 cores CPU.
+// Note: I'm not sure of the source of the speedup
+//		1/ It could be related to less MT logic (lock, cond var)
+//		2/ But I highly suspect that waking up thread is rather slow.  My guess
+//		is that low power feature (like C state) increases latency. In this case
+//		gain will be smaller if PCSX2 is running or in limited core CPU (<=4)
 template<class T> class GSJobQueueSpin : public IGSJobQueue<T>
 {
 protected: