From 8deee6afbc564d7c7d87d7e7d650be7ff70fab94 Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Tue, 3 Mar 2015 10:00:43 +0100 Subject: [PATCH 01/11] gsdx: include some C++11 define for later --- plugins/GSdx/GSThread.h | 3 --- plugins/GSdx/stdafx.h | 8 ++++++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/plugins/GSdx/GSThread.h b/plugins/GSdx/GSThread.h index b97d34027e..2d10c68a77 100644 --- a/plugins/GSdx/GSThread.h +++ b/plugins/GSdx/GSThread.h @@ -152,9 +152,6 @@ public: #include #endif -#include -#include - class GSThread : public IGSThread { #ifdef _STD_THREAD_ diff --git a/plugins/GSdx/stdafx.h b/plugins/GSdx/stdafx.h index cda44482f1..d21c72716e 100644 --- a/plugins/GSdx/stdafx.h +++ b/plugins/GSdx/stdafx.h @@ -96,6 +96,14 @@ typedef uint32 uptr; #include #include #include +#ifdef _CX11_ +#include +#include +#endif +#if defined(__linux__) || defined(_CX11_) +#include +#include +#endif using namespace std; From 9ad5933120eab35286d3fe74239b0e3bdde9be71 Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Tue, 3 Mar 2015 18:22:09 +0100 Subject: [PATCH 02/11] gsdx: Use composition insead of inheritance to support lock To ease update to C++11 --- plugins/GSdx/GSCapture.cpp | 6 +++--- plugins/GSdx/GSCapture.h | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/plugins/GSdx/GSCapture.cpp b/plugins/GSdx/GSCapture.cpp index 4595760945..904bfd5a0d 100644 --- a/plugins/GSdx/GSCapture.cpp +++ b/plugins/GSdx/GSCapture.cpp @@ -386,7 +386,7 @@ GSCapture::~GSCapture() bool GSCapture::BeginCapture(float fps) { - GSAutoLock lock(this); + GSAutoLock lock(&m_lock); ASSERT(fps != 0); @@ -481,7 +481,7 @@ bool GSCapture::BeginCapture(float fps) bool GSCapture::DeliverFrame(const void* bits, int pitch, bool rgba) { - GSAutoLock lock(this); + GSAutoLock lock(&m_lock); if(bits == NULL || pitch == 0) { @@ -506,7 +506,7 @@ bool GSCapture::DeliverFrame(const void* bits, int pitch, bool rgba) bool GSCapture::EndCapture() { - GSAutoLock lock(this); + GSAutoLock lock(&m_lock); #ifdef _WINDOWS diff --git a/plugins/GSdx/GSCapture.h b/plugins/GSdx/GSCapture.h index 65125c05e8..eb6e45601c 100644 --- a/plugins/GSdx/GSCapture.h +++ b/plugins/GSdx/GSCapture.h @@ -28,8 +28,9 @@ #include "GSCaptureDlg.h" #endif -class GSCapture : protected GSCritSec +class GSCapture { + GSCritSec m_lock; bool m_capturing; GSVector2i m_size; From a75d78bd7e3784ad9aa8bbcd371bd39def29caa7 Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Tue, 3 Mar 2015 18:29:21 +0100 Subject: [PATCH 03/11] gsdx: use standard lock_guard instead of GSAutoLock --- plugins/GSdx/GS.cpp | 4 ++++ plugins/GSdx/GSCapture.cpp | 12 ++++++++++++ plugins/GSdx/GSCapture.h | 6 ++++++ plugins/GSdx/GSLocalMemory.h | 2 ++ plugins/GSdx/GSRenderer.cpp | 4 ++++ plugins/GSdx/GSRenderer.h | 4 ++++ 6 files changed, 32 insertions(+) diff --git a/plugins/GSdx/GS.cpp b/plugins/GSdx/GS.cpp index 7cca083196..c09fb2596e 100644 --- a/plugins/GSdx/GS.cpp +++ b/plugins/GSdx/GS.cpp @@ -853,7 +853,11 @@ EXPORT_C GSgetTitleInfo2(char* dest, size_t length) if(s_gs->m_GStitleInfoBuffer[0]) { +#ifdef _CX11_ + std::lock_guard lock(s_gs->m_pGSsetTitle_Crit); +#else GSAutoLock lock(&s_gs->m_pGSsetTitle_Crit); +#endif s = format("GSdx | %s", s_gs->m_GStitleInfoBuffer); diff --git a/plugins/GSdx/GSCapture.cpp b/plugins/GSdx/GSCapture.cpp index 904bfd5a0d..d5c7b34615 100644 --- a/plugins/GSdx/GSCapture.cpp +++ b/plugins/GSdx/GSCapture.cpp @@ -386,7 +386,11 @@ GSCapture::~GSCapture() bool GSCapture::BeginCapture(float fps) { +#ifdef _CX11_ + std::lock_guard lock(m_lock); +#else GSAutoLock lock(&m_lock); +#endif ASSERT(fps != 0); @@ -481,7 +485,11 @@ bool GSCapture::BeginCapture(float fps) bool GSCapture::DeliverFrame(const void* bits, int pitch, bool rgba) { +#ifdef _CX11_ + std::lock_guard lock(m_lock); +#else GSAutoLock lock(&m_lock); +#endif if(bits == NULL || pitch == 0) { @@ -506,7 +514,11 @@ bool GSCapture::DeliverFrame(const void* bits, int pitch, bool rgba) bool GSCapture::EndCapture() { +#ifdef _CX11_ + std::lock_guard lock(m_lock); +#else GSAutoLock lock(&m_lock); +#endif #ifdef _WINDOWS diff --git a/plugins/GSdx/GSCapture.h b/plugins/GSdx/GSCapture.h index eb6e45601c..f6a0d56c0e 100644 --- a/plugins/GSdx/GSCapture.h +++ b/plugins/GSdx/GSCapture.h @@ -22,7 +22,9 @@ #pragma once #include "GSVector.h" +#ifndef _CX11_ #include "GSThread.h" +#endif #ifdef _WINDOWS #include "GSCaptureDlg.h" @@ -30,7 +32,11 @@ class GSCapture { +#ifdef _CX11_ + std::mutex m_lock; +#else GSCritSec m_lock; +#endif bool m_capturing; GSVector2i m_size; diff --git a/plugins/GSdx/GSLocalMemory.h b/plugins/GSdx/GSLocalMemory.h index 70e171090f..591e654369 100644 --- a/plugins/GSdx/GSLocalMemory.h +++ b/plugins/GSdx/GSLocalMemory.h @@ -26,7 +26,9 @@ #include "GSVector.h" #include "GSBlock.h" #include "GSClut.h" +#ifndef _CX11_ #include "GSThread.h" +#endif class GSOffset : public GSAlignedClass<32> { diff --git a/plugins/GSdx/GSRenderer.cpp b/plugins/GSdx/GSRenderer.cpp index 876ba5fea4..eb23a9607d 100644 --- a/plugins/GSdx/GSRenderer.cpp +++ b/plugins/GSdx/GSRenderer.cpp @@ -406,7 +406,11 @@ void GSRenderer::VSync(int field) // be noticeable). Besides, these locks are extremely short -- overhead of conditional // is way more expensive than just waiting for the CriticalSection in 1 of 10,000,000 tries. --air +#ifdef _CX11_ + std::lock_guard lock(m_pGSsetTitle_Crit); +#else GSAutoLock lock(&m_pGSsetTitle_Crit); +#endif strncpy(m_GStitleInfoBuffer, s.c_str(), countof(m_GStitleInfoBuffer) - 1); diff --git a/plugins/GSdx/GSRenderer.h b/plugins/GSdx/GSRenderer.h index 0a68c16c3c..ad4eb22e64 100644 --- a/plugins/GSdx/GSRenderer.h +++ b/plugins/GSdx/GSRenderer.h @@ -78,7 +78,11 @@ public: virtual void EndCapture(); public: +#ifdef _CX11_ + std::mutex m_pGSsetTitle_Crit; +#else GSCritSec m_pGSsetTitle_Crit; +#endif char m_GStitleInfoBuffer[128]; }; From 96820614729ff832c6613883c954021031d33b26 Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Tue, 3 Mar 2015 10:01:22 +0100 Subject: [PATCH 04/11] gsdx-queue:add a new job dispatcher queue based on boost and C++11 It is faster on linux, it requires less code, and it is "portable" It requires boost (only hpp files) + MSVC 2013 (for atomic) (seem doable by 2012 too) Actually there are several queues that either use spinlock or full sleep --- plugins/GSdx/GSRasterizer.h | 6 +- plugins/GSdx/GSThread.cpp | 6 + plugins/GSdx/GSThread_CXX11.h | 339 ++++++++++++++++++++++++++++++++++ 3 files changed, 350 insertions(+), 1 deletion(-) create mode 100644 plugins/GSdx/GSThread_CXX11.h diff --git a/plugins/GSdx/GSRasterizer.h b/plugins/GSdx/GSRasterizer.h index 998a744512..bd86546ec0 100644 --- a/plugins/GSdx/GSRasterizer.h +++ b/plugins/GSdx/GSRasterizer.h @@ -24,9 +24,13 @@ #include "GS.h" #include "GSVertexSW.h" #include "GSFunctionMap.h" -#include "GSThread.h" #include "GSAlignedClass.h" #include "GSPerfMon.h" +#ifdef ENABLE_BOOST +#include "GSThread_CXX11.h" +#else +#include "GSThread.h" +#endif __aligned(class, 32) GSRasterizerData : public GSAlignedClass<32> { diff --git a/plugins/GSdx/GSThread.cpp b/plugins/GSdx/GSThread.cpp index 9e63c055ed..0b2588feda 100644 --- a/plugins/GSdx/GSThread.cpp +++ b/plugins/GSdx/GSThread.cpp @@ -20,10 +20,15 @@ */ #include "stdafx.h" +#ifdef ENABLE_BOOST +#include "GSThread_CXX11.h" +#else #include "GSThread.h" +#endif #ifdef _WINDOWS +#ifndef ENABLE_BOOST InitializeConditionVariablePtr pInitializeConditionVariable; WakeConditionVariablePtr pWakeConditionVariable; WakeAllConditionVariablePtr pWakeAllConditionVariable; @@ -65,6 +70,7 @@ public: }; static InitCondVar s_icv; +#endif #endif diff --git a/plugins/GSdx/GSThread_CXX11.h b/plugins/GSdx/GSThread_CXX11.h new file mode 100644 index 0000000000..33a87a4906 --- /dev/null +++ b/plugins/GSdx/GSThread_CXX11.h @@ -0,0 +1,339 @@ +/* + * Copyright (C) 2007-2009 Gabest + * http://www.gabest.org + * + * This Program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This Program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU Make; see the file COPYING. If not, write to + * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA USA. + * http://www.gnu.org/copyleft/gpl.html + * + */ + +#pragma once + +#include "GSdx.h" +#include + +class IGSThread +{ +protected: + virtual void ThreadProc() = 0; +}; + +// let us use std::thread for now, comment out the definition to go back to pthread +// There are currently some bugs/limitations to std::thread (see various comment) +// For the moment let's keep pthread but uses new std object (mutex, cond_var) +//#define _STD_THREAD_ + +#ifdef _WINDOWS + +class GSThread : public IGSThread +{ + DWORD m_ThreadId; + HANDLE m_hThread; + + static DWORD WINAPI StaticThreadProc(void* lpParam); + +protected: + void CreateThread(); + void CloseThread(); + +public: + GSThread(); + virtual ~GSThread(); +}; + +#else + +#ifdef _STD_THREAD_ +#include +#else +#include +#endif + +class GSThread : public IGSThread +{ + #ifdef _STD_THREAD_ + std::thread *t; + #else + pthread_attr_t m_thread_attr; + pthread_t m_thread; + #endif + static void* StaticThreadProc(void* param); + +protected: + void CreateThread(); + void CloseThread(); + +public: + GSThread(); + virtual ~GSThread(); +}; + +#endif + +// Activate only a single define (From the lowest latency to better CPU usage) + +// This queue locks RENDERING threads + GS threads onto dedicated CPU +// pros: best fps by thread +// cons: requires (1 + eThreads) cores for GS emulation only ! Reserved to 8 cores CPU. +//#define NO_WAIT_BUT_CPU_INTENSIVE + +// This queue locks 'only' RENDERING threads mostly the same performance as above it the CPU is fast enough +// pros: nearly best fps by thread +// cons: requires (1 + eThreads) cores for GS emulation only ! Reserved to 6/8 cores CPU. +//#define WAIT_ON_GS_STILL_CPU_INTENSIVE + +// This queue doesn't lock any thread. It would be nicer for 2c/4c CPU. +// pros: no hard limit on thread numbers +// cons: less performance by thread +#define FULL_WAIT_LESS_CPU_INTENSIVE + +#if defined(FULL_WAIT_LESS_CPU_INTENSIVE) + +template class GSJobQueue : private GSThread +{ +protected: + std::atomic m_count; + std::atomic m_exit; + boost::lockfree::spsc_queue > m_queue; + + std::mutex m_lock; + std::condition_variable m_empty; + std::condition_variable m_notempty; + + void ThreadProc() { + std::unique_lock l(m_lock); + + while (true) { + + while (m_count == 0) { + if (m_exit.load(memory_order_acquire)) return; + m_notempty.wait(l); + } + + l.unlock(); + + int16_t consumed = 0; + for (int16_t nb = m_count; nb >= 0; nb--) { + if (m_queue.consume_one(*this)) + consumed++; + } + + l.lock(); + + m_count -= consumed; + + if (m_count <= 0) + m_empty.notify_one(); + + } + } + +public: + GSJobQueue() : + m_count(0), + m_exit(false) + { + CreateThread(); + }; + + virtual ~GSJobQueue() { + m_exit = true; + m_notempty.notify_one(); + CloseThread(); + } + + bool IsEmpty() const { + ASSERT(m_count >= 0); + + return m_count == 0; + } + + void Push(const T& item) { + while(!m_queue.push(item)) + std::this_thread::yield(); + + std::unique_lock l(m_lock); + + m_count++; + + l.unlock(); + + m_notempty.notify_one(); + } + + void Wait() { + if (m_count > 0) { + std::unique_lock l(m_lock); + while (m_count > 0) { + m_empty.wait(l); + } + } + + ASSERT(m_count == 0); + } + + virtual void Process(T& item) = 0; + + void operator()(T& item) { + Process(item); + } +}; + +#elif defined(WAIT_ON_GS_STILL_CPU_INTENSIVE) + +template class GSJobQueue : private GSThread +{ +protected: + std::atomic m_count; + std::atomic m_exit; + boost::lockfree::spsc_queue > m_queue; + + std::mutex m_lock; + std::condition_variable m_empty; + + void ThreadProc() { + std::unique_lock l(m_lock, defer_lock); + + while (true) { + + while (m_count == 0) { + if (m_exit.load(memory_order_acquire)) return; + std::this_thread::yield(); + } + + int16_t consumed = 0; + for (int16_t nb = m_count; nb >= 0; nb--) { + if (m_queue.consume_one(*this)) + consumed++; + } + + l.lock(); + + m_count -= consumed; + + l.unlock(); + + if (m_count <= 0) + m_empty.notify_one(); + + } + } + +public: + GSJobQueue() : + m_count(0), + m_exit(false) + { + CreateThread(); + }; + + virtual ~GSJobQueue() { + m_exit = true; + CloseThread(); + } + + bool IsEmpty() const { + ASSERT(m_count >= 0); + + return m_count == 0; + } + + void Push(const T& item) { + while(!m_queue.push(item)) + std::this_thread::yield(); + + m_count++; + } + + void Wait() { + if (m_count > 0) { + std::unique_lock l(m_lock); + while (m_count > 0) { + m_empty.wait(l); + } + } + + ASSERT(m_count == 0); + } + + virtual void Process(T& item) = 0; + + void operator()(T& item) { + Process(item); + } +}; + +#elif defined(NO_WAIT_BUT_CPU_INTENSIVE) + +template class GSJobQueue : private GSThread +{ +protected: + std::atomic m_count; + std::atomic m_exit; + boost::lockfree::spsc_queue > m_queue; + + void ThreadProc() { + while (true) { + while (m_count == 0) { + if (m_exit.load(memory_order_acquire)) return; + std::this_thread::yield(); + } + + m_count -= m_queue.consume_all(*this); + } + } + +public: + GSJobQueue() : + m_count(0), + m_exit(false) + { + CreateThread(); + }; + + virtual ~GSJobQueue() { + m_exit = true; + CloseThread(); + } + + bool IsEmpty() const { + ASSERT(m_count >= 0); + + return m_count == 0; + } + + void Push(const T& item) { + m_count++; + while(!m_queue.push(item)) + std::this_thread::yield(); + } + + void Wait() { + while (m_count > 0) + std::this_thread::yield(); + + ASSERT(m_count == 0); + } + + virtual void Process(T& item) = 0; + + void operator()(T& item) { + Process(item); + } +}; + +#else + #very bad +#endif From 0aac47ca59196512cab23046bb76cb28e717c7b6 Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Fri, 13 Mar 2015 19:52:04 +0100 Subject: [PATCH 05/11] gsdx-queue: add a new option "spin_thread" to select the queue behavior at runtime If someone has a more elegant solution, feel free to share it spin_thread = 0 spin_thread = 1 // the faster but GS thread will never stop, very bad for laptop --- plugins/GSdx/GSRasterizer.cpp | 34 +++++++++++++++-- plugins/GSdx/GSRasterizer.h | 30 ++++++++++++++- plugins/GSdx/GSRendererSW.cpp | 3 +- plugins/GSdx/GSThread_CXX11.h | 70 +++++++++++++++++------------------ 4 files changed, 96 insertions(+), 41 deletions(-) diff --git a/plugins/GSdx/GSRasterizer.cpp b/plugins/GSdx/GSRasterizer.cpp index 3462ab4082..e957b7ae76 100644 --- a/plugins/GSdx/GSRasterizer.cpp +++ b/plugins/GSdx/GSRasterizer.cpp @@ -1147,7 +1147,7 @@ GSRasterizerList::GSRasterizerList(int threads, GSPerfMon* perfmon) GSRasterizerList::~GSRasterizerList() { - for(vector::iterator i = m_workers.begin(); i != m_workers.end(); i++) + for(auto i = m_workers.begin(); i != m_workers.end(); i++) { delete *i; } @@ -1210,13 +1210,13 @@ int GSRasterizerList::GetPixels(bool reset) // GSRasterizerList::GSWorker -GSRasterizerList::GSWorker::GSWorker(GSRasterizer* r) +GSRasterizerList::GSWorker::GSWorker(GSRasterizer* r) : GSJobQueue >() , m_r(r) { } -GSRasterizerList::GSWorker::~GSWorker() +GSRasterizerList::GSWorker::~GSWorker() { Wait(); @@ -1228,7 +1228,33 @@ int GSRasterizerList::GSWorker::GetPixels(bool reset) return m_r->GetPixels(reset); } -void GSRasterizerList::GSWorker::Process(shared_ptr& item) +void GSRasterizerList::GSWorker::Process(shared_ptr& item) { m_r->Draw(item.get()); } + +// GSRasterizerList::GSWorkerSpin +#ifdef ENABLE_BOOST +GSRasterizerList::GSWorkerSpin::GSWorkerSpin(GSRasterizer* r) + : GSJobQueueSpin >() + , m_r(r) +{ +} + +GSRasterizerList::GSWorkerSpin::~GSWorkerSpin() +{ + Wait(); + + delete m_r; +} + +int GSRasterizerList::GSWorkerSpin::GetPixels(bool reset) +{ + return m_r->GetPixels(reset); +} + +void GSRasterizerList::GSWorkerSpin::Process(shared_ptr& item) +{ + m_r->Draw(item.get()); +} +#endif diff --git a/plugins/GSdx/GSRasterizer.h b/plugins/GSdx/GSRasterizer.h index bd86546ec0..8fcc66255c 100644 --- a/plugins/GSdx/GSRasterizer.h +++ b/plugins/GSdx/GSRasterizer.h @@ -199,8 +199,29 @@ protected: void Process(shared_ptr& item); }; +#ifdef ENABLE_BOOST + class GSWorkerSpin : public GSJobQueueSpin > + { + GSRasterizer* m_r; + + public: + GSWorkerSpin(GSRasterizer* r); + virtual ~GSWorkerSpin(); + + int GetPixels(bool reset); + + // GSJobQueue + + void Process(shared_ptr& item); + }; +#endif + GSPerfMon* m_perfmon; +#ifdef ENABLE_BOOST + vector > *> m_workers; +#else vector m_workers; +#endif uint8* m_scanline; GSRasterizerList(int threads, GSPerfMon* perfmon); @@ -208,7 +229,7 @@ protected: public: virtual ~GSRasterizerList(); - template static IRasterizer* Create(int threads, GSPerfMon* perfmon) + template static IRasterizer* Create(int threads, GSPerfMon* perfmon, bool spin_thread = false) { threads = std::max(threads, 0); @@ -222,7 +243,14 @@ public: for(int i = 0; i < threads; i++) { +#ifdef ENABLE_BOOST + if (spin_thread) + rl->m_workers.push_back(new GSWorkerSpin(new GSRasterizer(new DS(), i, threads, perfmon))); + else + rl->m_workers.push_back(new GSWorker(new GSRasterizer(new DS(), i, threads, perfmon))); +#else rl->m_workers.push_back(new GSWorker(new GSRasterizer(new DS(), i, threads, perfmon))); +#endif } return rl; diff --git a/plugins/GSdx/GSRendererSW.cpp b/plugins/GSdx/GSRendererSW.cpp index 83fd2402f3..3dfa829c57 100644 --- a/plugins/GSdx/GSRendererSW.cpp +++ b/plugins/GSdx/GSRendererSW.cpp @@ -41,7 +41,8 @@ GSRendererSW::GSRendererSW(int threads) memset(m_texture, 0, sizeof(m_texture)); - m_rl = GSRasterizerList::Create(threads, &m_perfmon); + bool spin_thread = !!theApp.GetConfig("spin_thread", 0); + m_rl = GSRasterizerList::Create(threads, &m_perfmon, spin_thread); m_output = (uint8*)_aligned_malloc(1024 * 1024 * sizeof(uint32), 32); diff --git a/plugins/GSdx/GSThread_CXX11.h b/plugins/GSdx/GSThread_CXX11.h index 33a87a4906..4cf7264d34 100644 --- a/plugins/GSdx/GSThread_CXX11.h +++ b/plugins/GSdx/GSThread_CXX11.h @@ -82,26 +82,25 @@ public: #endif -// Activate only a single define (From the lowest latency to better CPU usage) +// To allow switching between queue dynamically +template class IGSJobQueue : public GSThread +{ +public: + IGSJobQueue() {} + virtual ~IGSJobQueue() {} -// This queue locks RENDERING threads + GS threads onto dedicated CPU -// pros: best fps by thread -// cons: requires (1 + eThreads) cores for GS emulation only ! Reserved to 8 cores CPU. -//#define NO_WAIT_BUT_CPU_INTENSIVE + virtual bool IsEmpty() const = 0; + virtual void Push(const T& item) = 0; + virtual void Wait() = 0; -// This queue locks 'only' RENDERING threads mostly the same performance as above it the CPU is fast enough -// pros: nearly best fps by thread -// cons: requires (1 + eThreads) cores for GS emulation only ! Reserved to 6/8 cores CPU. -//#define WAIT_ON_GS_STILL_CPU_INTENSIVE + virtual void Process(T& item) = 0; + virtual int GetPixels(bool reset) = 0; +}; // This queue doesn't lock any thread. It would be nicer for 2c/4c CPU. // pros: no hard limit on thread numbers // cons: less performance by thread -#define FULL_WAIT_LESS_CPU_INTENSIVE - -#if defined(FULL_WAIT_LESS_CPU_INTENSIVE) - -template class GSJobQueue : private GSThread +template class GSJobQueue : public IGSJobQueue { protected: std::atomic m_count; @@ -145,13 +144,13 @@ public: m_count(0), m_exit(false) { - CreateThread(); - }; + this->CreateThread(); + } virtual ~GSJobQueue() { m_exit = true; m_notempty.notify_one(); - CloseThread(); + this->CloseThread(); } bool IsEmpty() const { @@ -184,16 +183,16 @@ public: ASSERT(m_count == 0); } - virtual void Process(T& item) = 0; - - void operator()(T& item) { - Process(item); + void operator() (T& item) { + this->Process(item); } }; -#elif defined(WAIT_ON_GS_STILL_CPU_INTENSIVE) -template class GSJobQueue : private GSThread +// This queue locks 'only' RENDERING threads mostly the same performance as above if the CPU is fast enough +// pros: nearly best fps by thread +// cons: requires (1 + eThreads) cores for GS emulation only ! Reserved to 6/8 cores CPU. +template class GSJobQueueSpin : public IGSJobQueue { protected: std::atomic m_count; @@ -232,16 +231,16 @@ protected: } public: - GSJobQueue() : + GSJobQueueSpin() : m_count(0), m_exit(false) { - CreateThread(); + this->CreateThread(); }; - virtual ~GSJobQueue() { + virtual ~GSJobQueueSpin() { m_exit = true; - CloseThread(); + this->CloseThread(); } bool IsEmpty() const { @@ -270,14 +269,17 @@ public: virtual void Process(T& item) = 0; - void operator()(T& item) { - Process(item); + void operator() (T& item) { + this->Process(item); } }; -#elif defined(NO_WAIT_BUT_CPU_INTENSIVE) +// This queue locks RENDERING threads + GS threads onto dedicated CPU +// pros: best fps by thread +// cons: requires (1 + eThreads) cores for GS emulation only ! Reserved to 8 cores CPU. +#if 0 -template class GSJobQueue : private GSThread +template class GSJobQueue : public IGSJobQueue { protected: std::atomic m_count; @@ -329,11 +331,9 @@ public: virtual void Process(T& item) = 0; - void operator()(T& item) { - Process(item); + void operator() (T& item) { + this->Process(item); } }; -#else - #very bad #endif From c9194301a0b02ce30fcc124310728a3e4d36efd0 Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Fri, 13 Mar 2015 19:59:31 +0100 Subject: [PATCH 06/11] gsdx-queue: (linux) add a GUI option to select the queue --- plugins/GSdx/GSLinuxDialog.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/plugins/GSdx/GSLinuxDialog.cpp b/plugins/GSdx/GSLinuxDialog.cpp index ba652bd46d..1e9686d5fd 100644 --- a/plugins/GSdx/GSLinuxDialog.cpp +++ b/plugins/GSdx/GSLinuxDialog.cpp @@ -165,12 +165,12 @@ bool RunLinuxDialog() GtkWidget *fsaa_combo_box, *render_combo_box, *filter_combo_box; GtkWidget *shader, *shader_conf, *shader_label, *shader_conf_label; - GtkWidget *shadeboost_check, *paltex_check, *fba_check, *aa_check, *native_res_check, *stretch_hack_check, *fxaa_check, *shaderfx_check, *align_sprite_check; + GtkWidget *shadeboost_check, *paltex_check, *fba_check, *aa_check, *native_res_check, *fxaa_check, *shaderfx_check, *spin_thread_check; GtkWidget *sb_contrast, *sb_brightness, *sb_saturation; GtkWidget *resx_spin, *resy_spin; GtkWidget *hack_table, *hack_skipdraw_label, *hack_box, *hack_frame; - GtkWidget *hack_alpha_check, *hack_date_check, *hack_offset_check, *hack_skipdraw_spin, *hack_sprite_check, * hack_wild_check, *hack_enble_check, *hack_logz_check; + GtkWidget *hack_alpha_check, *hack_date_check, *hack_offset_check, *hack_skipdraw_spin, *hack_sprite_check, * hack_wild_check, *hack_enble_check, *hack_logz_check, *align_sprite_check, *stretch_hack_check; GtkWidget *hack_tco_label, *hack_tco_entry; GtkWidget *gl_box, *gl_frame, *gl_table; @@ -352,6 +352,7 @@ bool RunLinuxDialog() paltex_check = gtk_check_button_new_with_label("Allow 8 bits textures"); fba_check = gtk_check_button_new_with_label("Alpha correction (FBA)"); aa_check = gtk_check_button_new_with_label("Edge anti-aliasing (AA1)"); + spin_thread_check= gtk_check_button_new_with_label("Disable thread sleeping (6+ cores CPU)"); fxaa_check = gtk_check_button_new_with_label("Fxaa shader"); shaderfx_check = gtk_check_button_new_with_label("External shader"); @@ -360,6 +361,7 @@ bool RunLinuxDialog() gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(paltex_check), theApp.GetConfig("paltex", 0)); gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(fba_check), theApp.GetConfig("fba", 1)); gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(aa_check), theApp.GetConfig("aa1", 0)); + gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(spin_thread_check), theApp.GetConfig("spin_thread", 0)); gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(fxaa_check), theApp.GetConfig("fxaa", 0)); gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(shaderfx_check), theApp.GetConfig("shaderfx", 0)); gtk_toggle_button_set_active(GTK_TOGGLE_BUTTON(native_res_check), theApp.GetConfig("nativeres", 0)); @@ -414,6 +416,7 @@ bool RunLinuxDialog() gtk_container_add(GTK_CONTAINER(sw_box), threads_box); gtk_container_add(GTK_CONTAINER(sw_box), aa_check); + gtk_container_add(GTK_CONTAINER(sw_box), spin_thread_check); // Tables are strange. The numbers are for their position: left, right, top, bottom. gtk_table_attach_defaults(GTK_TABLE(shader_table), fxaa_check, 0, 1, 0, 1); @@ -544,6 +547,7 @@ override_GL_ARB_shading_language_420pack = -1 theApp.SetConfig("paltex", (int)gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(paltex_check))); theApp.SetConfig("fba", (int)gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(fba_check))); theApp.SetConfig("aa1", (int)gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(aa_check))); + theApp.SetConfig("spin_thread", (int)gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(spin_thread_check))); theApp.SetConfig("fxaa", (int)gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(fxaa_check))); theApp.SetConfig("shaderfx", (int)gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(shaderfx_check))); theApp.SetConfig("nativeres", (int)gtk_toggle_button_get_active(GTK_TOGGLE_BUTTON(native_res_check))); From 90794c302a6dc9c07525a283117c2e9a01adc6ef Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Sat, 21 Mar 2015 15:39:31 +0100 Subject: [PATCH 07/11] gsdx-queue: import spsc_queue of boost I remove 80% of the file to only keep the ring buffer core Same speed as boost but without the boost dependency --- plugins/GSdx/boost_spsc_queue.hpp | 177 ++++++++++++++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100644 plugins/GSdx/boost_spsc_queue.hpp diff --git a/plugins/GSdx/boost_spsc_queue.hpp b/plugins/GSdx/boost_spsc_queue.hpp new file mode 100644 index 0000000000..c1104a5de7 --- /dev/null +++ b/plugins/GSdx/boost_spsc_queue.hpp @@ -0,0 +1,177 @@ +// This version is a stripped down version of boost/lockfree/spsc_queue.hpp boost_spsc_queue.hpp +// Rational +// * Performance is better on linux than the standard std::queue +// * Performance in the same on windows +// => 100-200MB of dependency feel rather unfriendly + +// Potential optimization +// * plug condition variable into the queue directly to avoid redundant m_count + +// * Restore boost optimization +// => unlikely or replace it with a % (if size is 2^n) + + +// lock-free single-producer/single-consumer ringbuffer +// this algorithm is implemented in various projects (linux kernel) +// +// Copyright (C) 2009-2013 Tim Blechmann +// +// Distributed under the Boost Software License, Version 1.0. (See +// accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +// Boost Software License - Version 1.0 - August 17th, 2003 +// +// Permission is hereby granted, free of charge, to any person or organization +// obtaining a copy of the software and accompanying documentation covered by +// this license (the "Software") to use, reproduce, display, distribute, +// execute, and transmit the Software, and to prepare derivative works of the +// Software, and to permit third-parties to whom the Software is furnished to +// do so, all subject to the following: +// +// The copyright notices in the Software and this entire statement, including +// the above license grant, this restriction and the following disclaimer, +// must be included in all copies of the Software, in whole or in part, and +// all derivative works of the Software, unless such copies or derivative +// works are solely in the form of machine-executable object code generated by +// a source language processor. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +// SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +// FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +// DEALINGS IN THE SOFTWARE. + + +template +class ringbuffer_base +{ + static const int padding_size = 64 - sizeof(size_t); + + atomic write_index_; + char padding1[padding_size]; /* force read_index and write_index to different cache lines */ + atomic read_index_; + + T *buffer; + + ringbuffer_base(ringbuffer_base const &) = delete; + ringbuffer_base(ringbuffer_base &&) = delete; + const ringbuffer_base& operator=( const ringbuffer_base& ) = delete; + +public: + ringbuffer_base(void): + write_index_(0), read_index_(0) + { + // Use dynamically allocation here with no T object dependency + // Otherwise the ringbuffer_base destructor will call the destructor + // of T which crash if T is a (invalid) shared_ptr. + // + // Note another solution will be to create a char buffer as union of T + buffer = (T*)_aligned_malloc(sizeof(T)*max_size, 32); + } + + ~ringbuffer_base(void) { + // destroy all remaining items + T out; + while (pop(out)) {}; + + _aligned_free(buffer); + } + + + static size_t next_index(size_t arg) + { + size_t ret = arg + 1; +#if 0 + while (unlikely(ret >= max_size)) +#else + while (ret >= max_size) +#endif + ret -= max_size; + return ret; + } + + bool push(T const & t) + { + const size_t write_index = write_index_.load(memory_order_relaxed); // only written from push thread + const size_t next = next_index(write_index); + + if (next == read_index_.load(memory_order_acquire)) + return false; /* ringbuffer is full */ + + new (buffer + write_index) T(t); // copy-construct + + write_index_.store(next, memory_order_release); + + return true; + } + + bool pop (T & ret) + { + const size_t write_index = write_index_.load(memory_order_acquire); + const size_t read_index = read_index_.load(memory_order_relaxed); // only written from pop thread + if (empty(write_index, read_index)) + return false; + + ret = buffer[read_index]; + buffer[read_index].~T(); + + size_t next = next_index(read_index); + read_index_.store(next, memory_order_release); + return true; + } + + template + bool consume_one(Functor & f) + { + const size_t write_index = write_index_.load(memory_order_acquire); + const size_t read_index = read_index_.load(memory_order_relaxed); // only written from pop thread + if (empty(write_index, read_index)) + return false; + + f(buffer[read_index]); + buffer[read_index].~T(); + + size_t next = next_index(read_index); + read_index_.store(next, memory_order_release); + return true; + } + +public: + /** reset the ringbuffer + * + * \note Not thread-safe + * */ + void reset(void) + { + write_index_.store(0, memory_order_relaxed); + read_index_.store(0, memory_order_release); + } + + /** Check if the ringbuffer is empty + * + * \return true, if the ringbuffer is empty, false otherwise + * \note Due to the concurrent nature of the ringbuffer the result may be inaccurate. + * */ + bool empty(void) + { + return empty(write_index_.load(memory_order_relaxed), read_index_.load(memory_order_relaxed)); + } + + /** + * \return true, if implementation is lock-free. + * + * */ + bool is_lock_free(void) const + { + return write_index_.is_lock_free() && read_index_.is_lock_free(); + } + +private: + bool empty(size_t write_index, size_t read_index) + { + return write_index == read_index; + } +}; From 84b33d2ddb684b96b1abdcade29757f99a76497a Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Fri, 20 Mar 2015 23:19:44 +0100 Subject: [PATCH 08/11] gsdx-queue: plug the new queue as a drop-off replacement of previous boost queue --- plugins/GSdx/GSThread_CXX11.h | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/plugins/GSdx/GSThread_CXX11.h b/plugins/GSdx/GSThread_CXX11.h index 4cf7264d34..da112d70a8 100644 --- a/plugins/GSdx/GSThread_CXX11.h +++ b/plugins/GSdx/GSThread_CXX11.h @@ -22,7 +22,12 @@ #pragma once #include "GSdx.h" +#define BOOST_STAND_ALONE +#ifdef BOOST_STAND_ALONE +#include "boost_spsc_queue.hpp" +#else #include +#endif class IGSThread { @@ -97,7 +102,7 @@ public: virtual int GetPixels(bool reset) = 0; }; -// This queue doesn't lock any thread. It would be nicer for 2c/4c CPU. +// This queue doesn't reserve any thread. It would be nicer for 2c/4c CPU. // pros: no hard limit on thread numbers // cons: less performance by thread template class GSJobQueue : public IGSJobQueue @@ -105,7 +110,11 @@ template class GSJobQueue : public IGSJobQueue protected: std::atomic m_count; std::atomic m_exit; - boost::lockfree::spsc_queue > m_queue; +#ifdef BOOST_STAND_ALONE + ringbuffer_base m_queue; +#else + boost::lockfree::spsc_queue > m_queue; +#endif std::mutex m_lock; std::condition_variable m_empty; @@ -148,7 +157,7 @@ public: } virtual ~GSJobQueue() { - m_exit = true; + m_exit.store(true, memory_order_release); m_notempty.notify_one(); this->CloseThread(); } @@ -189,7 +198,7 @@ public: }; -// This queue locks 'only' RENDERING threads mostly the same performance as above if the CPU is fast enough +// This queue reserves 'only' RENDERING threads mostly the same performance as a no reservation queue if the CPU is fast enough // pros: nearly best fps by thread // cons: requires (1 + eThreads) cores for GS emulation only ! Reserved to 6/8 cores CPU. template class GSJobQueueSpin : public IGSJobQueue @@ -197,7 +206,11 @@ template class GSJobQueueSpin : public IGSJobQueue protected: std::atomic m_count; std::atomic m_exit; - boost::lockfree::spsc_queue > m_queue; +#ifdef BOOST_STAND_ALONE + ringbuffer_base m_queue; +#else + boost::lockfree::spsc_queue > m_queue; +#endif std::mutex m_lock; std::condition_variable m_empty; @@ -239,7 +252,7 @@ public: }; virtual ~GSJobQueueSpin() { - m_exit = true; + m_exit.store(true, memory_order_release); this->CloseThread(); } @@ -267,14 +280,12 @@ public: ASSERT(m_count == 0); } - virtual void Process(T& item) = 0; - void operator() (T& item) { this->Process(item); } }; -// This queue locks RENDERING threads + GS threads onto dedicated CPU +// This queue reserves RENDERING threads + GS threads onto dedicated CPU // pros: best fps by thread // cons: requires (1 + eThreads) cores for GS emulation only ! Reserved to 8 cores CPU. #if 0 @@ -284,7 +295,7 @@ template class GSJobQueue : public IGSJobQueue protected: std::atomic m_count; std::atomic m_exit; - boost::lockfree::spsc_queue > m_queue; + boost::lockfree::spsc_queue > m_queue; void ThreadProc() { while (true) { From d91e989abbba0e6c5eed3aa04fbedafe442b2391 Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Sat, 21 Mar 2015 15:09:58 +0100 Subject: [PATCH 09/11] gsdx-queue: pass shared_ptr by reference It avoids atomic +1/-1 of the reference counter The counter is still incremented when the ptr is copyed into the queue --- plugins/GSdx/GSRasterizer.cpp | 4 ++-- plugins/GSdx/GSRasterizer.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/plugins/GSdx/GSRasterizer.cpp b/plugins/GSdx/GSRasterizer.cpp index e957b7ae76..58c89bfd77 100644 --- a/plugins/GSdx/GSRasterizer.cpp +++ b/plugins/GSdx/GSRasterizer.cpp @@ -104,7 +104,7 @@ int GSRasterizer::FindMyNextScanline(int top) const return top; } -void GSRasterizer::Queue(shared_ptr data) +void GSRasterizer::Queue(const shared_ptr& data) { Draw(data.get()); } @@ -1155,7 +1155,7 @@ GSRasterizerList::~GSRasterizerList() _aligned_free(m_scanline); } -void GSRasterizerList::Queue(shared_ptr data) +void GSRasterizerList::Queue(const shared_ptr& data) { GSVector4i r = data->bbox.rintersect(data->scissor); diff --git a/plugins/GSdx/GSRasterizer.h b/plugins/GSdx/GSRasterizer.h index 8fcc66255c..3cf60fb3e5 100644 --- a/plugins/GSdx/GSRasterizer.h +++ b/plugins/GSdx/GSRasterizer.h @@ -119,7 +119,7 @@ class IRasterizer : public GSAlignedClass<32> public: virtual ~IRasterizer() {} - virtual void Queue(shared_ptr data) = 0; + virtual void Queue(const shared_ptr& data) = 0; virtual void Sync() = 0; virtual bool IsSynced() const = 0; virtual int GetPixels(bool reset = true) = 0; @@ -174,7 +174,7 @@ public: // IRasterizer - void Queue(shared_ptr data); + void Queue(const shared_ptr& data); void Sync() {} bool IsSynced() const {return true;} int GetPixels(bool reset); @@ -259,7 +259,7 @@ public: // IRasterizer - void Queue(shared_ptr data); + void Queue(const shared_ptr& data); void Sync(); bool IsSynced() const; int GetPixels(bool reset); From fa243afbab1069542b5f0396a4756a7831894658 Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Wed, 4 Mar 2015 09:41:02 +0100 Subject: [PATCH 10/11] gsdx SW: enable new queue && C++11 on linux/MSVC 2012+ --- plugins/GSdx/stdafx.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/plugins/GSdx/stdafx.h b/plugins/GSdx/stdafx.h index d21c72716e..5726d9c623 100644 --- a/plugins/GSdx/stdafx.h +++ b/plugins/GSdx/stdafx.h @@ -60,6 +60,12 @@ #endif +// Require at least Visual Studio 2012 +#if defined(__linux__) || (defined(_MSC_VER) && (_MSC_VER >= 1700)) +#define _CX11_ +#define ENABLE_BOOST // queue is from boost but it doesn't require a full boost install +#endif + // put these into vc9/common7/ide/usertype.dat to have them highlighted typedef unsigned char uint8; From e605ed1d09f6fe45713aae776721a7d41a695f39 Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Fri, 27 Mar 2015 18:58:34 +0100 Subject: [PATCH 11/11] gsdx-queue: add a comment for the future --- plugins/GSdx/GSThread_CXX11.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/plugins/GSdx/GSThread_CXX11.h b/plugins/GSdx/GSThread_CXX11.h index da112d70a8..317132f1ee 100644 --- a/plugins/GSdx/GSThread_CXX11.h +++ b/plugins/GSdx/GSThread_CXX11.h @@ -201,6 +201,11 @@ public: // This queue reserves 'only' RENDERING threads mostly the same performance as a no reservation queue if the CPU is fast enough // pros: nearly best fps by thread // cons: requires (1 + eThreads) cores for GS emulation only ! Reserved to 6/8 cores CPU. +// Note: I'm not sure of the source of the speedup +// 1/ It could be related to less MT logic (lock, cond var) +// 2/ But I highly suspect that waking up thread is rather slow. My guess +// is that low power feature (like C state) increases latency. In this case +// gain will be smaller if PCSX2 is running or in limited core CPU (<=4) template class GSJobQueueSpin : public IGSJobQueue { protected: