From 6fb7beb58a265286f5ed53d96e0bc63ba89cddda Mon Sep 17 00:00:00 2001
From: Gregory Hainaut <gregory.hainaut@gmail.com>
Date: Sat, 2 Jul 2016 18:31:27 +0200
Subject: [PATCH 01/11] MTGS: use acquire/release semantics for atomic
 operation

* Avoid the generation of memory barrier (mfence)
* Based on the fact that it used to work on previous code without any
  barrier

v2:
* keep "slow" and safe atomic on reset path
* use relaxed atomic for m_RingBufferIsBusy
The signal doesn't carry dependency with others load/store. Instead it is used
as an hint to awake the semaphore
As a side, there is a potential thread bug

T1 do
* wait sema
* busy = true;
* while (!queue.empty) do work...
* busy = false;
* go back to wait sema

T2 do
* post sema even if busy is false

If T1 stop after the while queue but before the busy, T2 won't post the
event. When T1 will wake up, it will block on the semaphore
---
 pcsx2/MTGS.cpp | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/pcsx2/MTGS.cpp b/pcsx2/MTGS.cpp
index 2c0fdbadfc..ac645282fc 100644
--- a/pcsx2/MTGS.cpp
+++ b/pcsx2/MTGS.cpp
@@ -74,7 +74,7 @@ void SysMtgsThread::OnStart()
 
 	m_ReadPos			= 0;
 	m_WritePos			= 0;
-	m_RingBufferIsBusy	= false;
+	m_RingBufferIsBusy  = false;
 	m_packet_size		= 0;
 	m_packet_writepos	= 0;
 
@@ -110,9 +110,9 @@ void SysMtgsThread::ResetGS()
 	//  * Signal a reset.
 	//  * clear the path and byRegs structs (used by GIFtagDummy)
 
-	m_ReadPos = m_WritePos;
-	m_QueuedFrameCount = 0;
-	m_VsyncSignalListener = false;
+	m_ReadPos             = m_WritePos;
+	m_QueuedFrameCount    = 0;
+	m_VsyncSignalListener = 0;
 
 	MTGS_LOG( "MTGS: Sending Reset..." );
 	SendSimplePacket( GS_RINGTYPE_RESET, 0, 0, 0 );
@@ -163,7 +163,7 @@ void SysMtgsThread::PostVsyncStart()
 
 	if ((m_QueuedFrameCount.fetch_add(1) < EmuConfig.GS.VsyncQueueSize) /*|| (!EmuConfig.GS.VsyncEnable && !EmuConfig.GS.FrameLimitEnable)*/) return;
 
-	m_VsyncSignalListener = true;
+	m_VsyncSignalListener.store(true, std::memory_order_release);
 	//Console.WriteLn( Color_Blue, "(EEcore Sleep) Vsync\t\tringpos=0x%06x, writepos=0x%06x", volatize(m_ReadPos), m_WritePos );
 	m_sem_Vsync.WaitNoCancel();
 }
@@ -247,18 +247,18 @@ struct RingBufferLock {
 		: m_lock1(mtgs.m_mtx_RingBufferBusy),
 		  m_lock2(mtgs.m_mtx_RingBufferBusy2),
 		  m_mtgs(mtgs) {
-		m_mtgs.m_RingBufferIsBusy = true;
+		m_mtgs.m_RingBufferIsBusy.store(true, std::memory_order_relaxed);
 	}
 	virtual ~RingBufferLock() throw() {
-		m_mtgs.m_RingBufferIsBusy = false;
+		m_mtgs.m_RingBufferIsBusy.store(false, std::memory_order_relaxed);
 	}
 	void Acquire() {
 		m_lock1.Acquire();
 		m_lock2.Acquire();
-		m_mtgs.m_RingBufferIsBusy = true;
+		m_mtgs.m_RingBufferIsBusy.store(true, std::memory_order_relaxed);
 	}
 	void Release() {
-		m_mtgs.m_RingBufferIsBusy = false;
+		m_mtgs.m_RingBufferIsBusy.store(false, std::memory_order_relaxed);
 		m_lock2.Release();
 		m_lock1.Release();
 	}
@@ -525,13 +525,13 @@ void SysMtgsThread::ExecuteTaskInThread()
 
 			m_ReadPos = newringpos;
 
-			if( m_SignalRingEnable )
+			if(m_SignalRingEnable.load(std::memory_order_acquire))
 			{
 				// The EEcore has requested a signal after some amount of processed data.
 				if( m_SignalRingPosition.fetch_sub( ringposinc ) <= 0 )
 				{
 					// Make sure to post the signal after the m_ReadPos has been updated...
-					m_SignalRingEnable = false;
+					m_SignalRingEnable.store(false, std::memory_order_release);
 					m_sem_OnRingReset.Post();
 					continue;
 				}
@@ -547,7 +547,7 @@ void SysMtgsThread::ExecuteTaskInThread()
 		if( m_SignalRingEnable.exchange(false) )
 		{
 			//Console.Warning( "(MTGS Thread) Dangling RingSignal on empty buffer!  signalpos=0x%06x", m_SignalRingPosition.exchange(0) ) );
-			m_SignalRingPosition = 0;
+			m_SignalRingPosition.store(0, std::memory_order_release);
 			m_sem_OnRingReset.Post();
 		}
 
@@ -629,7 +629,7 @@ void SysMtgsThread::WaitGS(bool syncRegs, bool weakWait, bool isMTVU)
 // For use in loops that wait on the GS thread to do certain things.
 void SysMtgsThread::SetEvent()
 {
-	if(!m_RingBufferIsBusy)
+	if(!m_RingBufferIsBusy.load(std::memory_order_relaxed))
 		m_sem_event.Post();
 
 	m_CopyDataTally = 0;
@@ -655,11 +655,11 @@ void SysMtgsThread::SendDataPacket()
 
 	m_WritePos = m_packet_writepos;
 
-	if( EmuConfig.GS.SynchronousMTGS )
+	if(EmuConfig.GS.SynchronousMTGS)
 	{
 		WaitGS();
 	}
-	else if( !m_RingBufferIsBusy )
+	else if(!m_RingBufferIsBusy.load(std::memory_order_relaxed))
 	{
 		m_CopyDataTally += m_packet_size;
 		if( m_CopyDataTally > 0x2000 ) SetEvent();
@@ -714,12 +714,12 @@ void SysMtgsThread::GenericStall( uint size )
 		if( somedone > 0x80 )
 		{
 			pxAssertDev( m_SignalRingEnable == 0, "MTGS Thread Synchronization Error" );
-			m_SignalRingPosition = somedone;
+			m_SignalRingPosition.store(somedone, std::memory_order_release);
 
 			//Console.WriteLn( Color_Blue, "(EEcore Sleep) PrepDataPacker \tringpos=0x%06x, writepos=0x%06x, signalpos=0x%06x", readpos, writepos, m_SignalRingPosition );
 
 			while(true) {
-				m_SignalRingEnable = true;
+				m_SignalRingEnable.store(true, std::memory_order_release);
 				SetEvent();
 				m_sem_OnRingReset.WaitWithoutYield();
 				readpos = volatize(m_ReadPos);
@@ -814,7 +814,7 @@ void SysMtgsThread::SendSimpleGSPacket(MTGS_RingCommand type, u32 offset, u32 si
 	SendSimplePacket(type, (int)offset, (int)size, (int)path);
 
 	if(!EmuConfig.GS.SynchronousMTGS) {
-		if(!m_RingBufferIsBusy) {
+		if(!m_RingBufferIsBusy.load(std::memory_order_relaxed)) {
 			m_CopyDataTally += size / 16;
 			if (m_CopyDataTally > 0x2000) SetEvent();
 		}

From 5913ff1bd8aa71a5c9e49e2961bc00b1b558cf67 Mon Sep 17 00:00:00 2001
From: Gregory Hainaut <gregory.hainaut@gmail.com>
Date: Sun, 3 Jul 2016 10:27:53 +0200
Subject: [PATCH 02/11] MTGS: port read/write pointer to full atomic + optimize
 atomic access

Read pointer is only written by the MTGS thread so it can be relaxed-read
by the MTGS thread

Write pointer is only written by the EE thread so it can be relaxed-read
by the EE thread

Remove volatile stuff
---
 pcsx2/GS.h     |  5 ++--
 pcsx2/MTGS.cpp | 80 ++++++++++++++++++++++++++------------------------
 2 files changed, 44 insertions(+), 41 deletions(-)

diff --git a/pcsx2/GS.h b/pcsx2/GS.h
index 17a477a43b..3225b4f9fe 100644
--- a/pcsx2/GS.h
+++ b/pcsx2/GS.h
@@ -267,8 +267,9 @@ class SysMtgsThread : public SysThreadBase
 
 public:
 	// note: when m_ReadPos == m_WritePos, the fifo is empty
-	__aligned(4) uint m_ReadPos;	// cur pos gs is reading from
-	__aligned(4) uint m_WritePos;	// cur pos ee thread is writing to
+	// Threading info: m_ReadPos is updated by the MTGS thread. m_WritePos is updated by the EE thread
+	std::atomic<unsigned int> m_ReadPos;  // cur pos gs is reading from
+	std::atomic<unsigned int> m_WritePos; // cur pos ee thread is writing to
 
 	std::atomic<bool>	m_RingBufferIsBusy;
 	std::atomic<bool>	m_SignalRingEnable;
diff --git a/pcsx2/MTGS.cpp b/pcsx2/MTGS.cpp
index ac645282fc..fffdd86161 100644
--- a/pcsx2/MTGS.cpp
+++ b/pcsx2/MTGS.cpp
@@ -37,13 +37,6 @@ using namespace Threading;
 #	define MTGS_LOG(...) do {} while (0)
 #endif
 
-// forces the compiler to treat a non-volatile value as volatile.
-// This allows us to declare the vars as non-volatile and only use
-// them as volatile when appropriate (more optimized).
-
-#define volatize(x) (*reinterpret_cast<volatile uint*>(&(x)))
-
-
 // =====================================================================================================
 //  MTGS Threaded Class Implementation
 // =====================================================================================================
@@ -110,7 +103,7 @@ void SysMtgsThread::ResetGS()
 	//  * Signal a reset.
 	//  * clear the path and byRegs structs (used by GIFtagDummy)
 
-	m_ReadPos             = m_WritePos;
+	m_ReadPos             = m_WritePos.load();
 	m_QueuedFrameCount    = 0;
 	m_VsyncSignalListener = 0;
 
@@ -164,7 +157,7 @@ void SysMtgsThread::PostVsyncStart()
 	if ((m_QueuedFrameCount.fetch_add(1) < EmuConfig.GS.VsyncQueueSize) /*|| (!EmuConfig.GS.VsyncEnable && !EmuConfig.GS.FrameLimitEnable)*/) return;
 
 	m_VsyncSignalListener.store(true, std::memory_order_release);
-	//Console.WriteLn( Color_Blue, "(EEcore Sleep) Vsync\t\tringpos=0x%06x, writepos=0x%06x", volatize(m_ReadPos), m_WritePos );
+	//Console.WriteLn( Color_Blue, "(EEcore Sleep) Vsync\t\tringpos=0x%06x, writepos=0x%06x", m_ReadPos.load(), m_WritePos.load() );
 	m_sem_Vsync.WaitNoCancel();
 }
 
@@ -266,6 +259,9 @@ struct RingBufferLock {
 
 void SysMtgsThread::ExecuteTaskInThread()
 {
+	// Threading info: run in MTGS thread
+	// m_ReadPos is only update by the MTGS thread so it is safe to load it with a relaxed atomic
+
 #ifdef RINGBUF_DEBUG_STACK
 	PacketTagType prevCmd;
 #endif
@@ -285,16 +281,18 @@ void SysMtgsThread::ExecuteTaskInThread()
 
 		// note: m_ReadPos is intentionally not volatile, because it should only
 		// ever be modified by this thread.
-		while( m_ReadPos != volatize(m_WritePos))
+		while( m_ReadPos.load(std::memory_order_relaxed) != m_WritePos.load(std::memory_order_acquire))
 		{
 			if (EmuConfig.GS.DisableOutput) {
-				m_ReadPos = m_WritePos;
+				m_ReadPos = m_WritePos.load();
 				continue;
 			}
 
-			pxAssert( m_ReadPos < RingBufferSize );
+			const unsigned int local_ReadPos = m_ReadPos.load(std::memory_order_relaxed);
 
-			const PacketTagType& tag = (PacketTagType&)RingBuffer[m_ReadPos];
+			pxAssert( local_ReadPos < RingBufferSize );
+
+			const PacketTagType& tag = (PacketTagType&)RingBuffer[local_ReadPos];
 			u32 ringposinc = 1;
 
 #ifdef RINGBUF_DEBUG_STACK
@@ -302,11 +300,11 @@ void SysMtgsThread::ExecuteTaskInThread()
 
 			m_lock_Stack.Lock();
 			uptr stackpos = ringposStack.back();
-			if( stackpos != m_ReadPos )
+			if( stackpos != local_ReadPos )
 			{
-				Console.Error( "MTGS Ringbuffer Critical Failure ---> %x to %x (prevCmd: %x)\n", stackpos, m_ReadPos, prevCmd.command );
+				Console.Error( "MTGS Ringbuffer Critical Failure ---> %x to %x (prevCmd: %x)\n", stackpos, local_ReadPos, prevCmd.command );
 			}
-			pxAssert( stackpos == m_ReadPos );
+			pxAssert( stackpos == local_ReadPos );
 			prevCmd = tag;
 			ringposStack.pop_back();
 			m_lock_Stack.Release();
@@ -317,7 +315,7 @@ void SysMtgsThread::ExecuteTaskInThread()
 #if COPY_GS_PACKET_TO_MTGS == 1
 				case GS_RINGTYPE_P1:
 				{
-					uint datapos = (m_ReadPos+1) & RingBufferMask;
+					uint datapos = (local_ReadPos+1) & RingBufferMask;
 					const int qsize = tag.data[0];
 					const u128* data = &RingBuffer[datapos];
 
@@ -342,7 +340,7 @@ void SysMtgsThread::ExecuteTaskInThread()
 
 				case GS_RINGTYPE_P2:
 				{
-					uint datapos = (m_ReadPos+1) & RingBufferMask;
+					uint datapos = (local_ReadPos+1) & RingBufferMask;
 					const int qsize = tag.data[0];
 					const u128* data = &RingBuffer[datapos];
 
@@ -367,7 +365,7 @@ void SysMtgsThread::ExecuteTaskInThread()
 
 				case GS_RINGTYPE_P3:
 				{
-					uint datapos = (m_ReadPos+1) & RingBufferMask;
+					uint datapos = (local_ReadPos+1) & RingBufferMask;
 					const int qsize = tag.data[0];
 					const u128* data = &RingBuffer[datapos];
 
@@ -429,7 +427,7 @@ void SysMtgsThread::ExecuteTaskInThread()
 							// This seemingly obtuse system is needed in order to handle cases where the vsync data wraps
 							// around the edge of the ringbuffer.  If not for that I'd just use a struct. >_<
 
-							uint datapos = (m_ReadPos+1) & RingBufferMask;
+							uint datapos = (local_ReadPos+1) & RingBufferMask;
 							MemCopy_WrappedSrc( RingBuffer.m_Ring, datapos, RingBufferSize, (u128*)RingBuffer.Regs, 0xf );
 
 							u32* remainder = (u32*)&RingBuffer[datapos];
@@ -504,9 +502,9 @@ void SysMtgsThread::ExecuteTaskInThread()
 
 #ifdef PCSX2_DEVBUILD
 						default:
-							Console.Error("GSThreadProc, bad packet (%x) at m_ReadPos: %x, m_WritePos: %x", tag.command, m_ReadPos, m_WritePos);
+							Console.Error("GSThreadProc, bad packet (%x) at m_ReadPos: %x, m_WritePos: %x", tag.command, local_ReadPos, m_WritePos.load());
 							pxFail( "Bad packet encountered in the MTGS Ringbuffer." );
-							m_ReadPos = m_WritePos;
+							m_ReadPos.store(m_WritePos.load(std::memory_order_acquire), std::memory_order_release);
 						continue;
 #else
 						// Optimized performance in non-Dev builds.
@@ -516,14 +514,14 @@ void SysMtgsThread::ExecuteTaskInThread()
 				}
 			}
 
-			uint newringpos = (m_ReadPos + ringposinc) & RingBufferMask;
+			uint newringpos = (m_ReadPos.load(std::memory_order_relaxed) + ringposinc) & RingBufferMask;
 
 			if( EmuConfig.GS.SynchronousMTGS )
 			{
 				pxAssert( m_WritePos == newringpos );
 			}
 
-			m_ReadPos = newringpos;
+			m_ReadPos.store(newringpos, std::memory_order_release);
 
 			if(m_SignalRingEnable.load(std::memory_order_acquire))
 			{
@@ -599,14 +597,17 @@ void SysMtgsThread::WaitGS(bool syncRegs, bool weakWait, bool isMTVU)
 	Gif_Path&   path = gifUnit.gifPath[GIF_PATH_1];
 	u32 startP1Packs = weakWait ? path.GetPendingGSPackets() : 0;
 
-	if (isMTVU || volatize(m_ReadPos) != m_WritePos) {
+	// Both m_ReadPos and m_WritePos can be relaxed as we only want to test if the queue is empty but
+	// we don't want to access the content of the queue
+
+	if (isMTVU || m_ReadPos.load(std::memory_order_relaxed) != m_WritePos.load(std::memory_order_relaxed)) {
 		SetEvent();
 		RethrowException();
 		for(;;) {
 			if (weakWait) m_mtx_RingBufferBusy2.Wait();
 			else          m_mtx_RingBufferBusy .Wait();
 			RethrowException();
-			if(!isMTVU && volatize(m_ReadPos) == m_WritePos) break;
+			if(!isMTVU && m_ReadPos.load(std::memory_order_relaxed) == m_WritePos.load(std::memory_order_relaxed)) break;
 			u32 curP1Packs = weakWait ? path.GetPendingGSPackets() : 0;
 			if (weakWait && ((startP1Packs-curP1Packs) || !curP1Packs)) break;
 			// On weakWait we will stop waiting on the MTGS thread if the
@@ -653,7 +654,7 @@ void SysMtgsThread::SendDataPacket()
 	PacketTagType& tag = (PacketTagType&)RingBuffer[m_packet_startpos];
 	tag.data[0] = actualSize;
 
-	m_WritePos = m_packet_writepos;
+	m_WritePos.store(m_packet_writepos, std::memory_order_release);
 
 	if(EmuConfig.GS.SynchronousMTGS)
 	{
@@ -675,7 +676,7 @@ void SysMtgsThread::GenericStall( uint size )
 	// Note on volatiles: m_WritePos is not modified by the GS thread, so there's no need
 	// to use volatile reads here.  We do cache it though, since we know it never changes,
 	// except for calls to RingbufferRestert() -- handled below.
-	const uint writepos = m_WritePos;
+	const uint writepos = m_WritePos.load(std::memory_order_relaxed);
 
 	// Sanity checks! (within the confines of our ringbuffer please!)
 	pxAssert( size < RingBufferSize );
@@ -686,7 +687,7 @@ void SysMtgsThread::GenericStall( uint size )
 	// But if not then we need to make sure the readpos is outside the scope of
 	// the block about to be written (writepos + size)
 
-	uint readpos = volatize(m_ReadPos);
+	uint readpos = m_ReadPos.load(std::memory_order_acquire);
 	uint freeroom;
 
 	if (writepos < readpos)
@@ -722,7 +723,7 @@ void SysMtgsThread::GenericStall( uint size )
 				m_SignalRingEnable.store(true, std::memory_order_release);
 				SetEvent();
 				m_sem_OnRingReset.WaitWithoutYield();
-				readpos = volatize(m_ReadPos);
+				readpos = m_ReadPos.load(std::memory_order_acquire);
 				//Console.WriteLn( Color_Blue, "(EEcore Awake) Report!\tringpos=0x%06x", readpos );
 
 				if (writepos < readpos)
@@ -741,7 +742,7 @@ void SysMtgsThread::GenericStall( uint size )
 			SetEvent();
 			while(true) {
 				SpinWait();
-				readpos = volatize(m_ReadPos);
+				readpos = m_ReadPos.load(std::memory_order_acquire);
 
 				if (writepos < readpos)
 					freeroom = readpos - writepos;
@@ -762,12 +763,13 @@ void SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size )
 
 	// Command qword: Low word is the command, and the high word is the packet
 	// length in SIMDs (128 bits).
+	const unsigned int local_WritePos = m_WritePos.load(std::memory_order_relaxed);
 
-	PacketTagType& tag = (PacketTagType&)RingBuffer[m_WritePos];
+	PacketTagType& tag = (PacketTagType&)RingBuffer[local_WritePos];
 	tag.command = cmd;
 	tag.data[0] = m_packet_size;
-	m_packet_startpos = m_WritePos;
-	m_packet_writepos = (m_WritePos + 1) & RingBufferMask;
+	m_packet_startpos = local_WritePos;
+	m_packet_writepos = (local_WritePos + 1) & RingBufferMask;
 }
 
 // Returns the amount of giftag data processed (in simd128 values).
@@ -784,9 +786,9 @@ void SysMtgsThread::PrepDataPacket( GIF_PATH pathidx, u32 size )
 
 __fi void SysMtgsThread::_FinishSimplePacket()
 {
-	uint future_writepos = (m_WritePos+1) & RingBufferMask;
-	pxAssert( future_writepos != volatize(m_ReadPos) );
-	m_WritePos = future_writepos;
+	uint future_writepos = (m_WritePos.load(std::memory_order_relaxed) +1) & RingBufferMask;
+	pxAssert( future_writepos != m_ReadPos.load(std::memory_order_acquire) );
+	m_WritePos.store(future_writepos, std::memory_order_release);
 
 	if( EmuConfig.GS.SynchronousMTGS )
 		WaitGS();
@@ -799,7 +801,7 @@ void SysMtgsThread::SendSimplePacket( MTGS_RingCommand type, int data0, int data
 	//ScopedLock locker( m_PacketLocker );
 
 	GenericStall(1);
-	PacketTagType& tag = (PacketTagType&)RingBuffer[m_WritePos];
+	PacketTagType& tag = (PacketTagType&)RingBuffer[m_WritePos.load(std::memory_order_relaxed)];
 
 	tag.command = type;
 	tag.data[0] = data0;
@@ -826,7 +828,7 @@ void SysMtgsThread::SendPointerPacket( MTGS_RingCommand type, u32 data0, void* d
 	//ScopedLock locker( m_PacketLocker );
 
 	GenericStall(1);
-	PacketTagType& tag = (PacketTagType&)RingBuffer[m_WritePos];
+	PacketTagType& tag = (PacketTagType&)RingBuffer[m_WritePos.load(std::memory_order_relaxed)];
 
 	tag.command = type;
 	tag.data[0] = data0;

From 4cdf05efacd81b00f91b02bd4cd916bad65e292a Mon Sep 17 00:00:00 2001
From: Gregory Hainaut <gregory.hainaut@gmail.com>
Date: Sun, 3 Jul 2016 13:33:51 +0200
Subject: [PATCH 03/11] MTGS: use private member for RingBufferLock

Easier to understand the purpose of the 2nd lock
---
 pcsx2/MTGS.cpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/pcsx2/MTGS.cpp b/pcsx2/MTGS.cpp
index fffdd86161..944fcac620 100644
--- a/pcsx2/MTGS.cpp
+++ b/pcsx2/MTGS.cpp
@@ -231,11 +231,13 @@ void SysMtgsThread::OpenPlugin()
 	GSsetGameCRC( ElfCRC, 0 );
 }
 
-struct RingBufferLock {
+class RingBufferLock {
 	ScopedLock     m_lock1;
 	ScopedLock     m_lock2;
 	SysMtgsThread& m_mtgs;
 
+	public:
+
 	RingBufferLock(SysMtgsThread& mtgs)
 		: m_lock1(mtgs.m_mtx_RingBufferBusy),
 		  m_lock2(mtgs.m_mtx_RingBufferBusy2),
@@ -255,6 +257,12 @@ struct RingBufferLock {
 		m_lock2.Release();
 		m_lock1.Release();
 	}
+	void PartialAcquire() {
+		m_lock2.Acquire();
+	}
+	void PartialRelease() {
+		m_lock2.Release();
+	}
 };
 
 void SysMtgsThread::ExecuteTaskInThread()
@@ -400,10 +408,10 @@ void SysMtgsThread::ExecuteTaskInThread()
 				case GS_RINGTYPE_MTVU_GSPACKET: {
 					MTVU_LOG("MTGS - Waiting on semaXGkick!");
 					vu1Thread.KickStart(true);
-					busy.m_lock2.Release();
+					busy.PartialRelease();
 					// Wait for MTVU to complete vu1 program
 					vu1Thread.semaXGkick.WaitWithoutYield();
-					busy.m_lock2.Acquire();
+					busy.PartialAcquire();
 					Gif_Path& path   = gifUnit.gifPath[GIF_PATH_1];
 					GS_Packet gsPack = path.GetGSPacketMTVU(); // Get vu1 program's xgkick packet(s)
 					if (gsPack.size) GSgifTransfer((u32*)&path.buffer[gsPack.offset], gsPack.size/16);

From 3b4c357aaad8f6cdaf019094e8248e68963d6cf7 Mon Sep 17 00:00:00 2001
From: Gregory Hainaut <gregory.hainaut@gmail.com>
Date: Thu, 14 Jul 2016 17:52:59 +0200
Subject: [PATCH 04/11] MTGS: avoid a potential very rare deadlock

---
 pcsx2/MTGS.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pcsx2/MTGS.cpp b/pcsx2/MTGS.cpp
index 944fcac620..29978bf4c1 100644
--- a/pcsx2/MTGS.cpp
+++ b/pcsx2/MTGS.cpp
@@ -158,6 +158,13 @@ void SysMtgsThread::PostVsyncStart()
 
 	m_VsyncSignalListener.store(true, std::memory_order_release);
 	//Console.WriteLn( Color_Blue, "(EEcore Sleep) Vsync\t\tringpos=0x%06x, writepos=0x%06x", m_ReadPos.load(), m_WritePos.load() );
+
+	// We will wait a vsync event from the MTGS ring. If the ring is already purged, the event will never come !
+	// To avoid this potential deadlock, ring must be wake up after m_VsyncSignalListener
+	// Note: potentially we can also miss the previous wake up if we optimize away the post just before the release of busy signal of the ring
+	// So let's ensure the ring doesn't sleep
+	m_sem_event.Post();
+
 	m_sem_Vsync.WaitNoCancel();
 }
 

From ca4692179622d407d5bf35c7e9341051574b2535 Mon Sep 17 00:00:00 2001
From: Gregory Hainaut <gregory.hainaut@gmail.com>
Date: Sat, 2 Jul 2016 18:56:21 +0200
Subject: [PATCH 05/11] MTVU: use acquire/release semantics for atomic
 operation

* Avoid the generation of memory barrier (mfence)
* Based on the fact that it used to work on previous code without any
  barrier

v2:
* keep basic code in reset path
* use relaxed access for isBusy. The variable doesn't carry load/store
  dependency but is instead an hint to optimize semaphore post
---
 common/include/Utilities/Threading.h |  8 ++++----
 pcsx2/MTVU.cpp                       | 10 +++++-----
 pcsx2/MTVU.h                         |  4 ++--
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/common/include/Utilities/Threading.h b/common/include/Utilities/Threading.h
index 26967e5a8a..f25610c003 100644
--- a/common/include/Utilities/Threading.h
+++ b/common/include/Utilities/Threading.h
@@ -399,17 +399,17 @@ namespace Threading
 		ScopedLockBool(Mutex& mutexToLock, std::atomic<bool>& isLockedBool)
 			: m_lock(mutexToLock),
 			  m_bool(isLockedBool) {
-			m_bool = m_lock.IsLocked();
+			m_bool.store(m_lock.IsLocked(), std::memory_order_relaxed);
 		}
 		virtual ~ScopedLockBool() throw() {
-			m_bool = false;
+			m_bool.store(false, std::memory_order_relaxed);
 		}
 		void Acquire() {
 			m_lock.Acquire();
-			m_bool = m_lock.IsLocked();
+			m_bool.store(m_lock.IsLocked(), std::memory_order_relaxed);
 		}
 		void Release() {
-			m_bool = false;
+			m_bool.store(false, std::memory_order_relaxed);
 			m_lock.Release();
 		}
 	};
diff --git a/pcsx2/MTVU.cpp b/pcsx2/MTVU.cpp
index e049ba3df7..7a2fd4a7be 100644
--- a/pcsx2/MTVU.cpp
+++ b/pcsx2/MTVU.cpp
@@ -75,11 +75,11 @@ void VU_Thread::Reset()
 {
 	ScopedLock lock(mtxBusy);
 
-	read_pos     = 0;
 	write_pos    = 0;
 	write_offset = 0;
 	vuCycleIdx   = 0;
-	isBusy = false;
+	read_pos     = 0;
+	isBusy       = false;
 	memzero(vif);
 	memzero(vifRegs);
 	memzero(vuCycles);
@@ -202,7 +202,7 @@ __fi u32* VU_Thread::GetWritePtr()
 
 __fi void VU_Thread::incReadPos(s32 offset)
 { // Offset in u32 sizes
-	read_pos = (read_pos + offset) & buffer_mask;
+	read_pos.store((read_pos.load(std::memory_order_relaxed) + offset) & buffer_mask, std::memory_order_release);
 }
 __fi void VU_Thread::incWritePos()
 { // Adds write_offset
@@ -272,12 +272,12 @@ u32 VU_Thread::Get_vuCycles()
 void VU_Thread::KickStart(bool forceKick)
 {
 	if ((forceKick && !semaEvent.Count())
-	|| (!isBusy && GetReadPos() != write_pos)) semaEvent.Post();
+	|| (!isBusy.load(std::memory_order_relaxed) && GetReadPos() != write_pos)) semaEvent.Post();
 }
 
 bool VU_Thread::IsDone()
 {
-	return !isBusy && GetReadPos() == GetWritePos();
+	return !isBusy.load(std::memory_order_relaxed) && GetReadPos() == GetWritePos();
 }
 
 void VU_Thread::WaitVU()
diff --git a/pcsx2/MTVU.h b/pcsx2/MTVU.h
index 29cc1fcf36..8db6cde720 100644
--- a/pcsx2/MTVU.h
+++ b/pcsx2/MTVU.h
@@ -30,8 +30,8 @@ class VU_Thread : public pxThread {
 	static const s32 buffer_size = (_1mb * 16) / sizeof(s32);
 	static const u32 buffer_mask = buffer_size - 1;
 	__aligned(4) u32 buffer[buffer_size];
-	__aligned(4) std::atomic<int> read_pos; // Only modified by VU thread
-	__aligned(4) std::atomic<bool> isBusy;   // Is thread processing data?
+	std::atomic<int> read_pos; // Only modified by VU thread
+	std::atomic<bool> isBusy;   // Is thread processing data?
 	__aligned(4) s32  write_pos;    // Only modified by EE thread
 	__aligned(4) s32  write_offset; // Only modified by EE thread
 	__aligned(4) Mutex     mtxBusy;

From 3f0655c8210eb32630d37ea3cb947e6afb0be605 Mon Sep 17 00:00:00 2001
From: Gregory Hainaut <gregory.hainaut@gmail.com>
Date: Sun, 3 Jul 2016 10:54:12 +0200
Subject: [PATCH 06/11] MTVU: port write pointer to atomic and optimize atomic
 access

Write pointer can be relaxed-read from the EE thread
Read pointer can be relaxed-read from the VU thread

Warning: AtomicExchange calls were replaced by release-store
However AtomicExchange generates a memory barrier (at least on linux)
So either it is useless and it will be faster, or it will explode ;)
---
 pcsx2/MTVU.cpp | 34 +++++++++++++++++-----------------
 pcsx2/MTVU.h   |  2 +-
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/pcsx2/MTVU.cpp b/pcsx2/MTVU.cpp
index 7a2fd4a7be..875e38a672 100644
--- a/pcsx2/MTVU.cpp
+++ b/pcsx2/MTVU.cpp
@@ -75,11 +75,11 @@ void VU_Thread::Reset()
 {
 	ScopedLock lock(mtxBusy);
 
-	write_pos    = 0;
 	write_offset = 0;
 	vuCycleIdx   = 0;
 	read_pos     = 0;
 	isBusy       = false;
+	write_pos    = 0;
 	memzero(vif);
 	memzero(vifRegs);
 	memzero(vuCycles);
@@ -97,7 +97,7 @@ void VU_Thread::ExecuteRingBuffer()
 	for(;;) {
 		semaEvent.WaitWithoutYield();
 		ScopedLockBool lock(mtxBusy, isBusy);
-		while (read_pos != GetWritePos()) {
+		while (read_pos.load(std::memory_order_relaxed) != GetWritePos()) {
 			u32 tag = Read();
 			switch (tag) {
 				case MTVU_VU_EXECUTE: {
@@ -137,12 +137,12 @@ void VU_Thread::ExecuteRingBuffer()
 					Read(&vif.tag, vif_copy_size);
 					ReadRegs(&vifRegs);
 					u32 size = Read();
-					MTVU_Unpack(&buffer[read_pos], vifRegs);
+					MTVU_Unpack(&buffer[read_pos.load(std::memory_order_relaxed)], vifRegs);
 					incReadPos(size_u32(size));
 					break;
 				}
 				case MTVU_NULL_PACKET:
-					read_pos = 0;
+					read_pos.store(0, std::memory_order_release);
 					break;
 				jNO_DEFAULT;
 			}
@@ -156,8 +156,8 @@ __ri void VU_Thread::WaitOnSize(s32 size)
 {
 	for(;;) {
 		s32 readPos  = GetReadPos();
-		if (readPos <= write_pos) break; // MTVU is reading in back of write_pos
-		if (readPos >  write_pos + size) break; // Enough free front space
+		if (readPos <= write_pos.load(std::memory_order_relaxed)) break; // MTVU is reading in back of write_pos
+		if (readPos >  write_pos.load(std::memory_order_relaxed) + size) break; // Enough free front space
 		if (1) { // Let MTVU run to free up buffer space
 			KickStart();
 			if (IsDevBuild) DevCon.WriteLn("WaitOnSize()");
@@ -174,12 +174,12 @@ void VU_Thread::ReserveSpace(s32 size)
 	pxAssert(size      < buffer_size);
 	pxAssert(size > 0);
 	pxAssert(write_offset == 0);
-	if (write_pos + size > buffer_size) {
+	if (write_pos.load(std::memory_order_relaxed) + size > buffer_size) {
 		pxAssert(write_pos > 0);
 		WaitOnSize(1); // Size of MTVU_NULL_PACKET
 		Write(MTVU_NULL_PACKET);
 		write_offset = 0;
-		AtomicExchange(volatize(write_pos), 0);
+		write_pos.store(0, std::memory_order_release);
 	}
 	WaitOnSize(size);
 }
@@ -187,17 +187,17 @@ void VU_Thread::ReserveSpace(s32 size)
 // Use this when reading read_pos from ee thread
 __fi s32 VU_Thread::GetReadPos()
 {
-	return read_pos.load();
+	return read_pos.load(std::memory_order_acquire);
 }
 // Use this when reading write_pos from vu thread
 __fi s32 VU_Thread::GetWritePos()
 {
-	return AtomicRead(volatize(write_pos));
+	return write_pos.load(std::memory_order_acquire);
 }
 // Gets the effective write pointer after adding write_offset
 __fi u32* VU_Thread::GetWritePtr()
 {
-	return &buffer[(write_pos + write_offset) & buffer_mask];
+	return &buffer[(write_pos.load(std::memory_order_relaxed) + write_offset) & buffer_mask];
 }
 
 __fi void VU_Thread::incReadPos(s32 offset)
@@ -206,29 +206,29 @@ __fi void VU_Thread::incReadPos(s32 offset)
 }
 __fi void VU_Thread::incWritePos()
 { // Adds write_offset
-	s32 temp = (write_pos + write_offset) & buffer_mask;
+	s32 temp = (write_pos.load(std::memory_order_relaxed) + write_offset) & buffer_mask;
 	write_offset = 0;
-	AtomicExchange(volatize(write_pos), temp);
+	write_pos.store(temp, std::memory_order_release);
 	if (MTVU_ALWAYS_KICK) KickStart();
 	if (MTVU_SYNC_MODE)   WaitVU();
 }
 
 __fi u32 VU_Thread::Read()
 {
-	u32 ret = buffer[read_pos];
+	u32 ret = buffer[read_pos.load(std::memory_order_relaxed)];
 	incReadPos(1);
 	return ret;
 }
 
 __fi void VU_Thread::Read(void* dest, u32 size)
 {
-	memcpy(dest, &buffer[read_pos], size);
+	memcpy(dest, &buffer[read_pos.load(std::memory_order_relaxed)], size);
 	incReadPos(size_u32(size));
 }
 
 __fi void VU_Thread::ReadRegs(VIFregisters* dest)
 {
-	VIFregistersMTVU* src = (VIFregistersMTVU*)&buffer[read_pos];
+	VIFregistersMTVU* src = (VIFregistersMTVU*)&buffer[read_pos.load(std::memory_order_relaxed)];
 	dest->cycle = src->cycle;
 	dest->mode = src->mode;
 	dest->num = src->num;
@@ -272,7 +272,7 @@ u32 VU_Thread::Get_vuCycles()
 void VU_Thread::KickStart(bool forceKick)
 {
 	if ((forceKick && !semaEvent.Count())
-	|| (!isBusy.load(std::memory_order_relaxed) && GetReadPos() != write_pos)) semaEvent.Post();
+	|| (!isBusy.load(std::memory_order_relaxed) && GetReadPos() != write_pos.load(std::memory_order_relaxed))) semaEvent.Post();
 }
 
 bool VU_Thread::IsDone()
diff --git a/pcsx2/MTVU.h b/pcsx2/MTVU.h
index 8db6cde720..e6e045c04b 100644
--- a/pcsx2/MTVU.h
+++ b/pcsx2/MTVU.h
@@ -32,7 +32,7 @@ class VU_Thread : public pxThread {
 	__aligned(4) u32 buffer[buffer_size];
 	std::atomic<int> read_pos; // Only modified by VU thread
 	std::atomic<bool> isBusy;   // Is thread processing data?
-	__aligned(4) s32  write_pos;    // Only modified by EE thread
+	std::atomic<int> write_pos;    // Only modified by EE thread
 	__aligned(4) s32  write_offset; // Only modified by EE thread
 	__aligned(4) Mutex     mtxBusy;
 	__aligned(4) Semaphore semaEvent;

From 567976e822fb26e2ad75ae299962632546b54399 Mon Sep 17 00:00:00 2001
From: Gregory Hainaut <gregory.hainaut@gmail.com>
Date: Sun, 3 Jul 2016 11:12:35 +0200
Subject: [PATCH 07/11] MTVU: port vuCycles to std::atomic

V2: use relaxed order as the variable doesn't carry load/store dependency
It is only used as a counter for the speed hack
---
 pcsx2/MTVU.cpp | 16 +++++++++++-----
 pcsx2/MTVU.h   |  2 +-
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/pcsx2/MTVU.cpp b/pcsx2/MTVU.cpp
index 875e38a672..ee85811ea8 100644
--- a/pcsx2/MTVU.cpp
+++ b/pcsx2/MTVU.cpp
@@ -52,7 +52,10 @@ void SaveStateBase::mtvuFreeze()
 	FreezeTag("MTVU");
 	pxAssert(vu1Thread.IsDone());
 	if (!IsSaving()) vu1Thread.Reset();
-	Freeze(vu1Thread.vuCycles);
+	for (size_t i = 0; i < 4; ++i) {
+		unsigned int v = vu1Thread.vuCycles[i].load();
+		Freeze(v);
+	}
 	Freeze(vu1Thread.vuCycleIdx);
 }
 
@@ -82,7 +85,8 @@ void VU_Thread::Reset()
 	write_pos    = 0;
 	memzero(vif);
 	memzero(vifRegs);
-	memzero(vuCycles);
+	for (size_t i = 0; i < 4; ++i)
+		vu1Thread.vuCycles[i] = 0;
 }
 
 void VU_Thread::ExecuteTaskInThread()
@@ -109,7 +113,7 @@ void VU_Thread::ExecuteRingBuffer()
 					vuCPU->Execute(vu1RunCycles);
 					gifUnit.gifPath[GIF_PATH_1].FinishGSPacketMTVU();
 					semaXGkick.Post(); // Tell MTGS a path1 packet is complete
-					AtomicExchange(vuCycles[vuCycleIdx], vuRegs.cycle);
+					vuCycles[vuCycleIdx].store(vuRegs.cycle, std::memory_order_relaxed);
 					vuCycleIdx  = (vuCycleIdx + 1) & 3;
 					break;
 				}
@@ -265,8 +269,10 @@ __fi void VU_Thread::WriteRegs(VIFregisters* src)
 // Used for vu cycle stealing hack
 u32 VU_Thread::Get_vuCycles()
 {
-	return (AtomicRead(vuCycles[0]) + AtomicRead(vuCycles[1])
-		  + AtomicRead(vuCycles[2]) + AtomicRead(vuCycles[3])) >> 2;
+	return (vuCycles[0].load(std::memory_order_relaxed) +
+			vuCycles[1].load(std::memory_order_relaxed) +
+			vuCycles[2].load(std::memory_order_relaxed) +
+			vuCycles[3].load(std::memory_order_relaxed)) >> 2;
 }
 
 void VU_Thread::KickStart(bool forceKick)
diff --git a/pcsx2/MTVU.h b/pcsx2/MTVU.h
index e6e045c04b..71d16c5ce9 100644
--- a/pcsx2/MTVU.h
+++ b/pcsx2/MTVU.h
@@ -43,7 +43,7 @@ public:
 	__aligned16  vifStruct        vif;
 	__aligned16  VIFregisters     vifRegs;
 	__aligned(4) Semaphore semaXGkick;
-	__aligned(4) u32 vuCycles[4]; // Used for VU cycle stealing hack
+	__aligned(4) std::atomic<unsigned int> vuCycles[4]; // Used for VU cycle stealing hack
 	__aligned(4) u32 vuCycleIdx;  // Used for VU cycle stealing hack
 
 	VU_Thread(BaseVUmicroCPU*& _vuCPU, VURegs& _vuRegs);

From 086dfc8a14bda0b56d0024086a77b0e3f4211e0b Mon Sep 17 00:00:00 2001
From: Gregory Hainaut <gregory.hainaut@gmail.com>
Date: Sat, 2 Jul 2016 19:43:56 +0200
Subject: [PATCH 08/11] gsdx sw: use acquire/release semantics for atomic
 operation

* Avoid the generation of memory barrier (mfence)
---
 plugins/GSdx/GSTextureSW.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/plugins/GSdx/GSTextureSW.cpp b/plugins/GSdx/GSTextureSW.cpp
index 2a848e72ec..6e02f59da6 100644
--- a/plugins/GSdx/GSTextureSW.cpp
+++ b/plugins/GSdx/GSTextureSW.cpp
@@ -25,7 +25,7 @@
 
 GSTextureSW::GSTextureSW(int type, int width, int height)
 {
-	m_mapped.clear();
+	m_mapped.clear(std::memory_order_release);
 	m_size = GSVector2i(width, height);
 	m_type = type;
 	m_format = 0;
@@ -68,7 +68,7 @@ bool GSTextureSW::Map(GSMap& m, const GSVector4i* r)
 
 	if(m_data != NULL && r2.left >= 0 && r2.right <= m_size.x && r2.top >= 0 && r2.bottom <= m_size.y)
 	{
-		if (!m_mapped.test_and_set())
+		if (!m_mapped.test_and_set(std::memory_order_acquire))
 		{
 			m.bits = (uint8*)m_data + m_pitch * r2.top + (r2.left << 2);
 			m.pitch = m_pitch;
@@ -82,7 +82,7 @@ bool GSTextureSW::Map(GSMap& m, const GSVector4i* r)
 
 void GSTextureSW::Unmap()
 {
-	m_mapped.clear();
+	m_mapped.clear(std::memory_order_release);
 }
 
 bool GSTextureSW::Save(const string& fn, bool user_image, bool dds)

From 812e41d578f6371cf83660b9ec4c3c738ce9045c Mon Sep 17 00:00:00 2001
From: Gregory Hainaut <gregory.hainaut@gmail.com>
Date: Wed, 6 Jul 2016 20:15:22 +0200
Subject: [PATCH 09/11] common: relax atomic of m_IsBeingDeleted

Avoid the memory fence in the constructor
---
 common/src/Utilities/wxHelpers.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/src/Utilities/wxHelpers.cpp b/common/src/Utilities/wxHelpers.cpp
index 8654db4edd..1cfea691db 100644
--- a/common/src/Utilities/wxHelpers.cpp
+++ b/common/src/Utilities/wxHelpers.cpp
@@ -66,7 +66,7 @@ BaseDeletableObject::BaseDeletableObject()
 	//pxAssertDev( _CrtIsValidHeapPointer( this ), "BaseDeletableObject types cannot be created on the stack or as temporaries!" );
 	#endif
 
-	m_IsBeingDeleted = false;
+	m_IsBeingDeleted.store(false, std::memory_order_relaxed);
 }
 
 BaseDeletableObject::~BaseDeletableObject() throw()

From 10ea05bc6f576ee3a336fad7f694850165c9e8e1 Mon Sep 17 00:00:00 2001
From: Gregory Hainaut <gregory.hainaut@gmail.com>
Date: Fri, 8 Jul 2016 18:11:51 +0200
Subject: [PATCH 10/11] common: remove old atomic wrapper

Use cross-platform std::atomic instead
---
 common/include/Utilities/Threading.h        | 17 +----------------
 common/src/Utilities/Mutex.cpp              |  2 +-
 common/src/Utilities/ThreadTools.cpp        | 21 ---------------------
 common/src/Utilities/Windows/WinThreads.cpp |  5 -----
 pcsx2/MTVU.cpp                              |  1 -
 5 files changed, 2 insertions(+), 44 deletions(-)

diff --git a/common/include/Utilities/Threading.h b/common/include/Utilities/Threading.h
index f25610c003..69d119d543 100644
--- a/common/include/Utilities/Threading.h
+++ b/common/include/Utilities/Threading.h
@@ -159,9 +159,6 @@ namespace Threading
 	// For use in spin/wait loops.
 	extern void SpinWait();
 
-	// Use prior to committing data to another thread
-	extern void StoreFence();
-
 	// Optional implementation to enable hires thread/process scheduler for the operating system.
 	// Needed by Windows, but might not be relevant to other platforms.
 	extern void EnableHiresScheduler();
@@ -170,18 +167,6 @@ namespace Threading
 	// sleeps the current thread for the given number of milliseconds.
 	extern void Sleep( int ms );
 
-// --------------------------------------------------------------------------------------
-//  AtomicExchange / AtomicIncrement
-// --------------------------------------------------------------------------------------
-// Our fundamental interlocking functions.  All other useful interlocks can be derived
-// from these little beasties!  (these are all implemented internally using cross-platform
-// implementations of _InterlockedExchange and such)
-
-	extern u32 AtomicRead( volatile u32& Target );
-	extern s32 AtomicRead( volatile s32& Target );
-	extern u32 AtomicExchange( volatile u32& Target, u32 value );
-	extern s32 AtomicExchange( volatile s32& Target, s32 value );
-
 	// pthread Cond is an evil api that is not suited for Pcsx2 needs.
 	// Let's not use it. Use mutexes and semaphores instead to create waits. (Air)
 #if 0
@@ -309,7 +294,7 @@ namespace Threading
 	// will be automatically released on any return or exit from the function.
 	//
 	// Const qualification note:
-	//  ScopedLock takes const instances of the mutex, even though the mutex is modified 
+	//  ScopedLock takes const instances of the mutex, even though the mutex is modified
 	//  by locking and unlocking.  Two rationales:
 	//
 	//  1) when designing classes with accessors (GetString, GetValue, etc) that need mutexes,
diff --git a/common/src/Utilities/Mutex.cpp b/common/src/Utilities/Mutex.cpp
index 246387f805..7171f6f620 100644
--- a/common/src/Utilities/Mutex.cpp
+++ b/common/src/Utilities/Mutex.cpp
@@ -327,7 +327,7 @@ void Threading::ScopedLock::AssignAndLock( const Mutex* locker )
 	if( !m_lock ) return;
 
 	m_IsLocked = true;
-	m_lock->Acquire();		
+	m_lock->Acquire();
 }
 
 void Threading::ScopedLock::Assign( const Mutex& locker )
diff --git a/common/src/Utilities/ThreadTools.cpp b/common/src/Utilities/ThreadTools.cpp
index 6a5919ee39..75c53bda43 100644
--- a/common/src/Utilities/ThreadTools.cpp
+++ b/common/src/Utilities/ThreadTools.cpp
@@ -782,27 +782,6 @@ void Threading::WaitEvent::Wait()
 }
 #endif
 
-// --------------------------------------------------------------------------------------
-//  InterlockedExchanges / AtomicExchanges (PCSX2's Helper versions)
-// --------------------------------------------------------------------------------------
-// define some overloads for InterlockedExchanges for commonly used types, like u32 and s32.
-// Note: For all of these atomic operations below to be atomic, the variables need to be 4-byte
-// aligned. Read: http://msdn.microsoft.com/en-us/library/ms684122%28v=vs.85%29.aspx
-
-__fi u32 Threading::AtomicRead(volatile u32& Target) {
-	return Target; // Properly-aligned 32-bit reads are atomic
-}
-__fi s32 Threading::AtomicRead(volatile s32& Target) {
-	return Target; // Properly-aligned 32-bit reads are atomic
-}
-
-__fi u32 Threading::AtomicExchange(volatile u32& Target, u32 value ) {
-	return _InterlockedExchange( (volatile vol_t*)&Target, value );
-}
-__fi s32 Threading::AtomicExchange( volatile s32& Target, s32 value ) {
-	return _InterlockedExchange( (volatile vol_t*)&Target, value );
-}
-
 // --------------------------------------------------------------------------------------
 //  BaseThreadError
 // --------------------------------------------------------------------------------------
diff --git a/common/src/Utilities/Windows/WinThreads.cpp b/common/src/Utilities/Windows/WinThreads.cpp
index d33dd45b41..e6cd326a67 100644
--- a/common/src/Utilities/Windows/WinThreads.cpp
+++ b/common/src/Utilities/Windows/WinThreads.cpp
@@ -36,11 +36,6 @@ __fi void Threading::SpinWait()
 	__asm pause;
 }
 
-__fi void Threading::StoreFence()
-{
-	__asm sfence;
-}
-
 __fi void Threading::EnableHiresScheduler()
 {
 	// This improves accuracy of Sleep() by some amount, and only adds a negligible amount of
diff --git a/pcsx2/MTVU.cpp b/pcsx2/MTVU.cpp
index ee85811ea8..c270656864 100644
--- a/pcsx2/MTVU.cpp
+++ b/pcsx2/MTVU.cpp
@@ -21,7 +21,6 @@
 
 __aligned16 VU_Thread vu1Thread(CpuVU1, VU1);
 
-#define volatize(x) (*reinterpret_cast<volatile uint*>(&(x)))
 #define size_u32(x) (((u32)x+3u)>>2) // Rounds up a size in bytes for size in u32's
 #define MTVU_ALWAYS_KICK 0
 #define MTVU_SYNC_MODE   0

From f76bf9dddca0099136bc4e13cbc8330ae33a86f5 Mon Sep 17 00:00:00 2001
From: Gregory Hainaut <gregory.hainaut@gmail.com>
Date: Thu, 14 Jul 2016 19:41:21 +0200
Subject: [PATCH 11/11] gsdx: dump and log EE texture read

It gives a visual opportunity to detect a bad read of the texture cache
---
 plugins/GSdx/GSRendererHW.cpp | 19 ++++++++++---------
 plugins/GSdx/GSRendererSW.cpp | 28 +++++++++++++++-------------
 plugins/GSdx/GSState.cpp      | 14 ++++++++++++--
 plugins/GSdx/GSState.h        |  1 +
 4 files changed, 38 insertions(+), 24 deletions(-)

diff --git a/plugins/GSdx/GSRendererHW.cpp b/plugins/GSdx/GSRendererHW.cpp
index 4b2af8f387..4081c84de4 100644
--- a/plugins/GSdx/GSRendererHW.cpp
+++ b/plugins/GSdx/GSRendererHW.cpp
@@ -53,6 +53,7 @@ GSRendererHW::GSRendererHW(GSTextureCache* tc)
 		m_userhacks_align_sprite_X      = 0;
 	}
 
+	m_dump_root = root_hw;
 }
 
 void GSRendererHW::SetScaling()
@@ -217,7 +218,7 @@ GSTexture* GSRendererHW::GetOutput(int i, int& y_offset)
 		{
 			if(s_savef && s_n >= s_saven)
 			{
-				t->Save(root_hw + format("%05d_f%lld_fr%d_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), i, (int)TEX0.TBP0, (int)TEX0.PSM));
+				t->Save(m_dump_root + format("%05d_f%lld_fr%d_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), i, (int)TEX0.TBP0, (int)TEX0.PSM));
 			}
 		}
 
@@ -517,8 +518,8 @@ void GSRendererHW::Draw()
 			// Dump Register state
 			s = format("%05d_context.txt", s_n);
 
-			m_env.Dump(root_hw+s);
-			m_context->Dump(root_hw+s);
+			m_env.Dump(m_dump_root+s);
+			m_context->Dump(m_dump_root+s);
 		}
 
 		if(s_savet && s_n >= s_saven && tex)
@@ -529,13 +530,13 @@ void GSRendererHW::Draw()
 				(int)context->CLAMP.MINU, (int)context->CLAMP.MAXU,
 				(int)context->CLAMP.MINV, (int)context->CLAMP.MAXV);
 
-			tex->m_texture->Save(root_hw+s, false, true);
+			tex->m_texture->Save(m_dump_root+s, false, true);
 
 			if(tex->m_palette)
 			{
 				s = format("%05d_f%lld_tpx_%05x_%d.dds", s_n, frame, context->TEX0.CBP, context->TEX0.CPSM);
 
-				tex->m_palette->Save(root_hw+s, false, true);
+				tex->m_palette->Save(m_dump_root+s, false, true);
 			}
 		}
 
@@ -546,7 +547,7 @@ void GSRendererHW::Draw()
 			s = format("%05d_f%lld_rt0_%05x_%d.bmp", s_n, frame, context->FRAME.Block(), context->FRAME.PSM);
 
 			if (rt)
-				rt->m_texture->Save(root_hw+s);
+				rt->m_texture->Save(m_dump_root+s);
 		}
 
 		if(s_savez && s_n >= s_saven)
@@ -554,7 +555,7 @@ void GSRendererHW::Draw()
 			s = format("%05d_f%lld_rz0_%05x_%d.bmp", s_n, frame, context->ZBUF.Block(), context->ZBUF.PSM);
 
 			if (ds_tex)
-				ds_tex->Save(root_hw+s);
+				ds_tex->Save(m_dump_root+s);
 		}
 
 		s_n++;
@@ -706,7 +707,7 @@ void GSRendererHW::Draw()
 			s = format("%05d_f%lld_rt1_%05x_%d.bmp", s_n, frame, context->FRAME.Block(), context->FRAME.PSM);
 
 			if (rt)
-				rt->m_texture->Save(root_hw+s);
+				rt->m_texture->Save(m_dump_root+s);
 		}
 
 		if(s_savez && s_n >= s_saven)
@@ -714,7 +715,7 @@ void GSRendererHW::Draw()
 			s = format("%05d_f%lld_rz1_%05x_%d.bmp", s_n, frame, context->ZBUF.Block(), context->ZBUF.PSM);
 
 			if (ds_tex)
-				ds_tex->Save(root_hw+s);
+				ds_tex->Save(m_dump_root+s);
 		}
 
 		s_n++;
diff --git a/plugins/GSdx/GSRendererSW.cpp b/plugins/GSdx/GSRendererSW.cpp
index 025a4b7431..d81dd86be1 100644
--- a/plugins/GSdx/GSRendererSW.cpp
+++ b/plugins/GSdx/GSRendererSW.cpp
@@ -62,6 +62,8 @@ GSRendererSW::GSRendererSW(int threads)
 	InitCVB(GS_LINE_CLASS);
 	InitCVB(GS_TRIANGLE_CLASS);
 	InitCVB(GS_SPRITE_CLASS);
+
+	m_dump_root = root_sw;
 }
 
 GSRendererSW::~GSRendererSW()
@@ -258,7 +260,7 @@ GSTexture* GSRendererSW::GetOutput(int i, int& y_offset)
 		{
 			if(s_savef && s_n >= s_saven)
 			{
-				m_texture[i]->Save(root_sw + format("%05d_f%lld_fr%d_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), i, (int)DISPFB.Block(), (int)DISPFB.PSM));
+				m_texture[i]->Save(m_dump_root + format("%05d_f%lld_fr%d_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), i, (int)DISPFB.Block(), (int)DISPFB.PSM));
 			}
 
 			s_n++;
@@ -531,8 +533,8 @@ void GSRendererSW::Draw()
 			// Dump Register state
 			s = format("%05d_context.txt", s_n);
 
-			m_env.Dump(root_sw+s);
-			m_context->Dump(root_sw+s);
+			m_env.Dump(m_dump_root+s);
+			m_context->Dump(m_dump_root+s);
 		}
 
 		if(s_savet && s_n >= s_saven && PRIM->TME)
@@ -540,11 +542,11 @@ void GSRendererSW::Draw()
 			if (texture_shuffle) {
 				// Dump the RT in 32 bits format. It helps to debug texture shuffle effect
 				s = format("%05d_f%lld_tex_%05x_32bits.bmp", s_n, frame, (int)m_context->TEX0.TBP0);
-				m_mem.SaveBMP(root_sw+s, m_context->TEX0.TBP0, m_context->TEX0.TBW, 0, 1 << m_context->TEX0.TW, 1 << m_context->TEX0.TH);
+				m_mem.SaveBMP(m_dump_root+s, m_context->TEX0.TBP0, m_context->TEX0.TBW, 0, 1 << m_context->TEX0.TW, 1 << m_context->TEX0.TH);
 			}
 
 			s = format("%05d_f%lld_tex_%05x_%d.bmp", s_n, frame, (int)m_context->TEX0.TBP0, (int)m_context->TEX0.PSM);
-			m_mem.SaveBMP(root_sw+s, m_context->TEX0.TBP0, m_context->TEX0.TBW, m_context->TEX0.PSM, 1 << m_context->TEX0.TW, 1 << m_context->TEX0.TH);
+			m_mem.SaveBMP(m_dump_root+s, m_context->TEX0.TBP0, m_context->TEX0.TBW, m_context->TEX0.PSM, 1 << m_context->TEX0.TW, 1 << m_context->TEX0.TH);
 		}
 
 		s_n++;
@@ -555,18 +557,18 @@ void GSRendererSW::Draw()
 			if (texture_shuffle) {
 				// Dump the RT in 32 bits format. It helps to debug texture shuffle effect
 				s = format("%05d_f%lld_rt0_%05x_32bits.bmp", s_n, frame, m_context->FRAME.Block());
-				m_mem.SaveBMP(root_sw+s, m_context->FRAME.Block(), m_context->FRAME.FBW, 0, GetFrameRect().width(), 512);
+				m_mem.SaveBMP(m_dump_root+s, m_context->FRAME.Block(), m_context->FRAME.FBW, 0, GetFrameRect().width(), 512);
 			}
 
 			s = format("%05d_f%lld_rt0_%05x_%d.bmp", s_n, frame, m_context->FRAME.Block(), m_context->FRAME.PSM);
-			m_mem.SaveBMP(root_sw+s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512);
+			m_mem.SaveBMP(m_dump_root+s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512);
 		}
 
 		if(s_savez && s_n >= s_saven)
 		{
 			s = format("%05d_f%lld_rz0_%05x_%d.bmp", s_n, frame, m_context->ZBUF.Block(), m_context->ZBUF.PSM);
 
-			m_mem.SaveBMP(root_sw+s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512);
+			m_mem.SaveBMP(m_dump_root+s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512);
 		}
 
 		s_n++;
@@ -580,18 +582,18 @@ void GSRendererSW::Draw()
 			if (texture_shuffle) {
 				// Dump the RT in 32 bits format. It helps to debug texture shuffle effect
 				s = format("%05d_f%lld_rt1_%05x_32bits.bmp", s_n, frame, m_context->FRAME.Block());
-				m_mem.SaveBMP(root_sw+s, m_context->FRAME.Block(), m_context->FRAME.FBW, 0, GetFrameRect().width(), 512);
+				m_mem.SaveBMP(m_dump_root+s, m_context->FRAME.Block(), m_context->FRAME.FBW, 0, GetFrameRect().width(), 512);
 			}
 
 			s = format("%05d_f%lld_rt1_%05x_%d.bmp", s_n, frame, m_context->FRAME.Block(), m_context->FRAME.PSM);
-			m_mem.SaveBMP(root_sw+s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512);
+			m_mem.SaveBMP(m_dump_root+s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512);
 		}
 
 		if(s_savez && s_n >= s_saven)
 		{
 			s = format("%05d_f%lld_rz1_%05x_%d.bmp", s_n, frame, m_context->ZBUF.Block(), m_context->ZBUF.PSM);
 
-			m_mem.SaveBMP(root_sw+s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512);
+			m_mem.SaveBMP(m_dump_root+s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512);
 		}
 
 		s_n++;
@@ -685,14 +687,14 @@ void GSRendererSW::Sync(int reason)
 		{
 			s = format("%05d_f%lld_rt1_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), m_context->FRAME.Block(), m_context->FRAME.PSM);
 
-			m_mem.SaveBMP(root_sw+s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512);
+			m_mem.SaveBMP(m_dump_root+s, m_context->FRAME.Block(), m_context->FRAME.FBW, m_context->FRAME.PSM, GetFrameRect().width(), 512);
 		}
 
 		if(s_savez)
 		{
 			s = format("%05d_f%lld_zb1_%05x_%d.bmp", s_n, m_perfmon.GetFrame(), m_context->ZBUF.Block(), m_context->ZBUF.PSM);
 
-			m_mem.SaveBMP(root_sw+s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512);
+			m_mem.SaveBMP(m_dump_root+s, m_context->ZBUF.Block(), m_context->FRAME.FBW, m_context->ZBUF.PSM, GetFrameRect().width(), 512);
 		}
 	}
 
diff --git a/plugins/GSdx/GSState.cpp b/plugins/GSdx/GSState.cpp
index bf6be8b72e..4c7b7550d2 100644
--- a/plugins/GSdx/GSState.cpp
+++ b/plugins/GSdx/GSState.cpp
@@ -56,6 +56,7 @@ GSState::GSState()
 	s_savef = theApp.GetConfigB("savef");
 	s_saven = theApp.GetConfigI("saven");
 	s_savel = theApp.GetConfigI("savel");
+	m_dump_root = "";
 #if defined(__unix__)
 	if (s_dump) {
 		GSmkdir("/tmp/GS_HW_dump");
@@ -1663,8 +1664,10 @@ void GSState::Read(uint8* mem, int len)
 	int sy = m_env.TRXPOS.SSAY;
 	int w = m_env.TRXREG.RRW;
 	int h = m_env.TRXREG.RRH;
+	GSVector4i r(sx, sy, sx + w, sy + h);
 
-	// printf("Read len=%d SBP=%05x SBW=%d SPSM=%d SSAX=%d SSAY=%d RRW=%d RRH=%d\n", len, (int)m_env.BITBLTBUF.SBP, (int)m_env.BITBLTBUF.SBW, (int)m_env.BITBLTBUF.SPSM, sx, sy, w, h);
+	GL_CACHE("Read! len=%d SBP=%05x SBW=%d SPSM=%d SSAX=%d SSAY=%d RRW=%d RRH=%d",
+			len, (int)m_env.BITBLTBUF.SBP, (int)m_env.BITBLTBUF.SBW, (int)m_env.BITBLTBUF.SPSM, sx, sy, w, h);
 
 	if(!m_tr.Update(w, h, GSLocalMemory::m_psm[m_env.BITBLTBUF.SPSM].trbpp, len))
 	{
@@ -1675,11 +1678,18 @@ void GSState::Read(uint8* mem, int len)
 	{
 		if(m_tr.x == sx && m_tr.y == sy)
 		{
-			InvalidateLocalMem(m_env.BITBLTBUF, GSVector4i(sx, sy, sx + w, sy + h));
+			InvalidateLocalMem(m_env.BITBLTBUF, r);
 		}
 	}
 
 	m_mem.ReadImageX(m_tr.x, m_tr.y, mem, len, m_env.BITBLTBUF, m_env.TRXPOS, m_env.TRXREG);
+
+	if(s_dump && s_save && s_n >= s_saven) {
+		string s= m_dump_root + format("%05d_read_%05x_%d_%d_%d_%d_%d_%d.bmp",
+				s_n, (int)m_env.BITBLTBUF.SBP, (int)m_env.BITBLTBUF.SBW, (int)m_env.BITBLTBUF.SPSM,
+				r.left, r.top, r.right, r.bottom);
+		m_mem.SaveBMP(s, m_env.BITBLTBUF.SBP, m_env.BITBLTBUF.SBW, m_env.BITBLTBUF.SPSM, r.right, r.bottom);
+	}
 }
 
 void GSState::Move()
diff --git a/plugins/GSdx/GSState.h b/plugins/GSdx/GSState.h
index c2078a0062..01dc2bc272 100644
--- a/plugins/GSdx/GSState.h
+++ b/plugins/GSdx/GSState.h
@@ -229,6 +229,7 @@ public:
 	bool s_savef;
 	int s_saven;
 	int s_savel;
+	string m_dump_root;
 
 public:
 	GSState();