Merge pull request #1444 from PCSX2/atomic-relax

Atomic relax
2016-07-22 18:36:02 +02:00 · 2016-07-22 18:36:02 +02:00 · 7d35e15fea
parent 0f62bccb0b f76bf9dddc
commit 7d35e15fea
10 changed files with 125 additions and 143 deletions
--- a/common/include/Utilities/Threading.h
+++ b/common/include/Utilities/Threading.h
@ -159,9 +159,6 @@ namespace Threading
 	// For use in spin/wait loops.
 	extern void SpinWait();

-	// Use prior to committing data to another thread
-	extern void StoreFence();
-
 	// Optional implementation to enable hires thread/process scheduler for the operating system.
 	// Needed by Windows, but might not be relevant to other platforms.
 	extern void EnableHiresScheduler();
@ -170,18 +167,6 @@ namespace Threading
 	// sleeps the current thread for the given number of milliseconds.
 	extern void Sleep( int ms );

-// --------------------------------------------------------------------------------------
-//  AtomicExchange / AtomicIncrement
-// --------------------------------------------------------------------------------------
-// Our fundamental interlocking functions.  All other useful interlocks can be derived
-// from these little beasties!  (these are all implemented internally using cross-platform
-// implementations of _InterlockedExchange and such)
-
-	extern u32 AtomicRead( volatile u32& Target );
-	extern s32 AtomicRead( volatile s32& Target );
-	extern u32 AtomicExchange( volatile u32& Target, u32 value );
-	extern s32 AtomicExchange( volatile s32& Target, s32 value );
-
 	// pthread Cond is an evil api that is not suited for Pcsx2 needs.
 	// Let's not use it. Use mutexes and semaphores instead to create waits. (Air)
 #if 0
@ -399,17 +384,17 @@ namespace Threading
 		ScopedLockBool(Mutex& mutexToLock, std::atomic<bool>& isLockedBool)
 			: m_lock(mutexToLock),
 			  m_bool(isLockedBool) {
-			m_bool = m_lock.IsLocked();
+			m_bool.store(m_lock.IsLocked(), std::memory_order_relaxed);
 		}
 		virtual ~ScopedLockBool() throw() {
-			m_bool = false;
+			m_bool.store(false, std::memory_order_relaxed);
 		}
 		void Acquire() {
 			m_lock.Acquire();
-			m_bool = m_lock.IsLocked();
+			m_bool.store(m_lock.IsLocked(), std::memory_order_relaxed);
 		}
 		void Release() {
-			m_bool = false;
+			m_bool.store(false, std::memory_order_relaxed);
 			m_lock.Release();
 		}
 	};
--- a/common/src/Utilities/ThreadTools.cpp
+++ b/common/src/Utilities/ThreadTools.cpp
@ -782,27 +782,6 @@ void Threading::WaitEvent::Wait()
 }
 #endif

-// --------------------------------------------------------------------------------------
-//  InterlockedExchanges / AtomicExchanges (PCSX2's Helper versions)
-// --------------------------------------------------------------------------------------
-// define some overloads for InterlockedExchanges for commonly used types, like u32 and s32.
-// Note: For all of these atomic operations below to be atomic, the variables need to be 4-byte
-// aligned. Read: http://msdn.microsoft.com/en-us/library/ms684122%28v=vs.85%29.aspx
-
-__fi u32 Threading::AtomicRead(volatile u32& Target) {
-	return Target; // Properly-aligned 32-bit reads are atomic
-}
-__fi s32 Threading::AtomicRead(volatile s32& Target) {
-	return Target; // Properly-aligned 32-bit reads are atomic
-}
-
-__fi u32 Threading::AtomicExchange(volatile u32& Target, u32 value ) {
-	return _InterlockedExchange( (volatile vol_t*)&Target, value );
-}
-__fi s32 Threading::AtomicExchange( volatile s32& Target, s32 value ) {
-	return _InterlockedExchange( (volatile vol_t*)&Target, value );
-}
-
 // --------------------------------------------------------------------------------------
 //  BaseThreadError
 // --------------------------------------------------------------------------------------
--- a/common/src/Utilities/Windows/WinThreads.cpp
+++ b/common/src/Utilities/Windows/WinThreads.cpp
@ -36,11 +36,6 @@ __fi void Threading::SpinWait()
 	__asm pause;
 }

-__fi void Threading::StoreFence()
-{
-	__asm sfence;
-}
-
 __fi void Threading::EnableHiresScheduler()
 {
 	// This improves accuracy of Sleep() by some amount, and only adds a negligible amount of
--- a/common/src/Utilities/wxHelpers.cpp
+++ b/common/src/Utilities/wxHelpers.cpp
@ -66,7 +66,7 @@ BaseDeletableObject::BaseDeletableObject()
 	//pxAssertDev( _CrtIsValidHeapPointer( this ), "BaseDeletableObject types cannot be created on the stack or as temporaries!" );
 	#endif

-	m_IsBeingDeleted = false;
+	m_IsBeingDeleted.store(false, std::memory_order_relaxed);
 }

 BaseDeletableObject::~BaseDeletableObject() throw()
--- a/pcsx2/GS.h
+++ b/pcsx2/GS.h
@ -275,8 +275,9 @@ class SysMtgsThread : public SysThreadBase

 public:
 	// note: when m_ReadPos == m_WritePos, the fifo is empty
-	__aligned(4) uint m_ReadPos;	// cur pos gs is reading from
-	__aligned(4) uint m_WritePos;	// cur pos ee thread is writing to
+	// Threading info: m_ReadPos is updated by the MTGS thread. m_WritePos is updated by the EE thread
+	std::atomic<unsigned int> m_ReadPos;  // cur pos gs is reading from
+	std::atomic<unsigned int> m_WritePos; // cur pos ee thread is writing to

 	std::atomic<bool>	m_RingBufferIsBusy;
 	std::atomic<bool>	m_SignalRingEnable;
--- a/pcsx2/MTGS.cpp
+++ b/pcsx2/MTGS.cpp
@ -37,13 +37,6 @@ using namespace Threading;
 #	define MTGS_LOG(...) do {} while (0)
 #endif

-// forces the compiler to treat a non-volatile value as volatile.
-// This allows us to declare the vars as non-volatile and only use
-// them as volatile when appropriate (more optimized).
-
-#define volatize(x) (*reinterpret_cast<volatile uint*>(&(x)))
-
-
 // =====================================================================================================
 //  MTGS Threaded Class Implementation
 // =====================================================================================================
@ -110,9 +103,9 @@ void SysMtgsThread::ResetGS()
 	//  * Signal a reset.
 	//  * clear the path and byRegs structs (used by GIFtagDummy)

-	m_ReadPos = m_WritePos;
+	m_ReadPos             = m_WritePos.load();
 	m_QueuedFrameCount    = 0;
-	m_VsyncSignalListener = false;
+	m_VsyncSignalListener = 0;

 	MTGS_LOG( "MTGS: Sending Reset..." );
 	SendSimplePacket( GS_RINGTYPE_RESET, 0, 0, 0 );
@ -163,8 +156,15 @@ void SysMtgsThread::PostVsyncStart()

 	if ((m_QueuedFrameCount.fetch_add(1) < EmuConfig.GS.VsyncQueueSize) /*|| (!EmuConfig.GS.VsyncEnable && !EmuConfig.GS.FrameLimitEnable)*/) return;

-	m_VsyncSignalListener = true;
-	//Console.WriteLn( Color_Blue, "(EEcore Sleep) Vsync\t\tringpos=0x%06x, writepos=0x%06x", volatize(m_ReadPos), m_WritePos );
+	m_VsyncSignalListener.store(true, std::memory_order_release);
+	//Console.WriteLn( Color_Blue, "(EEcore Sleep) Vsync\t\tringpos=0x%06x, writepos=0x%06x", m_ReadPos.load(), m_WritePos.load() );
+
+	// We will wait a vsync event from the MTGS ring. If the ring is already purged, the event will never come !
+	// To avoid this potential deadlock, ring must be wake up after m_VsyncSignalListener
+	// Note: potentially we can also miss the previous wake up if we optimize away the post just before the release of busy signal of the ring
+	// So let's ensure the ring doesn't sleep
+	m_sem_event.Post();
+
 	m_sem_Vsync.WaitNoCancel();
 }

@ -238,34 +238,45 @@ void SysMtgsThread::OpenPlugin()
 	GSsetGameCRC( ElfCRC, 0 );
 }

-struct RingBufferLock {
+class RingBufferLock {
 	ScopedLock     m_lock1;
 	ScopedLock     m_lock2;
 	SysMtgsThread& m_mtgs;

+	public:
+
 	RingBufferLock(SysMtgsThread& mtgs)
 		: m_lock1(mtgs.m_mtx_RingBufferBusy),
 		  m_lock2(mtgs.m_mtx_RingBufferBusy2),
 		  m_mtgs(mtgs) {
-		m_mtgs.m_RingBufferIsBusy = true;
+		m_mtgs.m_RingBufferIsBusy.store(true, std::memory_order_relaxed);
 	}
 	virtual ~RingBufferLock() throw() {
-		m_mtgs.m_RingBufferIsBusy = false;
+		m_mtgs.m_RingBufferIsBusy.store(false, std::memory_order_relaxed);
 	}
 	void Acquire() {
 		m_lock1.Acquire();
 		m_lock2.Acquire();
-		m_mtgs.m_RingBufferIsBusy = true;
+		m_mtgs.m_RingBufferIsBusy.store(true, std::memory_order_relaxed);
 	}
 	void Release() {
-		m_mtgs.m_RingBufferIsBusy = false;
+		m_mtgs.m_RingBufferIsBusy.store(false, std::memory_order_relaxed);
 		m_lock2.Release();
 		m_lock1.Release();
 	}
+	void PartialAcquire() {
+		m_lock2.Acquire();
+	}
+	void PartialRelease() {
+		m_lock2.Release();
+	}
 };

 void SysMtgsThread::ExecuteTaskInThread()
 {
+	// Threading info: run in MTGS thread
+	// m_ReadPos is only update by the MTGS thread so it is safe to load it with a relaxed atomic
+
 #ifdef RINGBUF_DEBUG_STACK
 	PacketTagType prevCmd;
 #endif
@ -285,16 +296,18 @@ void SysMtgsThread::ExecuteTaskInThread()

 		// note: m_ReadPos is intentionally not volatile, because it should only
 		// ever be modified by this thread.
-		while( m_ReadPos != volatize(m_WritePos))
+		while( m_ReadPos.load(std::memory_order_relaxed) != m_WritePos.load(std::memory_order_acquire))
 		{
 			if (EmuConfig.GS.DisableOutput) {
-				m_ReadPos = m_WritePos;
+				m_ReadPos = m_WritePos.load();
 				continue;
 			}

-			pxAssert( m_ReadPos < RingBufferSize );
+			const unsigned int local_ReadPos = m_ReadPos.load(std::memory_order_relaxed);

-			const PacketTagType& tag = (PacketTagType&)RingBuffer[m_ReadPos];
+			pxAssert( local_ReadPos < RingBufferSize );
+
+			const PacketTagType& tag = (PacketTagType&)RingBuffer[local_ReadPos];
 			u32 ringposinc = 1;

 #ifdef RINGBUF_DEBUG_STACK
@ -302,11 +315,11 @@ void SysMtgsThread::ExecuteTaskInThread()

 			m_lock_Stack.Lock();
 			uptr stackpos = ringposStack.back();
-			if( stackpos != m_ReadPos )
+			if( stackpos != local_ReadPos )
 			{
-				Console.Error( "MTGS Ringbuffer Critical Failure ---> %x to %x (prevCmd: %x)\n", stackpos, m_ReadPos, prevCmd.command );
+				Console.Error( "MTGS Ringbuffer Critical Failure ---> %x to %x (prevCmd: %x)\n", stackpos, local_ReadPos, prevCmd.command );
 			}
-			pxAssert( stackpos == m_ReadPos );
+			pxAssert( stackpos == local_ReadPos );
 			prevCmd = tag;
 			ringposStack.pop_back();
 			m_lock_Stack.Release();
@ -317,7 +330,7 @@ void SysMtgsThread::ExecuteTaskInThread()
 #if COPY_GS_PACKET_TO_MTGS == 1
 				case GS_RINGTYPE_P1:
 				{
-					uint datapos = (m_ReadPos+1) & RingBufferMask;
+					uint datapos = (local_ReadPos+1) & RingBufferMask;
 					const int qsize = tag.data[0];
 					const u128* data = &RingBuffer[datapos];

@ -342,7 +355,7 @@ void SysMtgsThread::ExecuteTaskInThread()

 				case GS_RINGTYPE_P2:
 				{
-					uint datapos = (m_ReadPos+1) & RingBufferMask;
+					uint datapos = (local_ReadPos+1) & RingBufferMask;
 					const int qsize = tag.data[0];
 					const u128* data = &RingBuffer[datapos];

@ -367,7 +380,7 @@ void SysMtgsThread::ExecuteTaskInThread()

 				case GS_RINGTYPE_P3:
 				{
-					uint datapos = (m_ReadPos+1) & RingBufferMask;
+					uint datapos = (local_ReadPos+1) & RingBufferMask;
 					const int qsize = tag.data[0];
 					const u128* data = &RingBuffer[datapos];

@ -402,10 +415,10 @@ void SysMtgsThread::ExecuteTaskInThread()
 				case GS_RINGTYPE_MTVU_GSPACKET: {
 					MTVU_LOG("MTGS - Waiting on semaXGkick!");
 					vu1Thread.KickStart(true);
-					busy.m_lock2.Release();
+					busy.PartialRelease();
 					// Wait for MTVU to complete vu1 program
 					vu1Thread.semaXGkick.WaitWithoutYield();
-					busy.m_lock2.Acquire();
+					busy.PartialAcquire();
 					Gif_Path& path   = gifUnit.gifPath[GIF_PATH_1];
 					GS_Packet gsPack = path.GetGSPacketMTVU(); // Get vu1 program's xgkick packet(s)
 					if (gsPack.size) GSgifTransfer((u32*)&path.buffer[gsPack.offset], gsPack.size/16);
@ -429,7 +442,7 @@ void SysMtgsThread::ExecuteTaskInThread()
 							// This seemingly obtuse system is needed in order to handle cases where the vsync data wraps
 							// around the edge of the ringbuffer.  If not for that I'd just use a struct. >_<

-							uint datapos = (m_ReadPos+1) & RingBufferMask;
+							uint datapos = (local_ReadPos+1) & RingBufferMask;
 							MemCopy_WrappedSrc( RingBuffer.m_Ring, datapos, RingBufferSize, (u128*)RingBuffer.Regs, 0xf );

 							u32* remainder = (u32*)&RingBuffer[datapos];
@ -504,9 +517,9 @@ void SysMtgsThread::ExecuteTaskInThread()

 #ifdef PCSX2_DEVBUILD
 						default:
-							Console.Error("GSThreadProc, bad packet (%x) at m_ReadPos: %x, m_WritePos: %x", tag.command, m_ReadPos, m_WritePos);
+							Console.Error("GSThreadProc, bad packet (%x) at m_ReadPos: %x, m_WritePos: %x", tag.command, local_ReadPos, m_WritePos.load());
 							pxFail( "Bad packet encountered in the MTGS Ringbuffer." );
-							m_ReadPos = m_WritePos;
+							m_ReadPos.store(m_WritePos.load(std::memory_order_acquire), std::memory_order_release);
 						continue;
 #else
 						// Optimized performance in non-Dev builds.
@ -516,22 +529,22 @@ void SysMtgsThread::ExecuteTaskInThread()
 				}
 			}

-			uint newringpos = (m_ReadPos + ringposinc) & RingBufferMask;
+			uint newringpos = (m_ReadPos.load(std::memory_order_relaxed) + ringposinc) & RingBufferMask;

 			if( EmuConfig.GS.SynchronousMTGS )
 			{
 				pxAssert( m_WritePos == newringpos );
 			}

-			m_ReadPos = newringpos;
+			m_ReadPos.store(newringpos, std::memory_order_release);

-			if( m_SignalRingEnable )
+			if(m_SignalRingEnable.load(std::memory_order_acquire))
 			{
 				// The EEcore has requested a signal after some amount of processed data.
 				if( m_SignalRingPosition.fetch_sub( ringposinc ) <= 0 )
 				{
 					// Make sure to post the signal after the m_ReadPos has been updated...
-					m_SignalRingEnable = false;
+					m_SignalRingEnable.store(false, std::memory_order_release);
 					m_sem_OnRingReset.Post();
 					continue;
 				}
@ -547,7 +560,7 @@ void SysMtgsThread::ExecuteTaskInThread()
 		if( m_SignalRingEnable.exchange(false) )
 		{
 			//Console.Warning( "(MTGS Thread) Dangling RingSignal on empty buffer!  signalpos=0x%06x", m_SignalRingPosition.exchange(0) ) );
-			m_SignalRingPosition = 0;
+			m_SignalRingPosition.store(0, std::memory_order_release);
 			m_sem_OnRingReset.Post();
 		}

@ -599,14 +612,17 @@ void SysMtgsThread::WaitGS(bool syncRegs, bool weakWait, bool isMTVU)
 	Gif_Path&   path = gifUnit.gifPath[GIF_PATH_1];
 	u32 startP1Packs = weakWait ? path.GetPendingGSPackets() : 0;

-	if (isMTVU || volatize(m_ReadPos) != m_WritePos) {
+	// Both m_ReadPos and m_WritePos can be relaxed as we only want to test if the queue is empty but
+	// we don't want to access the content of the queue
+
+	if (isMTVU || m_ReadPos.load(std::memory_order_relaxed) != m_WritePos.load(std::memory_order_relaxed)) {
 		SetEvent();
 		RethrowException();
 		for(;;) {
 			if (weakWait) m_mtx_RingBufferBusy2.Wait();
 			else          m_mtx_RingBufferBusy .Wait();
 			RethrowException();
-			if(!isMTVU && volatize(m_ReadPos) == m_WritePos) break;
+			if(!isMTVU && m_ReadPos.load(std::memory_order_relaxed) == m_WritePos.load(std::memory_order_relaxed)) break;
 			u32 curP1Packs = weakWait ? path.GetPendingGSPackets() : 0;
 			if (weakWait && ((startP1Packs-curP1Packs) || !curP1Packs)) break;
 			// On weakWait we will stop waiting on the MTGS thread if the
@ -629,7 +645,7 @@ void SysMtgsThread::WaitGS(bool syncRegs, bool weakWait, bool isMTVU)
 // For use in loops that wait on the GS thread to do certain things.
 void SysMtgsThread::SetEvent()
 {
-	if(!m_RingBufferIsBusy)
+	if(!m_RingBufferIsBusy.load(std::memory_order_relaxed))
 		m_sem_event.Post();

 	m_CopyDataTally = 0;
@ -653,13 +669,13 @@ void SysMtgsThread::SendDataPacket()
 	PacketTagType& tag = (PacketTagType&)RingBuffer[m_packet_startpos];
 	tag.data[0] = actualSize;

-	m_WritePos = m_packet_writepos;
+	m_WritePos.store(m_packet_writepos, std::memory_order_release);

 	if(EmuConfig.GS.SynchronousMTGS)
 	{
 		WaitGS();
 	}
-	else if( !m_RingBufferIsBusy )
+	else if(!m_RingBufferIsBusy.load(std::memory_order_relaxed))
 	{
 		m_CopyDataTally += m_packet_size;
 		if( m_CopyDataTally > 0x2000 ) SetEvent();
@ -675,7 +691,7 @@ void SysMtgsThread::GenericStall( uint size )
 	// Note on volatiles: m_WritePos is not modified by the GS thread, so there's no need
 	// to use volatile reads here.  We do cache it though, since we know it never changes,
 	// except for calls to RingbufferRestert() -- handled below.
-	const uint writepos = m_WritePos;
+	const uint writepos = m_WritePos.load(std::memory_order_relaxed);

 	// Sanity checks! (within the confines of our ringbuffer please!)
 	pxAssert( size < RingBufferSize );
@ -686,7 +702,7 @@ void SysMtgsThread::GenericStall( uint size )
 	// But if not then we need to make sure the readpos is outside the scope of
 	// the block about to be written (writepos + size)

-	uint readpos = volatize(m_ReadPos);
+	uint readpos = m_ReadPos.load(std::memory_order_acquire);
 	uint freeroom;

 	if (writepos < readpos)
@ -714,15 +730,15 @@ void SysMtgsThread::GenericStall( uint size )
 		if( somedone > 0x80 )
 		{
 			pxAssertDev( m_SignalRingEnable == 0, "MTGS Thread Synchronization Error" );
-			m_SignalRingPosition = somedone;
+			m_SignalRingPosition.store(somedone, std::memory_order_release);

 			//Console.WriteLn( Color_Blue, "(EEcore Sleep) PrepDataPacker \tringpos=0x%06x, writepos=0x%06x, signalpos=0x%06x", readpos, writepos, m_SignalRingPosition );

 			while(true) {
-				m_SignalRingEnable = true;
+				m_SignalRingEnable.store(true, std::memory_order_release);
 				SetEvent();
 				m_sem_OnRingReset.WaitWithoutYield();
-				readpos = volatize(m_ReadPos);
+				readpos = m_ReadPos.load(std::memory_order_acquire);
 				//Console.WriteLn( Color_Blue, "(EEcore Awake) Report!\tringpos=0x%06x", readpos );

 				if (writepos < readpos)
@ -741,7 +757,7 @@ void SysMtgsThread::GenericStall( uint size )
 			SetEvent();
 			while(true) {
 				SpinWait();
-				readpos = volatize(m_ReadPos);
+				readpos = m_ReadPos.load(std::memory_order_acquire);

 				if (writepos < readpos)
 					freeroom = readpos - writepos;
@ -762,12 +778,13 @@ void SysMtgsThread::PrepDataPacket( MTGS_RingCommand cmd, u32 size )

 	// Command qword: Low word is the command, and the high word is the packet
 	// length in SIMDs (128 bits).
+	const unsigned int local_WritePos = m_WritePos.load(std::memory_order_relaxed);

-	PacketTagType& tag = (PacketTagType&)RingBuffer[m_WritePos];
+	PacketTagType& tag = (PacketTagType&)RingBuffer[local_WritePos];
 	tag.command = cmd;
 	tag.data[0] = m_packet_size;
-	m_packet_startpos = m_WritePos;
-	m_packet_writepos = (m_WritePos + 1) & RingBufferMask;
+	m_packet_startpos = local_WritePos;
+	m_packet_writepos = (local_WritePos + 1) & RingBufferMask;
 }

 // Returns the amount of giftag data processed (in simd128 values).
@ -784,9 +801,9 @@ void SysMtgsThread::PrepDataPacket( GIF_PATH pathidx, u32 size )

 __fi void SysMtgsThread::_FinishSimplePacket()
 {
-	uint future_writepos = (m_WritePos+1) & RingBufferMask;
-	pxAssert( future_writepos != volatize(m_ReadPos) );
-	m_WritePos = future_writepos;
+	uint future_writepos = (m_WritePos.load(std::memory_order_relaxed) +1) & RingBufferMask;
+	pxAssert( future_writepos != m_ReadPos.load(std::memory_order_acquire) );
+	m_WritePos.store(future_writepos, std::memory_order_release);

 	if( EmuConfig.GS.SynchronousMTGS )
 		WaitGS();
@ -799,7 +816,7 @@ void SysMtgsThread::SendSimplePacket( MTGS_RingCommand type, int data0, int data
 	//ScopedLock locker( m_PacketLocker );

 	GenericStall(1);
-	PacketTagType& tag = (PacketTagType&)RingBuffer[m_WritePos];
+	PacketTagType& tag = (PacketTagType&)RingBuffer[m_WritePos.load(std::memory_order_relaxed)];

 	tag.command = type;
 	tag.data[0] = data0;
@ -814,7 +831,7 @@ void SysMtgsThread::SendSimpleGSPacket(MTGS_RingCommand type, u32 offset, u32 si
 	SendSimplePacket(type, (int)offset, (int)size, (int)path);

 	if(!EmuConfig.GS.SynchronousMTGS) {
-		if(!m_RingBufferIsBusy) {
+		if(!m_RingBufferIsBusy.load(std::memory_order_relaxed)) {
 			m_CopyDataTally += size / 16;
 			if (m_CopyDataTally > 0x2000) SetEvent();
 		}
@ -826,7 +843,7 @@ void SysMtgsThread::SendPointerPacket( MTGS_RingCommand type, u32 data0, void* d
 	//ScopedLock locker( m_PacketLocker );

 	GenericStall(1);
-	PacketTagType& tag = (PacketTagType&)RingBuffer[m_WritePos];
+	PacketTagType& tag = (PacketTagType&)RingBuffer[m_WritePos.load(std::memory_order_relaxed)];

 	tag.command = type;
 	tag.data[0] = data0;
--- a/pcsx2/MTVU.cpp
+++ b/pcsx2/MTVU.cpp
@ -21,7 +21,6 @@

 __aligned16 VU_Thread vu1Thread(CpuVU1, VU1);

-#define volatize(x) (*reinterpret_cast<volatile uint*>(&(x)))
 #define size_u32(x) (((u32)x+3u)>>2) // Rounds up a size in bytes for size in u32's
 #define MTVU_ALWAYS_KICK 0
 #define MTVU_SYNC_MODE   0
@ -52,7 +51,10 @@ void SaveStateBase::mtvuFreeze()
 	FreezeTag("MTVU");
 	pxAssert(vu1Thread.IsDone());
 	if (!IsSaving()) vu1Thread.Reset();
-	Freeze(vu1Thread.vuCycles);
+	for (size_t i = 0; i < 4; ++i) {
+		unsigned int v = vu1Thread.vuCycles[i].load();
+		Freeze(v);
+	}
 	Freeze(vu1Thread.vuCycleIdx);
 }

@ -75,14 +77,15 @@ void VU_Thread::Reset()
 {
 	ScopedLock lock(mtxBusy);

-	read_pos     = 0;
-	write_pos    = 0;
 	write_offset = 0;
 	vuCycleIdx   = 0;
+	read_pos     = 0;
 	isBusy       = false;
+	write_pos    = 0;
 	memzero(vif);
 	memzero(vifRegs);
-	memzero(vuCycles);
+	for (size_t i = 0; i < 4; ++i)
+		vu1Thread.vuCycles[i] = 0;
 }

 void VU_Thread::ExecuteTaskInThread()
@ -97,7 +100,7 @@ void VU_Thread::ExecuteRingBuffer()
 	for(;;) {
 		semaEvent.WaitWithoutYield();
 		ScopedLockBool lock(mtxBusy, isBusy);
-		while (read_pos != GetWritePos()) {
+		while (read_pos.load(std::memory_order_relaxed) != GetWritePos()) {
 			u32 tag = Read();
 			switch (tag) {
 				case MTVU_VU_EXECUTE: {
@ -109,7 +112,7 @@ void VU_Thread::ExecuteRingBuffer()
 					vuCPU->Execute(vu1RunCycles);
 					gifUnit.gifPath[GIF_PATH_1].FinishGSPacketMTVU();
 					semaXGkick.Post(); // Tell MTGS a path1 packet is complete
-					AtomicExchange(vuCycles[vuCycleIdx], vuRegs.cycle);
+					vuCycles[vuCycleIdx].store(vuRegs.cycle, std::memory_order_relaxed);
 					vuCycleIdx  = (vuCycleIdx + 1) & 3;
 					break;
 				}
@ -137,12 +140,12 @@ void VU_Thread::ExecuteRingBuffer()
 					Read(&vif.tag, vif_copy_size);
 					ReadRegs(&vifRegs);
 					u32 size = Read();
-					MTVU_Unpack(&buffer[read_pos], vifRegs);
+					MTVU_Unpack(&buffer[read_pos.load(std::memory_order_relaxed)], vifRegs);
 					incReadPos(size_u32(size));
 					break;
 				}
 				case MTVU_NULL_PACKET:
-					read_pos = 0;
+					read_pos.store(0, std::memory_order_release);
 					break;
 				jNO_DEFAULT;
 			}
@ -156,8 +159,8 @@ __ri void VU_Thread::WaitOnSize(s32 size)
 {
 	for(;;) {
 		s32 readPos  = GetReadPos();
-		if (readPos <= write_pos) break; // MTVU is reading in back of write_pos
-		if (readPos >  write_pos + size) break; // Enough free front space
+		if (readPos <= write_pos.load(std::memory_order_relaxed)) break; // MTVU is reading in back of write_pos
+		if (readPos >  write_pos.load(std::memory_order_relaxed) + size) break; // Enough free front space
 		if (1) { // Let MTVU run to free up buffer space
 			KickStart();
 			if (IsDevBuild) DevCon.WriteLn("WaitOnSize()");
@ -174,12 +177,12 @@ void VU_Thread::ReserveSpace(s32 size)
 	pxAssert(size      < buffer_size);
 	pxAssert(size > 0);
 	pxAssert(write_offset == 0);
-	if (write_pos + size > buffer_size) {
+	if (write_pos.load(std::memory_order_relaxed) + size > buffer_size) {
 		pxAssert(write_pos > 0);
 		WaitOnSize(1); // Size of MTVU_NULL_PACKET
 		Write(MTVU_NULL_PACKET);
 		write_offset = 0;
-		AtomicExchange(volatize(write_pos), 0);
+		write_pos.store(0, std::memory_order_release);
 	}
 	WaitOnSize(size);
 }
@ -187,48 +190,48 @@ void VU_Thread::ReserveSpace(s32 size)
 // Use this when reading read_pos from ee thread
 __fi s32 VU_Thread::GetReadPos()
 {
-	return read_pos.load();
+	return read_pos.load(std::memory_order_acquire);
 }
 // Use this when reading write_pos from vu thread
 __fi s32 VU_Thread::GetWritePos()
 {
-	return AtomicRead(volatize(write_pos));
+	return write_pos.load(std::memory_order_acquire);
 }
 // Gets the effective write pointer after adding write_offset
 __fi u32* VU_Thread::GetWritePtr()
 {
-	return &buffer[(write_pos + write_offset) & buffer_mask];
+	return &buffer[(write_pos.load(std::memory_order_relaxed) + write_offset) & buffer_mask];
 }

 __fi void VU_Thread::incReadPos(s32 offset)
 { // Offset in u32 sizes
-	read_pos = (read_pos + offset) & buffer_mask;
+	read_pos.store((read_pos.load(std::memory_order_relaxed) + offset) & buffer_mask, std::memory_order_release);
 }
 __fi void VU_Thread::incWritePos()
 { // Adds write_offset
-	s32 temp = (write_pos + write_offset) & buffer_mask;
+	s32 temp = (write_pos.load(std::memory_order_relaxed) + write_offset) & buffer_mask;
 	write_offset = 0;
-	AtomicExchange(volatize(write_pos), temp);
+	write_pos.store(temp, std::memory_order_release);
 	if (MTVU_ALWAYS_KICK) KickStart();
 	if (MTVU_SYNC_MODE)   WaitVU();
 }

 __fi u32 VU_Thread::Read()
 {
-	u32 ret = buffer[read_pos];
+	u32 ret = buffer[read_pos.load(std::memory_order_relaxed)];
 	incReadPos(1);
 	return ret;
 }

 __fi void VU_Thread::Read(void* dest, u32 size)
 {
-	memcpy(dest, &buffer[read_pos], size);
+	memcpy(dest, &buffer[read_pos.load(std::memory_order_relaxed)], size);
 	incReadPos(size_u32(size));
 }

 __fi void VU_Thread::ReadRegs(VIFregisters* dest)
 {
-	VIFregistersMTVU* src = (VIFregistersMTVU*)&buffer[read_pos];
+	VIFregistersMTVU* src = (VIFregistersMTVU*)&buffer[read_pos.load(std::memory_order_relaxed)];
 	dest->cycle = src->cycle;
 	dest->mode = src->mode;
 	dest->num = src->num;
@ -265,19 +268,21 @@ __fi void VU_Thread::WriteRegs(VIFregisters* src)
 // Used for vu cycle stealing hack
 u32 VU_Thread::Get_vuCycles()
 {
-	return (AtomicRead(vuCycles[0]) + AtomicRead(vuCycles[1])
-		  + AtomicRead(vuCycles[2]) + AtomicRead(vuCycles[3])) >> 2;
+	return (vuCycles[0].load(std::memory_order_relaxed) +
+			vuCycles[1].load(std::memory_order_relaxed) +
+			vuCycles[2].load(std::memory_order_relaxed) +
+			vuCycles[3].load(std::memory_order_relaxed)) >> 2;
 }

 void VU_Thread::KickStart(bool forceKick)
 {
 	if ((forceKick && !semaEvent.Count())
-	|| (!isBusy && GetReadPos() != write_pos)) semaEvent.Post();
+	|| (!isBusy.load(std::memory_order_relaxed) && GetReadPos() != write_pos.load(std::memory_order_relaxed))) semaEvent.Post();
 }

 bool VU_Thread::IsDone()
 {
-	return !isBusy && GetReadPos() == GetWritePos();
+	return !isBusy.load(std::memory_order_relaxed) && GetReadPos() == GetWritePos();
 }

 void VU_Thread::WaitVU()
--- a/pcsx2/MTVU.h
+++ b/pcsx2/MTVU.h
@ -30,9 +30,9 @@ class VU_Thread : public pxThread {
 	static const s32 buffer_size = (_1mb * 16) / sizeof(s32);
 	static const u32 buffer_mask = buffer_size - 1;
 	__aligned(4) u32 buffer[buffer_size];
-	__aligned(4) std::atomic<int> read_pos; // Only modified by VU thread
-	__aligned(4) std::atomic<bool> isBusy;   // Is thread processing data?
-	__aligned(4) s32  write_pos;    // Only modified by EE thread
+	std::atomic<int> read_pos; // Only modified by VU thread
+	std::atomic<bool> isBusy;   // Is thread processing data?
+	std::atomic<int> write_pos;    // Only modified by EE thread
 	__aligned(4) s32  write_offset; // Only modified by EE thread
 	__aligned(4) Mutex     mtxBusy;
 	__aligned(4) Semaphore semaEvent;
@ -43,7 +43,7 @@ public:
 	__aligned16  vifStruct        vif;
 	__aligned16  VIFregisters     vifRegs;
 	__aligned(4) Semaphore semaXGkick;
-	__aligned(4) u32 vuCycles[4]; // Used for VU cycle stealing hack
+	__aligned(4) std::atomic<unsigned int> vuCycles[4]; // Used for VU cycle stealing hack
 	__aligned(4) u32 vuCycleIdx;  // Used for VU cycle stealing hack

 	VU_Thread(BaseVUmicroCPU*& _vuCPU, VURegs& _vuRegs);
--- a/plugins/GSdx/GSTextureSW.cpp
+++ b/plugins/GSdx/GSTextureSW.cpp
@ -25,7 +25,7 @@

 GSTextureSW::GSTextureSW(int type, int width, int height)
 {
-	m_mapped.clear();
+	m_mapped.clear(std::memory_order_release);
 	m_size = GSVector2i(width, height);
 	m_type = type;
 	m_format = 0;
@ -68,7 +68,7 @@ bool GSTextureSW::Map(GSMap& m, const GSVector4i* r)

 	if(m_data != NULL && r2.left >= 0 && r2.right <= m_size.x && r2.top >= 0 && r2.bottom <= m_size.y)
 	{
-		if (!m_mapped.test_and_set())
+		if (!m_mapped.test_and_set(std::memory_order_acquire))
 		{
 			m.bits = (uint8*)m_data + m_pitch * r2.top + (r2.left << 2);
 			m.pitch = m_pitch;
@ -82,7 +82,7 @@ bool GSTextureSW::Map(GSMap& m, const GSVector4i* r)

 void GSTextureSW::Unmap()
 {
-	m_mapped.clear();
+	m_mapped.clear(std::memory_order_release);
 }

 bool GSTextureSW::Save(const string& fn, bool user_image, bool dds)