From d3fd0eddbbe423edddc1abb6489ccc7369c8b892 Mon Sep 17 00:00:00 2001
From: degasus <wickmarkus@web.de>
Date: Sun, 26 Jan 2014 10:55:10 +0100
Subject: [PATCH] OSX: don't avoid unsync mapping on nvida gpus just because
 the windows driver doesn't like it

OSX has their own driver, so performance issues aren't shared with the nvidia driver (unlike the closed source linux and windows nvidia driver). So now they'll also use the MapAndSync backend like all other osx drivers.

fixes issue 6596

I've also cleaned up the if/else block selecting the best backend a bit.
---
 .../Core/VideoBackends/OGL/StreamBuffer.cpp   | 52 ++++++++++++-------
 Source/Core/VideoCommon/DriverDetails.cpp     |  2 +
 Source/Core/VideoCommon/DriverDetails.h       | 11 ++++
 3 files changed, 47 insertions(+), 18 deletions(-)

diff --git a/Source/Core/VideoBackends/OGL/StreamBuffer.cpp b/Source/Core/VideoBackends/OGL/StreamBuffer.cpp
index 71bfa404f6..f21ab60a50 100644
--- a/Source/Core/VideoBackends/OGL/StreamBuffer.cpp
+++ b/Source/Core/VideoBackends/OGL/StreamBuffer.cpp
@@ -152,7 +152,7 @@ public:
 			m_iterator = 0;
 		}
 		u8* pointer = (u8*)glMapBufferRange(m_buffertype, m_iterator, size,
-			GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_RANGE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT | GL_MAP_UNSYNCHRONIZED_BIT);
+			GL_MAP_WRITE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT | GL_MAP_UNSYNCHRONIZED_BIT);
 		return std::make_pair(pointer, m_iterator);
 	}
 
@@ -187,7 +187,7 @@ public:
 		Align(stride);
 		AllocMemory(size);
 		u8* pointer = (u8*)glMapBufferRange(m_buffertype, m_iterator, size,
-			GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_RANGE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT | GL_MAP_UNSYNCHRONIZED_BIT);
+			GL_MAP_WRITE_BIT | GL_MAP_FLUSH_EXPLICIT_BIT | GL_MAP_UNSYNCHRONIZED_BIT);
 		return std::make_pair(pointer, m_iterator);
 	}
 
@@ -346,24 +346,40 @@ public:
 // choose best streaming library based on the supported extensions and known issues
 StreamBuffer* StreamBuffer::Create(u32 type, size_t size)
 {
-	bool nvidia = !strcmp(g_ogl_config.gl_vendor, "NVIDIA Corporation");
+	// without basevertex support, only streaming methods whith uploads everything to zero works fine:
+	if(!g_ogl_config.bSupportsGLBaseVertex)
+	{
+		if(!DriverDetails::HasBug(DriverDetails::BUG_BROKENBUFFERSTREAM))
+			return new BufferSubData(type, size);
 
-	if (g_ogl_config.bSupportsGLBufferStorage &&
-		!(DriverDetails::HasBug(DriverDetails::BUG_BROKENBUFFERSTORAGE) && type == GL_ARRAY_BUFFER))
-		return new BufferStorage(type, size);
-	else if(!g_ogl_config.bSupportsGLBaseVertex && !DriverDetails::HasBug(DriverDetails::BUG_BROKENBUFFERSTREAM))
-		return new BufferSubData(type, size);
-	else if(!g_ogl_config.bSupportsGLBaseVertex)
+		// BufferData is by far the worst way, only use it if needed
 		return new BufferData(type, size);
-	else if(g_ogl_config.bSupportsGLSync && g_ogl_config.bSupportsGLPinnedMemory &&
-		!(DriverDetails::HasBug(DriverDetails::BUG_BROKENPINNEDMEMORY) && type == GL_ELEMENT_ARRAY_BUFFER))
-		return new PinnedMemory(type, size);
-	else if(nvidia)
-		return new BufferSubData(type, size);
-	else if(g_ogl_config.bSupportsGLSync)
-		return new MapAndSync(type, size);
-	else
-		return new MapAndOrphan(type, size);
+	}
+
+	// Prefer the syncing buffers over the orphaning one
+	if(g_ogl_config.bSupportsGLSync)
+	{
+		// try to use buffer storage whenever possible
+		if (g_ogl_config.bSupportsGLBufferStorage &&
+			!(DriverDetails::HasBug(DriverDetails::BUG_BROKENBUFFERSTORAGE) && type == GL_ARRAY_BUFFER))
+			return new BufferStorage(type, size);
+
+		// pinned memory is almost as fine
+		if(g_ogl_config.bSupportsGLPinnedMemory &&
+			!(DriverDetails::HasBug(DriverDetails::BUG_BROKENPINNEDMEMORY) && type == GL_ELEMENT_ARRAY_BUFFER))
+			return new PinnedMemory(type, size);
+
+		// don't fall back to MapAnd* for nvidia drivers
+		if(DriverDetails::HasBug(DriverDetails::BUG_BROKENUNSYNCMAPPING))
+			return new BufferSubData(type, size);
+
+		// mapping fallback
+		if(g_ogl_config.bSupportsGLSync)
+			return new MapAndSync(type, size);
+	}
+
+	// default fallback, should work everywhere, but isn't the best way to do this job
+	return new MapAndOrphan(type, size);
 }
 
 }
diff --git a/Source/Core/VideoCommon/DriverDetails.cpp b/Source/Core/VideoCommon/DriverDetails.cpp
index 3f90b2f4af..37b5c3cb3b 100644
--- a/Source/Core/VideoCommon/DriverDetails.cpp
+++ b/Source/Core/VideoCommon/DriverDetails.cpp
@@ -55,6 +55,8 @@ namespace DriverDetails
 		{OS_WINDOWS,VENDOR_NVIDIA,   DRIVER_NVIDIA,       -1, BUG_BROKENBUFFERSTORAGE, -1.0, 33220.0, true},
 		{OS_LINUX,  VENDOR_NVIDIA,   DRIVER_NVIDIA,       -1, BUG_BROKENBUFFERSTORAGE, -1.0, 33138.0, true},
 		{OS_OSX,    VENDOR_INTEL,    DRIVER_INTEL,      3000, BUG_PRIMITIVERESTART,    -1.0, -1.0, true},
+		{OS_WINDOWS,VENDOR_NVIDIA,   DRIVER_NVIDIA,       -1, BUG_BROKENUNSYNCMAPPING, -1.0, -1.0, true},
+		{OS_LINUX,  VENDOR_NVIDIA,   DRIVER_NVIDIA,       -1, BUG_BROKENUNSYNCMAPPING, -1.0, -1.0, true},
 	};
 
 	std::map<Bug, BugInfo> m_bugs;
diff --git a/Source/Core/VideoCommon/DriverDetails.h b/Source/Core/VideoCommon/DriverDetails.h
index 384e372fb3..68c31cc932 100644
--- a/Source/Core/VideoCommon/DriverDetails.h
+++ b/Source/Core/VideoCommon/DriverDetails.h
@@ -154,6 +154,17 @@ namespace DriverDetails
 		// The drivers on OS X has broken primitive restart.
 		// Intel HD 4000 series isn't affected by the bug
 		BUG_PRIMITIVERESTART,
+		// Bug: unsync mapping doesn't work fine
+		// Affected devices: nvidia driver
+		// Started Version: -1
+		// Ended Version: -1
+		// The nvidia driver (both windows + linux) doesn't like unsync mapping performance wise.
+		// Because of their threaded behavoir, they seem not to handle unsync mapping complete unsync,
+		// in fact, they serialize the driver which adds a much bigger overhead.
+		// Workaround: Use BufferSubData
+		// TODO: some windows AMD driver/gpu combination seems also affected
+		//       but as they all support pinned memory, it doesn't matter
+		BUG_BROKENUNSYNCMAPPING,
 	};
 
 	// Initializes our internal vendor, device family, and driver version