From cf02893d2290bd1a16fa7592c2df3a373f3b6888 Mon Sep 17 00:00:00 2001
From: TellowKrinkle <tellowkrinkle@gmail.com>
Date: Sun, 14 Feb 2021 05:32:01 -0600
Subject: [PATCH] macOS: faster semaphores

mach semaphore is kernel-only, and doesn't have a userspace fast path
---
 common/Darwin/DarwinSemaphore.cpp | 33 +++++++++++++++++++------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/common/Darwin/DarwinSemaphore.cpp b/common/Darwin/DarwinSemaphore.cpp
index d4124c3b40..4040690d6d 100644
--- a/common/Darwin/DarwinSemaphore.cpp
+++ b/common/Darwin/DarwinSemaphore.cpp
@@ -62,13 +62,12 @@ Threading::Semaphore::Semaphore()
 	// other platforms explicitly make a thread-private (unshared) semaphore
 	// here. But it seems Mach doesn't support that.
 	MACH_CHECK(semaphore_create(mach_task_self(), (semaphore_t*)&m_sema, SYNC_POLICY_FIFO, 0));
-	__atomic_store_n(&m_counter, 0, __ATOMIC_SEQ_CST);
+	__atomic_store_n(&m_counter, 0, __ATOMIC_RELEASE);
 }
 
 Threading::Semaphore::~Semaphore()
 {
 	MACH_CHECK(semaphore_destroy(mach_task_self(), (semaphore_t)m_sema));
-	__atomic_store_n(&m_counter, 0, __ATOMIC_SEQ_CST);
 }
 
 void Threading::Semaphore::Reset()
@@ -80,24 +79,23 @@ void Threading::Semaphore::Reset()
 
 void Threading::Semaphore::Post()
 {
-	MACH_CHECK(semaphore_signal(m_sema));
-	__atomic_add_fetch(&m_counter, 1, __ATOMIC_SEQ_CST);
+	if (__atomic_fetch_add(&m_counter, 1, __ATOMIC_RELEASE) < 0)
+		MACH_CHECK(semaphore_signal(m_sema));
 }
 
 void Threading::Semaphore::Post(int multiple)
 {
 	for (int i = 0; i < multiple; ++i)
 	{
-		MACH_CHECK(semaphore_signal(m_sema));
+		Post();
 	}
-	__atomic_add_fetch(&m_counter, multiple, __ATOMIC_SEQ_CST);
 }
 
 void Threading::Semaphore::WaitWithoutYield()
 {
 	pxAssertMsg(!wxThread::IsMain(), "Unyielding semaphore wait issued from the main/gui thread.  Please use Wait() instead.");
-	MACH_CHECK(semaphore_wait(m_sema));
-	__atomic_sub_fetch(&m_counter, 1, __ATOMIC_SEQ_CST);
+	if (__atomic_sub_fetch(&m_counter, 1, __ATOMIC_ACQUIRE) < 0)
+		MACH_CHECK(semaphore_wait(m_sema));
 }
 
 bool Threading::Semaphore::WaitWithoutYield(const wxTimeSpan& timeout)
@@ -109,6 +107,9 @@ bool Threading::Semaphore::WaitWithoutYield(const wxTimeSpan& timeout)
 	// semaphore_timedwait() is the way it's used in Grand Central Dispatch,
 	// which is open-source.
 
+	if (__atomic_sub_fetch(&m_counter, 1, __ATOMIC_ACQUIRE) >= 0)
+		return true;
+
 	// on x86 platforms, mach_absolute_time() returns nanoseconds
 	// TODO(aktau): on iOS a scale value from mach_timebase_info will be necessary
 	u64 const kOneThousand = 1000;
@@ -122,7 +123,8 @@ bool Threading::Semaphore::WaitWithoutYield(const wxTimeSpan& timeout)
 		if (now > deadline)
 		{
 			// timed out by definition
-			return false;
+			kr = KERN_OPERATION_TIMED_OUT;
+			break;
 		}
 
 		u64 timeleft = deadline - now;
@@ -141,15 +143,20 @@ bool Threading::Semaphore::WaitWithoutYield(const wxTimeSpan& timeout)
 
 	if (kr == KERN_OPERATION_TIMED_OUT)
 	{
-		return false;
+		int orig = __atomic_load_n(&m_counter, __ATOMIC_RELAXED);
+		while (orig < 0)
+		{
+			if (__atomic_compare_exchange_n(&m_counter, &orig, orig + 1, true, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED))
+				return false;
+		}
+		// Semaphore was signalled between our wait expiring and now, keep kernel sema in sync
+		kr = semaphore_wait(m_sema);
 	}
 
 	// while it's entirely possible to have KERN_FAILURE here, we should
 	// probably assert so we can study and correct the actual error here
 	// (the thread dying while someone is wainting for it).
 	MACH_CHECK(kr);
-
-	__atomic_sub_fetch(&m_counter, 1, __ATOMIC_SEQ_CST);
 	return true;
 }
 
@@ -250,6 +257,6 @@ void Threading::Semaphore::WaitNoCancel(const wxTimeSpan& timeout)
 
 int Threading::Semaphore::Count()
 {
-	return __atomic_load_n(&m_counter, __ATOMIC_SEQ_CST);
+	return __atomic_load_n(&m_counter, __ATOMIC_RELAXED);
 }
 #endif