From 08f7a28920860e220c4f21e516e81f5588d732fe Mon Sep 17 00:00:00 2001
From: "chss95cs@gmail.com" <chss95cs@gmail.com>
Date: Sun, 14 Aug 2022 08:59:11 -0700
Subject: [PATCH] Alternative mutex

---
 src/xenia/apu/xma_context.cc            | 10 ++--
 src/xenia/apu/xma_context.h             |  2 +-
 src/xenia/base/mutex.cc                 | 69 ++++++++++++++++++++++++-
 src/xenia/base/mutex.h                  | 59 +++++++++++++++++----
 src/xenia/base/threading_timer_queue.cc |  2 +-
 src/xenia/cpu/mmio_handler.h            |  2 +-
 src/xenia/cpu/ppc/ppc_context.h         |  4 +-
 src/xenia/gpu/primitive_processor.h     |  2 +-
 src/xenia/gpu/shared_memory.h           |  6 +--
 src/xenia/gpu/texture_cache.cc          | 12 ++---
 src/xenia/gpu/texture_cache.h           | 20 +++----
 src/xenia/memory.cc                     |  8 +--
 src/xenia/memory.h                      | 10 ++--
 13 files changed, 152 insertions(+), 54 deletions(-)

diff --git a/src/xenia/apu/xma_context.cc b/src/xenia/apu/xma_context.cc
index 217d96442..7024db5a8 100644
--- a/src/xenia/apu/xma_context.cc
+++ b/src/xenia/apu/xma_context.cc
@@ -91,7 +91,7 @@ int XmaContext::Setup(uint32_t id, Memory* memory, uint32_t guest_ptr) {
 }
 
 bool XmaContext::Work() {
-  std::lock_guard<std::mutex> lock(lock_);
+  std::lock_guard<xe_mutex> lock(lock_);
   if (!is_allocated() || !is_enabled()) {
     return false;
   }
@@ -106,7 +106,7 @@ bool XmaContext::Work() {
 }
 
 void XmaContext::Enable() {
-  std::lock_guard<std::mutex> lock(lock_);
+  std::lock_guard<xe_mutex> lock(lock_);
 
   auto context_ptr = memory()->TranslateVirtual(guest_ptr());
   XMA_CONTEXT_DATA data(context_ptr);
@@ -134,7 +134,7 @@ bool XmaContext::Block(bool poll) {
 }
 
 void XmaContext::Clear() {
-  std::lock_guard<std::mutex> lock(lock_);
+  std::lock_guard<xe_mutex> lock(lock_);
   XELOGAPU("XmaContext: reset context {}", id());
 
   auto context_ptr = memory()->TranslateVirtual(guest_ptr());
@@ -151,14 +151,14 @@ void XmaContext::Clear() {
 }
 
 void XmaContext::Disable() {
-  std::lock_guard<std::mutex> lock(lock_);
+  std::lock_guard<xe_mutex> lock(lock_);
   XELOGAPU("XmaContext: disabling context {}", id());
   set_is_enabled(false);
 }
 
 void XmaContext::Release() {
   // Lock it in case the decoder thread is working on it now.
-  std::lock_guard<std::mutex> lock(lock_);
+  std::lock_guard<xe_mutex> lock(lock_);
   assert_true(is_allocated_ == true);
 
   set_is_allocated(false);
diff --git a/src/xenia/apu/xma_context.h b/src/xenia/apu/xma_context.h
index 1ffdf9dba..a348c7836 100644
--- a/src/xenia/apu/xma_context.h
+++ b/src/xenia/apu/xma_context.h
@@ -200,7 +200,7 @@ class XmaContext {
 
   uint32_t id_ = 0;
   uint32_t guest_ptr_ = 0;
-  std::mutex lock_;
+  xe_mutex lock_;
   bool is_allocated_ = false;
   bool is_enabled_ = false;
   // bool is_dirty_ = true;
diff --git a/src/xenia/base/mutex.cc b/src/xenia/base/mutex.cc
index 322985594..762f05490 100644
--- a/src/xenia/base/mutex.cc
+++ b/src/xenia/base/mutex.cc
@@ -8,11 +8,76 @@
  */
 
 #include "xenia/base/mutex.h"
+#if XE_PLATFORM_WIN32 == 1
+#include "xenia/base/platform_win.h"
+#endif
+
 
 namespace xe {
+#if XE_PLATFORM_WIN32 == 1 &&XE_ENABLE_FAST_WIN32_MUTEX == 1
+	//default spincount for entercriticalsection is insane on windows, 0x20007D0i64 (33556432 times!!)
+	//when a lock is highly contended performance degrades sharply on some processors
+	#define		XE_CRIT_SPINCOUNT		128
+/*
+chrispy: todo, if a thread exits before releasing the global mutex we need to
+check this and release the mutex one way to do this is by using FlsAlloc and
+PFLS_CALLBACK_FUNCTION, which gets called with the fiber local data when a
+thread exits
+*/
+thread_local unsigned global_mutex_depth = 0;
+static CRITICAL_SECTION* global_critical_section(xe_global_mutex* mutex) {
+  return reinterpret_cast<CRITICAL_SECTION*>(mutex);
+}
+
+xe_global_mutex::xe_global_mutex() {
+  InitializeCriticalSectionAndSpinCount(global_critical_section(this),
+                                        XE_CRIT_SPINCOUNT);
+}
+xe_global_mutex ::~xe_global_mutex() {
+  DeleteCriticalSection(global_critical_section(this));
+}
+void xe_global_mutex::lock() {
+  if (global_mutex_depth) {
+  } else {
+    EnterCriticalSection(global_critical_section(this));
+  }
+  global_mutex_depth++;
+}
+void xe_global_mutex::unlock() {
+  if (--global_mutex_depth == 0) {
+    LeaveCriticalSection(global_critical_section(this));
+  }
+}
+bool xe_global_mutex::try_lock() {
+  if (global_mutex_depth) {
+    ++global_mutex_depth;
+    return true;
+  } else {
+    BOOL success = TryEnterCriticalSection(global_critical_section(this));
+    if (success) {
+      ++global_mutex_depth;
+    }
+    return success;
+  }
+}
+
+CRITICAL_SECTION* fast_crit(xe_fast_mutex* mutex) {
+  return reinterpret_cast<CRITICAL_SECTION*>(mutex);
+}
+xe_fast_mutex::xe_fast_mutex() {
+  InitializeCriticalSectionAndSpinCount(fast_crit(this), XE_CRIT_SPINCOUNT);
+}
+xe_fast_mutex::~xe_fast_mutex() { DeleteCriticalSection(fast_crit(this)); }
+
+void xe_fast_mutex::lock() { EnterCriticalSection(fast_crit(this)); }
+void xe_fast_mutex::unlock() { LeaveCriticalSection(fast_crit(this)); }
+bool xe_fast_mutex::try_lock() {
+  return TryEnterCriticalSection(fast_crit(this));
+}
+#endif
 // chrispy: moved this out of body of function to eliminate the initialization
 // guards
-static std::recursive_mutex global_mutex;
-std::recursive_mutex& global_critical_region::mutex() { return global_mutex; }
+static global_mutex_type global_mutex;
+global_mutex_type& global_critical_region::mutex() { return global_mutex; }
 
 }  // namespace xe
diff --git a/src/xenia/base/mutex.h b/src/xenia/base/mutex.h
index 71d6bd26e..e93f71e1b 100644
--- a/src/xenia/base/mutex.h
+++ b/src/xenia/base/mutex.h
@@ -9,11 +9,50 @@
 
 #ifndef XENIA_BASE_MUTEX_H_
 #define XENIA_BASE_MUTEX_H_
-
 #include <mutex>
+#include "platform.h"
 
+//#define		XE_ENABLE_FAST_WIN32_MUTEX 1
 namespace xe {
 
+#if XE_PLATFORM_WIN32 == 1 && XE_ENABLE_FAST_WIN32_MUTEX==1
+/*
+   must conform to
+   BasicLockable:https://en.cppreference.com/w/cpp/named_req/BasicLockable as
+   well as Lockable: https://en.cppreference.com/w/cpp/named_req/Lockable
+
+   this emulates a recursive mutex, except with far less overhead
+*/
+class alignas(64) xe_global_mutex {
+  char detail[64];
+
+ public:
+  xe_global_mutex();
+  ~xe_global_mutex();
+
+  void lock();
+  void unlock();
+  bool try_lock();
+};
+using global_mutex_type = xe_global_mutex;
+
+class alignas(64) xe_fast_mutex {
+  char detail[64];
+
+ public:
+  xe_fast_mutex();
+  ~xe_fast_mutex();
+
+  void lock();
+  void unlock();
+  bool try_lock();
+};
+using xe_mutex = xe_fast_mutex;
+#else
+using global_mutex_type = std::recursive_mutex;
+using xe_mutex = std::mutex;
+#endif
+using global_unique_lock_type = std::unique_lock<global_mutex_type>;
 // The global critical region mutex singleton.
 // This must guard any operation that may suspend threads or be sensitive to
 // being suspended such as global table locks and such.
@@ -54,30 +93,30 @@ namespace xe {
 // };
 class global_critical_region {
  public:
-  static std::recursive_mutex& mutex();
+  static global_mutex_type& mutex();
 
   // Acquires a lock on the global critical section.
   // Use this when keeping an instance is not possible. Otherwise, prefer
   // to keep an instance of global_critical_region near the members requiring
   // it to keep things readable.
-  static std::unique_lock<std::recursive_mutex> AcquireDirect() {
-    return std::unique_lock<std::recursive_mutex>(mutex());
+  static global_unique_lock_type AcquireDirect() {
+    return global_unique_lock_type(mutex());
   }
 
   // Acquires a lock on the global critical section.
-  inline std::unique_lock<std::recursive_mutex> Acquire() {
-    return std::unique_lock<std::recursive_mutex>(mutex());
+  inline global_unique_lock_type Acquire() {
+    return global_unique_lock_type(mutex());
   }
 
   // Acquires a deferred lock on the global critical section.
-  inline std::unique_lock<std::recursive_mutex> AcquireDeferred() {
-    return std::unique_lock<std::recursive_mutex>(mutex(), std::defer_lock);
+  inline global_unique_lock_type AcquireDeferred() {
+    return global_unique_lock_type(mutex(), std::defer_lock);
   }
 
   // Tries to acquire a lock on the glboal critical section.
   // Check owns_lock() to see if the lock was successfully acquired.
-  inline std::unique_lock<std::recursive_mutex> TryAcquire() {
-    return std::unique_lock<std::recursive_mutex>(mutex(), std::try_to_lock);
+  inline global_unique_lock_type TryAcquire() {
+    return global_unique_lock_type(mutex(), std::try_to_lock);
   }
 };
 
diff --git a/src/xenia/base/threading_timer_queue.cc b/src/xenia/base/threading_timer_queue.cc
index 7d6e612cf..8e19b50dd 100644
--- a/src/xenia/base/threading_timer_queue.cc
+++ b/src/xenia/base/threading_timer_queue.cc
@@ -36,7 +36,7 @@ using WaitItem = TimerQueueWaitItem;
 	edit: actually had to change it back, when i was testing it only worked because i fixed disruptorplus' code to compile (it gives wrong args to condition_variable::wait_until) but now builds
 
 */
-using WaitStrat = dp::spin_wait_strategy; //dp::blocking_wait_strategy;
+using WaitStrat = dp::blocking_wait_strategy;
 
 class TimerQueue {
  public:
diff --git a/src/xenia/cpu/mmio_handler.h b/src/xenia/cpu/mmio_handler.h
index d9f6dc04c..4fc281134 100644
--- a/src/xenia/cpu/mmio_handler.h
+++ b/src/xenia/cpu/mmio_handler.h
@@ -48,7 +48,7 @@ class MMIOHandler {
   typedef uint32_t (*HostToGuestVirtual)(const void* context,
                                          const void* host_address);
   typedef bool (*AccessViolationCallback)(
-      std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+      global_unique_lock_type global_lock_locked_once, //not passed by reference with const like the others?
       void* context, void* host_address, bool is_write);
 
   // access_violation_callback is called with global_critical_region locked once
diff --git a/src/xenia/cpu/ppc/ppc_context.h b/src/xenia/cpu/ppc/ppc_context.h
index a9c0c8ed1..09205850b 100644
--- a/src/xenia/cpu/ppc/ppc_context.h
+++ b/src/xenia/cpu/ppc/ppc_context.h
@@ -15,7 +15,7 @@
 #include <string>
 
 #include "xenia/base/vec128.h"
-
+#include "xenia/base/mutex.h"
 namespace xe {
 namespace cpu {
 class Processor;
@@ -405,7 +405,7 @@ typedef struct alignas(64) PPCContext_s {
 
   // Global interrupt lock, held while interrupts are disabled or interrupts are
   // executing. This is shared among all threads and comes from the processor.
-  std::recursive_mutex* global_mutex;
+  global_mutex_type* global_mutex;
 
   // Used to shuttle data into externs. Contents volatile.
   uint64_t scratch;
diff --git a/src/xenia/gpu/primitive_processor.h b/src/xenia/gpu/primitive_processor.h
index 6a77a3d0f..aac84885d 100644
--- a/src/xenia/gpu/primitive_processor.h
+++ b/src/xenia/gpu/primitive_processor.h
@@ -883,7 +883,7 @@ class PrimitiveProcessor {
   // Must be called in a global critical region.
   void UpdateCacheBucketsNonEmptyL2(
       uint32_t bucket_index_div_64,
-      [[maybe_unused]] const std::unique_lock<std::recursive_mutex>&
+      [[maybe_unused]] const global_unique_lock_type&
           global_lock) {
     uint64_t& cache_buckets_non_empty_l2_ref =
         cache_buckets_non_empty_l2_[bucket_index_div_64 >> 6];
diff --git a/src/xenia/gpu/shared_memory.h b/src/xenia/gpu/shared_memory.h
index 8fdb1af45..63cc380d0 100644
--- a/src/xenia/gpu/shared_memory.h
+++ b/src/xenia/gpu/shared_memory.h
@@ -35,7 +35,7 @@ class SharedMemory {
   virtual void SetSystemPageBlocksValidWithGpuDataWritten();
 
   typedef void (*GlobalWatchCallback)(
-      const std::unique_lock<std::recursive_mutex>& global_lock, void* context,
+      const global_unique_lock_type& global_lock, void* context,
       uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu);
   typedef void* GlobalWatchHandle;
   // Registers a callback invoked when something is invalidated in the GPU
@@ -49,8 +49,8 @@ class SharedMemory {
   GlobalWatchHandle RegisterGlobalWatch(GlobalWatchCallback callback,
                                         void* callback_context);
   void UnregisterGlobalWatch(GlobalWatchHandle handle);
-  typedef void (*WatchCallback)(
-      const std::unique_lock<std::recursive_mutex>& global_lock, void* context,
+  typedef void (*WatchCallback)(const global_unique_lock_type& global_lock,
+                                void* context,
       void* data, uint64_t argument, bool invalidated_by_gpu);
   typedef void* WatchHandle;
   // Registers a callback invoked when the specified memory range is invalidated
diff --git a/src/xenia/gpu/texture_cache.cc b/src/xenia/gpu/texture_cache.cc
index 18fac01d9..05f6e2090 100644
--- a/src/xenia/gpu/texture_cache.cc
+++ b/src/xenia/gpu/texture_cache.cc
@@ -507,7 +507,7 @@ TextureCache::Texture::~Texture() {
 }
 
 void TextureCache::Texture::MakeUpToDateAndWatch(
-    const std::unique_lock<std::recursive_mutex>& global_lock) {
+    const global_unique_lock_type& global_lock) {
   SharedMemory& shared_memory = texture_cache().shared_memory();
   if (base_outdated_) {
     assert_not_zero(GetGuestBaseSize());
@@ -552,7 +552,7 @@ void TextureCache::Texture::MarkAsUsed() {
 }
 
 void TextureCache::Texture::WatchCallback(
-    [[maybe_unused]] const std::unique_lock<std::recursive_mutex>& global_lock,
+    [[maybe_unused]] const global_unique_lock_type& global_lock,
     bool is_mip) {
   if (is_mip) {
     assert_not_zero(GetGuestMipsSize());
@@ -565,8 +565,8 @@ void TextureCache::Texture::WatchCallback(
   }
 }
 
-void TextureCache::WatchCallback(
-    const std::unique_lock<std::recursive_mutex>& global_lock, void* context,
+void TextureCache::WatchCallback(const global_unique_lock_type& global_lock,
+                                 void* context,
     void* data, uint64_t argument, bool invalidated_by_gpu) {
   Texture& texture = *static_cast<Texture*>(context);
   texture.WatchCallback(global_lock, argument != 0);
@@ -902,7 +902,7 @@ bool TextureCache::IsRangeScaledResolved(uint32_t start_unscaled,
 }
 
 void TextureCache::ScaledResolveGlobalWatchCallbackThunk(
-    const std::unique_lock<std::recursive_mutex>& global_lock, void* context,
+    const global_unique_lock_type& global_lock, void* context,
     uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu) {
   TextureCache* texture_cache = reinterpret_cast<TextureCache*>(context);
   texture_cache->ScaledResolveGlobalWatchCallback(
@@ -910,7 +910,7 @@ void TextureCache::ScaledResolveGlobalWatchCallbackThunk(
 }
 
 void TextureCache::ScaledResolveGlobalWatchCallback(
-    const std::unique_lock<std::recursive_mutex>& global_lock,
+    const global_unique_lock_type& global_lock,
     uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu) {
   assert_true(IsDrawResolutionScaled());
   if (invalidated_by_gpu) {
diff --git a/src/xenia/gpu/texture_cache.h b/src/xenia/gpu/texture_cache.h
index 0f19ba6f9..cb690a286 100644
--- a/src/xenia/gpu/texture_cache.h
+++ b/src/xenia/gpu/texture_cache.h
@@ -230,19 +230,15 @@ class TextureCache {
     }
     bool IsResolved() const { return base_resolved_ || mips_resolved_; }
 
-    bool base_outdated(
-        const std::unique_lock<std::recursive_mutex>& global_lock) const {
+    bool base_outdated(const global_unique_lock_type& global_lock) const {
       return base_outdated_;
     }
-    bool mips_outdated(
-        const std::unique_lock<std::recursive_mutex>& global_lock) const {
+    bool mips_outdated(const global_unique_lock_type& global_lock) const {
       return mips_outdated_;
     }
-    void MakeUpToDateAndWatch(
-        const std::unique_lock<std::recursive_mutex>& global_lock);
+    void MakeUpToDateAndWatch(const global_unique_lock_type& global_lock);
 
-    void WatchCallback(
-        const std::unique_lock<std::recursive_mutex>& global_lock, bool is_mip);
+    void WatchCallback(const global_unique_lock_type& global_lock, bool is_mip);
 
     // For LRU caching - updates the last usage frame and moves the texture to
     // the end of the usage queue. Must be called any time the texture is
@@ -579,8 +575,8 @@ class TextureCache {
   void UpdateTexturesTotalHostMemoryUsage(uint64_t add, uint64_t subtract);
 
   // Shared memory callback for texture data invalidation.
-  static void WatchCallback(
-      const std::unique_lock<std::recursive_mutex>& global_lock, void* context,
+  static void WatchCallback(const global_unique_lock_type& global_lock,
+                            void* context,
       void* data, uint64_t argument, bool invalidated_by_gpu);
 
   // Checks if there are any pages that contain scaled resolve data within the
@@ -589,10 +585,10 @@ class TextureCache {
   // Global shared memory invalidation callback for invalidating scaled resolved
   // texture data.
   static void ScaledResolveGlobalWatchCallbackThunk(
-      const std::unique_lock<std::recursive_mutex>& global_lock, void* context,
+      const global_unique_lock_type& global_lock, void* context,
       uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu);
   void ScaledResolveGlobalWatchCallback(
-      const std::unique_lock<std::recursive_mutex>& global_lock,
+      const global_unique_lock_type& global_lock,
       uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu);
 
   const RegisterFile& register_file_;
diff --git a/src/xenia/memory.cc b/src/xenia/memory.cc
index 388fefc62..f94b0c469 100644
--- a/src/xenia/memory.cc
+++ b/src/xenia/memory.cc
@@ -465,7 +465,7 @@ cpu::MMIORange* Memory::LookupVirtualMappedRange(uint32_t virtual_address) {
 }
 
 bool Memory::AccessViolationCallback(
-    std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+    global_unique_lock_type global_lock_locked_once,
     void* host_address, bool is_write) {
   // Access via physical_membase_ is special, when need to bypass everything
   // (for instance, for a data provider to actually write the data) so only
@@ -493,14 +493,14 @@ bool Memory::AccessViolationCallback(
 }
 
 bool Memory::AccessViolationCallbackThunk(
-    std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+    global_unique_lock_type global_lock_locked_once,
     void* context, void* host_address, bool is_write) {
   return reinterpret_cast<Memory*>(context)->AccessViolationCallback(
       std::move(global_lock_locked_once), host_address, is_write);
 }
 
 bool Memory::TriggerPhysicalMemoryCallbacks(
-    std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+    global_unique_lock_type global_lock_locked_once,
     uint32_t virtual_address, uint32_t length, bool is_write,
     bool unwatch_exact_range, bool unprotect) {
   BaseHeap* heap = LookupHeap(virtual_address);
@@ -1711,7 +1711,7 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
 }
 
 bool PhysicalHeap::TriggerCallbacks(
-    std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+    global_unique_lock_type global_lock_locked_once,
     uint32_t virtual_address, uint32_t length, bool is_write,
     bool unwatch_exact_range, bool unprotect) {
   // TODO(Triang3l): Support read watches.
diff --git a/src/xenia/memory.h b/src/xenia/memory.h
index ed313a26d..ebbc814e6 100644
--- a/src/xenia/memory.h
+++ b/src/xenia/memory.h
@@ -271,8 +271,7 @@ class PhysicalHeap : public BaseHeap {
                              bool enable_invalidation_notifications,
                              bool enable_data_providers);
   // Returns true if any page in the range was watched.
-  bool TriggerCallbacks(
-      std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+  bool TriggerCallbacks(global_unique_lock_type global_lock_locked_once,
       uint32_t virtual_address, uint32_t length, bool is_write,
       bool unwatch_exact_range, bool unprotect = true);
 
@@ -459,7 +458,7 @@ class Memory {
   // TODO(Triang3l): Implement data providers - this is why locking depth of 1
   // will be required in the future.
   bool TriggerPhysicalMemoryCallbacks(
-      std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+      global_unique_lock_type global_lock_locked_once,
       uint32_t virtual_address, uint32_t length, bool is_write,
       bool unwatch_exact_range, bool unprotect = true);
 
@@ -508,11 +507,10 @@ class Memory {
   static uint32_t HostToGuestVirtualThunk(const void* context,
                                           const void* host_address);
 
-  bool AccessViolationCallback(
-      std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+  bool AccessViolationCallback(global_unique_lock_type global_lock_locked_once,
       void* host_address, bool is_write);
   static bool AccessViolationCallbackThunk(
-      std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+      global_unique_lock_type global_lock_locked_once,
       void* context, void* host_address, bool is_write);
 
   std::filesystem::path file_name_;