Alternative mutex

2022-08-14 08:59:11 -07:00 · 2022-08-14 08:59:11 -07:00 · 08f7a28920
parent 6bc3191b97
commit 08f7a28920
13 changed files with 152 additions and 54 deletions
--- a/src/xenia/apu/xma_context.cc
+++ b/src/xenia/apu/xma_context.cc
@ -91,7 +91,7 @@ int XmaContext::Setup(uint32_t id, Memory* memory, uint32_t guest_ptr) {
 }

 bool XmaContext::Work() {
-  std::lock_guard<std::mutex> lock(lock_);
+  std::lock_guard<xe_mutex> lock(lock_);
  if (!is_allocated() || !is_enabled()) {
    return false;
  }
@ -106,7 +106,7 @@ bool XmaContext::Work() {
 }

 void XmaContext::Enable() {
-  std::lock_guard<std::mutex> lock(lock_);
+  std::lock_guard<xe_mutex> lock(lock_);

  auto context_ptr = memory()->TranslateVirtual(guest_ptr());
  XMA_CONTEXT_DATA data(context_ptr);
@ -134,7 +134,7 @@ bool XmaContext::Block(bool poll) {
 }

 void XmaContext::Clear() {
-  std::lock_guard<std::mutex> lock(lock_);
+  std::lock_guard<xe_mutex> lock(lock_);
  XELOGAPU("XmaContext: reset context {}", id());

  auto context_ptr = memory()->TranslateVirtual(guest_ptr());
@ -151,14 +151,14 @@ void XmaContext::Clear() {
 }

 void XmaContext::Disable() {
-  std::lock_guard<std::mutex> lock(lock_);
+  std::lock_guard<xe_mutex> lock(lock_);
  XELOGAPU("XmaContext: disabling context {}", id());
  set_is_enabled(false);
 }

 void XmaContext::Release() {
  // Lock it in case the decoder thread is working on it now.
-  std::lock_guard<std::mutex> lock(lock_);
+  std::lock_guard<xe_mutex> lock(lock_);
  assert_true(is_allocated_ == true);

  set_is_allocated(false);
--- a/src/xenia/apu/xma_context.h
+++ b/src/xenia/apu/xma_context.h
@ -200,7 +200,7 @@ class XmaContext {

  uint32_t id_ = 0;
  uint32_t guest_ptr_ = 0;
-  std::mutex lock_;
+  xe_mutex lock_;
  bool is_allocated_ = false;
  bool is_enabled_ = false;
  // bool is_dirty_ = true;
--- a/src/xenia/base/mutex.cc
+++ b/src/xenia/base/mutex.cc
@ -8,11 +8,76 @@
 */

 #include "xenia/base/mutex.h"
+#if XE_PLATFORM_WIN32 == 1
+#include "xenia/base/platform_win.h"
+#endif
+

 namespace xe {
+#if XE_PLATFORM_WIN32 == 1 &&XE_ENABLE_FAST_WIN32_MUTEX == 1
+	//default spincount for entercriticalsection is insane on windows, 0x20007D0i64 (33556432 times!!)
+	//when a lock is highly contended performance degrades sharply on some processors
+	#define		XE_CRIT_SPINCOUNT		128
+/*
+chrispy: todo, if a thread exits before releasing the global mutex we need to
+check this and release the mutex one way to do this is by using FlsAlloc and
+PFLS_CALLBACK_FUNCTION, which gets called with the fiber local data when a
+thread exits
+*/
+thread_local unsigned global_mutex_depth = 0;
+static CRITICAL_SECTION* global_critical_section(xe_global_mutex* mutex) {
+  return reinterpret_cast<CRITICAL_SECTION*>(mutex);
+}
+
+xe_global_mutex::xe_global_mutex() {
+  InitializeCriticalSectionAndSpinCount(global_critical_section(this),
+                                        XE_CRIT_SPINCOUNT);
+}
+xe_global_mutex ::~xe_global_mutex() {
+  DeleteCriticalSection(global_critical_section(this));
+}
+void xe_global_mutex::lock() {
+  if (global_mutex_depth) {
+  } else {
+    EnterCriticalSection(global_critical_section(this));
+  }
+  global_mutex_depth++;
+}
+void xe_global_mutex::unlock() {
+  if (--global_mutex_depth == 0) {
+    LeaveCriticalSection(global_critical_section(this));
+  }
+}
+bool xe_global_mutex::try_lock() {
+  if (global_mutex_depth) {
+    ++global_mutex_depth;
+    return true;
+  } else {
+    BOOL success = TryEnterCriticalSection(global_critical_section(this));
+    if (success) {
+      ++global_mutex_depth;
+    }
+    return success;
+  }
+}
+
+CRITICAL_SECTION* fast_crit(xe_fast_mutex* mutex) {
+  return reinterpret_cast<CRITICAL_SECTION*>(mutex);
+}
+xe_fast_mutex::xe_fast_mutex() {
+  InitializeCriticalSectionAndSpinCount(fast_crit(this), XE_CRIT_SPINCOUNT);
+}
+xe_fast_mutex::~xe_fast_mutex() { DeleteCriticalSection(fast_crit(this)); }
+
+void xe_fast_mutex::lock() { EnterCriticalSection(fast_crit(this)); }
+void xe_fast_mutex::unlock() { LeaveCriticalSection(fast_crit(this)); }
+bool xe_fast_mutex::try_lock() {
+  return TryEnterCriticalSection(fast_crit(this));
+}
+#endif
 // chrispy: moved this out of body of function to eliminate the initialization
 // guards
-static std::recursive_mutex global_mutex;
-std::recursive_mutex& global_critical_region::mutex() { return global_mutex; }
+static global_mutex_type global_mutex;
+global_mutex_type& global_critical_region::mutex() { return global_mutex; }

 }  // namespace xe
--- a/src/xenia/base/mutex.h
+++ b/src/xenia/base/mutex.h
@ -9,11 +9,50 @@

 #ifndef XENIA_BASE_MUTEX_H_
 #define XENIA_BASE_MUTEX_H_
-
 #include <mutex>
+#include "platform.h"

+//#define		XE_ENABLE_FAST_WIN32_MUTEX 1
 namespace xe {

+#if XE_PLATFORM_WIN32 == 1 && XE_ENABLE_FAST_WIN32_MUTEX==1
+/*
+   must conform to
+   BasicLockable:https://en.cppreference.com/w/cpp/named_req/BasicLockable as
+   well as Lockable: https://en.cppreference.com/w/cpp/named_req/Lockable
+
+   this emulates a recursive mutex, except with far less overhead
+*/
+class alignas(64) xe_global_mutex {
+  char detail[64];
+
+ public:
+  xe_global_mutex();
+  ~xe_global_mutex();
+
+  void lock();
+  void unlock();
+  bool try_lock();
+};
+using global_mutex_type = xe_global_mutex;
+
+class alignas(64) xe_fast_mutex {
+  char detail[64];
+
+ public:
+  xe_fast_mutex();
+  ~xe_fast_mutex();
+
+  void lock();
+  void unlock();
+  bool try_lock();
+};
+using xe_mutex = xe_fast_mutex;
+#else
+using global_mutex_type = std::recursive_mutex;
+using xe_mutex = std::mutex;
+#endif
+using global_unique_lock_type = std::unique_lock<global_mutex_type>;
 // The global critical region mutex singleton.
 // This must guard any operation that may suspend threads or be sensitive to
 // being suspended such as global table locks and such.
@ -54,30 +93,30 @@ namespace xe {
 // };
 class global_critical_region {
 public:
-  static std::recursive_mutex& mutex();
+  static global_mutex_type& mutex();

  // Acquires a lock on the global critical section.
  // Use this when keeping an instance is not possible. Otherwise, prefer
  // to keep an instance of global_critical_region near the members requiring
  // it to keep things readable.
-  static std::unique_lock<std::recursive_mutex> AcquireDirect() {
-    return std::unique_lock<std::recursive_mutex>(mutex());
+  static global_unique_lock_type AcquireDirect() {
+    return global_unique_lock_type(mutex());
  }

  // Acquires a lock on the global critical section.
-  inline std::unique_lock<std::recursive_mutex> Acquire() {
-    return std::unique_lock<std::recursive_mutex>(mutex());
+  inline global_unique_lock_type Acquire() {
+    return global_unique_lock_type(mutex());
  }

  // Acquires a deferred lock on the global critical section.
-  inline std::unique_lock<std::recursive_mutex> AcquireDeferred() {
-    return std::unique_lock<std::recursive_mutex>(mutex(), std::defer_lock);
+  inline global_unique_lock_type AcquireDeferred() {
+    return global_unique_lock_type(mutex(), std::defer_lock);
  }

  // Tries to acquire a lock on the glboal critical section.
  // Check owns_lock() to see if the lock was successfully acquired.
-  inline std::unique_lock<std::recursive_mutex> TryAcquire() {
-    return std::unique_lock<std::recursive_mutex>(mutex(), std::try_to_lock);
+  inline global_unique_lock_type TryAcquire() {
+    return global_unique_lock_type(mutex(), std::try_to_lock);
  }
 };

--- a/src/xenia/base/threading_timer_queue.cc
+++ b/src/xenia/base/threading_timer_queue.cc
@ -36,7 +36,7 @@ using WaitItem = TimerQueueWaitItem;
 	edit: actually had to change it back, when i was testing it only worked because i fixed disruptorplus' code to compile (it gives wrong args to condition_variable::wait_until) but now builds

 */
-using WaitStrat = dp::spin_wait_strategy; //dp::blocking_wait_strategy;
+using WaitStrat = dp::blocking_wait_strategy;

 class TimerQueue {
 public:
--- a/src/xenia/cpu/mmio_handler.h
+++ b/src/xenia/cpu/mmio_handler.h
@ -48,7 +48,7 @@ class MMIOHandler {
  typedef uint32_t (*HostToGuestVirtual)(const void* context,
                                         const void* host_address);
  typedef bool (*AccessViolationCallback)(
-      std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+      global_unique_lock_type global_lock_locked_once, //not passed by reference with const like the others?
      void* context, void* host_address, bool is_write);

  // access_violation_callback is called with global_critical_region locked once
--- a/src/xenia/cpu/ppc/ppc_context.h
+++ b/src/xenia/cpu/ppc/ppc_context.h
@ -15,7 +15,7 @@
 #include <string>

 #include "xenia/base/vec128.h"
-
+#include "xenia/base/mutex.h"
 namespace xe {
 namespace cpu {
 class Processor;
@ -405,7 +405,7 @@ typedef struct alignas(64) PPCContext_s {

  // Global interrupt lock, held while interrupts are disabled or interrupts are
  // executing. This is shared among all threads and comes from the processor.
-  std::recursive_mutex* global_mutex;
+  global_mutex_type* global_mutex;

  // Used to shuttle data into externs. Contents volatile.
  uint64_t scratch;
--- a/src/xenia/gpu/primitive_processor.h
+++ b/src/xenia/gpu/primitive_processor.h
@ -883,7 +883,7 @@ class PrimitiveProcessor {
  // Must be called in a global critical region.
  void UpdateCacheBucketsNonEmptyL2(
      uint32_t bucket_index_div_64,
-      [[maybe_unused]] const std::unique_lock<std::recursive_mutex>&
+      [[maybe_unused]] const global_unique_lock_type&
          global_lock) {
    uint64_t& cache_buckets_non_empty_l2_ref =
        cache_buckets_non_empty_l2_[bucket_index_div_64 >> 6];
--- a/src/xenia/gpu/shared_memory.h
+++ b/src/xenia/gpu/shared_memory.h
@ -35,7 +35,7 @@ class SharedMemory {
  virtual void SetSystemPageBlocksValidWithGpuDataWritten();

  typedef void (*GlobalWatchCallback)(
-      const std::unique_lock<std::recursive_mutex>& global_lock, void* context,
+      const global_unique_lock_type& global_lock, void* context,
      uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu);
  typedef void* GlobalWatchHandle;
  // Registers a callback invoked when something is invalidated in the GPU
@ -49,8 +49,8 @@ class SharedMemory {
  GlobalWatchHandle RegisterGlobalWatch(GlobalWatchCallback callback,
                                        void* callback_context);
  void UnregisterGlobalWatch(GlobalWatchHandle handle);
-  typedef void (*WatchCallback)(
-      const std::unique_lock<std::recursive_mutex>& global_lock, void* context,
+  typedef void (*WatchCallback)(const global_unique_lock_type& global_lock,
+                                void* context,
      void* data, uint64_t argument, bool invalidated_by_gpu);
  typedef void* WatchHandle;
  // Registers a callback invoked when the specified memory range is invalidated
--- a/src/xenia/gpu/texture_cache.cc
+++ b/src/xenia/gpu/texture_cache.cc
@ -507,7 +507,7 @@ TextureCache::Texture::~Texture() {
 }

 void TextureCache::Texture::MakeUpToDateAndWatch(
-    const std::unique_lock<std::recursive_mutex>& global_lock) {
+    const global_unique_lock_type& global_lock) {
  SharedMemory& shared_memory = texture_cache().shared_memory();
  if (base_outdated_) {
    assert_not_zero(GetGuestBaseSize());
@ -552,7 +552,7 @@ void TextureCache::Texture::MarkAsUsed() {
 }

 void TextureCache::Texture::WatchCallback(
-    [[maybe_unused]] const std::unique_lock<std::recursive_mutex>& global_lock,
+    [[maybe_unused]] const global_unique_lock_type& global_lock,
    bool is_mip) {
  if (is_mip) {
    assert_not_zero(GetGuestMipsSize());
@ -565,8 +565,8 @@ void TextureCache::Texture::WatchCallback(
  }
 }

-void TextureCache::WatchCallback(
-    const std::unique_lock<std::recursive_mutex>& global_lock, void* context,
+void TextureCache::WatchCallback(const global_unique_lock_type& global_lock,
+                                 void* context,
    void* data, uint64_t argument, bool invalidated_by_gpu) {
  Texture& texture = *static_cast<Texture*>(context);
  texture.WatchCallback(global_lock, argument != 0);
@ -902,7 +902,7 @@ bool TextureCache::IsRangeScaledResolved(uint32_t start_unscaled,
 }

 void TextureCache::ScaledResolveGlobalWatchCallbackThunk(
-    const std::unique_lock<std::recursive_mutex>& global_lock, void* context,
+    const global_unique_lock_type& global_lock, void* context,
    uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu) {
  TextureCache* texture_cache = reinterpret_cast<TextureCache*>(context);
  texture_cache->ScaledResolveGlobalWatchCallback(
@ -910,7 +910,7 @@ void TextureCache::ScaledResolveGlobalWatchCallbackThunk(
 }

 void TextureCache::ScaledResolveGlobalWatchCallback(
-    const std::unique_lock<std::recursive_mutex>& global_lock,
+    const global_unique_lock_type& global_lock,
    uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu) {
  assert_true(IsDrawResolutionScaled());
  if (invalidated_by_gpu) {
--- a/src/xenia/gpu/texture_cache.h
+++ b/src/xenia/gpu/texture_cache.h
@ -230,19 +230,15 @@ class TextureCache {
    }
    bool IsResolved() const { return base_resolved_ || mips_resolved_; }

-    bool base_outdated(
-        const std::unique_lock<std::recursive_mutex>& global_lock) const {
+    bool base_outdated(const global_unique_lock_type& global_lock) const {
      return base_outdated_;
    }
-    bool mips_outdated(
-        const std::unique_lock<std::recursive_mutex>& global_lock) const {
+    bool mips_outdated(const global_unique_lock_type& global_lock) const {
      return mips_outdated_;
    }
-    void MakeUpToDateAndWatch(
-        const std::unique_lock<std::recursive_mutex>& global_lock);
+    void MakeUpToDateAndWatch(const global_unique_lock_type& global_lock);

-    void WatchCallback(
-        const std::unique_lock<std::recursive_mutex>& global_lock, bool is_mip);
+    void WatchCallback(const global_unique_lock_type& global_lock, bool is_mip);

    // For LRU caching - updates the last usage frame and moves the texture to
    // the end of the usage queue. Must be called any time the texture is
@ -579,8 +575,8 @@ class TextureCache {
  void UpdateTexturesTotalHostMemoryUsage(uint64_t add, uint64_t subtract);

  // Shared memory callback for texture data invalidation.
-  static void WatchCallback(
-      const std::unique_lock<std::recursive_mutex>& global_lock, void* context,
+  static void WatchCallback(const global_unique_lock_type& global_lock,
+                            void* context,
      void* data, uint64_t argument, bool invalidated_by_gpu);

  // Checks if there are any pages that contain scaled resolve data within the
@ -589,10 +585,10 @@ class TextureCache {
  // Global shared memory invalidation callback for invalidating scaled resolved
  // texture data.
  static void ScaledResolveGlobalWatchCallbackThunk(
-      const std::unique_lock<std::recursive_mutex>& global_lock, void* context,
+      const global_unique_lock_type& global_lock, void* context,
      uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu);
  void ScaledResolveGlobalWatchCallback(
-      const std::unique_lock<std::recursive_mutex>& global_lock,
+      const global_unique_lock_type& global_lock,
      uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu);

  const RegisterFile& register_file_;
--- a/src/xenia/memory.cc
+++ b/src/xenia/memory.cc
@ -465,7 +465,7 @@ cpu::MMIORange* Memory::LookupVirtualMappedRange(uint32_t virtual_address) {
 }

 bool Memory::AccessViolationCallback(
-    std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+    global_unique_lock_type global_lock_locked_once,
    void* host_address, bool is_write) {
  // Access via physical_membase_ is special, when need to bypass everything
  // (for instance, for a data provider to actually write the data) so only
@ -493,14 +493,14 @@ bool Memory::AccessViolationCallback(
 }

 bool Memory::AccessViolationCallbackThunk(
-    std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+    global_unique_lock_type global_lock_locked_once,
    void* context, void* host_address, bool is_write) {
  return reinterpret_cast<Memory*>(context)->AccessViolationCallback(
      std::move(global_lock_locked_once), host_address, is_write);
 }

 bool Memory::TriggerPhysicalMemoryCallbacks(
-    std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+    global_unique_lock_type global_lock_locked_once,
    uint32_t virtual_address, uint32_t length, bool is_write,
    bool unwatch_exact_range, bool unprotect) {
  BaseHeap* heap = LookupHeap(virtual_address);
@ -1711,7 +1711,7 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
 }

 bool PhysicalHeap::TriggerCallbacks(
-    std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+    global_unique_lock_type global_lock_locked_once,
    uint32_t virtual_address, uint32_t length, bool is_write,
    bool unwatch_exact_range, bool unprotect) {
  // TODO(Triang3l): Support read watches.
--- a/src/xenia/memory.h
+++ b/src/xenia/memory.h
@ -271,8 +271,7 @@ class PhysicalHeap : public BaseHeap {
                             bool enable_invalidation_notifications,
                             bool enable_data_providers);
  // Returns true if any page in the range was watched.
-  bool TriggerCallbacks(
-      std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+  bool TriggerCallbacks(global_unique_lock_type global_lock_locked_once,
      uint32_t virtual_address, uint32_t length, bool is_write,
      bool unwatch_exact_range, bool unprotect = true);

@ -459,7 +458,7 @@ class Memory {
  // TODO(Triang3l): Implement data providers - this is why locking depth of 1
  // will be required in the future.
  bool TriggerPhysicalMemoryCallbacks(
-      std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+      global_unique_lock_type global_lock_locked_once,
      uint32_t virtual_address, uint32_t length, bool is_write,
      bool unwatch_exact_range, bool unprotect = true);

@ -508,11 +507,10 @@ class Memory {
  static uint32_t HostToGuestVirtualThunk(const void* context,
                                          const void* host_address);

-  bool AccessViolationCallback(
-      std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+  bool AccessViolationCallback(global_unique_lock_type global_lock_locked_once,
      void* host_address, bool is_write);
  static bool AccessViolationCallbackThunk(
-      std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+      global_unique_lock_type global_lock_locked_once,
      void* context, void* host_address, bool is_write);

  std::filesystem::path file_name_;