From 08f7a28920860e220c4f21e516e81f5588d732fe Mon Sep 17 00:00:00 2001 From: "chss95cs@gmail.com" Date: Sun, 14 Aug 2022 08:59:11 -0700 Subject: [PATCH] Alternative mutex --- src/xenia/apu/xma_context.cc | 10 ++-- src/xenia/apu/xma_context.h | 2 +- src/xenia/base/mutex.cc | 69 ++++++++++++++++++++++++- src/xenia/base/mutex.h | 59 +++++++++++++++++---- src/xenia/base/threading_timer_queue.cc | 2 +- src/xenia/cpu/mmio_handler.h | 2 +- src/xenia/cpu/ppc/ppc_context.h | 4 +- src/xenia/gpu/primitive_processor.h | 2 +- src/xenia/gpu/shared_memory.h | 6 +-- src/xenia/gpu/texture_cache.cc | 12 ++--- src/xenia/gpu/texture_cache.h | 20 +++---- src/xenia/memory.cc | 8 +-- src/xenia/memory.h | 10 ++-- 13 files changed, 152 insertions(+), 54 deletions(-) diff --git a/src/xenia/apu/xma_context.cc b/src/xenia/apu/xma_context.cc index 217d96442..7024db5a8 100644 --- a/src/xenia/apu/xma_context.cc +++ b/src/xenia/apu/xma_context.cc @@ -91,7 +91,7 @@ int XmaContext::Setup(uint32_t id, Memory* memory, uint32_t guest_ptr) { } bool XmaContext::Work() { - std::lock_guard lock(lock_); + std::lock_guard lock(lock_); if (!is_allocated() || !is_enabled()) { return false; } @@ -106,7 +106,7 @@ bool XmaContext::Work() { } void XmaContext::Enable() { - std::lock_guard lock(lock_); + std::lock_guard lock(lock_); auto context_ptr = memory()->TranslateVirtual(guest_ptr()); XMA_CONTEXT_DATA data(context_ptr); @@ -134,7 +134,7 @@ bool XmaContext::Block(bool poll) { } void XmaContext::Clear() { - std::lock_guard lock(lock_); + std::lock_guard lock(lock_); XELOGAPU("XmaContext: reset context {}", id()); auto context_ptr = memory()->TranslateVirtual(guest_ptr()); @@ -151,14 +151,14 @@ void XmaContext::Clear() { } void XmaContext::Disable() { - std::lock_guard lock(lock_); + std::lock_guard lock(lock_); XELOGAPU("XmaContext: disabling context {}", id()); set_is_enabled(false); } void XmaContext::Release() { // Lock it in case the decoder thread is working on it now. - std::lock_guard lock(lock_); + std::lock_guard lock(lock_); assert_true(is_allocated_ == true); set_is_allocated(false); diff --git a/src/xenia/apu/xma_context.h b/src/xenia/apu/xma_context.h index 1ffdf9dba..a348c7836 100644 --- a/src/xenia/apu/xma_context.h +++ b/src/xenia/apu/xma_context.h @@ -200,7 +200,7 @@ class XmaContext { uint32_t id_ = 0; uint32_t guest_ptr_ = 0; - std::mutex lock_; + xe_mutex lock_; bool is_allocated_ = false; bool is_enabled_ = false; // bool is_dirty_ = true; diff --git a/src/xenia/base/mutex.cc b/src/xenia/base/mutex.cc index 322985594..762f05490 100644 --- a/src/xenia/base/mutex.cc +++ b/src/xenia/base/mutex.cc @@ -8,11 +8,76 @@ */ #include "xenia/base/mutex.h" +#if XE_PLATFORM_WIN32 == 1 +#include "xenia/base/platform_win.h" +#endif + namespace xe { +#if XE_PLATFORM_WIN32 == 1 &&XE_ENABLE_FAST_WIN32_MUTEX == 1 + //default spincount for entercriticalsection is insane on windows, 0x20007D0i64 (33556432 times!!) + //when a lock is highly contended performance degrades sharply on some processors + #define XE_CRIT_SPINCOUNT 128 +/* +chrispy: todo, if a thread exits before releasing the global mutex we need to +check this and release the mutex one way to do this is by using FlsAlloc and +PFLS_CALLBACK_FUNCTION, which gets called with the fiber local data when a +thread exits +*/ +thread_local unsigned global_mutex_depth = 0; +static CRITICAL_SECTION* global_critical_section(xe_global_mutex* mutex) { + return reinterpret_cast(mutex); +} + +xe_global_mutex::xe_global_mutex() { + InitializeCriticalSectionAndSpinCount(global_critical_section(this), + XE_CRIT_SPINCOUNT); +} +xe_global_mutex ::~xe_global_mutex() { + DeleteCriticalSection(global_critical_section(this)); +} +void xe_global_mutex::lock() { + if (global_mutex_depth) { + } else { + EnterCriticalSection(global_critical_section(this)); + } + global_mutex_depth++; +} +void xe_global_mutex::unlock() { + if (--global_mutex_depth == 0) { + LeaveCriticalSection(global_critical_section(this)); + } +} +bool xe_global_mutex::try_lock() { + if (global_mutex_depth) { + ++global_mutex_depth; + return true; + } else { + BOOL success = TryEnterCriticalSection(global_critical_section(this)); + if (success) { + ++global_mutex_depth; + } + return success; + } +} + +CRITICAL_SECTION* fast_crit(xe_fast_mutex* mutex) { + return reinterpret_cast(mutex); +} +xe_fast_mutex::xe_fast_mutex() { + InitializeCriticalSectionAndSpinCount(fast_crit(this), XE_CRIT_SPINCOUNT); +} +xe_fast_mutex::~xe_fast_mutex() { DeleteCriticalSection(fast_crit(this)); } + +void xe_fast_mutex::lock() { EnterCriticalSection(fast_crit(this)); } +void xe_fast_mutex::unlock() { LeaveCriticalSection(fast_crit(this)); } +bool xe_fast_mutex::try_lock() { + return TryEnterCriticalSection(fast_crit(this)); +} +#endif // chrispy: moved this out of body of function to eliminate the initialization // guards -static std::recursive_mutex global_mutex; -std::recursive_mutex& global_critical_region::mutex() { return global_mutex; } +static global_mutex_type global_mutex; +global_mutex_type& global_critical_region::mutex() { return global_mutex; } } // namespace xe diff --git a/src/xenia/base/mutex.h b/src/xenia/base/mutex.h index 71d6bd26e..e93f71e1b 100644 --- a/src/xenia/base/mutex.h +++ b/src/xenia/base/mutex.h @@ -9,11 +9,50 @@ #ifndef XENIA_BASE_MUTEX_H_ #define XENIA_BASE_MUTEX_H_ - #include +#include "platform.h" +//#define XE_ENABLE_FAST_WIN32_MUTEX 1 namespace xe { +#if XE_PLATFORM_WIN32 == 1 && XE_ENABLE_FAST_WIN32_MUTEX==1 +/* + must conform to + BasicLockable:https://en.cppreference.com/w/cpp/named_req/BasicLockable as + well as Lockable: https://en.cppreference.com/w/cpp/named_req/Lockable + + this emulates a recursive mutex, except with far less overhead +*/ +class alignas(64) xe_global_mutex { + char detail[64]; + + public: + xe_global_mutex(); + ~xe_global_mutex(); + + void lock(); + void unlock(); + bool try_lock(); +}; +using global_mutex_type = xe_global_mutex; + +class alignas(64) xe_fast_mutex { + char detail[64]; + + public: + xe_fast_mutex(); + ~xe_fast_mutex(); + + void lock(); + void unlock(); + bool try_lock(); +}; +using xe_mutex = xe_fast_mutex; +#else +using global_mutex_type = std::recursive_mutex; +using xe_mutex = std::mutex; +#endif +using global_unique_lock_type = std::unique_lock; // The global critical region mutex singleton. // This must guard any operation that may suspend threads or be sensitive to // being suspended such as global table locks and such. @@ -54,30 +93,30 @@ namespace xe { // }; class global_critical_region { public: - static std::recursive_mutex& mutex(); + static global_mutex_type& mutex(); // Acquires a lock on the global critical section. // Use this when keeping an instance is not possible. Otherwise, prefer // to keep an instance of global_critical_region near the members requiring // it to keep things readable. - static std::unique_lock AcquireDirect() { - return std::unique_lock(mutex()); + static global_unique_lock_type AcquireDirect() { + return global_unique_lock_type(mutex()); } // Acquires a lock on the global critical section. - inline std::unique_lock Acquire() { - return std::unique_lock(mutex()); + inline global_unique_lock_type Acquire() { + return global_unique_lock_type(mutex()); } // Acquires a deferred lock on the global critical section. - inline std::unique_lock AcquireDeferred() { - return std::unique_lock(mutex(), std::defer_lock); + inline global_unique_lock_type AcquireDeferred() { + return global_unique_lock_type(mutex(), std::defer_lock); } // Tries to acquire a lock on the glboal critical section. // Check owns_lock() to see if the lock was successfully acquired. - inline std::unique_lock TryAcquire() { - return std::unique_lock(mutex(), std::try_to_lock); + inline global_unique_lock_type TryAcquire() { + return global_unique_lock_type(mutex(), std::try_to_lock); } }; diff --git a/src/xenia/base/threading_timer_queue.cc b/src/xenia/base/threading_timer_queue.cc index 7d6e612cf..8e19b50dd 100644 --- a/src/xenia/base/threading_timer_queue.cc +++ b/src/xenia/base/threading_timer_queue.cc @@ -36,7 +36,7 @@ using WaitItem = TimerQueueWaitItem; edit: actually had to change it back, when i was testing it only worked because i fixed disruptorplus' code to compile (it gives wrong args to condition_variable::wait_until) but now builds */ -using WaitStrat = dp::spin_wait_strategy; //dp::blocking_wait_strategy; +using WaitStrat = dp::blocking_wait_strategy; class TimerQueue { public: diff --git a/src/xenia/cpu/mmio_handler.h b/src/xenia/cpu/mmio_handler.h index d9f6dc04c..4fc281134 100644 --- a/src/xenia/cpu/mmio_handler.h +++ b/src/xenia/cpu/mmio_handler.h @@ -48,7 +48,7 @@ class MMIOHandler { typedef uint32_t (*HostToGuestVirtual)(const void* context, const void* host_address); typedef bool (*AccessViolationCallback)( - std::unique_lock global_lock_locked_once, + global_unique_lock_type global_lock_locked_once, //not passed by reference with const like the others? void* context, void* host_address, bool is_write); // access_violation_callback is called with global_critical_region locked once diff --git a/src/xenia/cpu/ppc/ppc_context.h b/src/xenia/cpu/ppc/ppc_context.h index a9c0c8ed1..09205850b 100644 --- a/src/xenia/cpu/ppc/ppc_context.h +++ b/src/xenia/cpu/ppc/ppc_context.h @@ -15,7 +15,7 @@ #include #include "xenia/base/vec128.h" - +#include "xenia/base/mutex.h" namespace xe { namespace cpu { class Processor; @@ -405,7 +405,7 @@ typedef struct alignas(64) PPCContext_s { // Global interrupt lock, held while interrupts are disabled or interrupts are // executing. This is shared among all threads and comes from the processor. - std::recursive_mutex* global_mutex; + global_mutex_type* global_mutex; // Used to shuttle data into externs. Contents volatile. uint64_t scratch; diff --git a/src/xenia/gpu/primitive_processor.h b/src/xenia/gpu/primitive_processor.h index 6a77a3d0f..aac84885d 100644 --- a/src/xenia/gpu/primitive_processor.h +++ b/src/xenia/gpu/primitive_processor.h @@ -883,7 +883,7 @@ class PrimitiveProcessor { // Must be called in a global critical region. void UpdateCacheBucketsNonEmptyL2( uint32_t bucket_index_div_64, - [[maybe_unused]] const std::unique_lock& + [[maybe_unused]] const global_unique_lock_type& global_lock) { uint64_t& cache_buckets_non_empty_l2_ref = cache_buckets_non_empty_l2_[bucket_index_div_64 >> 6]; diff --git a/src/xenia/gpu/shared_memory.h b/src/xenia/gpu/shared_memory.h index 8fdb1af45..63cc380d0 100644 --- a/src/xenia/gpu/shared_memory.h +++ b/src/xenia/gpu/shared_memory.h @@ -35,7 +35,7 @@ class SharedMemory { virtual void SetSystemPageBlocksValidWithGpuDataWritten(); typedef void (*GlobalWatchCallback)( - const std::unique_lock& global_lock, void* context, + const global_unique_lock_type& global_lock, void* context, uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu); typedef void* GlobalWatchHandle; // Registers a callback invoked when something is invalidated in the GPU @@ -49,8 +49,8 @@ class SharedMemory { GlobalWatchHandle RegisterGlobalWatch(GlobalWatchCallback callback, void* callback_context); void UnregisterGlobalWatch(GlobalWatchHandle handle); - typedef void (*WatchCallback)( - const std::unique_lock& global_lock, void* context, + typedef void (*WatchCallback)(const global_unique_lock_type& global_lock, + void* context, void* data, uint64_t argument, bool invalidated_by_gpu); typedef void* WatchHandle; // Registers a callback invoked when the specified memory range is invalidated diff --git a/src/xenia/gpu/texture_cache.cc b/src/xenia/gpu/texture_cache.cc index 18fac01d9..05f6e2090 100644 --- a/src/xenia/gpu/texture_cache.cc +++ b/src/xenia/gpu/texture_cache.cc @@ -507,7 +507,7 @@ TextureCache::Texture::~Texture() { } void TextureCache::Texture::MakeUpToDateAndWatch( - const std::unique_lock& global_lock) { + const global_unique_lock_type& global_lock) { SharedMemory& shared_memory = texture_cache().shared_memory(); if (base_outdated_) { assert_not_zero(GetGuestBaseSize()); @@ -552,7 +552,7 @@ void TextureCache::Texture::MarkAsUsed() { } void TextureCache::Texture::WatchCallback( - [[maybe_unused]] const std::unique_lock& global_lock, + [[maybe_unused]] const global_unique_lock_type& global_lock, bool is_mip) { if (is_mip) { assert_not_zero(GetGuestMipsSize()); @@ -565,8 +565,8 @@ void TextureCache::Texture::WatchCallback( } } -void TextureCache::WatchCallback( - const std::unique_lock& global_lock, void* context, +void TextureCache::WatchCallback(const global_unique_lock_type& global_lock, + void* context, void* data, uint64_t argument, bool invalidated_by_gpu) { Texture& texture = *static_cast(context); texture.WatchCallback(global_lock, argument != 0); @@ -902,7 +902,7 @@ bool TextureCache::IsRangeScaledResolved(uint32_t start_unscaled, } void TextureCache::ScaledResolveGlobalWatchCallbackThunk( - const std::unique_lock& global_lock, void* context, + const global_unique_lock_type& global_lock, void* context, uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu) { TextureCache* texture_cache = reinterpret_cast(context); texture_cache->ScaledResolveGlobalWatchCallback( @@ -910,7 +910,7 @@ void TextureCache::ScaledResolveGlobalWatchCallbackThunk( } void TextureCache::ScaledResolveGlobalWatchCallback( - const std::unique_lock& global_lock, + const global_unique_lock_type& global_lock, uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu) { assert_true(IsDrawResolutionScaled()); if (invalidated_by_gpu) { diff --git a/src/xenia/gpu/texture_cache.h b/src/xenia/gpu/texture_cache.h index 0f19ba6f9..cb690a286 100644 --- a/src/xenia/gpu/texture_cache.h +++ b/src/xenia/gpu/texture_cache.h @@ -230,19 +230,15 @@ class TextureCache { } bool IsResolved() const { return base_resolved_ || mips_resolved_; } - bool base_outdated( - const std::unique_lock& global_lock) const { + bool base_outdated(const global_unique_lock_type& global_lock) const { return base_outdated_; } - bool mips_outdated( - const std::unique_lock& global_lock) const { + bool mips_outdated(const global_unique_lock_type& global_lock) const { return mips_outdated_; } - void MakeUpToDateAndWatch( - const std::unique_lock& global_lock); + void MakeUpToDateAndWatch(const global_unique_lock_type& global_lock); - void WatchCallback( - const std::unique_lock& global_lock, bool is_mip); + void WatchCallback(const global_unique_lock_type& global_lock, bool is_mip); // For LRU caching - updates the last usage frame and moves the texture to // the end of the usage queue. Must be called any time the texture is @@ -579,8 +575,8 @@ class TextureCache { void UpdateTexturesTotalHostMemoryUsage(uint64_t add, uint64_t subtract); // Shared memory callback for texture data invalidation. - static void WatchCallback( - const std::unique_lock& global_lock, void* context, + static void WatchCallback(const global_unique_lock_type& global_lock, + void* context, void* data, uint64_t argument, bool invalidated_by_gpu); // Checks if there are any pages that contain scaled resolve data within the @@ -589,10 +585,10 @@ class TextureCache { // Global shared memory invalidation callback for invalidating scaled resolved // texture data. static void ScaledResolveGlobalWatchCallbackThunk( - const std::unique_lock& global_lock, void* context, + const global_unique_lock_type& global_lock, void* context, uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu); void ScaledResolveGlobalWatchCallback( - const std::unique_lock& global_lock, + const global_unique_lock_type& global_lock, uint32_t address_first, uint32_t address_last, bool invalidated_by_gpu); const RegisterFile& register_file_; diff --git a/src/xenia/memory.cc b/src/xenia/memory.cc index 388fefc62..f94b0c469 100644 --- a/src/xenia/memory.cc +++ b/src/xenia/memory.cc @@ -465,7 +465,7 @@ cpu::MMIORange* Memory::LookupVirtualMappedRange(uint32_t virtual_address) { } bool Memory::AccessViolationCallback( - std::unique_lock global_lock_locked_once, + global_unique_lock_type global_lock_locked_once, void* host_address, bool is_write) { // Access via physical_membase_ is special, when need to bypass everything // (for instance, for a data provider to actually write the data) so only @@ -493,14 +493,14 @@ bool Memory::AccessViolationCallback( } bool Memory::AccessViolationCallbackThunk( - std::unique_lock global_lock_locked_once, + global_unique_lock_type global_lock_locked_once, void* context, void* host_address, bool is_write) { return reinterpret_cast(context)->AccessViolationCallback( std::move(global_lock_locked_once), host_address, is_write); } bool Memory::TriggerPhysicalMemoryCallbacks( - std::unique_lock global_lock_locked_once, + global_unique_lock_type global_lock_locked_once, uint32_t virtual_address, uint32_t length, bool is_write, bool unwatch_exact_range, bool unprotect) { BaseHeap* heap = LookupHeap(virtual_address); @@ -1711,7 +1711,7 @@ void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address, } bool PhysicalHeap::TriggerCallbacks( - std::unique_lock global_lock_locked_once, + global_unique_lock_type global_lock_locked_once, uint32_t virtual_address, uint32_t length, bool is_write, bool unwatch_exact_range, bool unprotect) { // TODO(Triang3l): Support read watches. diff --git a/src/xenia/memory.h b/src/xenia/memory.h index ed313a26d..ebbc814e6 100644 --- a/src/xenia/memory.h +++ b/src/xenia/memory.h @@ -271,8 +271,7 @@ class PhysicalHeap : public BaseHeap { bool enable_invalidation_notifications, bool enable_data_providers); // Returns true if any page in the range was watched. - bool TriggerCallbacks( - std::unique_lock global_lock_locked_once, + bool TriggerCallbacks(global_unique_lock_type global_lock_locked_once, uint32_t virtual_address, uint32_t length, bool is_write, bool unwatch_exact_range, bool unprotect = true); @@ -459,7 +458,7 @@ class Memory { // TODO(Triang3l): Implement data providers - this is why locking depth of 1 // will be required in the future. bool TriggerPhysicalMemoryCallbacks( - std::unique_lock global_lock_locked_once, + global_unique_lock_type global_lock_locked_once, uint32_t virtual_address, uint32_t length, bool is_write, bool unwatch_exact_range, bool unprotect = true); @@ -508,11 +507,10 @@ class Memory { static uint32_t HostToGuestVirtualThunk(const void* context, const void* host_address); - bool AccessViolationCallback( - std::unique_lock global_lock_locked_once, + bool AccessViolationCallback(global_unique_lock_type global_lock_locked_once, void* host_address, bool is_write); static bool AccessViolationCallbackThunk( - std::unique_lock global_lock_locked_once, + global_unique_lock_type global_lock_locked_once, void* context, void* host_address, bool is_write); std::filesystem::path file_name_;