diff --git a/src/core/gpu_backend.cpp b/src/core/gpu_backend.cpp index b614a226b..2f6e6b00a 100644 --- a/src/core/gpu_backend.cpp +++ b/src/core/gpu_backend.cpp @@ -53,8 +53,13 @@ struct Stats : Counters struct ALIGN_TO_CACHE_LINE CPUThreadState { + static constexpr u32 WAIT_NONE = 0; + static constexpr u32 WAIT_CPU_THREAD_WAITING = 1; + static constexpr u32 WAIT_GPU_THREAD_SIGNALING = 2; + static constexpr u32 WAIT_GPU_THREAD_POSTED = 3; + std::atomic queued_frames; - std::atomic_bool waiting_for_gpu_thread; + std::atomic wait_state; Threading::KernelSemaphore gpu_thread_wait; }; @@ -289,7 +294,7 @@ bool GPUBackend::BeginQueueFrame() if (g_settings.gpu_max_queued_frames > 0) DEV_LOG("<-- {} queued frames, {} max, blocking CPU thread", queued_frames, g_settings.gpu_max_queued_frames); - s_cpu_thread_state.waiting_for_gpu_thread.store(true, std::memory_order_release); + s_cpu_thread_state.wait_state.store(CPUThreadState::WAIT_CPU_THREAD_WAITING, std::memory_order_release); return true; } @@ -300,9 +305,9 @@ void GPUBackend::WaitForOneQueuedFrame() { // It's possible that the GPU thread has already signaled the semaphore. // If so, then we still need to drain it, otherwise waits in the future will return prematurely. - bool expected = true; - if (s_cpu_thread_state.waiting_for_gpu_thread.compare_exchange_strong(expected, false, std::memory_order_acq_rel, - std::memory_order_relaxed)) + u32 expected = CPUThreadState::WAIT_GPU_THREAD_SIGNALING; + if (s_cpu_thread_state.wait_state.compare_exchange_strong(expected, CPUThreadState::WAIT_NONE, + std::memory_order_acq_rel, std::memory_order_acquire)) { return; } @@ -310,6 +315,11 @@ void GPUBackend::WaitForOneQueuedFrame() s_cpu_thread_state.gpu_thread_wait.Wait(); + // Depending on where the GPU thread is, now we can either be in WAIT_GPU_THREAD_SIGNALING or WAIT_GPU_THREAD_POSTED + // state. We want to clear the flag here regardless, so a store-release is fine. Because the GPU thread has a + // compare-exchange on WAIT_GPU_THREAD_SIGNALING, it can't "overwrite" the value we store here. + s_cpu_thread_state.wait_state.store(CPUThreadState::WAIT_NONE, std::memory_order_release); + // Sanity check: queued frames should be in range now. If they're not, we fucked up the semaphore. Assert(s_cpu_thread_state.queued_frames.load(std::memory_order_acquire) <= g_settings.gpu_max_queued_frames); } @@ -323,14 +333,22 @@ void GPUBackend::ReleaseQueuedFrame() { s_cpu_thread_state.queued_frames.fetch_sub(1, std::memory_order_acq_rel); - bool expected = true; - if (s_cpu_thread_state.waiting_for_gpu_thread.compare_exchange_strong(expected, false, std::memory_order_acq_rel, - std::memory_order_relaxed)) + // We need two states here in case we get preempted in between the compare_exchange_strong() and Post(). + // This means that we will only release the semaphore once the CPU is guaranteed to be in a waiting state, + // and ensure that we don't post twice if the CPU thread lags and we process 2 frames before it wakes up. + u32 expected = CPUThreadState::WAIT_CPU_THREAD_WAITING; + if (s_cpu_thread_state.wait_state.compare_exchange_strong(expected, CPUThreadState::WAIT_GPU_THREAD_SIGNALING, + std::memory_order_acq_rel, std::memory_order_acquire)) { if (g_settings.gpu_max_queued_frames > 0) DEV_LOG("--> Unblocking CPU thread"); s_cpu_thread_state.gpu_thread_wait.Post(); + + // This needs to be a compare_exchange, because the CPU thread can clear the flag before we execute this line. + expected = CPUThreadState::WAIT_GPU_THREAD_SIGNALING; + s_cpu_thread_state.wait_state.compare_exchange_strong(expected, CPUThreadState::WAIT_GPU_THREAD_POSTED, + std::memory_order_acq_rel, std::memory_order_acquire); } }