From 427dd45151eddc920d681c4711a3cdbd7e363f82 Mon Sep 17 00:00:00 2001 From: Stenzek Date: Sun, 17 Mar 2019 15:59:22 +1000 Subject: [PATCH 1/5] Vulkan: Simplify command buffer fence tracking --- .../Vulkan/CommandBufferManager.cpp | 122 ++++++++---------- .../Vulkan/CommandBufferManager.h | 34 ++--- .../Core/VideoBackends/Vulkan/PerfQuery.cpp | 39 +++--- Source/Core/VideoBackends/Vulkan/PerfQuery.h | 6 +- Source/Core/VideoBackends/Vulkan/Renderer.cpp | 12 +- .../VideoBackends/Vulkan/StreamBuffer.cpp | 58 +++++---- .../Core/VideoBackends/Vulkan/StreamBuffer.h | 4 +- .../Core/VideoBackends/Vulkan/VKTexture.cpp | 59 ++------- Source/Core/VideoBackends/Vulkan/VKTexture.h | 2 +- .../VideoBackends/Vulkan/VertexManager.cpp | 6 +- Source/Core/VideoBackends/Vulkan/main.cpp | 4 +- 11 files changed, 138 insertions(+), 208 deletions(-) diff --git a/Source/Core/VideoBackends/Vulkan/CommandBufferManager.cpp b/Source/Core/VideoBackends/Vulkan/CommandBufferManager.cpp index 1f2fcd01c7..f00f6001cb 100644 --- a/Source/Core/VideoBackends/Vulkan/CommandBufferManager.cpp +++ b/Source/Core/VideoBackends/Vulkan/CommandBufferManager.cpp @@ -54,7 +54,6 @@ bool CommandBufferManager::CreateCommandBuffers() { resources.init_command_buffer_used = false; resources.semaphore_used = false; - resources.needs_fence_wait = false; VkCommandPoolCreateInfo pool_info = {VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, nullptr, 0, g_vulkan_context->GetGraphicsQueueFamilyIndex()}; @@ -211,43 +210,61 @@ void CommandBufferManager::WaitForWorkerThreadIdle() m_submit_semaphore.Post(); } -void CommandBufferManager::WaitForGPUIdle() +void CommandBufferManager::WaitForFenceCounter(u64 fence_counter) { - WaitForWorkerThreadIdle(); - vkDeviceWaitIdle(g_vulkan_context->GetDevice()); -} - -void CommandBufferManager::WaitForFence(VkFence fence) -{ - // Find the command buffer that this fence corresponds to. - u32 command_buffer_index = 0; - for (; command_buffer_index < static_cast(m_frame_resources.size()); command_buffer_index++) - { - if (m_frame_resources[command_buffer_index].fence == fence) - break; - } - ASSERT(command_buffer_index < m_frame_resources.size()); - - // Has this command buffer already been waited for? - if (!m_frame_resources[command_buffer_index].needs_fence_wait) + if (m_completed_fence_counter >= fence_counter) return; + // Find the first command buffer which covers this counter value. + u32 index = (m_current_frame + 1) % NUM_COMMAND_BUFFERS; + while (index != m_current_frame) + { + if (m_frame_resources[index].fence_counter >= fence_counter) + break; + + index = (index + 1) % NUM_COMMAND_BUFFERS; + } + + ASSERT(index != m_current_frame); + WaitForCommandBufferCompletion(index); +} + +void CommandBufferManager::WaitForCommandBufferCompletion(u32 index) +{ // Ensure this command buffer has been submitted. WaitForWorkerThreadIdle(); // Wait for this command buffer to be completed. - VkResult res = - vkWaitForFences(g_vulkan_context->GetDevice(), 1, - &m_frame_resources[command_buffer_index].fence, VK_TRUE, UINT64_MAX); + VkResult res = vkWaitForFences(g_vulkan_context->GetDevice(), 1, &m_frame_resources[index].fence, + VK_TRUE, UINT64_MAX); if (res != VK_SUCCESS) LOG_VULKAN_ERROR(res, "vkWaitForFences failed: "); - // Immediately fire callbacks and cleanups, since the commands has been completed. - m_frame_resources[command_buffer_index].needs_fence_wait = false; - OnCommandBufferExecuted(command_buffer_index); + // Clean up any resources for command buffers between the last known completed buffer and this + // now-completed command buffer. If we use >2 buffers, this may be more than one buffer. + const u64 now_completed_counter = m_frame_resources[index].fence_counter; + u32 cleanup_index = (m_current_frame + 1) % NUM_COMMAND_BUFFERS; + while (cleanup_index != m_current_frame) + { + FrameResources& resources = m_frame_resources[cleanup_index]; + if (resources.fence_counter > now_completed_counter) + break; + + if (resources.fence_counter > m_completed_fence_counter) + { + for (auto& it : resources.cleanup_resources) + it(); + resources.cleanup_resources.clear(); + } + + cleanup_index = (cleanup_index + 1) % NUM_COMMAND_BUFFERS; + } + + m_completed_fence_counter = now_completed_counter; } void CommandBufferManager::SubmitCommandBuffer(bool submit_on_worker_thread, + bool wait_for_completion, VkSwapchainKHR present_swap_chain, uint32_t present_image_index) { @@ -263,16 +280,13 @@ void CommandBufferManager::SubmitCommandBuffer(bool submit_on_worker_thread, } } - // This command buffer now has commands, so can't be re-used without waiting. - resources.needs_fence_wait = true; - // Grab the semaphore before submitting command buffer either on-thread or off-thread. // This prevents a race from occurring where a second command buffer is executed // before the worker thread has woken and executed the first one yet. m_submit_semaphore.Wait(); // Submitting off-thread? - if (m_use_threaded_submission && submit_on_worker_thread) + if (m_use_threaded_submission && submit_on_worker_thread && !wait_for_completion) { // Push to the pending submit queue. { @@ -287,6 +301,8 @@ void CommandBufferManager::SubmitCommandBuffer(bool submit_on_worker_thread, { // Pass through to normal submission path. SubmitCommandBuffer(m_current_frame, present_swap_chain, present_image_index); + if (wait_for_completion) + WaitForCommandBufferCompletion(m_current_frame); } // Switch to next cmdbuffer. @@ -365,39 +381,15 @@ void CommandBufferManager::SubmitCommandBuffer(u32 command_buffer_index, m_submit_semaphore.Post(); } -void CommandBufferManager::OnCommandBufferExecuted(u32 index) -{ - FrameResources& resources = m_frame_resources[index]; - - // Fire fence tracking callbacks. - for (auto iter = m_fence_callbacks.begin(); iter != m_fence_callbacks.end();) - { - auto backup_iter = iter++; - backup_iter->second(resources.fence); - } - - // Clean up all objects pending destruction on this command buffer - for (auto& it : resources.cleanup_resources) - it(); - resources.cleanup_resources.clear(); -} - void CommandBufferManager::BeginCommandBuffer() { // Move to the next command buffer. - m_current_frame = (m_current_frame + 1) % NUM_COMMAND_BUFFERS; - FrameResources& resources = m_frame_resources[m_current_frame]; + const u32 next_buffer_index = (m_current_frame + 1) % NUM_COMMAND_BUFFERS; + FrameResources& resources = m_frame_resources[next_buffer_index]; // Wait for the GPU to finish with all resources for this command buffer. - if (resources.needs_fence_wait) - { - VkResult res = - vkWaitForFences(g_vulkan_context->GetDevice(), 1, &resources.fence, true, UINT64_MAX); - if (res != VK_SUCCESS) - LOG_VULKAN_ERROR(res, "vkWaitForFences failed: "); - - OnCommandBufferExecuted(m_current_frame); - } + if (resources.fence_counter > m_completed_fence_counter) + WaitForCommandBufferCompletion(next_buffer_index); // Reset fence to unsignaled before starting. VkResult res = vkResetFences(g_vulkan_context->GetDevice(), 1, &resources.fence); @@ -427,6 +419,8 @@ void CommandBufferManager::BeginCommandBuffer() // Reset upload command buffer state resources.init_command_buffer_used = false; resources.semaphore_used = false; + resources.fence_counter = m_next_fence_counter++; + m_current_frame = next_buffer_index; } void CommandBufferManager::DeferBufferDestruction(VkBuffer object) @@ -471,19 +465,5 @@ void CommandBufferManager::DeferImageViewDestruction(VkImageView object) [object]() { vkDestroyImageView(g_vulkan_context->GetDevice(), object, nullptr); }); } -void CommandBufferManager::AddFenceSignaledCallback(const void* key, FenceSignaledCallback callback) -{ - // Shouldn't be adding twice. - ASSERT(m_fence_callbacks.find(key) == m_fence_callbacks.end()); - m_fence_callbacks.emplace(key, std::move(callback)); -} - -void CommandBufferManager::RemoveFenceSignaledCallback(const void* key) -{ - auto iter = m_fence_callbacks.find(key); - ASSERT(iter != m_fence_callbacks.end()); - m_fence_callbacks.erase(iter); -} - std::unique_ptr g_command_buffer_mgr; } // namespace Vulkan diff --git a/Source/Core/VideoBackends/Vulkan/CommandBufferManager.h b/Source/Core/VideoBackends/Vulkan/CommandBufferManager.h index 9cfc50e287..abc49c0622 100644 --- a/Source/Core/VideoBackends/Vulkan/CommandBufferManager.h +++ b/Source/Core/VideoBackends/Vulkan/CommandBufferManager.h @@ -51,9 +51,15 @@ public: // Allocates a descriptors set from the pool reserved for the current frame. VkDescriptorSet AllocateDescriptorSet(VkDescriptorSetLayout set_layout); + // Fence "counters" are used to track which commands have been completed by the GPU. + // If the last completed fence counter is greater or equal to N, it means that the work + // associated counter N has been completed by the GPU. The value of N to associate with + // commands can be retreived by calling GetCurrentFenceCounter(). + u64 GetCompletedFenceCounter() const { return m_completed_fence_counter; } + // Gets the fence that will be signaled when the currently executing command buffer is // queued and executed. Do not wait for this fence before the buffer is executed. - VkFence GetCurrentCommandBufferFence() const { return m_frame_resources[m_current_frame].fence; } + u64 GetCurrentFenceCounter() const { return m_frame_resources[m_current_frame].fence_counter; } // Returns the semaphore for the current command buffer, which can be used to ensure the // swap chain image is ready before the command buffer executes. @@ -66,15 +72,11 @@ public: // Ensure that the worker thread has submitted any previous command buffers and is idle. void WaitForWorkerThreadIdle(); - // Ensure that the worker thread has both submitted all commands, and the GPU has caught up. - // Use with caution, huge performance penalty. - void WaitForGPUIdle(); - // Wait for a fence to be completed. // Also invokes callbacks for completion. - void WaitForFence(VkFence fence); + void WaitForFenceCounter(u64 fence_counter); - void SubmitCommandBuffer(bool submit_on_worker_thread, + void SubmitCommandBuffer(bool submit_on_worker_thread, bool wait_for_completion, VkSwapchainKHR present_swap_chain = VK_NULL_HANDLE, uint32_t present_image_index = 0xFFFFFFFF); @@ -90,25 +92,17 @@ public: void DeferImageDestruction(VkImage object); void DeferImageViewDestruction(VkImageView object); - // Instruct the manager to fire the specified callback when a fence is flagged to be signaled. - // This happens when command buffers are executed, and can be tested if signaled, which means - // that all commands up to the point when the callback was fired have completed. - using FenceSignaledCallback = std::function; - void AddFenceSignaledCallback(const void* key, FenceSignaledCallback callback); - void RemoveFenceSignaledCallback(const void* key); - private: bool CreateCommandBuffers(); void DestroyCommandBuffers(); bool CreateSubmitThread(); + void WaitForCommandBufferCompletion(u32 command_buffer_index); void SubmitCommandBuffer(u32 command_buffer_index, VkSwapchainKHR present_swap_chain, u32 present_image_index); void BeginCommandBuffer(); - void OnCommandBufferExecuted(u32 index); - struct FrameResources { // [0] - Init (upload) command buffer, [1] - draw command buffer @@ -117,19 +111,19 @@ private: VkDescriptorPool descriptor_pool = VK_NULL_HANDLE; VkFence fence = VK_NULL_HANDLE; VkSemaphore semaphore = VK_NULL_HANDLE; + u64 fence_counter = 0; bool init_command_buffer_used = false; bool semaphore_used = false; - bool needs_fence_wait = false; std::vector> cleanup_resources; }; + u64 m_next_fence_counter = 1; + u64 m_completed_fence_counter = 0; + std::array m_frame_resources; u32 m_current_frame; - // callbacks when a fence point is set - std::map m_fence_callbacks; - // Threaded command buffer execution // Semaphore determines when a command buffer can be queued Common::Semaphore m_submit_semaphore; diff --git a/Source/Core/VideoBackends/Vulkan/PerfQuery.cpp b/Source/Core/VideoBackends/Vulkan/PerfQuery.cpp index 3a84f2e571..1934c78410 100644 --- a/Source/Core/VideoBackends/Vulkan/PerfQuery.cpp +++ b/Source/Core/VideoBackends/Vulkan/PerfQuery.cpp @@ -24,8 +24,6 @@ PerfQuery::PerfQuery() = default; PerfQuery::~PerfQuery() { - g_command_buffer_mgr->RemoveFenceSignaledCallback(this); - if (m_query_pool != VK_NULL_HANDLE) vkDestroyQueryPool(g_vulkan_context->GetDevice(), m_query_pool, nullptr); } @@ -49,9 +47,6 @@ bool PerfQuery::Initialize() return false; } - g_command_buffer_mgr->AddFenceSignaledCallback( - this, std::bind(&PerfQuery::OnFenceSignaled, this, std::placeholders::_1)); - return true; } @@ -113,7 +108,7 @@ void PerfQuery::ResetQuery() for (auto& entry : m_query_buffer) { - entry.pending_fence = VK_NULL_HANDLE; + entry.fence_counter = 0; entry.available = false; entry.active = false; } @@ -217,7 +212,7 @@ void PerfQuery::QueueCopyQueryResults(u32 start_index, u32 query_count) { u32 index = start_index + i; ActiveQuery& entry = m_query_buffer[index]; - entry.pending_fence = g_command_buffer_mgr->GetCurrentCommandBufferFence(); + entry.fence_counter = g_command_buffer_mgr->GetCurrentFenceCounter(); entry.available = true; entry.active = false; } @@ -261,8 +256,10 @@ void PerfQuery::FlushQueries() QueueCopyQueryResults(copy_start_index, copy_count); } -void PerfQuery::OnFenceSignaled(VkFence fence) +void PerfQuery::ProcessPendingResults() { + const u64 completed_fence_counter = g_command_buffer_mgr->GetCurrentFenceCounter(); + // Need to save these since ProcessResults will modify them. u32 query_read_pos = m_query_read_pos; u32 query_count = m_query_count; @@ -273,7 +270,7 @@ void PerfQuery::OnFenceSignaled(VkFence fence) for (u32 i = 0; i < query_count; i++) { u32 index = (query_read_pos + i) % PERF_QUERY_BUFFER_SIZE; - if (m_query_buffer[index].pending_fence != fence) + if (m_query_buffer[index].fence_counter > completed_fence_counter) { // These should be grouped together, at the end. break; @@ -314,8 +311,8 @@ void PerfQuery::ProcessResults(u32 start_index, u32 query_count) ActiveQuery& entry = m_query_buffer[index]; // Should have a fence associated with it (waiting for a result). - ASSERT(entry.pending_fence != VK_NULL_HANDLE); - entry.pending_fence = VK_NULL_HANDLE; + ASSERT(entry.fence_counter != 0); + entry.fence_counter = 0; entry.available = false; entry.active = false; @@ -340,9 +337,11 @@ void PerfQuery::NonBlockingPartialFlush() return; // Submit a command buffer in the background if the front query is not bound to one. - // Ideally this will complete before the buffer fills. - if (m_query_buffer[m_query_read_pos].pending_fence == VK_NULL_HANDLE) + ActiveQuery& entry = m_query_buffer[m_query_read_pos]; + if (entry.fence_counter == g_command_buffer_mgr->GetCurrentFenceCounter()) Renderer::GetInstance()->ExecuteCommandBuffer(true, false); + + ProcessPendingResults(); } void PerfQuery::BlockingPartialFlush() @@ -352,17 +351,9 @@ void PerfQuery::BlockingPartialFlush() // If the first pending query is needing command buffer execution, do that. ActiveQuery& entry = m_query_buffer[m_query_read_pos]; - if (entry.pending_fence == VK_NULL_HANDLE) - { - // This will callback OnCommandBufferQueued which will set the fence on the entry. - // We wait for completion, which will also call OnCommandBufferExecuted, and clear the fence. + if (entry.fence_counter == g_command_buffer_mgr->GetCurrentFenceCounter()) Renderer::GetInstance()->ExecuteCommandBuffer(false, true); - } - else - { - // The command buffer has been submitted, but is awaiting completion. - // Wait for the fence to complete, which will call OnCommandBufferExecuted. - g_command_buffer_mgr->WaitForFence(entry.pending_fence); - } + + ProcessPendingResults(); } } // namespace Vulkan diff --git a/Source/Core/VideoBackends/Vulkan/PerfQuery.h b/Source/Core/VideoBackends/Vulkan/PerfQuery.h index 47ccf22a66..facbe0dc0f 100644 --- a/Source/Core/VideoBackends/Vulkan/PerfQuery.h +++ b/Source/Core/VideoBackends/Vulkan/PerfQuery.h @@ -36,8 +36,8 @@ public: private: struct ActiveQuery { + u64 fence_counter; PerfQueryType query_type; - VkFence pending_fence; bool available; bool active; }; @@ -45,11 +45,9 @@ private: bool CreateQueryPool(); bool CreateReadbackBuffer(); void QueueCopyQueryResults(u32 start_index, u32 query_count); + void ProcessPendingResults(); void ProcessResults(u32 start_index, u32 query_count); - void OnCommandBufferQueued(VkCommandBuffer command_buffer, VkFence fence); - void OnFenceSignaled(VkFence fence); - void NonBlockingPartialFlush(); void BlockingPartialFlush(); diff --git a/Source/Core/VideoBackends/Vulkan/Renderer.cpp b/Source/Core/VideoBackends/Vulkan/Renderer.cpp index 8b560be15d..a4db679ecb 100644 --- a/Source/Core/VideoBackends/Vulkan/Renderer.cpp +++ b/Source/Core/VideoBackends/Vulkan/Renderer.cpp @@ -315,7 +315,7 @@ void Renderer::PresentBackbuffer() // Because this final command buffer is rendering to the swap chain, we need to wait for // the available semaphore to be signaled before executing the buffer. This final submission // can happen off-thread in the background while we're preparing the next frame. - g_command_buffer_mgr->SubmitCommandBuffer(true, m_swap_chain->GetSwapChain(), + g_command_buffer_mgr->SubmitCommandBuffer(true, false, m_swap_chain->GetSwapChain(), m_swap_chain->GetCurrentImageIndex()); // New cmdbuffer, so invalidate state. @@ -327,11 +327,7 @@ void Renderer::ExecuteCommandBuffer(bool submit_off_thread, bool wait_for_comple StateTracker::GetInstance()->EndRenderPass(); PerfQuery::GetInstance()->FlushQueries(); - // If we're waiting for completion, don't bother waking the worker thread. - const VkFence pending_fence = g_command_buffer_mgr->GetCurrentCommandBufferFence(); - g_command_buffer_mgr->SubmitCommandBuffer(submit_off_thread && wait_for_completion); - if (wait_for_completion) - g_command_buffer_mgr->WaitForFence(pending_fence); + g_command_buffer_mgr->SubmitCommandBuffer(submit_off_thread, wait_for_completion); StateTracker::GetInstance()->InvalidateCachedState(); } @@ -550,10 +546,6 @@ void Renderer::UnbindTexture(const AbstractTexture* texture) void Renderer::ResetSamplerStates() { - // Ensure none of the sampler objects are in use. - // This assumes that none of the samplers are in use on the command list currently being recorded. - g_command_buffer_mgr->WaitForGPUIdle(); - // Invalidate all sampler states, next draw will re-initialize them. for (u32 i = 0; i < m_sampler_states.size(); i++) { diff --git a/Source/Core/VideoBackends/Vulkan/StreamBuffer.cpp b/Source/Core/VideoBackends/Vulkan/StreamBuffer.cpp index ea610f09cf..aa635e4d41 100644 --- a/Source/Core/VideoBackends/Vulkan/StreamBuffer.cpp +++ b/Source/Core/VideoBackends/Vulkan/StreamBuffer.cpp @@ -19,14 +19,10 @@ namespace Vulkan { StreamBuffer::StreamBuffer(VkBufferUsageFlags usage, u32 size) : m_usage(usage), m_size(size) { - g_command_buffer_mgr->AddFenceSignaledCallback( - this, std::bind(&StreamBuffer::OnFenceSignaled, this, std::placeholders::_1)); } StreamBuffer::~StreamBuffer() { - g_command_buffer_mgr->RemoveFenceSignaledCallback(this); - if (m_host_pointer) vkUnmapMemory(g_vulkan_context->GetDevice(), m_memory); @@ -189,8 +185,6 @@ bool StreamBuffer::ReserveMemory(u32 num_bytes, u32 alignment) // Can we find a fence to wait on that will give us enough memory? if (WaitForClearSpace(required_bytes)) { - ASSERT(m_current_offset == m_current_gpu_position || - (m_current_offset + required_bytes) < m_current_gpu_position); m_current_offset = Common::AlignUp(m_current_offset, alignment); m_last_allocation_size = num_bytes; return true; @@ -225,36 +219,40 @@ void StreamBuffer::UpdateCurrentFencePosition() return; // Has the offset changed since the last fence? - const VkFence fence = g_command_buffer_mgr->GetCurrentCommandBufferFence(); - if (!m_tracked_fences.empty() && m_tracked_fences.back().first == fence) + const u64 counter = g_command_buffer_mgr->GetCurrentFenceCounter(); + if (!m_tracked_fences.empty() && m_tracked_fences.back().first == counter) { // Still haven't executed a command buffer, so just update the offset. m_tracked_fences.back().second = m_current_offset; return; } - m_tracked_fences.emplace_back(fence, m_current_offset); + // New buffer, so update the GPU position while we're at it. + UpdateGPUPosition(); + m_tracked_fences.emplace_back(counter, m_current_offset); } -void StreamBuffer::OnFenceSignaled(VkFence fence) +void StreamBuffer::UpdateGPUPosition() { - // Locate the entry for this fence (if any, we may have been forced to wait already) - auto iter = std::find_if(m_tracked_fences.begin(), m_tracked_fences.end(), - [fence](const auto& it) { return it.first == fence; }); + auto start = m_tracked_fences.begin(); + auto end = start; - if (iter != m_tracked_fences.end()) + const u64 completed_counter = g_command_buffer_mgr->GetCompletedFenceCounter(); + while (end != m_tracked_fences.end() && completed_counter >= end->first) { - // Update the GPU position, and remove any fences before this fence (since - // it is implied that they have been signaled as well, though the callback - // should have removed them already). - m_current_gpu_position = iter->second; - m_tracked_fences.erase(m_tracked_fences.begin(), ++iter); + m_current_gpu_position = end->second; + ++end; } + + if (start != end) + m_tracked_fences.erase(start, end); } bool StreamBuffer::WaitForClearSpace(u32 num_bytes) { u32 new_offset = 0; + u32 new_gpu_position = 0; + auto iter = m_tracked_fences.begin(); for (; iter != m_tracked_fences.end(); iter++) { @@ -265,20 +263,32 @@ bool StreamBuffer::WaitForClearSpace(u32 num_bytes) u32 gpu_position = iter->second; if (m_current_offset == gpu_position) { - // Start at the start of the buffer again. new_offset = 0; + new_gpu_position = 0; break; } // Assuming that we wait for this fence, are we allocating in front of the GPU? if (m_current_offset > gpu_position) { + // This would suggest the GPU has now followed us and wrapped around, so we have from + // m_current_position..m_size free, as well as and 0..gpu_position. + const u32 remaining_space_after_offset = m_size - m_current_offset; + if (remaining_space_after_offset >= num_bytes) + { + // Switch to allocating in front of the GPU, using the remainder of the buffer. + new_offset = m_current_offset; + new_gpu_position = gpu_position; + break; + } + // We can wrap around to the start, behind the GPU, if there is enough space. // We use > here because otherwise we'd end up lining up with the GPU, and then the // allocator would assume that the GPU has consumed what we just wrote. if (gpu_position > num_bytes) { new_offset = 0; + new_gpu_position = gpu_position; break; } } @@ -292,6 +302,7 @@ bool StreamBuffer::WaitForClearSpace(u32 num_bytes) { // Leave the offset as-is, but update the GPU position. new_offset = m_current_offset; + new_gpu_position = gpu_position; break; } } @@ -300,14 +311,17 @@ bool StreamBuffer::WaitForClearSpace(u32 num_bytes) // Did any fences satisfy this condition? // Has the command buffer been executed yet? If not, the caller should execute it. if (iter == m_tracked_fences.end() || - iter->first == g_command_buffer_mgr->GetCurrentCommandBufferFence()) + iter->first == g_command_buffer_mgr->GetCurrentFenceCounter()) { return false; } // Wait until this fence is signaled. This will fire the callback, updating the GPU position. - g_command_buffer_mgr->WaitForFence(iter->first); + g_command_buffer_mgr->WaitForFenceCounter(iter->first); + m_tracked_fences.erase(m_tracked_fences.begin(), + m_current_offset == iter->second ? m_tracked_fences.end() : ++iter); m_current_offset = new_offset; + m_current_gpu_position = new_gpu_position; return true; } diff --git a/Source/Core/VideoBackends/Vulkan/StreamBuffer.h b/Source/Core/VideoBackends/Vulkan/StreamBuffer.h index b52ce6cd35..677313939a 100644 --- a/Source/Core/VideoBackends/Vulkan/StreamBuffer.h +++ b/Source/Core/VideoBackends/Vulkan/StreamBuffer.h @@ -34,7 +34,7 @@ public: private: bool AllocateBuffer(); void UpdateCurrentFencePosition(); - void OnFenceSignaled(VkFence fence); + void UpdateGPUPosition(); // Waits for as many fences as needed to allocate num_bytes bytes from the buffer. bool WaitForClearSpace(u32 num_bytes); @@ -50,7 +50,7 @@ private: u8* m_host_pointer = nullptr; // List of fences and the corresponding positions in the buffer - std::deque> m_tracked_fences; + std::deque> m_tracked_fences; bool m_coherent_mapping = false; }; diff --git a/Source/Core/VideoBackends/Vulkan/VKTexture.cpp b/Source/Core/VideoBackends/Vulkan/VKTexture.cpp index 202bbaad0f..27b079a779 100644 --- a/Source/Core/VideoBackends/Vulkan/VKTexture.cpp +++ b/Source/Core/VideoBackends/Vulkan/VKTexture.cpp @@ -674,11 +674,7 @@ VKStagingTexture::VKStagingTexture(StagingTextureType type, const TextureConfig& { } -VKStagingTexture::~VKStagingTexture() -{ - if (m_needs_flush) - VKStagingTexture::Flush(); -} +VKStagingTexture::~VKStagingTexture() = default; std::unique_ptr VKStagingTexture::Create(StagingTextureType type, const TextureConfig& config) @@ -739,14 +735,6 @@ void VKStagingTexture::CopyFromTexture(const AbstractTexture* src, ASSERT(dst_rect.left >= 0 && static_cast(dst_rect.right) <= m_config.width && dst_rect.top >= 0 && static_cast(dst_rect.bottom) <= m_config.height); - if (m_needs_flush) - { - // Drop copy before reusing it. - g_command_buffer_mgr->RemoveFenceSignaledCallback(this); - m_flush_fence = VK_NULL_HANDLE; - m_needs_flush = false; - } - StateTracker::GetInstance()->EndRenderPass(); VkImageLayout old_layout = src_tex->GetLayout(); @@ -773,16 +761,7 @@ void VKStagingTexture::CopyFromTexture(const AbstractTexture* src, src_tex->TransitionToLayout(g_command_buffer_mgr->GetCurrentCommandBuffer(), old_layout); m_needs_flush = true; - m_flush_fence = g_command_buffer_mgr->GetCurrentCommandBufferFence(); - g_command_buffer_mgr->AddFenceSignaledCallback(this, [this](VkFence fence) { - if (m_flush_fence != fence) - return; - - m_flush_fence = VK_NULL_HANDLE; - m_needs_flush = false; - g_command_buffer_mgr->RemoveFenceSignaledCallback(this); - m_staging_buffer->InvalidateCPUCache(); - }); + m_flush_fence_counter = g_command_buffer_mgr->GetCurrentFenceCounter(); } void VKStagingTexture::CopyToTexture(const MathUtil::Rectangle& src_rect, AbstractTexture* dst, @@ -798,14 +777,6 @@ void VKStagingTexture::CopyToTexture(const MathUtil::Rectangle& src_rect, A ASSERT(dst_rect.left >= 0 && static_cast(dst_rect.right) <= dst_tex->GetWidth() && dst_rect.top >= 0 && static_cast(dst_rect.bottom) <= dst_tex->GetHeight()); - if (m_needs_flush) - { - // Drop copy before reusing it. - g_command_buffer_mgr->RemoveFenceSignaledCallback(this); - m_flush_fence = VK_NULL_HANDLE; - m_needs_flush = false; - } - // Flush caches before copying. m_staging_buffer->FlushCPUCache(); StateTracker::GetInstance()->EndRenderPass(); @@ -833,15 +804,7 @@ void VKStagingTexture::CopyToTexture(const MathUtil::Rectangle& src_rect, A dst_tex->TransitionToLayout(g_command_buffer_mgr->GetCurrentCommandBuffer(), old_layout); m_needs_flush = true; - m_flush_fence = g_command_buffer_mgr->GetCurrentCommandBufferFence(); - g_command_buffer_mgr->AddFenceSignaledCallback(this, [this](VkFence fence) { - if (m_flush_fence != fence) - return; - - m_flush_fence = VK_NULL_HANDLE; - m_needs_flush = false; - g_command_buffer_mgr->RemoveFenceSignaledCallback(this); - }); + m_flush_fence_counter = g_command_buffer_mgr->GetCurrentFenceCounter(); } bool VKStagingTexture::Map() @@ -860,25 +823,23 @@ void VKStagingTexture::Flush() if (!m_needs_flush) return; - // Either of the below two calls will cause the callback to fire. - g_command_buffer_mgr->RemoveFenceSignaledCallback(this); - if (m_flush_fence == g_command_buffer_mgr->GetCurrentCommandBufferFence()) + // Is this copy in the current command buffer? + if (g_command_buffer_mgr->GetCurrentFenceCounter() == m_flush_fence_counter) { - // The readback is in the current command buffer, and we must execute it. + // Execute the command buffer and wait for it to finish. Renderer::GetInstance()->ExecuteCommandBuffer(false, true); } else { - // WaitForFence should fire the callback. - g_command_buffer_mgr->WaitForFence(m_flush_fence); + // Wait for the GPU to finish with it. + g_command_buffer_mgr->WaitForFenceCounter(m_flush_fence_counter); } - DEBUG_ASSERT(m_flush_fence == VK_NULL_HANDLE); - m_needs_flush = false; - // For readback textures, invalidate the CPU cache as there is new data there. if (m_type == StagingTextureType::Readback || m_type == StagingTextureType::Mutable) m_staging_buffer->InvalidateCPUCache(); + + m_needs_flush = false; } VKFramebuffer::VKFramebuffer(VKTexture* color_attachment, VKTexture* depth_attachment, u32 width, diff --git a/Source/Core/VideoBackends/Vulkan/VKTexture.h b/Source/Core/VideoBackends/Vulkan/VKTexture.h index bab11ec108..2a194acc4d 100644 --- a/Source/Core/VideoBackends/Vulkan/VKTexture.h +++ b/Source/Core/VideoBackends/Vulkan/VKTexture.h @@ -104,7 +104,7 @@ private: std::unique_ptr buffer); std::unique_ptr m_staging_buffer; - VkFence m_flush_fence = VK_NULL_HANDLE; + u64 m_flush_fence_counter = 0; }; class VKFramebuffer final : public AbstractFramebuffer diff --git a/Source/Core/VideoBackends/Vulkan/VertexManager.cpp b/Source/Core/VideoBackends/Vulkan/VertexManager.cpp index 336d3ad480..144d3b3ab8 100644 --- a/Source/Core/VideoBackends/Vulkan/VertexManager.cpp +++ b/Source/Core/VideoBackends/Vulkan/VertexManager.cpp @@ -60,11 +60,11 @@ VertexManager::~VertexManager() bool VertexManager::Initialize() { m_vertex_stream_buffer = - StreamBuffer::Create(VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, VERTEX_STREAM_BUFFER_SIZE * 4); + StreamBuffer::Create(VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, VERTEX_STREAM_BUFFER_SIZE); m_index_stream_buffer = - StreamBuffer::Create(VK_BUFFER_USAGE_INDEX_BUFFER_BIT, INDEX_STREAM_BUFFER_SIZE * 4); + StreamBuffer::Create(VK_BUFFER_USAGE_INDEX_BUFFER_BIT, INDEX_STREAM_BUFFER_SIZE); m_uniform_stream_buffer = - StreamBuffer::Create(VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, UNIFORM_STREAM_BUFFER_SIZE * 4); + StreamBuffer::Create(VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, UNIFORM_STREAM_BUFFER_SIZE); if (!m_vertex_stream_buffer || !m_index_stream_buffer || !m_uniform_stream_buffer) { PanicAlert("Failed to allocate streaming buffers"); diff --git a/Source/Core/VideoBackends/Vulkan/main.cpp b/Source/Core/VideoBackends/Vulkan/main.cpp index 911f8d1991..07c24c9354 100644 --- a/Source/Core/VideoBackends/Vulkan/main.cpp +++ b/Source/Core/VideoBackends/Vulkan/main.cpp @@ -251,8 +251,8 @@ bool VideoBackend::Initialize(const WindowSystemInfo& wsi) void VideoBackend::Shutdown() { - if (g_command_buffer_mgr) - g_command_buffer_mgr->WaitForGPUIdle(); + if (g_vulkan_context) + vkDeviceWaitIdle(g_vulkan_context->GetDevice()); if (g_shader_cache) g_shader_cache->Shutdown(); From 087b11e780a89928ae4f6d2229eb3e79f4f0689b Mon Sep 17 00:00:00 2001 From: Stenzek Date: Sun, 17 Mar 2019 16:41:50 +1000 Subject: [PATCH 2/5] TextureCacheBase: Fix possible crash on shutdown with deferred EFB copies --- Source/Core/VideoCommon/TextureCacheBase.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Source/Core/VideoCommon/TextureCacheBase.cpp b/Source/Core/VideoCommon/TextureCacheBase.cpp index a5ff0081c5..a210be3bc3 100644 --- a/Source/Core/VideoCommon/TextureCacheBase.cpp +++ b/Source/Core/VideoCommon/TextureCacheBase.cpp @@ -96,6 +96,9 @@ TextureCacheBase::TextureCacheBase() TextureCacheBase::~TextureCacheBase() { + // Clear pending EFB copies first, so we don't try to flush them. + m_pending_efb_copies.clear(); + HiresTexture::Shutdown(); Invalidate(); Common::FreeAlignedMemory(temp); From 23a655217c425fcb8bce2cf9c94e770f45f198c6 Mon Sep 17 00:00:00 2001 From: Stenzek Date: Sun, 17 Mar 2019 16:46:41 +1000 Subject: [PATCH 3/5] Vulkan: Fix validation layer error for unbound texture layouts --- Source/Core/VideoBackends/Vulkan/StateTracker.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/Source/Core/VideoBackends/Vulkan/StateTracker.cpp b/Source/Core/VideoBackends/Vulkan/StateTracker.cpp index 6bb1b8981d..e25147cbe1 100644 --- a/Source/Core/VideoBackends/Vulkan/StateTracker.cpp +++ b/Source/Core/VideoBackends/Vulkan/StateTracker.cpp @@ -62,11 +62,13 @@ bool StateTracker::Initialize() VKTexture::Create(TextureConfig(1, 1, 1, 1, 1, AbstractTextureFormat::RGBA8, 0)); if (!m_dummy_texture) return false; + m_dummy_texture->TransitionToLayout(g_command_buffer_mgr->GetCurrentInitCommandBuffer(), + VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL); // Initialize all samplers to point by default for (size_t i = 0; i < NUM_PIXEL_SHADER_SAMPLERS; i++) { - m_bindings.samplers[i].imageLayout = VK_IMAGE_LAYOUT_UNDEFINED; + m_bindings.samplers[i].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; m_bindings.samplers[i].imageView = m_dummy_texture->GetView(); m_bindings.samplers[i].sampler = g_object_cache->GetPointSampler(); } @@ -223,14 +225,14 @@ void StateTracker::UnbindTexture(VkImageView view) if (it.imageView == view) { it.imageView = m_dummy_texture->GetView(); - it.imageLayout = VK_IMAGE_LAYOUT_UNDEFINED; + it.imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; } } if (m_bindings.image_texture.imageView == view) { m_bindings.image_texture.imageView = m_dummy_texture->GetView(); - m_bindings.image_texture.imageLayout = VK_IMAGE_LAYOUT_UNDEFINED; + m_bindings.image_texture.imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL; } } From 6d40ea855345eed482be135359c61224704de436 Mon Sep 17 00:00:00 2001 From: Stenzek Date: Sun, 17 Mar 2019 16:55:32 +1000 Subject: [PATCH 4/5] Vulkan: Fix barrier validation layer errors for bounding box --- Source/Core/VideoBackends/Vulkan/BoundingBox.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Source/Core/VideoBackends/Vulkan/BoundingBox.cpp b/Source/Core/VideoBackends/Vulkan/BoundingBox.cpp index 9e7da60249..15c9403228 100644 --- a/Source/Core/VideoBackends/Vulkan/BoundingBox.cpp +++ b/Source/Core/VideoBackends/Vulkan/BoundingBox.cpp @@ -99,7 +99,7 @@ void BoundingBox::Flush() StagingBuffer::BufferMemoryBarrier( g_command_buffer_mgr->GetCurrentCommandBuffer(), m_gpu_buffer, VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, 0, BUFFER_SIZE, - VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT); + VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT); } // We're now up-to-date. @@ -223,7 +223,7 @@ void BoundingBox::Readback() StagingBuffer::BufferMemoryBarrier( g_command_buffer_mgr->GetCurrentCommandBuffer(), m_gpu_buffer, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT, 0, - BUFFER_SIZE, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT); + BUFFER_SIZE, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT); m_readback_buffer->PrepareForGPUWrite(g_command_buffer_mgr->GetCurrentCommandBuffer(), VK_ACCESS_TRANSFER_WRITE_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT); @@ -237,7 +237,7 @@ void BoundingBox::Readback() StagingBuffer::BufferMemoryBarrier( g_command_buffer_mgr->GetCurrentCommandBuffer(), m_gpu_buffer, VK_ACCESS_TRANSFER_READ_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, 0, BUFFER_SIZE, - VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT); + VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT); m_readback_buffer->FlushGPUCache(g_command_buffer_mgr->GetCurrentCommandBuffer(), VK_ACCESS_TRANSFER_WRITE_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT); From 604ab67c7f28dc03a82ee7aa1bcc487d39ea6f8c Mon Sep 17 00:00:00 2001 From: Stenzek Date: Mon, 18 Mar 2019 00:09:26 +1000 Subject: [PATCH 5/5] Vulkan: Simplify perf queries using vkGetQueryPoolResults --- .../Core/VideoBackends/Vulkan/PerfQuery.cpp | 261 ++++-------------- Source/Core/VideoBackends/Vulkan/PerfQuery.h | 40 ++- Source/Core/VideoBackends/Vulkan/Renderer.cpp | 2 - 3 files changed, 73 insertions(+), 230 deletions(-) diff --git a/Source/Core/VideoBackends/Vulkan/PerfQuery.cpp b/Source/Core/VideoBackends/Vulkan/PerfQuery.cpp index 1934c78410..f19b67d12b 100644 --- a/Source/Core/VideoBackends/Vulkan/PerfQuery.cpp +++ b/Source/Core/VideoBackends/Vulkan/PerfQuery.cpp @@ -14,7 +14,6 @@ #include "VideoBackends/Vulkan/CommandBufferManager.h" #include "VideoBackends/Vulkan/Renderer.h" -#include "VideoBackends/Vulkan/StagingBuffer.h" #include "VideoBackends/Vulkan/StateTracker.h" #include "VideoBackends/Vulkan/VulkanContext.h" @@ -28,11 +27,6 @@ PerfQuery::~PerfQuery() vkDestroyQueryPool(g_vulkan_context->GetDevice(), m_query_pool, nullptr); } -Vulkan::PerfQuery* PerfQuery::GetInstance() -{ - return static_cast(g_perf_query.get()); -} - bool PerfQuery::Initialize() { if (!CreateQueryPool()) @@ -41,47 +35,30 @@ bool PerfQuery::Initialize() return false; } - if (!CreateReadbackBuffer()) - { - PanicAlert("Failed to create readback buffer"); - return false; - } - return true; } void PerfQuery::EnableQuery(PerfQueryGroup type) { - // Have we used half of the query buffer already? - if (m_query_count > m_query_buffer.size() / 2) - NonBlockingPartialFlush(); - // Block if there are no free slots. - if (m_query_count == PERF_QUERY_BUFFER_SIZE) - { - // ERROR_LOG(VIDEO, "Flushed query buffer early!"); - BlockingPartialFlush(); - } + // Otherwise, try to keep half of them available. + if (m_query_count > m_query_buffer.size() / 2) + PartialFlush(m_query_count == PERF_QUERY_BUFFER_SIZE); if (type == PQG_ZCOMP_ZCOMPLOC || type == PQG_ZCOMP) { - u32 index = (m_query_read_pos + m_query_count) % PERF_QUERY_BUFFER_SIZE; - ActiveQuery& entry = m_query_buffer[index]; - ASSERT(!entry.active && !entry.available); - entry.active = true; - m_query_count++; - - DEBUG_LOG(VIDEO, "start query %u", index); + ActiveQuery& entry = m_query_buffer[m_query_next_pos]; + DEBUG_ASSERT(!entry.has_value); + entry.has_value = true; // Use precise queries if supported, otherwise boolean (which will be incorrect). - VkQueryControlFlags flags = 0; - if (g_vulkan_context->SupportsPreciseOcclusionQueries()) - flags = VK_QUERY_CONTROL_PRECISE_BIT; + VkQueryControlFlags flags = + g_vulkan_context->SupportsPreciseOcclusionQueries() ? VK_QUERY_CONTROL_PRECISE_BIT : 0; // Ensure the query starts within a render pass. - // TODO: Is this needed? StateTracker::GetInstance()->BeginRenderPass(); - vkCmdBeginQuery(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool, index, flags); + vkCmdBeginQuery(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool, m_query_next_pos, + flags); } } @@ -89,16 +66,17 @@ void PerfQuery::DisableQuery(PerfQueryGroup type) { if (type == PQG_ZCOMP_ZCOMPLOC || type == PQG_ZCOMP) { - // DisableQuery should be called for each EnableQuery, so subtract one to get the previous one. - u32 index = (m_query_read_pos + m_query_count - 1) % PERF_QUERY_BUFFER_SIZE; - vkCmdEndQuery(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool, index); + vkCmdEndQuery(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool, m_query_next_pos); + m_query_next_pos = (m_query_next_pos + 1) % PERF_QUERY_BUFFER_SIZE; + m_query_count++; } } void PerfQuery::ResetQuery() { m_query_count = 0; - m_query_read_pos = 0; + m_query_readback_pos = 0; + m_query_next_pos = 0; std::fill_n(m_results, ArraySize(m_results), 0); // Reset entire query pool, ensuring all queries are ready to write to. @@ -106,34 +84,20 @@ void PerfQuery::ResetQuery() vkCmdResetQueryPool(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool, 0, PERF_QUERY_BUFFER_SIZE); - for (auto& entry : m_query_buffer) - { - entry.fence_counter = 0; - entry.available = false; - entry.active = false; - } + std::memset(m_query_buffer.data(), 0, sizeof(ActiveQuery) * m_query_buffer.size()); } u32 PerfQuery::GetQueryResult(PerfQueryType type) { u32 result = 0; - if (type == PQ_ZCOMP_INPUT_ZCOMPLOC || type == PQ_ZCOMP_OUTPUT_ZCOMPLOC) - { result = m_results[PQG_ZCOMP_ZCOMPLOC]; - } else if (type == PQ_ZCOMP_INPUT || type == PQ_ZCOMP_OUTPUT) - { result = m_results[PQG_ZCOMP]; - } else if (type == PQ_BLEND_INPUT) - { result = m_results[PQG_ZCOMP] + m_results[PQG_ZCOMP_ZCOMPLOC]; - } else if (type == PQ_EFB_COPY_CLOCKS) - { result = m_results[PQG_EFB_COPY_CLOCKS]; - } return result / 4; } @@ -141,7 +105,7 @@ u32 PerfQuery::GetQueryResult(PerfQueryType type) void PerfQuery::FlushResults() { while (!IsFlushed()) - BlockingPartialFlush(); + PartialFlush(true); } bool PerfQuery::IsFlushed() const @@ -170,190 +134,79 @@ bool PerfQuery::CreateQueryPool() return true; } -bool PerfQuery::CreateReadbackBuffer() +void PerfQuery::ReadbackQueries() { - m_readback_buffer = StagingBuffer::Create(STAGING_BUFFER_TYPE_READBACK, - PERF_QUERY_BUFFER_SIZE * sizeof(PerfQueryDataType), - VK_BUFFER_USAGE_TRANSFER_DST_BIT); - - // Leave the buffer persistently mapped, we invalidate it when we need to read. - if (!m_readback_buffer || !m_readback_buffer->Map()) - return false; - - return true; -} - -void PerfQuery::QueueCopyQueryResults(u32 start_index, u32 query_count) -{ - DEBUG_LOG(VIDEO, "queue copy of queries %u-%u", start_index, start_index + query_count - 1); - - // Transition buffer for GPU write - // TODO: Is this needed? - m_readback_buffer->PrepareForGPUWrite(g_command_buffer_mgr->GetCurrentCommandBuffer(), - VK_ACCESS_TRANSFER_WRITE_BIT, - VK_PIPELINE_STAGE_TRANSFER_BIT); - - // Copy from queries -> buffer - vkCmdCopyQueryPoolResults(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool, - start_index, query_count, m_readback_buffer->GetBuffer(), - start_index * sizeof(PerfQueryDataType), sizeof(PerfQueryDataType), - VK_QUERY_RESULT_WAIT_BIT); - - // Prepare for host readback - m_readback_buffer->FlushGPUCache(g_command_buffer_mgr->GetCurrentCommandBuffer(), - VK_ACCESS_TRANSFER_WRITE_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT); - - // Reset queries so they're ready to use again - vkCmdResetQueryPool(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool, start_index, - query_count); - - // Flag all queries as available, but with a fence that has to be completed first - for (u32 i = 0; i < query_count; i++) - { - u32 index = start_index + i; - ActiveQuery& entry = m_query_buffer[index]; - entry.fence_counter = g_command_buffer_mgr->GetCurrentFenceCounter(); - entry.available = true; - entry.active = false; - } -} - -void PerfQuery::FlushQueries() -{ - // Flag all pending queries that aren't available as available after execution. - u32 copy_start_index = 0; - u32 copy_count = 0; - for (u32 i = 0; i < m_query_count; i++) - { - u32 index = (m_query_read_pos + i) % PERF_QUERY_BUFFER_SIZE; - ActiveQuery& entry = m_query_buffer[index]; - - // Skip already-copied queries (will happen if a flush hasn't occurred and - // a command buffer hasn't finished executing). - if (entry.available) - { - // These should be grouped together, and at the start. - ASSERT(copy_count == 0); - continue; - } - - // If this wrapped around, we need to flush the entries before the end of the buffer. - ASSERT(entry.active); - if (index < copy_start_index) - { - QueueCopyQueryResults(copy_start_index, copy_count); - copy_start_index = index; - copy_count = 0; - } - else if (copy_count == 0) - { - copy_start_index = index; - } - copy_count++; - } - - if (copy_count > 0) - QueueCopyQueryResults(copy_start_index, copy_count); -} - -void PerfQuery::ProcessPendingResults() -{ - const u64 completed_fence_counter = g_command_buffer_mgr->GetCurrentFenceCounter(); + const u64 completed_fence_counter = g_command_buffer_mgr->GetCompletedFenceCounter(); // Need to save these since ProcessResults will modify them. - u32 query_read_pos = m_query_read_pos; - u32 query_count = m_query_count; - - // Flush as many queries as are bound to this fence. - u32 flush_start_index = 0; - u32 flush_count = 0; - for (u32 i = 0; i < query_count; i++) + const u32 outstanding_queries = m_query_count; + u32 readback_count = 0; + for (u32 i = 0; i < outstanding_queries; i++) { - u32 index = (query_read_pos + i) % PERF_QUERY_BUFFER_SIZE; - if (m_query_buffer[index].fence_counter > completed_fence_counter) - { - // These should be grouped together, at the end. + u32 index = (m_query_readback_pos + readback_count) % PERF_QUERY_BUFFER_SIZE; + const ActiveQuery& entry = m_query_buffer[index]; + if (entry.fence_counter > completed_fence_counter) break; - } // If this wrapped around, we need to flush the entries before the end of the buffer. - if (index < flush_start_index) + if (index < m_query_readback_pos) { - ProcessResults(flush_start_index, flush_count); - flush_start_index = index; - flush_count = 0; + ReadbackQueries(readback_count); + DEBUG_ASSERT(m_query_readback_pos == 0); + readback_count = 0; } - else if (flush_count == 0) - { - flush_start_index = index; - } - flush_count++; + + readback_count++; } - if (flush_count > 0) - ProcessResults(flush_start_index, flush_count); + if (readback_count > 0) + ReadbackQueries(readback_count); } -void PerfQuery::ProcessResults(u32 start_index, u32 query_count) +void PerfQuery::ReadbackQueries(u32 query_count) { - // Invalidate CPU caches before reading back. - m_readback_buffer->InvalidateCPUCache(start_index * sizeof(PerfQueryDataType), - query_count * sizeof(PerfQueryDataType)); - // Should be at maximum query_count queries pending. - ASSERT(query_count <= m_query_count); - DEBUG_LOG(VIDEO, "process queries %u-%u", start_index, start_index + query_count - 1); + ASSERT(query_count <= m_query_count && + (m_query_readback_pos + query_count) <= PERF_QUERY_BUFFER_SIZE); + + // Read back from the GPU. + VkResult res = + vkGetQueryPoolResults(g_vulkan_context->GetDevice(), m_query_pool, m_query_readback_pos, + query_count, query_count * sizeof(PerfQueryDataType), + m_query_result_buffer.data(), sizeof(PerfQueryDataType), 0); + if (res != VK_SUCCESS) + LOG_VULKAN_ERROR(res, "vkGetQueryPoolResults failed: "); // Remove pending queries. for (u32 i = 0; i < query_count; i++) { - u32 index = (m_query_read_pos + i) % PERF_QUERY_BUFFER_SIZE; + u32 index = (m_query_readback_pos + i) % PERF_QUERY_BUFFER_SIZE; ActiveQuery& entry = m_query_buffer[index]; // Should have a fence associated with it (waiting for a result). - ASSERT(entry.fence_counter != 0); + DEBUG_ASSERT(entry.fence_counter != 0); entry.fence_counter = 0; - entry.available = false; - entry.active = false; - - // Grab result from readback buffer, it will already have been invalidated. - u32 result; - m_readback_buffer->Read(index * sizeof(PerfQueryDataType), &result, sizeof(result), false); - DEBUG_LOG(VIDEO, " query result %u", result); + entry.has_value = false; // NOTE: Reported pixel metrics should be referenced to native resolution m_results[entry.query_type] += - static_cast(static_cast(result) * EFB_WIDTH / g_renderer->GetTargetWidth() * - EFB_HEIGHT / g_renderer->GetTargetHeight()); + static_cast(static_cast(m_query_result_buffer[i]) * EFB_WIDTH / + g_renderer->GetTargetWidth() * EFB_HEIGHT / g_renderer->GetTargetHeight()); } - m_query_read_pos = (m_query_read_pos + query_count) % PERF_QUERY_BUFFER_SIZE; + m_query_readback_pos = (m_query_readback_pos + query_count) % PERF_QUERY_BUFFER_SIZE; m_query_count -= query_count; } -void PerfQuery::NonBlockingPartialFlush() +void PerfQuery::PartialFlush(bool blocking) { - if (IsFlushed()) - return; - // Submit a command buffer in the background if the front query is not bound to one. - ActiveQuery& entry = m_query_buffer[m_query_read_pos]; - if (entry.fence_counter == g_command_buffer_mgr->GetCurrentFenceCounter()) - Renderer::GetInstance()->ExecuteCommandBuffer(true, false); + if (blocking || m_query_buffer[m_query_readback_pos].fence_counter == + g_command_buffer_mgr->GetCurrentFenceCounter()) + { + Renderer::GetInstance()->ExecuteCommandBuffer(true, blocking); + } - ProcessPendingResults(); -} - -void PerfQuery::BlockingPartialFlush() -{ - if (IsFlushed()) - return; - - // If the first pending query is needing command buffer execution, do that. - ActiveQuery& entry = m_query_buffer[m_query_read_pos]; - if (entry.fence_counter == g_command_buffer_mgr->GetCurrentFenceCounter()) - Renderer::GetInstance()->ExecuteCommandBuffer(false, true); - - ProcessPendingResults(); + ReadbackQueries(); } } // namespace Vulkan diff --git a/Source/Core/VideoBackends/Vulkan/PerfQuery.h b/Source/Core/VideoBackends/Vulkan/PerfQuery.h index facbe0dc0f..8ca91ac658 100644 --- a/Source/Core/VideoBackends/Vulkan/PerfQuery.h +++ b/Source/Core/VideoBackends/Vulkan/PerfQuery.h @@ -13,18 +13,15 @@ namespace Vulkan { -class StagingBuffer; - class PerfQuery : public PerfQueryBase { public: PerfQuery(); ~PerfQuery(); - static PerfQuery* GetInstance(); + static PerfQuery* GetInstance() { return static_cast(g_perf_query.get()); } bool Initialize(); - void FlushQueries(); void EnableQuery(PerfQueryGroup type) override; void DisableQuery(PerfQueryGroup type) override; @@ -34,35 +31,30 @@ public: bool IsFlushed() const override; private: + // u32 is used for the sample counts. + using PerfQueryDataType = u32; + + // when testing in SMS: 64 was too small, 128 was ok + // TODO: This should be size_t, but the base class uses u32s + static const u32 PERF_QUERY_BUFFER_SIZE = 512; + struct ActiveQuery { u64 fence_counter; PerfQueryType query_type; - bool available; - bool active; + bool has_value; }; bool CreateQueryPool(); - bool CreateReadbackBuffer(); - void QueueCopyQueryResults(u32 start_index, u32 query_count); - void ProcessPendingResults(); - void ProcessResults(u32 start_index, u32 query_count); + void ReadbackQueries(); + void ReadbackQueries(u32 query_count); + void PartialFlush(bool blocking); - void NonBlockingPartialFlush(); - void BlockingPartialFlush(); - - // when testing in SMS: 64 was too small, 128 was ok - // TODO: This should be size_t, but the base class uses u32s - using PerfQueryDataType = u32; - static const u32 PERF_QUERY_BUFFER_SIZE = 512; - std::array m_query_buffer = {}; - u32 m_query_read_pos = 0; - - // TODO: Investigate using pipeline statistics to implement other query types VkQueryPool m_query_pool = VK_NULL_HANDLE; - - // Buffer containing query results. Each query is a u32. - std::unique_ptr m_readback_buffer; + u32 m_query_readback_pos = 0; + u32 m_query_next_pos = 0; + std::array m_query_buffer = {}; + std::array m_query_result_buffer = {}; }; } // namespace Vulkan diff --git a/Source/Core/VideoBackends/Vulkan/Renderer.cpp b/Source/Core/VideoBackends/Vulkan/Renderer.cpp index a4db679ecb..fc07c4ce41 100644 --- a/Source/Core/VideoBackends/Vulkan/Renderer.cpp +++ b/Source/Core/VideoBackends/Vulkan/Renderer.cpp @@ -304,7 +304,6 @@ void Renderer::PresentBackbuffer() { // End drawing to backbuffer StateTracker::GetInstance()->EndRenderPass(); - PerfQuery::GetInstance()->FlushQueries(); // Transition the backbuffer to PRESENT_SRC to ensure all commands drawing // to it have finished before present. @@ -325,7 +324,6 @@ void Renderer::PresentBackbuffer() void Renderer::ExecuteCommandBuffer(bool submit_off_thread, bool wait_for_completion) { StateTracker::GetInstance()->EndRenderPass(); - PerfQuery::GetInstance()->FlushQueries(); g_command_buffer_mgr->SubmitCommandBuffer(submit_off_thread, wait_for_completion);