From 427dd45151eddc920d681c4711a3cdbd7e363f82 Mon Sep 17 00:00:00 2001
From: Stenzek <stenzek@gmail.com>
Date: Sun, 17 Mar 2019 15:59:22 +1000
Subject: [PATCH 1/5] Vulkan: Simplify command buffer fence tracking

---
 .../Vulkan/CommandBufferManager.cpp           | 122 ++++++++----------
 .../Vulkan/CommandBufferManager.h             |  34 ++---
 .../Core/VideoBackends/Vulkan/PerfQuery.cpp   |  39 +++---
 Source/Core/VideoBackends/Vulkan/PerfQuery.h  |   6 +-
 Source/Core/VideoBackends/Vulkan/Renderer.cpp |  12 +-
 .../VideoBackends/Vulkan/StreamBuffer.cpp     |  58 +++++----
 .../Core/VideoBackends/Vulkan/StreamBuffer.h  |   4 +-
 .../Core/VideoBackends/Vulkan/VKTexture.cpp   |  59 ++-------
 Source/Core/VideoBackends/Vulkan/VKTexture.h  |   2 +-
 .../VideoBackends/Vulkan/VertexManager.cpp    |   6 +-
 Source/Core/VideoBackends/Vulkan/main.cpp     |   4 +-
 11 files changed, 138 insertions(+), 208 deletions(-)
diff --git a/Source/Core/VideoBackends/Vulkan/CommandBufferManager.cpp b/Source/Core/VideoBackends/Vulkan/CommandBufferManager.cpp
index 1f2fcd01c7..f00f6001cb 100644
--- a/Source/Core/VideoBackends/Vulkan/CommandBufferManager.cpp
+++ b/Source/Core/VideoBackends/Vulkan/CommandBufferManager.cpp
@@ -54,7 +54,6 @@ bool CommandBufferManager::CreateCommandBuffers()
   {
     resources.init_command_buffer_used = false;
     resources.semaphore_used = false;
-    resources.needs_fence_wait = false;
 
     VkCommandPoolCreateInfo pool_info = {VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, nullptr, 0,
                                          g_vulkan_context->GetGraphicsQueueFamilyIndex()};
@@ -211,43 +210,61 @@ void CommandBufferManager::WaitForWorkerThreadIdle()
   m_submit_semaphore.Post();
 }
 
-void CommandBufferManager::WaitForGPUIdle()
+void CommandBufferManager::WaitForFenceCounter(u64 fence_counter)
 {
-  WaitForWorkerThreadIdle();
-  vkDeviceWaitIdle(g_vulkan_context->GetDevice());
-}
-
-void CommandBufferManager::WaitForFence(VkFence fence)
-{
-  // Find the command buffer that this fence corresponds to.
-  u32 command_buffer_index = 0;
-  for (; command_buffer_index < static_cast<u32>(m_frame_resources.size()); command_buffer_index++)
-  {
-    if (m_frame_resources[command_buffer_index].fence == fence)
-      break;
-  }
-  ASSERT(command_buffer_index < m_frame_resources.size());
-
-  // Has this command buffer already been waited for?
-  if (!m_frame_resources[command_buffer_index].needs_fence_wait)
+  if (m_completed_fence_counter >= fence_counter)
     return;
 
+  // Find the first command buffer which covers this counter value.
+  u32 index = (m_current_frame + 1) % NUM_COMMAND_BUFFERS;
+  while (index != m_current_frame)
+  {
+    if (m_frame_resources[index].fence_counter >= fence_counter)
+      break;
+
+    index = (index + 1) % NUM_COMMAND_BUFFERS;
+  }
+
+  ASSERT(index != m_current_frame);
+  WaitForCommandBufferCompletion(index);
+}
+
+void CommandBufferManager::WaitForCommandBufferCompletion(u32 index)
+{
   // Ensure this command buffer has been submitted.
   WaitForWorkerThreadIdle();
 
   // Wait for this command buffer to be completed.
-  VkResult res =
-      vkWaitForFences(g_vulkan_context->GetDevice(), 1,
-                      &m_frame_resources[command_buffer_index].fence, VK_TRUE, UINT64_MAX);
+  VkResult res = vkWaitForFences(g_vulkan_context->GetDevice(), 1, &m_frame_resources[index].fence,
+                                 VK_TRUE, UINT64_MAX);
   if (res != VK_SUCCESS)
     LOG_VULKAN_ERROR(res, "vkWaitForFences failed: ");
 
-  // Immediately fire callbacks and cleanups, since the commands has been completed.
-  m_frame_resources[command_buffer_index].needs_fence_wait = false;
-  OnCommandBufferExecuted(command_buffer_index);
+  // Clean up any resources for command buffers between the last known completed buffer and this
+  // now-completed command buffer. If we use >2 buffers, this may be more than one buffer.
+  const u64 now_completed_counter = m_frame_resources[index].fence_counter;
+  u32 cleanup_index = (m_current_frame + 1) % NUM_COMMAND_BUFFERS;
+  while (cleanup_index != m_current_frame)
+  {
+    FrameResources& resources = m_frame_resources[cleanup_index];
+    if (resources.fence_counter > now_completed_counter)
+      break;
+
+    if (resources.fence_counter > m_completed_fence_counter)
+    {
+      for (auto& it : resources.cleanup_resources)
+        it();
+      resources.cleanup_resources.clear();
+    }
+
+    cleanup_index = (cleanup_index + 1) % NUM_COMMAND_BUFFERS;
+  }
+
+  m_completed_fence_counter = now_completed_counter;
 }
 
 void CommandBufferManager::SubmitCommandBuffer(bool submit_on_worker_thread,
+                                               bool wait_for_completion,
                                                VkSwapchainKHR present_swap_chain,
                                                uint32_t present_image_index)
 {
@@ -263,16 +280,13 @@ void CommandBufferManager::SubmitCommandBuffer(bool submit_on_worker_thread,
     }
   }
 
-  // This command buffer now has commands, so can't be re-used without waiting.
-  resources.needs_fence_wait = true;
-
   // Grab the semaphore before submitting command buffer either on-thread or off-thread.
   // This prevents a race from occurring where a second command buffer is executed
   // before the worker thread has woken and executed the first one yet.
   m_submit_semaphore.Wait();
 
   // Submitting off-thread?
-  if (m_use_threaded_submission && submit_on_worker_thread)
+  if (m_use_threaded_submission && submit_on_worker_thread && !wait_for_completion)
   {
     // Push to the pending submit queue.
     {
@@ -287,6 +301,8 @@ void CommandBufferManager::SubmitCommandBuffer(bool submit_on_worker_thread,
   {
     // Pass through to normal submission path.
     SubmitCommandBuffer(m_current_frame, present_swap_chain, present_image_index);
+    if (wait_for_completion)
+      WaitForCommandBufferCompletion(m_current_frame);
   }
 
   // Switch to next cmdbuffer.
@@ -365,39 +381,15 @@ void CommandBufferManager::SubmitCommandBuffer(u32 command_buffer_index,
   m_submit_semaphore.Post();
 }
 
-void CommandBufferManager::OnCommandBufferExecuted(u32 index)
-{
-  FrameResources& resources = m_frame_resources[index];
-
-  // Fire fence tracking callbacks.
-  for (auto iter = m_fence_callbacks.begin(); iter != m_fence_callbacks.end();)
-  {
-    auto backup_iter = iter++;
-    backup_iter->second(resources.fence);
-  }
-
-  // Clean up all objects pending destruction on this command buffer
-  for (auto& it : resources.cleanup_resources)
-    it();
-  resources.cleanup_resources.clear();
-}
-
 void CommandBufferManager::BeginCommandBuffer()
 {
   // Move to the next command buffer.
-  m_current_frame = (m_current_frame + 1) % NUM_COMMAND_BUFFERS;
-  FrameResources& resources = m_frame_resources[m_current_frame];
+  const u32 next_buffer_index = (m_current_frame + 1) % NUM_COMMAND_BUFFERS;
+  FrameResources& resources = m_frame_resources[next_buffer_index];
 
   // Wait for the GPU to finish with all resources for this command buffer.
-  if (resources.needs_fence_wait)
-  {
-    VkResult res =
-        vkWaitForFences(g_vulkan_context->GetDevice(), 1, &resources.fence, true, UINT64_MAX);
-    if (res != VK_SUCCESS)
-      LOG_VULKAN_ERROR(res, "vkWaitForFences failed: ");
-
-    OnCommandBufferExecuted(m_current_frame);
-  }
+  if (resources.fence_counter > m_completed_fence_counter)
+    WaitForCommandBufferCompletion(next_buffer_index);
 
   // Reset fence to unsignaled before starting.
   VkResult res = vkResetFences(g_vulkan_context->GetDevice(), 1, &resources.fence);
@@ -427,6 +419,8 @@ void CommandBufferManager::BeginCommandBuffer()
   // Reset upload command buffer state
   resources.init_command_buffer_used = false;
   resources.semaphore_used = false;
+  resources.fence_counter = m_next_fence_counter++;
+  m_current_frame = next_buffer_index;
 }
 
 void CommandBufferManager::DeferBufferDestruction(VkBuffer object)
@@ -471,19 +465,5 @@ void CommandBufferManager::DeferImageViewDestruction(VkImageView object)
       [object]() { vkDestroyImageView(g_vulkan_context->GetDevice(), object, nullptr); });
 }
 
-void CommandBufferManager::AddFenceSignaledCallback(const void* key, FenceSignaledCallback callback)
-{
-  // Shouldn't be adding twice.
-  ASSERT(m_fence_callbacks.find(key) == m_fence_callbacks.end());
-  m_fence_callbacks.emplace(key, std::move(callback));
-}
-
-void CommandBufferManager::RemoveFenceSignaledCallback(const void* key)
-{
-  auto iter = m_fence_callbacks.find(key);
-  ASSERT(iter != m_fence_callbacks.end());
-  m_fence_callbacks.erase(iter);
-}
-
 std::unique_ptr<CommandBufferManager> g_command_buffer_mgr;
 }  // namespace Vulkan
diff --git a/Source/Core/VideoBackends/Vulkan/CommandBufferManager.h b/Source/Core/VideoBackends/Vulkan/CommandBufferManager.h
index 9cfc50e287..abc49c0622 100644
--- a/Source/Core/VideoBackends/Vulkan/CommandBufferManager.h
+++ b/Source/Core/VideoBackends/Vulkan/CommandBufferManager.h
@@ -51,9 +51,15 @@ public:
   // Allocates a descriptors set from the pool reserved for the current frame.
   VkDescriptorSet AllocateDescriptorSet(VkDescriptorSetLayout set_layout);
 
+  // Fence "counters" are used to track which commands have been completed by the GPU.
+  // If the last completed fence counter is greater or equal to N, it means that the work
+  // associated counter N has been completed by the GPU. The value of N to associate with
+  // commands can be retreived by calling GetCurrentFenceCounter().
+  u64 GetCompletedFenceCounter() const { return m_completed_fence_counter; }
+
   // Gets the fence that will be signaled when the currently executing command buffer is
   // queued and executed. Do not wait for this fence before the buffer is executed.
-  VkFence GetCurrentCommandBufferFence() const { return m_frame_resources[m_current_frame].fence; }
+  u64 GetCurrentFenceCounter() const { return m_frame_resources[m_current_frame].fence_counter; }
 
   // Returns the semaphore for the current command buffer, which can be used to ensure the
   // swap chain image is ready before the command buffer executes.
@@ -66,15 +72,11 @@ public:
   // Ensure that the worker thread has submitted any previous command buffers and is idle.
   void WaitForWorkerThreadIdle();
 
-  // Ensure that the worker thread has both submitted all commands, and the GPU has caught up.
-  // Use with caution, huge performance penalty.
-  void WaitForGPUIdle();
-
   // Wait for a fence to be completed.
   // Also invokes callbacks for completion.
-  void WaitForFence(VkFence fence);
+  void WaitForFenceCounter(u64 fence_counter);
 
-  void SubmitCommandBuffer(bool submit_on_worker_thread,
+  void SubmitCommandBuffer(bool submit_on_worker_thread, bool wait_for_completion,
                            VkSwapchainKHR present_swap_chain = VK_NULL_HANDLE,
                            uint32_t present_image_index = 0xFFFFFFFF);
 
@@ -90,25 +92,17 @@ public:
   void DeferImageDestruction(VkImage object);
   void DeferImageViewDestruction(VkImageView object);
 
-  // Instruct the manager to fire the specified callback when a fence is flagged to be signaled.
-  // This happens when command buffers are executed, and can be tested if signaled, which means
-  // that all commands up to the point when the callback was fired have completed.
-  using FenceSignaledCallback = std::function<void(VkFence)>;
-  void AddFenceSignaledCallback(const void* key, FenceSignaledCallback callback);
-  void RemoveFenceSignaledCallback(const void* key);
-
 private:
   bool CreateCommandBuffers();
   void DestroyCommandBuffers();
 
   bool CreateSubmitThread();
 
+  void WaitForCommandBufferCompletion(u32 command_buffer_index);
   void SubmitCommandBuffer(u32 command_buffer_index, VkSwapchainKHR present_swap_chain,
                            u32 present_image_index);
   void BeginCommandBuffer();
 
-  void OnCommandBufferExecuted(u32 index);
-
   struct FrameResources
   {
     // [0] - Init (upload) command buffer, [1] - draw command buffer
@@ -117,19 +111,19 @@ private:
     VkDescriptorPool descriptor_pool = VK_NULL_HANDLE;
     VkFence fence = VK_NULL_HANDLE;
     VkSemaphore semaphore = VK_NULL_HANDLE;
+    u64 fence_counter = 0;
     bool init_command_buffer_used = false;
     bool semaphore_used = false;
-    bool needs_fence_wait = false;
 
     std::vector<std::function<void()>> cleanup_resources;
   };
 
+  u64 m_next_fence_counter = 1;
+  u64 m_completed_fence_counter = 0;
+
   std::array<FrameResources, NUM_COMMAND_BUFFERS> m_frame_resources;
   u32 m_current_frame;
 
-  // callbacks when a fence point is set
-  std::map<const void*, FenceSignaledCallback> m_fence_callbacks;
-
   // Threaded command buffer execution
   // Semaphore determines when a command buffer can be queued
   Common::Semaphore m_submit_semaphore;
diff --git a/Source/Core/VideoBackends/Vulkan/PerfQuery.cpp b/Source/Core/VideoBackends/Vulkan/PerfQuery.cpp
index 3a84f2e571..1934c78410 100644
--- a/Source/Core/VideoBackends/Vulkan/PerfQuery.cpp
+++ b/Source/Core/VideoBackends/Vulkan/PerfQuery.cpp
@@ -24,8 +24,6 @@ PerfQuery::PerfQuery() = default;
 
 PerfQuery::~PerfQuery()
 {
-  g_command_buffer_mgr->RemoveFenceSignaledCallback(this);
-
   if (m_query_pool != VK_NULL_HANDLE)
     vkDestroyQueryPool(g_vulkan_context->GetDevice(), m_query_pool, nullptr);
 }
@@ -49,9 +47,6 @@ bool PerfQuery::Initialize()
     return false;
   }
 
-  g_command_buffer_mgr->AddFenceSignaledCallback(
-      this, std::bind(&PerfQuery::OnFenceSignaled, this, std::placeholders::_1));
-
   return true;
 }
 
@@ -113,7 +108,7 @@ void PerfQuery::ResetQuery()
 
   for (auto& entry : m_query_buffer)
   {
-    entry.pending_fence = VK_NULL_HANDLE;
+    entry.fence_counter = 0;
     entry.available = false;
     entry.active = false;
   }
@@ -217,7 +212,7 @@ void PerfQuery::QueueCopyQueryResults(u32 start_index, u32 query_count)
   {
     u32 index = start_index + i;
     ActiveQuery& entry = m_query_buffer[index];
-    entry.pending_fence = g_command_buffer_mgr->GetCurrentCommandBufferFence();
+    entry.fence_counter = g_command_buffer_mgr->GetCurrentFenceCounter();
     entry.available = true;
     entry.active = false;
   }
@@ -261,8 +256,10 @@ void PerfQuery::FlushQueries()
     QueueCopyQueryResults(copy_start_index, copy_count);
 }
 
-void PerfQuery::OnFenceSignaled(VkFence fence)
+void PerfQuery::ProcessPendingResults()
 {
+  const u64 completed_fence_counter = g_command_buffer_mgr->GetCurrentFenceCounter();
+
   // Need to save these since ProcessResults will modify them.
   u32 query_read_pos = m_query_read_pos;
   u32 query_count = m_query_count;
@@ -273,7 +270,7 @@ void PerfQuery::OnFenceSignaled(VkFence fence)
   for (u32 i = 0; i < query_count; i++)
   {
     u32 index = (query_read_pos + i) % PERF_QUERY_BUFFER_SIZE;
-    if (m_query_buffer[index].pending_fence != fence)
+    if (m_query_buffer[index].fence_counter > completed_fence_counter)
     {
       // These should be grouped together, at the end.
       break;
@@ -314,8 +311,8 @@ void PerfQuery::ProcessResults(u32 start_index, u32 query_count)
     ActiveQuery& entry = m_query_buffer[index];
 
     // Should have a fence associated with it (waiting for a result).
-    ASSERT(entry.pending_fence != VK_NULL_HANDLE);
-    entry.pending_fence = VK_NULL_HANDLE;
+    ASSERT(entry.fence_counter != 0);
+    entry.fence_counter = 0;
     entry.available = false;
     entry.active = false;
 
@@ -340,9 +337,11 @@ void PerfQuery::NonBlockingPartialFlush()
     return;
 
   // Submit a command buffer in the background if the front query is not bound to one.
-  // Ideally this will complete before the buffer fills.
-  if (m_query_buffer[m_query_read_pos].pending_fence == VK_NULL_HANDLE)
+  ActiveQuery& entry = m_query_buffer[m_query_read_pos];
+  if (entry.fence_counter == g_command_buffer_mgr->GetCurrentFenceCounter())
     Renderer::GetInstance()->ExecuteCommandBuffer(true, false);
+
+  ProcessPendingResults();
 }
 
 void PerfQuery::BlockingPartialFlush()
@@ -352,17 +351,9 @@ void PerfQuery::BlockingPartialFlush()
 
   // If the first pending query is needing command buffer execution, do that.
   ActiveQuery& entry = m_query_buffer[m_query_read_pos];
-  if (entry.pending_fence == VK_NULL_HANDLE)
-  {
-    // This will callback OnCommandBufferQueued which will set the fence on the entry.
-    // We wait for completion, which will also call OnCommandBufferExecuted, and clear the fence.
+  if (entry.fence_counter == g_command_buffer_mgr->GetCurrentFenceCounter())
     Renderer::GetInstance()->ExecuteCommandBuffer(false, true);
-  }
-  else
-  {
-    // The command buffer has been submitted, but is awaiting completion.
-    // Wait for the fence to complete, which will call OnCommandBufferExecuted.
-    g_command_buffer_mgr->WaitForFence(entry.pending_fence);
-  }
+
+  ProcessPendingResults();
 }
 }  // namespace Vulkan
diff --git a/Source/Core/VideoBackends/Vulkan/PerfQuery.h b/Source/Core/VideoBackends/Vulkan/PerfQuery.h
index 47ccf22a66..facbe0dc0f 100644
--- a/Source/Core/VideoBackends/Vulkan/PerfQuery.h
+++ b/Source/Core/VideoBackends/Vulkan/PerfQuery.h
@@ -36,8 +36,8 @@ public:
 private:
   struct ActiveQuery
   {
+    u64 fence_counter;
     PerfQueryType query_type;
-    VkFence pending_fence;
     bool available;
     bool active;
   };
@@ -45,11 +45,9 @@ private:
   bool CreateQueryPool();
   bool CreateReadbackBuffer();
   void QueueCopyQueryResults(u32 start_index, u32 query_count);
+  void ProcessPendingResults();
   void ProcessResults(u32 start_index, u32 query_count);
 
-  void OnCommandBufferQueued(VkCommandBuffer command_buffer, VkFence fence);
-  void OnFenceSignaled(VkFence fence);
-
   void NonBlockingPartialFlush();
   void BlockingPartialFlush();
 
diff --git a/Source/Core/VideoBackends/Vulkan/Renderer.cpp b/Source/Core/VideoBackends/Vulkan/Renderer.cpp
index 8b560be15d..a4db679ecb 100644
--- a/Source/Core/VideoBackends/Vulkan/Renderer.cpp
+++ b/Source/Core/VideoBackends/Vulkan/Renderer.cpp
@@ -315,7 +315,7 @@ void Renderer::PresentBackbuffer()
   // Because this final command buffer is rendering to the swap chain, we need to wait for
   // the available semaphore to be signaled before executing the buffer. This final submission
   // can happen off-thread in the background while we're preparing the next frame.
-  g_command_buffer_mgr->SubmitCommandBuffer(true, m_swap_chain->GetSwapChain(),
+  g_command_buffer_mgr->SubmitCommandBuffer(true, false, m_swap_chain->GetSwapChain(),
                                             m_swap_chain->GetCurrentImageIndex());
 
   // New cmdbuffer, so invalidate state.
@@ -327,11 +327,7 @@ void Renderer::ExecuteCommandBuffer(bool submit_off_thread, bool wait_for_comple
   StateTracker::GetInstance()->EndRenderPass();
   PerfQuery::GetInstance()->FlushQueries();
 
-  // If we're waiting for completion, don't bother waking the worker thread.
-  const VkFence pending_fence = g_command_buffer_mgr->GetCurrentCommandBufferFence();
-  g_command_buffer_mgr->SubmitCommandBuffer(submit_off_thread && wait_for_completion);
-  if (wait_for_completion)
-    g_command_buffer_mgr->WaitForFence(pending_fence);
+  g_command_buffer_mgr->SubmitCommandBuffer(submit_off_thread, wait_for_completion);
 
   StateTracker::GetInstance()->InvalidateCachedState();
 }
@@ -550,10 +546,6 @@ void Renderer::UnbindTexture(const AbstractTexture* texture)
 
 void Renderer::ResetSamplerStates()
 {
-  // Ensure none of the sampler objects are in use.
-  // This assumes that none of the samplers are in use on the command list currently being recorded.
-  g_command_buffer_mgr->WaitForGPUIdle();
-
   // Invalidate all sampler states, next draw will re-initialize them.
   for (u32 i = 0; i < m_sampler_states.size(); i++)
   {
diff --git a/Source/Core/VideoBackends/Vulkan/StreamBuffer.cpp b/Source/Core/VideoBackends/Vulkan/StreamBuffer.cpp
index ea610f09cf..aa635e4d41 100644
--- a/Source/Core/VideoBackends/Vulkan/StreamBuffer.cpp
+++ b/Source/Core/VideoBackends/Vulkan/StreamBuffer.cpp
@@ -19,14 +19,10 @@ namespace Vulkan
 {
 StreamBuffer::StreamBuffer(VkBufferUsageFlags usage, u32 size) : m_usage(usage), m_size(size)
 {
-  g_command_buffer_mgr->AddFenceSignaledCallback(
-      this, std::bind(&StreamBuffer::OnFenceSignaled, this, std::placeholders::_1));
 }
 
 StreamBuffer::~StreamBuffer()
 {
-  g_command_buffer_mgr->RemoveFenceSignaledCallback(this);
-
   if (m_host_pointer)
     vkUnmapMemory(g_vulkan_context->GetDevice(), m_memory);
 
@@ -189,8 +185,6 @@ bool StreamBuffer::ReserveMemory(u32 num_bytes, u32 alignment)
   // Can we find a fence to wait on that will give us enough memory?
   if (WaitForClearSpace(required_bytes))
   {
-    ASSERT(m_current_offset == m_current_gpu_position ||
-           (m_current_offset + required_bytes) < m_current_gpu_position);
     m_current_offset = Common::AlignUp(m_current_offset, alignment);
     m_last_allocation_size = num_bytes;
     return true;
@@ -225,36 +219,40 @@ void StreamBuffer::UpdateCurrentFencePosition()
     return;
 
   // Has the offset changed since the last fence?
-  const VkFence fence = g_command_buffer_mgr->GetCurrentCommandBufferFence();
-  if (!m_tracked_fences.empty() && m_tracked_fences.back().first == fence)
+  const u64 counter = g_command_buffer_mgr->GetCurrentFenceCounter();
+  if (!m_tracked_fences.empty() && m_tracked_fences.back().first == counter)
   {
     // Still haven't executed a command buffer, so just update the offset.
     m_tracked_fences.back().second = m_current_offset;
     return;
   }
 
-  m_tracked_fences.emplace_back(fence, m_current_offset);
+  // New buffer, so update the GPU position while we're at it.
+  UpdateGPUPosition();
+  m_tracked_fences.emplace_back(counter, m_current_offset);
 }
 
-void StreamBuffer::OnFenceSignaled(VkFence fence)
+void StreamBuffer::UpdateGPUPosition()
 {
-  // Locate the entry for this fence (if any, we may have been forced to wait already)
-  auto iter = std::find_if(m_tracked_fences.begin(), m_tracked_fences.end(),
-                           [fence](const auto& it) { return it.first == fence; });
+  auto start = m_tracked_fences.begin();
+  auto end = start;
 
-  if (iter != m_tracked_fences.end())
+  const u64 completed_counter = g_command_buffer_mgr->GetCompletedFenceCounter();
+  while (end != m_tracked_fences.end() && completed_counter >= end->first)
   {
-    // Update the GPU position, and remove any fences before this fence (since
-    // it is implied that they have been signaled as well, though the callback
-    // should have removed them already).
-    m_current_gpu_position = iter->second;
-    m_tracked_fences.erase(m_tracked_fences.begin(), ++iter);
+    m_current_gpu_position = end->second;
+    ++end;
   }
+
+  if (start != end)
+    m_tracked_fences.erase(start, end);
 }
 
 bool StreamBuffer::WaitForClearSpace(u32 num_bytes)
 {
   u32 new_offset = 0;
+  u32 new_gpu_position = 0;
+
   auto iter = m_tracked_fences.begin();
   for (; iter != m_tracked_fences.end(); iter++)
   {
@@ -265,20 +263,32 @@ bool StreamBuffer::WaitForClearSpace(u32 num_bytes)
     u32 gpu_position = iter->second;
     if (m_current_offset == gpu_position)
     {
-      // Start at the start of the buffer again.
       new_offset = 0;
+      new_gpu_position = 0;
       break;
     }
 
     // Assuming that we wait for this fence, are we allocating in front of the GPU?
     if (m_current_offset > gpu_position)
     {
+      // This would suggest the GPU has now followed us and wrapped around, so we have from
+      // m_current_position..m_size free, as well as and 0..gpu_position.
+      const u32 remaining_space_after_offset = m_size - m_current_offset;
+      if (remaining_space_after_offset >= num_bytes)
+      {
+        // Switch to allocating in front of the GPU, using the remainder of the buffer.
+        new_offset = m_current_offset;
+        new_gpu_position = gpu_position;
+        break;
+      }
+
       // We can wrap around to the start, behind the GPU, if there is enough space.
       // We use > here because otherwise we'd end up lining up with the GPU, and then the
       // allocator would assume that the GPU has consumed what we just wrote.
       if (gpu_position > num_bytes)
       {
         new_offset = 0;
+        new_gpu_position = gpu_position;
         break;
       }
     }
@@ -292,6 +302,7 @@ bool StreamBuffer::WaitForClearSpace(u32 num_bytes)
       {
         // Leave the offset as-is, but update the GPU position.
         new_offset = m_current_offset;
+        new_gpu_position = gpu_position;
         break;
       }
     }
@@ -300,14 +311,17 @@ bool StreamBuffer::WaitForClearSpace(u32 num_bytes)
   // Did any fences satisfy this condition?
   // Has the command buffer been executed yet? If not, the caller should execute it.
   if (iter == m_tracked_fences.end() ||
-      iter->first == g_command_buffer_mgr->GetCurrentCommandBufferFence())
+      iter->first == g_command_buffer_mgr->GetCurrentFenceCounter())
   {
     return false;
   }
 
   // Wait until this fence is signaled. This will fire the callback, updating the GPU position.
-  g_command_buffer_mgr->WaitForFence(iter->first);
+  g_command_buffer_mgr->WaitForFenceCounter(iter->first);
+  m_tracked_fences.erase(m_tracked_fences.begin(),
+                         m_current_offset == iter->second ? m_tracked_fences.end() : ++iter);
   m_current_offset = new_offset;
+  m_current_gpu_position = new_gpu_position;
   return true;
 }
 
diff --git a/Source/Core/VideoBackends/Vulkan/StreamBuffer.h b/Source/Core/VideoBackends/Vulkan/StreamBuffer.h
index b52ce6cd35..677313939a 100644
--- a/Source/Core/VideoBackends/Vulkan/StreamBuffer.h
+++ b/Source/Core/VideoBackends/Vulkan/StreamBuffer.h
@@ -34,7 +34,7 @@ public:
 private:
   bool AllocateBuffer();
   void UpdateCurrentFencePosition();
-  void OnFenceSignaled(VkFence fence);
+  void UpdateGPUPosition();
 
   // Waits for as many fences as needed to allocate num_bytes bytes from the buffer.
   bool WaitForClearSpace(u32 num_bytes);
@@ -50,7 +50,7 @@ private:
   u8* m_host_pointer = nullptr;
 
   // List of fences and the corresponding positions in the buffer
-  std::deque<std::pair<VkFence, u32>> m_tracked_fences;
+  std::deque<std::pair<u64, u32>> m_tracked_fences;
 
   bool m_coherent_mapping = false;
 };
diff --git a/Source/Core/VideoBackends/Vulkan/VKTexture.cpp b/Source/Core/VideoBackends/Vulkan/VKTexture.cpp
index 202bbaad0f..27b079a779 100644
--- a/Source/Core/VideoBackends/Vulkan/VKTexture.cpp
+++ b/Source/Core/VideoBackends/Vulkan/VKTexture.cpp
@@ -674,11 +674,7 @@ VKStagingTexture::VKStagingTexture(StagingTextureType type, const TextureConfig&
 {
 }
 
-VKStagingTexture::~VKStagingTexture()
-{
-  if (m_needs_flush)
-    VKStagingTexture::Flush();
-}
+VKStagingTexture::~VKStagingTexture() = default;
 
 std::unique_ptr<VKStagingTexture> VKStagingTexture::Create(StagingTextureType type,
                                                            const TextureConfig& config)
@@ -739,14 +735,6 @@ void VKStagingTexture::CopyFromTexture(const AbstractTexture* src,
   ASSERT(dst_rect.left >= 0 && static_cast<u32>(dst_rect.right) <= m_config.width &&
          dst_rect.top >= 0 && static_cast<u32>(dst_rect.bottom) <= m_config.height);
 
-  if (m_needs_flush)
-  {
-    // Drop copy before reusing it.
-    g_command_buffer_mgr->RemoveFenceSignaledCallback(this);
-    m_flush_fence = VK_NULL_HANDLE;
-    m_needs_flush = false;
-  }
-
   StateTracker::GetInstance()->EndRenderPass();
 
   VkImageLayout old_layout = src_tex->GetLayout();
@@ -773,16 +761,7 @@ void VKStagingTexture::CopyFromTexture(const AbstractTexture* src,
   src_tex->TransitionToLayout(g_command_buffer_mgr->GetCurrentCommandBuffer(), old_layout);
 
   m_needs_flush = true;
-  m_flush_fence = g_command_buffer_mgr->GetCurrentCommandBufferFence();
-  g_command_buffer_mgr->AddFenceSignaledCallback(this, [this](VkFence fence) {
-    if (m_flush_fence != fence)
-      return;
-
-    m_flush_fence = VK_NULL_HANDLE;
-    m_needs_flush = false;
-    g_command_buffer_mgr->RemoveFenceSignaledCallback(this);
-    m_staging_buffer->InvalidateCPUCache();
-  });
+  m_flush_fence_counter = g_command_buffer_mgr->GetCurrentFenceCounter();
 }
 
 void VKStagingTexture::CopyToTexture(const MathUtil::Rectangle<int>& src_rect, AbstractTexture* dst,
@@ -798,14 +777,6 @@ void VKStagingTexture::CopyToTexture(const MathUtil::Rectangle<int>& src_rect, A
   ASSERT(dst_rect.left >= 0 && static_cast<u32>(dst_rect.right) <= dst_tex->GetWidth() &&
          dst_rect.top >= 0 && static_cast<u32>(dst_rect.bottom) <= dst_tex->GetHeight());
 
-  if (m_needs_flush)
-  {
-    // Drop copy before reusing it.
-    g_command_buffer_mgr->RemoveFenceSignaledCallback(this);
-    m_flush_fence = VK_NULL_HANDLE;
-    m_needs_flush = false;
-  }
-
   // Flush caches before copying.
   m_staging_buffer->FlushCPUCache();
   StateTracker::GetInstance()->EndRenderPass();
@@ -833,15 +804,7 @@ void VKStagingTexture::CopyToTexture(const MathUtil::Rectangle<int>& src_rect, A
   dst_tex->TransitionToLayout(g_command_buffer_mgr->GetCurrentCommandBuffer(), old_layout);
 
   m_needs_flush = true;
-  m_flush_fence = g_command_buffer_mgr->GetCurrentCommandBufferFence();
-  g_command_buffer_mgr->AddFenceSignaledCallback(this, [this](VkFence fence) {
-    if (m_flush_fence != fence)
-      return;
-
-    m_flush_fence = VK_NULL_HANDLE;
-    m_needs_flush = false;
-    g_command_buffer_mgr->RemoveFenceSignaledCallback(this);
-  });
+  m_flush_fence_counter = g_command_buffer_mgr->GetCurrentFenceCounter();
 }
 
 bool VKStagingTexture::Map()
@@ -860,25 +823,23 @@ void VKStagingTexture::Flush()
   if (!m_needs_flush)
     return;
 
-  // Either of the below two calls will cause the callback to fire.
-  g_command_buffer_mgr->RemoveFenceSignaledCallback(this);
-  if (m_flush_fence == g_command_buffer_mgr->GetCurrentCommandBufferFence())
+  // Is this copy in the current command buffer?
+  if (g_command_buffer_mgr->GetCurrentFenceCounter() == m_flush_fence_counter)
   {
-    // The readback is in the current command buffer, and we must execute it.
+    // Execute the command buffer and wait for it to finish.
     Renderer::GetInstance()->ExecuteCommandBuffer(false, true);
   }
   else
   {
-    // WaitForFence should fire the callback.
-    g_command_buffer_mgr->WaitForFence(m_flush_fence);
+    // Wait for the GPU to finish with it.
+    g_command_buffer_mgr->WaitForFenceCounter(m_flush_fence_counter);
   }
 
-  DEBUG_ASSERT(m_flush_fence == VK_NULL_HANDLE);
-  m_needs_flush = false;
-
   // For readback textures, invalidate the CPU cache as there is new data there.
   if (m_type == StagingTextureType::Readback || m_type == StagingTextureType::Mutable)
     m_staging_buffer->InvalidateCPUCache();
+
+  m_needs_flush = false;
 }
 
 VKFramebuffer::VKFramebuffer(VKTexture* color_attachment, VKTexture* depth_attachment, u32 width,
diff --git a/Source/Core/VideoBackends/Vulkan/VKTexture.h b/Source/Core/VideoBackends/Vulkan/VKTexture.h
index bab11ec108..2a194acc4d 100644
--- a/Source/Core/VideoBackends/Vulkan/VKTexture.h
+++ b/Source/Core/VideoBackends/Vulkan/VKTexture.h
@@ -104,7 +104,7 @@ private:
                    std::unique_ptr<StagingBuffer> buffer);
 
   std::unique_ptr<StagingBuffer> m_staging_buffer;
-  VkFence m_flush_fence = VK_NULL_HANDLE;
+  u64 m_flush_fence_counter = 0;
 };
 
 class VKFramebuffer final : public AbstractFramebuffer
diff --git a/Source/Core/VideoBackends/Vulkan/VertexManager.cpp b/Source/Core/VideoBackends/Vulkan/VertexManager.cpp
index 336d3ad480..144d3b3ab8 100644
--- a/Source/Core/VideoBackends/Vulkan/VertexManager.cpp
+++ b/Source/Core/VideoBackends/Vulkan/VertexManager.cpp
@@ -60,11 +60,11 @@ VertexManager::~VertexManager()
 bool VertexManager::Initialize()
 {
   m_vertex_stream_buffer =
-      StreamBuffer::Create(VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, VERTEX_STREAM_BUFFER_SIZE * 4);
+      StreamBuffer::Create(VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, VERTEX_STREAM_BUFFER_SIZE);
   m_index_stream_buffer =
-      StreamBuffer::Create(VK_BUFFER_USAGE_INDEX_BUFFER_BIT, INDEX_STREAM_BUFFER_SIZE * 4);
+      StreamBuffer::Create(VK_BUFFER_USAGE_INDEX_BUFFER_BIT, INDEX_STREAM_BUFFER_SIZE);
   m_uniform_stream_buffer =
-      StreamBuffer::Create(VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, UNIFORM_STREAM_BUFFER_SIZE * 4);
+      StreamBuffer::Create(VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, UNIFORM_STREAM_BUFFER_SIZE);
   if (!m_vertex_stream_buffer || !m_index_stream_buffer || !m_uniform_stream_buffer)
   {
     PanicAlert("Failed to allocate streaming buffers");
diff --git a/Source/Core/VideoBackends/Vulkan/main.cpp b/Source/Core/VideoBackends/Vulkan/main.cpp
index 911f8d1991..07c24c9354 100644
--- a/Source/Core/VideoBackends/Vulkan/main.cpp
+++ b/Source/Core/VideoBackends/Vulkan/main.cpp
@@ -251,8 +251,8 @@ bool VideoBackend::Initialize(const WindowSystemInfo& wsi)
 
 void VideoBackend::Shutdown()
 {
-  if (g_command_buffer_mgr)
-    g_command_buffer_mgr->WaitForGPUIdle();
+  if (g_vulkan_context)
+    vkDeviceWaitIdle(g_vulkan_context->GetDevice());
 
   if (g_shader_cache)
     g_shader_cache->Shutdown();

From 087b11e780a89928ae4f6d2229eb3e79f4f0689b Mon Sep 17 00:00:00 2001
From: Stenzek <stenzek@gmail.com>
Date: Sun, 17 Mar 2019 16:41:50 +1000
Subject: [PATCH 2/5] TextureCacheBase: Fix possible crash on shutdown with
 deferred EFB copies

---
 Source/Core/VideoCommon/TextureCacheBase.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Source/Core/VideoCommon/TextureCacheBase.cpp b/Source/Core/VideoCommon/TextureCacheBase.cpp
index a5ff0081c5..a210be3bc3 100644
--- a/Source/Core/VideoCommon/TextureCacheBase.cpp
+++ b/Source/Core/VideoCommon/TextureCacheBase.cpp
@@ -96,6 +96,9 @@ TextureCacheBase::TextureCacheBase()
 
 TextureCacheBase::~TextureCacheBase()
 {
+  // Clear pending EFB copies first, so we don't try to flush them.
+  m_pending_efb_copies.clear();
+
   HiresTexture::Shutdown();
   Invalidate();
   Common::FreeAlignedMemory(temp);

From 23a655217c425fcb8bce2cf9c94e770f45f198c6 Mon Sep 17 00:00:00 2001
From: Stenzek <stenzek@gmail.com>
Date: Sun, 17 Mar 2019 16:46:41 +1000
Subject: [PATCH 3/5] Vulkan: Fix validation layer error for unbound texture
 layouts

---
 Source/Core/VideoBackends/Vulkan/StateTracker.cpp | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/Source/Core/VideoBackends/Vulkan/StateTracker.cpp b/Source/Core/VideoBackends/Vulkan/StateTracker.cpp
index 6bb1b8981d..e25147cbe1 100644
--- a/Source/Core/VideoBackends/Vulkan/StateTracker.cpp
+++ b/Source/Core/VideoBackends/Vulkan/StateTracker.cpp
@@ -62,11 +62,13 @@ bool StateTracker::Initialize()
       VKTexture::Create(TextureConfig(1, 1, 1, 1, 1, AbstractTextureFormat::RGBA8, 0));
   if (!m_dummy_texture)
     return false;
+  m_dummy_texture->TransitionToLayout(g_command_buffer_mgr->GetCurrentInitCommandBuffer(),
+                                      VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL);
 
   // Initialize all samplers to point by default
   for (size_t i = 0; i < NUM_PIXEL_SHADER_SAMPLERS; i++)
   {
-    m_bindings.samplers[i].imageLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+    m_bindings.samplers[i].imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
     m_bindings.samplers[i].imageView = m_dummy_texture->GetView();
     m_bindings.samplers[i].sampler = g_object_cache->GetPointSampler();
   }
@@ -223,14 +225,14 @@ void StateTracker::UnbindTexture(VkImageView view)
     if (it.imageView == view)
     {
       it.imageView = m_dummy_texture->GetView();
-      it.imageLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+      it.imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
     }
   }
 
   if (m_bindings.image_texture.imageView == view)
   {
     m_bindings.image_texture.imageView = m_dummy_texture->GetView();
-    m_bindings.image_texture.imageLayout = VK_IMAGE_LAYOUT_UNDEFINED;
+    m_bindings.image_texture.imageLayout = VK_IMAGE_LAYOUT_SHADER_READ_ONLY_OPTIMAL;
   }
 }
 

From 6d40ea855345eed482be135359c61224704de436 Mon Sep 17 00:00:00 2001
From: Stenzek <stenzek@gmail.com>
Date: Sun, 17 Mar 2019 16:55:32 +1000
Subject: [PATCH 4/5] Vulkan: Fix barrier validation layer errors for bounding
 box

---
 Source/Core/VideoBackends/Vulkan/BoundingBox.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Source/Core/VideoBackends/Vulkan/BoundingBox.cpp b/Source/Core/VideoBackends/Vulkan/BoundingBox.cpp
index 9e7da60249..15c9403228 100644
--- a/Source/Core/VideoBackends/Vulkan/BoundingBox.cpp
+++ b/Source/Core/VideoBackends/Vulkan/BoundingBox.cpp
@@ -99,7 +99,7 @@ void BoundingBox::Flush()
     StagingBuffer::BufferMemoryBarrier(
         g_command_buffer_mgr->GetCurrentCommandBuffer(), m_gpu_buffer, VK_ACCESS_TRANSFER_WRITE_BIT,
         VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, 0, BUFFER_SIZE,
-        VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT);
+        VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT);
   }
 
   // We're now up-to-date.
@@ -223,7 +223,7 @@ void BoundingBox::Readback()
   StagingBuffer::BufferMemoryBarrier(
       g_command_buffer_mgr->GetCurrentCommandBuffer(), m_gpu_buffer,
       VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT, 0,
-      BUFFER_SIZE, VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT);
+      BUFFER_SIZE, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT);
   m_readback_buffer->PrepareForGPUWrite(g_command_buffer_mgr->GetCurrentCommandBuffer(),
                                         VK_ACCESS_TRANSFER_WRITE_BIT,
                                         VK_PIPELINE_STAGE_TRANSFER_BIT);
@@ -237,7 +237,7 @@ void BoundingBox::Readback()
   StagingBuffer::BufferMemoryBarrier(
       g_command_buffer_mgr->GetCurrentCommandBuffer(), m_gpu_buffer, VK_ACCESS_TRANSFER_READ_BIT,
       VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT, 0, BUFFER_SIZE,
-      VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT);
+      VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT);
   m_readback_buffer->FlushGPUCache(g_command_buffer_mgr->GetCurrentCommandBuffer(),
                                    VK_ACCESS_TRANSFER_WRITE_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT);
 

From 604ab67c7f28dc03a82ee7aa1bcc487d39ea6f8c Mon Sep 17 00:00:00 2001
From: Stenzek <stenzek@gmail.com>
Date: Mon, 18 Mar 2019 00:09:26 +1000
Subject: [PATCH 5/5] Vulkan: Simplify perf queries using vkGetQueryPoolResults

---
 .../Core/VideoBackends/Vulkan/PerfQuery.cpp   | 261 ++++--------------
 Source/Core/VideoBackends/Vulkan/PerfQuery.h  |  40 ++-
 Source/Core/VideoBackends/Vulkan/Renderer.cpp |   2 -
 3 files changed, 73 insertions(+), 230 deletions(-)

diff --git a/Source/Core/VideoBackends/Vulkan/PerfQuery.cpp b/Source/Core/VideoBackends/Vulkan/PerfQuery.cpp
index 1934c78410..f19b67d12b 100644
--- a/Source/Core/VideoBackends/Vulkan/PerfQuery.cpp
+++ b/Source/Core/VideoBackends/Vulkan/PerfQuery.cpp
@@ -14,7 +14,6 @@
 
 #include "VideoBackends/Vulkan/CommandBufferManager.h"
 #include "VideoBackends/Vulkan/Renderer.h"
-#include "VideoBackends/Vulkan/StagingBuffer.h"
 #include "VideoBackends/Vulkan/StateTracker.h"
 #include "VideoBackends/Vulkan/VulkanContext.h"
 
@@ -28,11 +27,6 @@ PerfQuery::~PerfQuery()
     vkDestroyQueryPool(g_vulkan_context->GetDevice(), m_query_pool, nullptr);
 }
 
-Vulkan::PerfQuery* PerfQuery::GetInstance()
-{
-  return static_cast<PerfQuery*>(g_perf_query.get());
-}
-
 bool PerfQuery::Initialize()
 {
   if (!CreateQueryPool())
@@ -41,47 +35,30 @@ bool PerfQuery::Initialize()
     return false;
   }
 
-  if (!CreateReadbackBuffer())
-  {
-    PanicAlert("Failed to create readback buffer");
-    return false;
-  }
-
   return true;
 }
 
 void PerfQuery::EnableQuery(PerfQueryGroup type)
 {
-  // Have we used half of the query buffer already?
-  if (m_query_count > m_query_buffer.size() / 2)
-    NonBlockingPartialFlush();
-
   // Block if there are no free slots.
-  if (m_query_count == PERF_QUERY_BUFFER_SIZE)
-  {
-    // ERROR_LOG(VIDEO, "Flushed query buffer early!");
-    BlockingPartialFlush();
-  }
+  // Otherwise, try to keep half of them available.
+  if (m_query_count > m_query_buffer.size() / 2)
+    PartialFlush(m_query_count == PERF_QUERY_BUFFER_SIZE);
 
   if (type == PQG_ZCOMP_ZCOMPLOC || type == PQG_ZCOMP)
   {
-    u32 index = (m_query_read_pos + m_query_count) % PERF_QUERY_BUFFER_SIZE;
-    ActiveQuery& entry = m_query_buffer[index];
-    ASSERT(!entry.active && !entry.available);
-    entry.active = true;
-    m_query_count++;
-
-    DEBUG_LOG(VIDEO, "start query %u", index);
+    ActiveQuery& entry = m_query_buffer[m_query_next_pos];
+    DEBUG_ASSERT(!entry.has_value);
+    entry.has_value = true;
 
     // Use precise queries if supported, otherwise boolean (which will be incorrect).
-    VkQueryControlFlags flags = 0;
-    if (g_vulkan_context->SupportsPreciseOcclusionQueries())
-      flags = VK_QUERY_CONTROL_PRECISE_BIT;
+    VkQueryControlFlags flags =
+        g_vulkan_context->SupportsPreciseOcclusionQueries() ? VK_QUERY_CONTROL_PRECISE_BIT : 0;
 
     // Ensure the query starts within a render pass.
-    // TODO: Is this needed?
     StateTracker::GetInstance()->BeginRenderPass();
-    vkCmdBeginQuery(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool, index, flags);
+    vkCmdBeginQuery(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool, m_query_next_pos,
+                    flags);
   }
 }
 
@@ -89,16 +66,17 @@ void PerfQuery::DisableQuery(PerfQueryGroup type)
 {
   if (type == PQG_ZCOMP_ZCOMPLOC || type == PQG_ZCOMP)
   {
-    // DisableQuery should be called for each EnableQuery, so subtract one to get the previous one.
-    u32 index = (m_query_read_pos + m_query_count - 1) % PERF_QUERY_BUFFER_SIZE;
-    vkCmdEndQuery(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool, index);
+    vkCmdEndQuery(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool, m_query_next_pos);
+    m_query_next_pos = (m_query_next_pos + 1) % PERF_QUERY_BUFFER_SIZE;
+    m_query_count++;
   }
 }
 
 void PerfQuery::ResetQuery()
 {
   m_query_count = 0;
-  m_query_read_pos = 0;
+  m_query_readback_pos = 0;
+  m_query_next_pos = 0;
   std::fill_n(m_results, ArraySize(m_results), 0);
 
   // Reset entire query pool, ensuring all queries are ready to write to.
@@ -106,34 +84,20 @@ void PerfQuery::ResetQuery()
   vkCmdResetQueryPool(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool, 0,
                       PERF_QUERY_BUFFER_SIZE);
 
-  for (auto& entry : m_query_buffer)
-  {
-    entry.fence_counter = 0;
-    entry.available = false;
-    entry.active = false;
-  }
+  std::memset(m_query_buffer.data(), 0, sizeof(ActiveQuery) * m_query_buffer.size());
 }
 
 u32 PerfQuery::GetQueryResult(PerfQueryType type)
 {
   u32 result = 0;
-
   if (type == PQ_ZCOMP_INPUT_ZCOMPLOC || type == PQ_ZCOMP_OUTPUT_ZCOMPLOC)
-  {
     result = m_results[PQG_ZCOMP_ZCOMPLOC];
-  }
   else if (type == PQ_ZCOMP_INPUT || type == PQ_ZCOMP_OUTPUT)
-  {
     result = m_results[PQG_ZCOMP];
-  }
   else if (type == PQ_BLEND_INPUT)
-  {
     result = m_results[PQG_ZCOMP] + m_results[PQG_ZCOMP_ZCOMPLOC];
-  }
   else if (type == PQ_EFB_COPY_CLOCKS)
-  {
     result = m_results[PQG_EFB_COPY_CLOCKS];
-  }
 
   return result / 4;
 }
@@ -141,7 +105,7 @@ u32 PerfQuery::GetQueryResult(PerfQueryType type)
 void PerfQuery::FlushResults()
 {
   while (!IsFlushed())
-    BlockingPartialFlush();
+    PartialFlush(true);
 }
 
 bool PerfQuery::IsFlushed() const
@@ -170,190 +134,79 @@ bool PerfQuery::CreateQueryPool()
   return true;
 }
 
-bool PerfQuery::CreateReadbackBuffer()
+void PerfQuery::ReadbackQueries()
 {
-  m_readback_buffer = StagingBuffer::Create(STAGING_BUFFER_TYPE_READBACK,
-                                            PERF_QUERY_BUFFER_SIZE * sizeof(PerfQueryDataType),
-                                            VK_BUFFER_USAGE_TRANSFER_DST_BIT);
-
-  // Leave the buffer persistently mapped, we invalidate it when we need to read.
-  if (!m_readback_buffer || !m_readback_buffer->Map())
-    return false;
-
-  return true;
-}
-
-void PerfQuery::QueueCopyQueryResults(u32 start_index, u32 query_count)
-{
-  DEBUG_LOG(VIDEO, "queue copy of queries %u-%u", start_index, start_index + query_count - 1);
-
-  // Transition buffer for GPU write
-  // TODO: Is this needed?
-  m_readback_buffer->PrepareForGPUWrite(g_command_buffer_mgr->GetCurrentCommandBuffer(),
-                                        VK_ACCESS_TRANSFER_WRITE_BIT,
-                                        VK_PIPELINE_STAGE_TRANSFER_BIT);
-
-  // Copy from queries -> buffer
-  vkCmdCopyQueryPoolResults(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool,
-                            start_index, query_count, m_readback_buffer->GetBuffer(),
-                            start_index * sizeof(PerfQueryDataType), sizeof(PerfQueryDataType),
-                            VK_QUERY_RESULT_WAIT_BIT);
-
-  // Prepare for host readback
-  m_readback_buffer->FlushGPUCache(g_command_buffer_mgr->GetCurrentCommandBuffer(),
-                                   VK_ACCESS_TRANSFER_WRITE_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT);
-
-  // Reset queries so they're ready to use again
-  vkCmdResetQueryPool(g_command_buffer_mgr->GetCurrentCommandBuffer(), m_query_pool, start_index,
-                      query_count);
-
-  // Flag all queries as available, but with a fence that has to be completed first
-  for (u32 i = 0; i < query_count; i++)
-  {
-    u32 index = start_index + i;
-    ActiveQuery& entry = m_query_buffer[index];
-    entry.fence_counter = g_command_buffer_mgr->GetCurrentFenceCounter();
-    entry.available = true;
-    entry.active = false;
-  }
-}
-
-void PerfQuery::FlushQueries()
-{
-  // Flag all pending queries that aren't available as available after execution.
-  u32 copy_start_index = 0;
-  u32 copy_count = 0;
-  for (u32 i = 0; i < m_query_count; i++)
-  {
-    u32 index = (m_query_read_pos + i) % PERF_QUERY_BUFFER_SIZE;
-    ActiveQuery& entry = m_query_buffer[index];
-
-    // Skip already-copied queries (will happen if a flush hasn't occurred and
-    // a command buffer hasn't finished executing).
-    if (entry.available)
-    {
-      // These should be grouped together, and at the start.
-      ASSERT(copy_count == 0);
-      continue;
-    }
-
-    // If this wrapped around, we need to flush the entries before the end of the buffer.
-    ASSERT(entry.active);
-    if (index < copy_start_index)
-    {
-      QueueCopyQueryResults(copy_start_index, copy_count);
-      copy_start_index = index;
-      copy_count = 0;
-    }
-    else if (copy_count == 0)
-    {
-      copy_start_index = index;
-    }
-    copy_count++;
-  }
-
-  if (copy_count > 0)
-    QueueCopyQueryResults(copy_start_index, copy_count);
-}
-
-void PerfQuery::ProcessPendingResults()
-{
-  const u64 completed_fence_counter = g_command_buffer_mgr->GetCurrentFenceCounter();
+  const u64 completed_fence_counter = g_command_buffer_mgr->GetCompletedFenceCounter();
 
   // Need to save these since ProcessResults will modify them.
-  u32 query_read_pos = m_query_read_pos;
-  u32 query_count = m_query_count;
-
-  // Flush as many queries as are bound to this fence.
-  u32 flush_start_index = 0;
-  u32 flush_count = 0;
-  for (u32 i = 0; i < query_count; i++)
+  const u32 outstanding_queries = m_query_count;
+  u32 readback_count = 0;
+  for (u32 i = 0; i < outstanding_queries; i++)
   {
-    u32 index = (query_read_pos + i) % PERF_QUERY_BUFFER_SIZE;
-    if (m_query_buffer[index].fence_counter > completed_fence_counter)
-    {
-      // These should be grouped together, at the end.
+    u32 index = (m_query_readback_pos + readback_count) % PERF_QUERY_BUFFER_SIZE;
+    const ActiveQuery& entry = m_query_buffer[index];
+    if (entry.fence_counter > completed_fence_counter)
       break;
-    }
 
     // If this wrapped around, we need to flush the entries before the end of the buffer.
-    if (index < flush_start_index)
+    if (index < m_query_readback_pos)
     {
-      ProcessResults(flush_start_index, flush_count);
-      flush_start_index = index;
-      flush_count = 0;
+      ReadbackQueries(readback_count);
+      DEBUG_ASSERT(m_query_readback_pos == 0);
+      readback_count = 0;
     }
-    else if (flush_count == 0)
-    {
-      flush_start_index = index;
-    }
-    flush_count++;
+
+    readback_count++;
   }
 
-  if (flush_count > 0)
-    ProcessResults(flush_start_index, flush_count);
+  if (readback_count > 0)
+    ReadbackQueries(readback_count);
 }
 
-void PerfQuery::ProcessResults(u32 start_index, u32 query_count)
+void PerfQuery::ReadbackQueries(u32 query_count)
 {
-  // Invalidate CPU caches before reading back.
-  m_readback_buffer->InvalidateCPUCache(start_index * sizeof(PerfQueryDataType),
-                                        query_count * sizeof(PerfQueryDataType));
-
   // Should be at maximum query_count queries pending.
-  ASSERT(query_count <= m_query_count);
-  DEBUG_LOG(VIDEO, "process queries %u-%u", start_index, start_index + query_count - 1);
+  ASSERT(query_count <= m_query_count &&
+         (m_query_readback_pos + query_count) <= PERF_QUERY_BUFFER_SIZE);
+
+  // Read back from the GPU.
+  VkResult res =
+      vkGetQueryPoolResults(g_vulkan_context->GetDevice(), m_query_pool, m_query_readback_pos,
+                            query_count, query_count * sizeof(PerfQueryDataType),
+                            m_query_result_buffer.data(), sizeof(PerfQueryDataType), 0);
+  if (res != VK_SUCCESS)
+    LOG_VULKAN_ERROR(res, "vkGetQueryPoolResults failed: ");
 
   // Remove pending queries.
   for (u32 i = 0; i < query_count; i++)
   {
-    u32 index = (m_query_read_pos + i) % PERF_QUERY_BUFFER_SIZE;
+    u32 index = (m_query_readback_pos + i) % PERF_QUERY_BUFFER_SIZE;
     ActiveQuery& entry = m_query_buffer[index];
 
     // Should have a fence associated with it (waiting for a result).
-    ASSERT(entry.fence_counter != 0);
+    DEBUG_ASSERT(entry.fence_counter != 0);
     entry.fence_counter = 0;
-    entry.available = false;
-    entry.active = false;
-
-    // Grab result from readback buffer, it will already have been invalidated.
-    u32 result;
-    m_readback_buffer->Read(index * sizeof(PerfQueryDataType), &result, sizeof(result), false);
-    DEBUG_LOG(VIDEO, "  query result %u", result);
+    entry.has_value = false;
 
     // NOTE: Reported pixel metrics should be referenced to native resolution
     m_results[entry.query_type] +=
-        static_cast<u32>(static_cast<u64>(result) * EFB_WIDTH / g_renderer->GetTargetWidth() *
-                         EFB_HEIGHT / g_renderer->GetTargetHeight());
+        static_cast<u32>(static_cast<u64>(m_query_result_buffer[i]) * EFB_WIDTH /
+                         g_renderer->GetTargetWidth() * EFB_HEIGHT / g_renderer->GetTargetHeight());
   }
 
-  m_query_read_pos = (m_query_read_pos + query_count) % PERF_QUERY_BUFFER_SIZE;
+  m_query_readback_pos = (m_query_readback_pos + query_count) % PERF_QUERY_BUFFER_SIZE;
   m_query_count -= query_count;
 }
 
-void PerfQuery::NonBlockingPartialFlush()
+void PerfQuery::PartialFlush(bool blocking)
 {
-  if (IsFlushed())
-    return;
-
   // Submit a command buffer in the background if the front query is not bound to one.
-  ActiveQuery& entry = m_query_buffer[m_query_read_pos];
-  if (entry.fence_counter == g_command_buffer_mgr->GetCurrentFenceCounter())
-    Renderer::GetInstance()->ExecuteCommandBuffer(true, false);
+  if (blocking || m_query_buffer[m_query_readback_pos].fence_counter ==
+                      g_command_buffer_mgr->GetCurrentFenceCounter())
+  {
+    Renderer::GetInstance()->ExecuteCommandBuffer(true, blocking);
+  }
 
-  ProcessPendingResults();
-}
-
-void PerfQuery::BlockingPartialFlush()
-{
-  if (IsFlushed())
-    return;
-
-  // If the first pending query is needing command buffer execution, do that.
-  ActiveQuery& entry = m_query_buffer[m_query_read_pos];
-  if (entry.fence_counter == g_command_buffer_mgr->GetCurrentFenceCounter())
-    Renderer::GetInstance()->ExecuteCommandBuffer(false, true);
-
-  ProcessPendingResults();
+  ReadbackQueries();
 }
 }  // namespace Vulkan
diff --git a/Source/Core/VideoBackends/Vulkan/PerfQuery.h b/Source/Core/VideoBackends/Vulkan/PerfQuery.h
index facbe0dc0f..8ca91ac658 100644
--- a/Source/Core/VideoBackends/Vulkan/PerfQuery.h
+++ b/Source/Core/VideoBackends/Vulkan/PerfQuery.h
@@ -13,18 +13,15 @@
 
 namespace Vulkan
 {
-class StagingBuffer;
-
 class PerfQuery : public PerfQueryBase
 {
 public:
   PerfQuery();
   ~PerfQuery();
 
-  static PerfQuery* GetInstance();
+  static PerfQuery* GetInstance() { return static_cast<PerfQuery*>(g_perf_query.get()); }
 
   bool Initialize();
-  void FlushQueries();
 
   void EnableQuery(PerfQueryGroup type) override;
   void DisableQuery(PerfQueryGroup type) override;
@@ -34,35 +31,30 @@ public:
   bool IsFlushed() const override;
 
 private:
+  // u32 is used for the sample counts.
+  using PerfQueryDataType = u32;
+
+  // when testing in SMS: 64 was too small, 128 was ok
+  // TODO: This should be size_t, but the base class uses u32s
+  static const u32 PERF_QUERY_BUFFER_SIZE = 512;
+
   struct ActiveQuery
   {
     u64 fence_counter;
     PerfQueryType query_type;
-    bool available;
-    bool active;
+    bool has_value;
   };
 
   bool CreateQueryPool();
-  bool CreateReadbackBuffer();
-  void QueueCopyQueryResults(u32 start_index, u32 query_count);
-  void ProcessPendingResults();
-  void ProcessResults(u32 start_index, u32 query_count);
+  void ReadbackQueries();
+  void ReadbackQueries(u32 query_count);
+  void PartialFlush(bool blocking);
 
-  void NonBlockingPartialFlush();
-  void BlockingPartialFlush();
-
-  // when testing in SMS: 64 was too small, 128 was ok
-  // TODO: This should be size_t, but the base class uses u32s
-  using PerfQueryDataType = u32;
-  static const u32 PERF_QUERY_BUFFER_SIZE = 512;
-  std::array<ActiveQuery, PERF_QUERY_BUFFER_SIZE> m_query_buffer = {};
-  u32 m_query_read_pos = 0;
-
-  // TODO: Investigate using pipeline statistics to implement other query types
   VkQueryPool m_query_pool = VK_NULL_HANDLE;
-
-  // Buffer containing query results. Each query is a u32.
-  std::unique_ptr<StagingBuffer> m_readback_buffer;
+  u32 m_query_readback_pos = 0;
+  u32 m_query_next_pos = 0;
+  std::array<ActiveQuery, PERF_QUERY_BUFFER_SIZE> m_query_buffer = {};
+  std::array<PerfQueryDataType, PERF_QUERY_BUFFER_SIZE> m_query_result_buffer = {};
 };
 
 }  // namespace Vulkan
diff --git a/Source/Core/VideoBackends/Vulkan/Renderer.cpp b/Source/Core/VideoBackends/Vulkan/Renderer.cpp
index a4db679ecb..fc07c4ce41 100644
--- a/Source/Core/VideoBackends/Vulkan/Renderer.cpp
+++ b/Source/Core/VideoBackends/Vulkan/Renderer.cpp
@@ -304,7 +304,6 @@ void Renderer::PresentBackbuffer()
 {
   // End drawing to backbuffer
   StateTracker::GetInstance()->EndRenderPass();
-  PerfQuery::GetInstance()->FlushQueries();
 
   // Transition the backbuffer to PRESENT_SRC to ensure all commands drawing
   // to it have finished before present.
@@ -325,7 +324,6 @@ void Renderer::PresentBackbuffer()
 void Renderer::ExecuteCommandBuffer(bool submit_off_thread, bool wait_for_completion)
 {
   StateTracker::GetInstance()->EndRenderPass();
-  PerfQuery::GetInstance()->FlushQueries();
 
   g_command_buffer_mgr->SubmitCommandBuffer(submit_off_thread, wait_for_completion);