vk: Solve GPU hang/reset due to waiting on events that are never signaled

- TODO: Some refactoring may be required to pair the primary and secondary CB and avoid such blunders
This commit is contained in:
kd-11 2021-03-10 00:46:03 +03:00 committed by kd-11
parent 3e8a00d264
commit 608f8de347
3 changed files with 62 additions and 6 deletions

View File

@ -30,6 +30,17 @@ namespace vk
{
for (auto&& job : m_event_queue.pop_all())
{
if (job->type == xqueue_event_type::barrier)
{
// Blocks the queue from progressing until the work items are actually submitted to the GPU
// Avoids spamming the GPU with event requests when the events have not even been submitted yet
while (job->completion_eid == m_submit_count.load())
{
thread_ctrl::wait_for(100);
}
continue;
}
vk::wait_for_event(job->queue1_signal.get(), GENERAL_WAIT_TIMEOUT);
job->queue2_signal->host_signal();
}
@ -62,7 +73,12 @@ namespace vk
{
auto ev1 = std::make_unique<event>(*get_current_renderer(), sync_domain::gpu);
auto ev2 = std::make_unique<event>(*get_current_renderer(), sync_domain::gpu);
m_events_pool.emplace_back(ev1, ev2, 0ull);
m_events_pool.emplace_back(ev1, ev2, 0ull, i);
}
for (usz i = 0; i < VK_MAX_ASYNC_COMPUTE_QUEUES; ++i)
{
m_barriers_pool.emplace_back(0ull, 0xFFFF0000 + i);
}
}
@ -80,6 +96,7 @@ namespace vk
ensure(sync_label->completion_eid <= vk::last_completed_event_id());
m_sync_label_debug_uid = sync_label->uid;
sync_label->queue1_signal->reset();
sync_label->queue2_signal->reset();
sync_label->completion_eid = vk::current_event_id();
@ -143,6 +160,11 @@ namespace vk
}
}
// 3. Insert a barrier for this CB. A job is about to be scheduled on it immediately.
auto barrier = &m_barriers_pool[m_next_cb_index];
barrier->completion_eid = m_submit_count;
m_event_queue.push(barrier);
m_next_cb_index++;
return m_current_cb;
}
@ -160,6 +182,11 @@ namespace vk
return std::exchange(m_sync_label, nullptr);
}
u64 AsyncTaskScheduler::get_primary_sync_label_debug_uid()
{
return std::exchange(m_sync_label_debug_uid, ~0ull);
}
void AsyncTaskScheduler::flush(VkBool32 force_flush, VkSemaphore wait_semaphore, VkPipelineStageFlags wait_dst_stage_mask)
{
if (!m_current_cb)
@ -176,6 +203,9 @@ namespace vk
m_current_cb->end();
m_current_cb->submit(get_current_renderer()->get_transfer_queue(), wait_semaphore, VK_NULL_HANDLE, nullptr, wait_dst_stage_mask, force_flush);
m_submit_count++;
thread_ctrl::notify(g_fxo->get<async_scheduler_thread>());
m_last_used_cb = m_current_cb;
m_current_cb = nullptr;
m_sync_required = false;

View File

@ -9,15 +9,32 @@
namespace vk
{
enum class xqueue_event_type
{
label,
barrier
};
struct xqueue_event
{
// Type
xqueue_event_type type;
// Payload
std::unique_ptr<event> queue1_signal;
std::unique_ptr<event> queue2_signal;
u64 completion_eid;
xqueue_event(): completion_eid(0) {}
xqueue_event(std::unique_ptr<event>& trigger, std::unique_ptr<event>& payload, u64 eid)
: queue1_signal(std::move(trigger)), queue2_signal(std::move(payload)), completion_eid(eid)
// Identifiers
u64 completion_eid;
u64 uid;
xqueue_event(u64 eid, u64 _uid)
: type(xqueue_event_type::barrier), completion_eid(eid), uid(_uid)
{}
xqueue_event(std::unique_ptr<event>& trigger, std::unique_ptr<event>& payload, u64 eid, u64 _uid)
: type(xqueue_event_type::label), queue1_signal(std::move(trigger)), queue2_signal(std::move(payload)),
completion_eid(eid), uid(_uid)
{}
};
@ -31,6 +48,8 @@ namespace vk
command_buffer* m_last_used_cb = nullptr;
command_buffer* m_current_cb = nullptr;
usz m_next_cb_index = 0;
std::vector<xqueue_event> m_barriers_pool;
atomic_t<u64> m_submit_count = 0;
// Scheduler
shared_mutex m_config_mutex;
@ -40,6 +59,7 @@ namespace vk
// Sync
event* m_sync_label = nullptr;
atomic_t<bool> m_sync_required = false;
u64 m_sync_label_debug_uid = 0;
static constexpr u32 events_pool_size = 16384;
std::vector<xqueue_event> m_events_pool;
@ -58,6 +78,7 @@ namespace vk
command_buffer* get_current();
event* get_primary_sync_label();
u64 get_primary_sync_label_debug_uid();
void flush(VkBool32 force_flush, VkSemaphore wait_semaphore = VK_NULL_HANDLE, VkPipelineStageFlags wait_dst_stage_mask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT);
void kill();

View File

@ -1,8 +1,9 @@
#pragma once
#include "VKAsyncScheduler.h"
#include "VKDMA.h"
#include "VKRenderTargets.h"
#include "VKResourceManager.h"
#include "VKDMA.h"
#include "vkutils/image_helpers.h"
#include "../Common/texture_cache.h"
@ -1062,6 +1063,10 @@ namespace vk
if (cmd.access_hint != vk::command_buffer::access_type_hint::all)
{
// Flush any pending async jobs in case of blockers
// TODO: Context-level manager should handle this logic
g_fxo->get<async_scheduler_thread>().flush(VK_TRUE);
// Primary access command queue, must restart it after
vk::fence submit_fence(*m_device);
cmd.submit(m_submit_queue, VK_NULL_HANDLE, VK_NULL_HANDLE, &submit_fence, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_TRUE);