rsx: Fix offloader deadlock

- Do not allow offloader to handle its own faults. Serialize them on RSX instead.
  This approach introduces a GPU race condition that should be avoided with improved synchronization.
- TODO: Use proper GPU-side synchronization to avoid this situation
This commit is contained in:
kd-11 2019-08-25 18:47:49 +03:00 committed by kd-11
parent b70908c8f3
commit 9d981de96d
8 changed files with 183 additions and 72 deletions

View File

@ -895,7 +895,6 @@ void GLGSRender::on_init_thread()
m_video_output_pass.create();
m_gl_texture_cache.initialize();
m_thread_id = std::this_thread::get_id();
if (!supports_native_ui)
{
@ -1821,7 +1820,7 @@ void GLGSRender::flip(int buffer, bool emu_flip)
bool GLGSRender::on_access_violation(u32 address, bool is_writing)
{
const bool can_flush = (std::this_thread::get_id() == m_thread_id);
const bool can_flush = (std::this_thread::get_id() == m_rsx_thread);
const rsx::invalidation_cause cause =
is_writing ? (can_flush ? rsx::invalidation_cause::write : rsx::invalidation_cause::deferred_write)
: (can_flush ? rsx::invalidation_cause::read : rsx::invalidation_cause::deferred_read);
@ -1848,14 +1847,13 @@ bool GLGSRender::on_access_violation(u32 address, bool is_writing)
return true;
}
void GLGSRender::on_invalidate_memory_range(const utils::address_range &range)
void GLGSRender::on_invalidate_memory_range(const utils::address_range &range, rsx::invalidation_cause cause)
{
//Discard all memory in that range without bothering with writeback (Force it for strict?)
gl::command_context cmd{ gl_state };
auto data = std::move(m_gl_texture_cache.invalidate_range(cmd, range, rsx::invalidation_cause::unmap));
auto data = std::move(m_gl_texture_cache.invalidate_range(cmd, range, cause));
AUDIT(data.empty());
if (data.violation_handled)
if (cause == rsx::invalidation_cause::unmap && data.violation_handled)
{
m_gl_texture_cache.purge_unreleased_sections();
{
@ -1865,6 +1863,14 @@ void GLGSRender::on_invalidate_memory_range(const utils::address_range &range)
}
}
void GLGSRender::on_semaphore_acquire_wait()
{
if (!work_queue.empty())
{
do_local_task(rsx::FIFO_state::lock_wait);
}
}
void GLGSRender::do_local_task(rsx::FIFO_state state)
{
if (!work_queue.empty())

View File

@ -327,8 +327,6 @@ private:
shared_mutex queue_guard;
std::list<work_item> work_queue;
std::thread::id m_thread_id;
GLProgramBuffer m_prog_buffer;
draw_context_t m_decompiler_context;
@ -397,8 +395,9 @@ protected:
void do_local_task(rsx::FIFO_state state) override;
bool on_access_violation(u32 address, bool is_writing) override;
void on_invalidate_memory_range(const utils::address_range &range) override;
void on_invalidate_memory_range(const utils::address_range &range, rsx::invalidation_cause cause) override;
void notify_tile_unbound(u32 tile) override;
void on_semaphore_acquire_wait() override;
std::array<std::vector<gsl::byte>, 4> copy_render_targets_to_memory() override;
std::array<std::vector<gsl::byte>, 2> copy_depth_stencil_buffer_to_memory() override;

View File

@ -3,6 +3,8 @@
#include "Common/BufferUtils.h"
#include "Emu/System.h"
#include "RSXOffload.h"
#include "RSXThread.h"
#include "rsx_utils.h"
#include <thread>
#include <atomic>
@ -27,6 +29,9 @@ namespace rsx
return;
}
// Register thread id
m_thread_id = std::this_thread::get_id();
if (g_cfg.core.thread_scheduler_enabled)
{
thread_ctrl::set_thread_affinity_mask(thread_ctrl::get_affinity_mask(thread_class::rsx));
@ -36,22 +41,21 @@ namespace rsx
{
if (m_enqueued_count.load() != m_processed_count)
{
for (auto slice = m_work_queue.pop_all(); slice; slice.pop_front())
for (m_current_job = m_work_queue.pop_all(); m_current_job; m_current_job.pop_front())
{
auto task = *slice;
switch (task.type)
switch (m_current_job->type)
{
case raw_copy:
memcpy(task.dst, task.src, task.length);
memcpy(m_current_job->dst, m_current_job->src, m_current_job->length);
break;
case vector_copy:
memcpy(task.dst, task.opt_storage.data(), task.length);
memcpy(m_current_job->dst, m_current_job->opt_storage.data(), m_current_job->length);
break;
case index_emulate:
write_index_array_for_non_indexed_non_native_primitive_to_buffer(
reinterpret_cast<char*>(task.dst),
static_cast<rsx::primitive_type>(task.aux_param0),
task.length);
reinterpret_cast<char*>(m_current_job->dst),
static_cast<rsx::primitive_type>(m_current_job->aux_param0),
m_current_job->length);
break;
default:
ASSUME(0);
@ -116,6 +120,11 @@ namespace rsx
}
// Synchronization
bool dma_manager::is_current_thread() const
{
return (std::this_thread::get_id() == m_thread_id);
}
void dma_manager::sync()
{
if (LIKELY(m_enqueued_count.load() == m_processed_count))
@ -124,8 +133,25 @@ namespace rsx
return;
}
while (m_enqueued_count.load() != m_processed_count)
_mm_pause();
if (auto rsxthr = get_current_renderer(); rsxthr->is_current_thread())
{
if (m_mem_fault_flag)
{
// Abort if offloader is in recovery mode
return;
}
while (m_enqueued_count.load() != m_processed_count)
{
rsxthr->on_semaphore_acquire_wait();
_mm_pause();
}
}
else
{
while (m_enqueued_count.load() != m_processed_count)
_mm_pause();
}
}
void dma_manager::join()
@ -133,4 +159,50 @@ namespace rsx
m_worker_state = thread_state::finished;
sync();
}
void dma_manager::set_mem_fault_flag()
{
verify("Access denied" HERE), is_current_thread();
m_mem_fault_flag.release(true);
}
void dma_manager::clear_mem_fault_flag()
{
verify("Access denied" HERE), is_current_thread();
m_mem_fault_flag.release(false);
}
// Fault recovery
utils::address_range dma_manager::get_fault_range(bool writing) const
{
verify(HERE), m_current_job;
void *address = nullptr;
u32 range = m_current_job->length;
switch (m_current_job->type)
{
case raw_copy:
address = (writing) ? m_current_job->dst : m_current_job->src;
break;
case vector_copy:
verify(HERE), writing;
address = m_current_job->dst;
break;
case index_emulate:
verify(HERE), writing;
address = m_current_job->dst;
range = get_index_count(static_cast<rsx::primitive_type>(m_current_job->aux_param0), m_current_job->length);
break;
default:
ASSUME(0);
fmt::throw_exception("Unreachable" HERE);
}
const uintptr_t addr = uintptr_t(address);
const uintptr_t base = uintptr_t(vm::g_base_addr);
verify(HERE), addr > base;
return utils::address_range::start_length(u32(addr - base), range);
}
}

View File

@ -3,9 +3,11 @@
#include "Utilities/types.h"
#include "Utilities/lockless.h"
#include "Utilities/Thread.h"
#include "Utilities/address_range.h"
#include "gcm_enums.h"
#include <vector>
#include <thread>
namespace rsx
{
@ -42,9 +44,12 @@ namespace rsx
};
lf_queue<transport_packet> m_work_queue;
lf_queue_slice<transport_packet> m_current_job;
atomic_t<u64> m_enqueued_count{ 0 };
volatile u64 m_processed_count = 0;
thread_state m_worker_state = thread_state::detached;
std::thread::id m_thread_id;
atomic_t<bool> m_mem_fault_flag{ false };
// TODO: Improved benchmarks here; value determined by profiling on a Ryzen CPU, rounded to the nearest 512 bytes
const u32 max_immediate_transfer_size = 3584;
@ -63,8 +68,14 @@ namespace rsx
void emulate_as_indexed(void *dst, rsx::primitive_type primitive, u32 count);
// Synchronization
bool is_current_thread() const;
void sync();
void join();
void set_mem_fault_flag();
void clear_mem_fault_flag();
// Fault recovery
utils::address_range get_fault_range(bool writing) const;
};
extern dma_manager g_dma_manager;

View File

@ -917,27 +917,6 @@ namespace rsx
fmt::throw_exception("ill-formed draw command" HERE);
}
void thread::do_internal_task()
{
if (m_internal_tasks.empty())
{
std::this_thread::yield();
}
else
{
fmt::throw_exception("Disabled" HERE);
//std::lock_guard lock(m_mtx_task);
//internal_task_entry &front = m_internal_tasks.front();
//if (front.callback())
//{
// front.promise.set_value();
// m_internal_tasks.pop_front();
//}
}
}
void thread::do_local_task(FIFO_state state)
{
if (async_flip_requested & flip_request::emu_requested)
@ -2465,7 +2444,7 @@ namespace rsx
if (!m_invalidated_memory_range.valid())
return;
on_invalidate_memory_range(m_invalidated_memory_range);
on_invalidate_memory_range(m_invalidated_memory_range, rsx::invalidation_cause::unmap);
m_invalidated_memory_range.invalidate();
}

View File

@ -12,6 +12,7 @@
#include "rsx_methods.h"
#include "rsx_utils.h"
#include "Overlays/overlays.h"
#include "Common/texture_cache_utils.h"
#include "Utilities/Thread.h"
#include "Utilities/geometry.h"
@ -418,8 +419,8 @@ namespace rsx
protected:
std::thread::id m_rsx_thread;
atomic_t<bool> m_rsx_thread_exiting{true};
s32 m_return_addr{-1}, restore_ret{-1};
atomic_t<bool> m_rsx_thread_exiting{ true };
std::array<push_buffer_vertex_info, 16> vertex_push_buffers;
std::vector<u32> element_push_buffer;
@ -433,6 +434,7 @@ namespace rsx
// FIFO
std::unique_ptr<FIFO::FIFO_control> fifo_ctrl;
FIFO::flattening_helper m_flattener;
s32 m_return_addr{ -1 }, restore_ret{ -1 };
// Occlusion query
bool zcull_surface_active = false;
@ -605,7 +607,7 @@ namespace rsx
virtual void flip(int buffer, bool emu_flip = false) = 0;
virtual u64 timestamp();
virtual bool on_access_violation(u32 /*address*/, bool /*is_writing*/) { return false; }
virtual void on_invalidate_memory_range(const address_range & /*range*/) {}
virtual void on_invalidate_memory_range(const address_range & /*range*/, rsx::invalidation_cause) {}
virtual void notify_tile_unbound(u32 /*tile*/) {}
// zcull
@ -661,18 +663,6 @@ namespace rsx
private:
shared_mutex m_mtx_task;
struct internal_task_entry
{
std::function<bool()> callback;
//std::promise<void> promise;
internal_task_entry(std::function<bool()> callback) : callback(std::move(callback))
{
}
};
std::deque<internal_task_entry> m_internal_tasks;
void do_internal_task();
void handle_emu_flip(u32 buffer);
void handle_invalidated_memory_range();
@ -732,7 +722,7 @@ namespace rsx
/**
* Notify to check internal state during semaphore wait
*/
void on_semaphore_acquire_wait() { do_local_task(FIFO_state::lock_wait); }
virtual void on_semaphore_acquire_wait() {}
/**
* Copy rtt values to buffer.
@ -767,7 +757,10 @@ namespace rsx
void pause();
void unpause();
//Get RSX approximate load in %
// Get RSX approximate load in %
u32 get_load();
// Returns true if the current thread is the active RSX thread
bool is_current_thread() const { return std::this_thread::get_id() == m_rsx_thread; }
};
}

View File

@ -662,10 +662,29 @@ bool VKGSRender::on_access_violation(u32 address, bool is_writing)
if (result.num_flushable > 0)
{
const bool is_rsxthr = std::this_thread::get_id() == m_rsx_thread;
bool has_queue_ref = false;
if (rsx::g_dma_manager.is_current_thread())
{
// The offloader thread cannot handle flush requests
verify(HERE), m_queue_status.load() == flush_queue_state::ok;
if (!is_rsxthr)
m_offloader_fault_range = rsx::g_dma_manager.get_fault_range(is_writing);
m_offloader_fault_cause = (is_writing) ? rsx::invalidation_cause::write : rsx::invalidation_cause::read;
rsx::g_dma_manager.set_mem_fault_flag();
m_queue_status |= flush_queue_state::deadlock;
// Wait for deadlock to clear
while (m_queue_status & flush_queue_state::deadlock)
{
_mm_pause();
}
rsx::g_dma_manager.clear_mem_fault_flag();
return true;
}
bool has_queue_ref = false;
if (!is_current_thread())
{
//Always submit primary cb to ensure state consistency (flush pending changes such as image transitions)
vm::temporary_unlock();
@ -703,14 +722,14 @@ bool VKGSRender::on_access_violation(u32 address, bool is_writing)
return true;
}
void VKGSRender::on_invalidate_memory_range(const utils::address_range &range)
void VKGSRender::on_invalidate_memory_range(const utils::address_range &range, rsx::invalidation_cause cause)
{
std::lock_guard lock(m_secondary_cb_guard);
auto data = std::move(m_texture_cache.invalidate_range(m_secondary_command_buffer, range, rsx::invalidation_cause::unmap));
auto data = std::move(m_texture_cache.invalidate_range(m_secondary_command_buffer, range, cause));
AUDIT(data.empty());
if (data.violation_handled)
if (cause == rsx::invalidation_cause::unmap && data.violation_handled)
{
m_texture_cache.purge_unreleased_sections();
{
@ -720,6 +739,14 @@ void VKGSRender::on_invalidate_memory_range(const utils::address_range &range)
}
}
void VKGSRender::on_semaphore_acquire_wait()
{
if (m_flush_requests.pending() || m_queue_status & flush_queue_state::deadlock)
{
do_local_task(rsx::FIFO_state::lock_wait);
}
}
void VKGSRender::notify_tile_unbound(u32 tile)
{
//TODO: Handle texture writeback
@ -2326,16 +2353,28 @@ void VKGSRender::frame_context_cleanup(frame_context_t *ctx, bool free_resources
void VKGSRender::do_local_task(rsx::FIFO_state state)
{
if (m_queue_status & flush_queue_state::deadlock)
{
// Clear offloader deadlock
// NOTE: It is not possible to handle regular flush requests before this is cleared
// NOTE: This may cause graphics corruption due to unsynchronized modification
flush_command_queue();
on_invalidate_memory_range(m_offloader_fault_range, m_offloader_fault_cause);
m_queue_status.clear(flush_queue_state::deadlock);
}
if (m_flush_requests.pending())
{
std::lock_guard lock(m_flush_queue_mutex);
if (m_flush_queue_mutex.try_lock())
{
// TODO: Determine if a hard sync is necessary
// Pipeline barriers later may do a better job synchronizing than wholly stalling the pipeline
flush_command_queue();
//TODO: Determine if a hard sync is necessary
//Pipeline barriers later may do a better job synchronizing than wholly stalling the pipeline
flush_command_queue();
m_flush_requests.clear_pending_flag();
m_flush_requests.consumer_wait();
m_flush_requests.clear_pending_flag();
m_flush_requests.consumer_wait();
m_flush_queue_mutex.unlock();
}
}
else if (!in_begin_end && state != rsx::FIFO_state::lock_wait)
{

View File

@ -294,6 +294,12 @@ struct flush_request_task
}
};
enum flush_queue_state : u32
{
ok = 0,
deadlock = 1
};
class VKGSRender : public GSRender, public ::rsx::reports::ZCULL_control
{
private:
@ -404,6 +410,11 @@ private:
shared_mutex m_flush_queue_mutex;
flush_request_task m_flush_requests;
// Offloader thread deadlock recovery
rsx::atomic_bitmask_t<flush_queue_state> m_queue_status;
utils::address_range m_offloader_fault_range;
rsx::invalidation_cause m_offloader_fault_cause;
bool m_render_pass_open = false;
u64 m_current_renderpass_key = 0;
VkRenderPass m_cached_renderpass = VK_NULL_HANDLE;
@ -488,7 +499,8 @@ protected:
void notify_tile_unbound(u32 tile) override;
bool on_access_violation(u32 address, bool is_writing) override;
void on_invalidate_memory_range(const utils::address_range &range) override;
void on_invalidate_memory_range(const utils::address_range &range, rsx::invalidation_cause cause) override;
void on_semaphore_acquire_wait() override;
bool on_decompiler_task() override;
};