From 2dce55d036ed982f74a7583993e6f2639fa4ae30 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Tue, 13 Mar 2018 16:34:31 +0300 Subject: [PATCH] rsx: ZCULL synchronization fixes - Track asynchronous operations in RSX core - Add read barriers to force pending writes to finish. Fixes zcull delay flicker in all UE3 titles without forcing hard stall - Increase zcull latency as all writes should be synchronized now --- rpcs3/Emu/RSX/GL/GLGSRender.cpp | 5 +++ rpcs3/Emu/RSX/GL/GLGSRender.h | 1 + rpcs3/Emu/RSX/RSXThread.cpp | 76 ++++++++++++++++++++++++++------- rpcs3/Emu/RSX/RSXThread.h | 17 +++++--- rpcs3/Emu/RSX/gcm_printing.cpp | 2 +- rpcs3/Emu/RSX/rsx_methods.cpp | 10 ++++- 6 files changed, 87 insertions(+), 24 deletions(-) diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp index 909ad66bdf..ed6990976e 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp +++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp @@ -1574,6 +1574,11 @@ void GLGSRender::get_occlusion_query_result(rsx::reports::occlusion_query_info* } } +void GLGSRender::discard_occlusion_query(rsx::reports::occlusion_query_info* query) +{ + glEndQuery(GL_ANY_SAMPLES_PASSED); +} + void GLGSRender::shell_do_cleanup() { //TODO: Key cleanup requests with UID to identify resources to remove diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.h b/rpcs3/Emu/RSX/GL/GLGSRender.h index c853d5fded..d2f9bf08bf 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.h +++ b/rpcs3/Emu/RSX/GL/GLGSRender.h @@ -367,6 +367,7 @@ public: void end_occlusion_query(rsx::reports::occlusion_query_info* query) override; bool check_occlusion_query_status(rsx::reports::occlusion_query_info* query) override; void get_occlusion_query_result(rsx::reports::occlusion_query_info* query) override; + void discard_occlusion_query(rsx::reports::occlusion_query_info* query) override; protected: void begin() override; diff --git a/rpcs3/Emu/RSX/RSXThread.cpp b/rpcs3/Emu/RSX/RSXThread.cpp index e5f8008569..ae44525774 100644 --- a/rpcs3/Emu/RSX/RSXThread.cpp +++ b/rpcs3/Emu/RSX/RSXThread.cpp @@ -2092,7 +2092,7 @@ namespace rsx //Reset zcull ctrl zcull_ctrl->set_active(this, false); - zcull_ctrl->clear(); + zcull_ctrl->clear(this); if (zcull_ctrl->has_pending()) { @@ -2142,7 +2142,7 @@ namespace rsx if (g_cfg.video.disable_zcull_queries) return; - zcull_ctrl->clear(); + zcull_ctrl->clear(this); } void thread::get_zcull_stats(u32 type, vm::addr_t sink) @@ -2153,18 +2153,13 @@ namespace rsx switch (type) { case CELL_GCM_ZPASS_PIXEL_CNT: - { - zcull_ctrl->read_report(this, sink, type); - return; - } case CELL_GCM_ZCULL_STATS: case CELL_GCM_ZCULL_STATS1: case CELL_GCM_ZCULL_STATS2: case CELL_GCM_ZCULL_STATS3: { - //TODO - value = (type != CELL_GCM_ZCULL_STATS3)? UINT16_MAX : 0; - break; + zcull_ctrl->read_report(this, sink, type); + return; } default: LOG_ERROR(RSX, "Unknown zcull stat type %d", type); @@ -2181,6 +2176,14 @@ namespace rsx void thread::sync() { zcull_ctrl->sync(this); + + _mm_mfence(); + verify (HERE), async_tasks_pending.load() == 0; + } + + void thread::read_barrier(u32 memory_address, u32 memory_range) + { + zcull_ctrl->read_barrier(this, memory_address, memory_range); } void thread::notify_zcull_info_changed() @@ -2328,6 +2331,7 @@ namespace rsx m_pending_writes.push_back({}); m_pending_writes.back().query = m_current_task; + ptimer->async_tasks_pending++; } else { @@ -2342,7 +2346,7 @@ namespace rsx void ZCULL_control::read_report(::rsx::thread* ptimer, vm::addr_t sink, u32 type) { - if (m_current_task) + if (m_current_task && type == CELL_GCM_ZPASS_PIXEL_CNT) { m_current_task->owned = true; end_occlusion_query(m_current_task); @@ -2384,6 +2388,8 @@ namespace rsx break; } + + ptimer->async_tasks_pending++; } void ZCULL_control::allocate_new_query(::rsx::thread* ptimer) @@ -2436,7 +2442,7 @@ namespace rsx } } - void ZCULL_control::clear() + void ZCULL_control::clear(class ::rsx::thread* ptimer) { if (!m_pending_writes.empty()) { @@ -2449,6 +2455,7 @@ namespace rsx discard_occlusion_query(It->query); It->query->pending = false; valid_size--; + ptimer->async_tasks_pending--; continue; } @@ -2470,9 +2477,27 @@ namespace rsx m_cycles_delay = max_zcull_cycles_delay; } - void ZCULL_control::write(vm::addr_t sink, u32 timestamp, u32 value) + void ZCULL_control::write(vm::addr_t sink, u32 timestamp, u32 type, u32 value) { verify(HERE), sink; + + switch (type) + { + case CELL_GCM_ZPASS_PIXEL_CNT: + value = value ? UINT16_MAX : 0; + break; + case CELL_GCM_ZCULL_STATS3: + value = value ? 0 : UINT16_MAX; + break; + case CELL_GCM_ZCULL_STATS2: + case CELL_GCM_ZCULL_STATS1: + case CELL_GCM_ZCULL_STATS: + default: + //Not implemented + value = UINT32_MAX; + break; + } + vm::ptr out = sink; out->value = value; out->timer = timestamp; @@ -2520,7 +2545,7 @@ namespace rsx if (!writer.forwarder) //No other queries in the chain, write result - write(writer.sink, ptimer->timestamp(), result ? UINT16_MAX : 0); + write(writer.sink, ptimer->timestamp(), writer.type, result); processed++; } @@ -2555,10 +2580,13 @@ namespace rsx else It = m_statistics_map.erase(It); } + + //Decrement jobs counter + ptimer->async_tasks_pending -= processed; } //Critical, since its likely a WAIT_FOR_IDLE type has been processed, all results are considered available - m_cycles_delay = 2; + m_cycles_delay = min_zcull_cycles_delay; } void ZCULL_control::update(::rsx::thread* ptimer) @@ -2644,7 +2672,7 @@ namespace rsx //only zpass supported right now if (!writer.forwarder) //No other queries in the chain, write result - write(writer.sink, ptimer->timestamp(), result ? UINT16_MAX : 0); + write(writer.sink, ptimer->timestamp(), writer.type, result); processed++; } @@ -2669,6 +2697,24 @@ namespace rsx { m_pending_writes.resize(0); } + + ptimer->async_tasks_pending -= processed; + } + } + + void ZCULL_control::read_barrier(::rsx::thread* ptimer, u32 memory_address, u32 memory_range) + { + if (m_pending_writes.empty()) + return; + + const auto memory_end = memory_address + memory_range; + for (const auto &writer : m_pending_writes) + { + if (writer.sink >= memory_address && writer.sink < memory_end) + { + sync(ptimer); + return; + } } } } diff --git a/rpcs3/Emu/RSX/RSXThread.h b/rpcs3/Emu/RSX/RSXThread.h index b82de85e06..73cab9c067 100644 --- a/rpcs3/Emu/RSX/RSXThread.h +++ b/rpcs3/Emu/RSX/RSXThread.h @@ -185,9 +185,8 @@ namespace rsx struct ZCULL_control { //Delay in 'cycles' before a report update operation is forced to retire - //Larger values might give more performance but some engines (UE3) dont seem to wait for results and will flicker - //TODO: Determine the real max delay in real hardware - const u32 max_zcull_cycles_delay = 10; + const u32 max_zcull_cycles_delay = 128; + const u32 min_zcull_cycles_delay = 16; //Number of occlusion query slots available. Real hardware actually has far fewer units before choking const u32 occlusion_query_count = 128; @@ -200,7 +199,7 @@ namespace rsx occlusion_query_info* m_current_task = nullptr; u32 m_statistics_tag_id = 0; u32 m_tsc = 0; - u32 m_cycles_delay = 10; + u32 m_cycles_delay = max_zcull_cycles_delay; std::vector m_pending_writes; std::unordered_map m_statistics_map; @@ -211,7 +210,7 @@ namespace rsx void set_enabled(class ::rsx::thread* ptimer, bool enabled); void set_active(class ::rsx::thread* ptimer, bool active); - void write(vm::addr_t sink, u32 timestamp, u32 value); + void write(vm::addr_t sink, u32 timestamp, u32 type, u32 value); //Read current zcull statistics into the address provided void read_report(class ::rsx::thread* ptimer, vm::addr_t sink, u32 type); @@ -220,11 +219,14 @@ namespace rsx void allocate_new_query(class ::rsx::thread* ptimer); //clears current stat block and increments stat_tag_id - void clear(); + void clear(class ::rsx::thread* ptimer); //forcefully flushes all void sync(class ::rsx::thread* ptimer); + //conditionally sync any pending writes if range overlaps + void read_barrier(class ::rsx::thread* ptimer, u32 memory_address, u32 memory_range); + //call once every 'tick' to update void update(class ::rsx::thread* ptimer); @@ -367,6 +369,8 @@ namespace rsx bool sync_point_request = false; bool in_begin_end = false; + atomic_t async_tasks_pending{ 0 }; + bool conditional_render_test_failed = false; bool conditional_render_enabled = false; bool zcull_stats_enabled = false; @@ -412,6 +416,7 @@ namespace rsx //sync void sync(); + void read_barrier(u32 memory_address, u32 memory_range); gsl::span get_raw_index_array(const std::vector >& draw_indexed_clause) const; gsl::span get_raw_vertex_buffer(const rsx::data_array_format_info&, u32 base_offset, const std::vector>& vertex_ranges) const; diff --git a/rpcs3/Emu/RSX/gcm_printing.cpp b/rpcs3/Emu/RSX/gcm_printing.cpp index 980bf9f4bf..5d25ed0a2e 100644 --- a/rpcs3/Emu/RSX/gcm_printing.cpp +++ b/rpcs3/Emu/RSX/gcm_printing.cpp @@ -724,7 +724,7 @@ std::string rsx::get_method_name(const u32 id) return std::string("CELL_GCM_") + found->second; } - return fmt::format("Unknown/illegal method [0x%08x]", id); + return fmt::format("Unknown/illegal method [0x%08x]", id << 2); } // Various parameter pretty printing function diff --git a/rpcs3/Emu/RSX/rsx_methods.cpp b/rpcs3/Emu/RSX/rsx_methods.cpp index 3dc556b911..aea99d0c61 100644 --- a/rpcs3/Emu/RSX/rsx_methods.cpp +++ b/rpcs3/Emu/RSX/rsx_methods.cpp @@ -689,6 +689,9 @@ namespace rsx in_pitch = in_bpp * in_w; } + const auto read_address = get_address(src_offset, src_dma); + rsx->read_barrier(read_address, in_pitch * in_h); + if (dst_color_format != rsx::blit_engine::transfer_destination_format::r5g6b5 && dst_color_format != rsx::blit_engine::transfer_destination_format::a8r8g8b8) { @@ -933,7 +936,7 @@ namespace rsx namespace nv0039 { - void buffer_notify(thread*, u32, u32 arg) + void buffer_notify(thread *rsx, u32, u32 arg) { s32 in_pitch = method_registers.nv0039_input_pitch(); s32 out_pitch = method_registers.nv0039_output_pitch(); @@ -968,8 +971,11 @@ namespace rsx u32 dst_offset = method_registers.nv0039_output_offset(); u32 dst_dma = method_registers.nv0039_output_location(); + const auto read_address = get_address(src_offset, src_dma); + rsx->read_barrier(read_address, in_pitch * line_count); + u8 *dst = (u8*)vm::base(get_address(dst_offset, dst_dma)); - const u8 *src = (u8*)vm::base(get_address(src_offset, src_dma)); + const u8 *src = (u8*)vm::base(read_address); if (in_pitch == out_pitch && out_pitch == line_length) {