rsx: ZCULL synchronization fixes

- Track asynchronous operations in RSX core
- Add read barriers to force pending writes to finish.
  Fixes zcull delay flicker in all UE3 titles without forcing hard stall
- Increase zcull latency as all writes should be synchronized now
This commit is contained in:
kd-11 2018-03-13 16:34:31 +03:00
parent 315798b1f4
commit 2dce55d036
6 changed files with 87 additions and 24 deletions

View File

@ -1574,6 +1574,11 @@ void GLGSRender::get_occlusion_query_result(rsx::reports::occlusion_query_info*
}
}
void GLGSRender::discard_occlusion_query(rsx::reports::occlusion_query_info* query)
{
glEndQuery(GL_ANY_SAMPLES_PASSED);
}
void GLGSRender::shell_do_cleanup()
{
//TODO: Key cleanup requests with UID to identify resources to remove

View File

@ -367,6 +367,7 @@ public:
void end_occlusion_query(rsx::reports::occlusion_query_info* query) override;
bool check_occlusion_query_status(rsx::reports::occlusion_query_info* query) override;
void get_occlusion_query_result(rsx::reports::occlusion_query_info* query) override;
void discard_occlusion_query(rsx::reports::occlusion_query_info* query) override;
protected:
void begin() override;

View File

@ -2092,7 +2092,7 @@ namespace rsx
//Reset zcull ctrl
zcull_ctrl->set_active(this, false);
zcull_ctrl->clear();
zcull_ctrl->clear(this);
if (zcull_ctrl->has_pending())
{
@ -2142,7 +2142,7 @@ namespace rsx
if (g_cfg.video.disable_zcull_queries)
return;
zcull_ctrl->clear();
zcull_ctrl->clear(this);
}
void thread::get_zcull_stats(u32 type, vm::addr_t sink)
@ -2153,18 +2153,13 @@ namespace rsx
switch (type)
{
case CELL_GCM_ZPASS_PIXEL_CNT:
{
zcull_ctrl->read_report(this, sink, type);
return;
}
case CELL_GCM_ZCULL_STATS:
case CELL_GCM_ZCULL_STATS1:
case CELL_GCM_ZCULL_STATS2:
case CELL_GCM_ZCULL_STATS3:
{
//TODO
value = (type != CELL_GCM_ZCULL_STATS3)? UINT16_MAX : 0;
break;
zcull_ctrl->read_report(this, sink, type);
return;
}
default:
LOG_ERROR(RSX, "Unknown zcull stat type %d", type);
@ -2181,6 +2176,14 @@ namespace rsx
void thread::sync()
{
zcull_ctrl->sync(this);
_mm_mfence();
verify (HERE), async_tasks_pending.load() == 0;
}
void thread::read_barrier(u32 memory_address, u32 memory_range)
{
zcull_ctrl->read_barrier(this, memory_address, memory_range);
}
void thread::notify_zcull_info_changed()
@ -2328,6 +2331,7 @@ namespace rsx
m_pending_writes.push_back({});
m_pending_writes.back().query = m_current_task;
ptimer->async_tasks_pending++;
}
else
{
@ -2342,7 +2346,7 @@ namespace rsx
void ZCULL_control::read_report(::rsx::thread* ptimer, vm::addr_t sink, u32 type)
{
if (m_current_task)
if (m_current_task && type == CELL_GCM_ZPASS_PIXEL_CNT)
{
m_current_task->owned = true;
end_occlusion_query(m_current_task);
@ -2384,6 +2388,8 @@ namespace rsx
break;
}
ptimer->async_tasks_pending++;
}
void ZCULL_control::allocate_new_query(::rsx::thread* ptimer)
@ -2436,7 +2442,7 @@ namespace rsx
}
}
void ZCULL_control::clear()
void ZCULL_control::clear(class ::rsx::thread* ptimer)
{
if (!m_pending_writes.empty())
{
@ -2449,6 +2455,7 @@ namespace rsx
discard_occlusion_query(It->query);
It->query->pending = false;
valid_size--;
ptimer->async_tasks_pending--;
continue;
}
@ -2470,9 +2477,27 @@ namespace rsx
m_cycles_delay = max_zcull_cycles_delay;
}
void ZCULL_control::write(vm::addr_t sink, u32 timestamp, u32 value)
void ZCULL_control::write(vm::addr_t sink, u32 timestamp, u32 type, u32 value)
{
verify(HERE), sink;
switch (type)
{
case CELL_GCM_ZPASS_PIXEL_CNT:
value = value ? UINT16_MAX : 0;
break;
case CELL_GCM_ZCULL_STATS3:
value = value ? 0 : UINT16_MAX;
break;
case CELL_GCM_ZCULL_STATS2:
case CELL_GCM_ZCULL_STATS1:
case CELL_GCM_ZCULL_STATS:
default:
//Not implemented
value = UINT32_MAX;
break;
}
vm::ptr<CellGcmReportData> out = sink;
out->value = value;
out->timer = timestamp;
@ -2520,7 +2545,7 @@ namespace rsx
if (!writer.forwarder)
//No other queries in the chain, write result
write(writer.sink, ptimer->timestamp(), result ? UINT16_MAX : 0);
write(writer.sink, ptimer->timestamp(), writer.type, result);
processed++;
}
@ -2555,10 +2580,13 @@ namespace rsx
else
It = m_statistics_map.erase(It);
}
//Decrement jobs counter
ptimer->async_tasks_pending -= processed;
}
//Critical, since its likely a WAIT_FOR_IDLE type has been processed, all results are considered available
m_cycles_delay = 2;
m_cycles_delay = min_zcull_cycles_delay;
}
void ZCULL_control::update(::rsx::thread* ptimer)
@ -2644,7 +2672,7 @@ namespace rsx
//only zpass supported right now
if (!writer.forwarder)
//No other queries in the chain, write result
write(writer.sink, ptimer->timestamp(), result ? UINT16_MAX : 0);
write(writer.sink, ptimer->timestamp(), writer.type, result);
processed++;
}
@ -2669,6 +2697,24 @@ namespace rsx
{
m_pending_writes.resize(0);
}
ptimer->async_tasks_pending -= processed;
}
}
void ZCULL_control::read_barrier(::rsx::thread* ptimer, u32 memory_address, u32 memory_range)
{
if (m_pending_writes.empty())
return;
const auto memory_end = memory_address + memory_range;
for (const auto &writer : m_pending_writes)
{
if (writer.sink >= memory_address && writer.sink < memory_end)
{
sync(ptimer);
return;
}
}
}
}

View File

@ -185,9 +185,8 @@ namespace rsx
struct ZCULL_control
{
//Delay in 'cycles' before a report update operation is forced to retire
//Larger values might give more performance but some engines (UE3) dont seem to wait for results and will flicker
//TODO: Determine the real max delay in real hardware
const u32 max_zcull_cycles_delay = 10;
const u32 max_zcull_cycles_delay = 128;
const u32 min_zcull_cycles_delay = 16;
//Number of occlusion query slots available. Real hardware actually has far fewer units before choking
const u32 occlusion_query_count = 128;
@ -200,7 +199,7 @@ namespace rsx
occlusion_query_info* m_current_task = nullptr;
u32 m_statistics_tag_id = 0;
u32 m_tsc = 0;
u32 m_cycles_delay = 10;
u32 m_cycles_delay = max_zcull_cycles_delay;
std::vector<queued_report_write> m_pending_writes;
std::unordered_map<u32, u32> m_statistics_map;
@ -211,7 +210,7 @@ namespace rsx
void set_enabled(class ::rsx::thread* ptimer, bool enabled);
void set_active(class ::rsx::thread* ptimer, bool active);
void write(vm::addr_t sink, u32 timestamp, u32 value);
void write(vm::addr_t sink, u32 timestamp, u32 type, u32 value);
//Read current zcull statistics into the address provided
void read_report(class ::rsx::thread* ptimer, vm::addr_t sink, u32 type);
@ -220,11 +219,14 @@ namespace rsx
void allocate_new_query(class ::rsx::thread* ptimer);
//clears current stat block and increments stat_tag_id
void clear();
void clear(class ::rsx::thread* ptimer);
//forcefully flushes all
void sync(class ::rsx::thread* ptimer);
//conditionally sync any pending writes if range overlaps
void read_barrier(class ::rsx::thread* ptimer, u32 memory_address, u32 memory_range);
//call once every 'tick' to update
void update(class ::rsx::thread* ptimer);
@ -367,6 +369,8 @@ namespace rsx
bool sync_point_request = false;
bool in_begin_end = false;
atomic_t<s32> async_tasks_pending{ 0 };
bool conditional_render_test_failed = false;
bool conditional_render_enabled = false;
bool zcull_stats_enabled = false;
@ -412,6 +416,7 @@ namespace rsx
//sync
void sync();
void read_barrier(u32 memory_address, u32 memory_range);
gsl::span<const gsl::byte> get_raw_index_array(const std::vector<std::pair<u32, u32> >& draw_indexed_clause) const;
gsl::span<const gsl::byte> get_raw_vertex_buffer(const rsx::data_array_format_info&, u32 base_offset, const std::vector<std::pair<u32, u32>>& vertex_ranges) const;

View File

@ -724,7 +724,7 @@ std::string rsx::get_method_name(const u32 id)
return std::string("CELL_GCM_") + found->second;
}
return fmt::format("Unknown/illegal method [0x%08x]", id);
return fmt::format("Unknown/illegal method [0x%08x]", id << 2);
}
// Various parameter pretty printing function

View File

@ -689,6 +689,9 @@ namespace rsx
in_pitch = in_bpp * in_w;
}
const auto read_address = get_address(src_offset, src_dma);
rsx->read_barrier(read_address, in_pitch * in_h);
if (dst_color_format != rsx::blit_engine::transfer_destination_format::r5g6b5 &&
dst_color_format != rsx::blit_engine::transfer_destination_format::a8r8g8b8)
{
@ -933,7 +936,7 @@ namespace rsx
namespace nv0039
{
void buffer_notify(thread*, u32, u32 arg)
void buffer_notify(thread *rsx, u32, u32 arg)
{
s32 in_pitch = method_registers.nv0039_input_pitch();
s32 out_pitch = method_registers.nv0039_output_pitch();
@ -968,8 +971,11 @@ namespace rsx
u32 dst_offset = method_registers.nv0039_output_offset();
u32 dst_dma = method_registers.nv0039_output_location();
const auto read_address = get_address(src_offset, src_dma);
rsx->read_barrier(read_address, in_pitch * line_count);
u8 *dst = (u8*)vm::base(get_address(dst_offset, dst_dma));
const u8 *src = (u8*)vm::base(get_address(src_offset, src_dma));
const u8 *src = (u8*)vm::base(read_address);
if (in_pitch == out_pitch && out_pitch == line_length)
{