rsx/vk: Implement batched transform constant updates

This commit is contained in:
kd-11 2024-04-23 02:21:19 +03:00 committed by kd-11
parent ac6f77a744
commit a09111052a
10 changed files with 114 additions and 59 deletions

View File

@ -39,7 +39,7 @@ std::tuple<u32, u32, u32> write_index_array_data_to_buffer(std::span<std::byte>
void write_index_array_for_non_indexed_non_native_primitive_to_buffer(char* dst, rsx::primitive_type draw_mode, unsigned count);
// Copy and swap data in 32-bit units
extern void(*const copy_data_swap_u32)(u32*, const u32*, u32);
extern void(*const copy_data_swap_u32)(u32* dst, const u32* src, u32 count);
// Copy and swap data in 32-bit units, return true if changed
extern bool(*const copy_data_swap_u32_cmp)(u32*, const u32*, u32);
extern bool(*const copy_data_swap_u32_cmp)(u32* dst, const u32* src, u32 count);

View File

@ -15,7 +15,7 @@ namespace rsx
// virtual void begin() = 0;
// virtual void end() = 0;
// Patch transform constants
virtual void patch_transform_constants(context* ctx, u32 first_index, const std::span<u32>& data) {};
// Patch transform constants. Units are in 32x4 units
virtual void patch_transform_constants(context* /*ctx*/, u32 /*index*/, u32 /*count*/) {};
};
}

View File

@ -3,6 +3,7 @@
#include "Emu/RSX/rsx_methods.h" // FIXME
#include "Emu/RSX/rsx_utils.h"
#include "Emu/RSX/RSXThread.h"
#include "Emu/RSX/Common/BufferUtils.h"
#include "Emu/RSX/NV47/HW/context.h"
#include "Emu/RSX/NV47/HW/nv4097.h"
@ -115,33 +116,45 @@ namespace rsx
switch (barrier.type)
{
case primitive_restart_barrier:
{
break;
}
case index_base_modifier_barrier:
{
// Change index base offset
REGS(ctx)->decode(NV4097_SET_VERTEX_DATA_BASE_INDEX, barrier.arg0);
result |= index_base_changed;
break;
}
case vertex_base_modifier_barrier:
{
// Change vertex base offset
REGS(ctx)->decode(NV4097_SET_VERTEX_DATA_BASE_OFFSET, barrier.arg0);
result |= vertex_base_changed;
break;
}
case vertex_array_offset_modifier_barrier:
{
// Change vertex array offset
REGS(ctx)->decode(NV4097_SET_VERTEX_DATA_ARRAY_OFFSET + barrier.index, barrier.arg0);
result |= vertex_arrays_changed;
break;
}
case transform_constant_load_modifier_barrier:
{
// Change the transform load target. Does not change result mask.
REGS(ctx)->decode(NV4097_SET_TRANSFORM_PROGRAM_LOAD, barrier.arg0);
break;
}
case transform_constant_update_barrier:
{
// Update transform constants
// REGS(ctx)->decode(NV4097_SET_TRANSFORM_CONSTANT + barrier.index, barrier.arg); // This statement technically does the right thing but has no consequence other than wasting perf.
// FIXME: Batching
nv4097::set_transform_constant::decode_one(ctx, NV4097_SET_TRANSFORM_CONSTANT + barrier.index, barrier.arg0);
auto ptr = RSX(ctx)->fifo_ctrl->translate_address(barrier.arg0);
auto buffer = std::span<const u32>(static_cast<const u32*>(vm::base(ptr)), barrier.arg1);
nv4097::set_transform_constant::batch_decode(ctx, NV4097_SET_TRANSFORM_CONSTANT + barrier.index, buffer);
result |= transform_constants_changed;
break;
}
default:
fmt::throw_exception("Unreachable");
}

View File

@ -30,20 +30,22 @@ namespace rsx
REGS(ctx)->transform_constants[load + constant_id][subreg] = arg;
}
void set_transform_constant::batch_decode(context* ctx, u32 reg, const std::span<const u32>& args)
{
const u32 index = reg - NV4097_SET_TRANSFORM_CONSTANT;
const u32 constant_id = index / 4;
const u8 subreg = index % 4;
const u32 load = REGS(ctx)->transform_constant_load();
auto dst = &REGS(ctx)->transform_constants[load + constant_id][subreg];
copy_data_swap_u32(dst, args.data(), ::size32(args));
const u32 last_constant_id = ((reg + ::size32(args) + 3) - NV4097_SET_TRANSFORM_CONSTANT) / 4; // Aligned div
RSX(ctx)->patch_transform_constants(ctx, load + constant_id, last_constant_id - constant_id);
}
void set_transform_constant::impl(context* ctx, u32 reg, u32 arg)
{
if (RSX(ctx)->in_begin_end && !REGS(ctx)->current_draw_clause.empty())
{
// Updating constants mid-draw is messy. Push attr barrier.
REGS(ctx)->current_draw_clause.insert_command_barrier(
rsx::transform_constant_update_barrier,
arg,
0,
reg - NV4097_SET_TRANSFORM_CONSTANT
);
return;
}
const u32 index = reg - NV4097_SET_TRANSFORM_CONSTANT;
const u32 constant_id = index / 4;
const u8 subreg = index % 4;
@ -73,6 +75,20 @@ namespace rsx
rcount = 0;
}
if (RSX(ctx)->in_begin_end && !REGS(ctx)->current_draw_clause.empty())
{
// Updating constants mid-draw is messy. Defer the writes
REGS(ctx)->current_draw_clause.insert_command_barrier(
rsx::transform_constant_update_barrier,
RSX(ctx)->fifo_ctrl->get_pos(),
rcount,
reg - NV4097_SET_TRANSFORM_CONSTANT
);
RSX(ctx)->fifo_ctrl->skip_methods(rcount - 1);
return;
}
const auto values = &REGS(ctx)->transform_constants[load + constant_id][subreg];
const auto fifo_span = RSX(ctx)->fifo_ctrl->get_current_arg_ptr();

View File

@ -6,6 +6,8 @@
#include "Emu/RSX/gcm_enums.h"
#include "Emu/RSX/NV47/FW/draw_call.inc.h"
#include <span>
namespace rsx
{
enum command_barrier_type : u32;
@ -201,6 +203,8 @@ namespace rsx
static void impl(context* ctx, u32 reg, u32 arg);
static void decode_one(context* ctx, u32 reg, u32 arg);
static void batch_decode(context* ctx, u32 reg, const std::span<const u32>& args);
};
struct set_transform_program

View File

@ -29,6 +29,11 @@ namespace rsx
m_iotable = &pctrl->iomap_table;
}
u32 FIFO_control::translate_address(u32 address) const
{
return m_iotable->get_addr(address);
}
void FIFO_control::sync_get() const
{
m_ctrl->get.release(m_internal_get);

View File

@ -151,6 +151,8 @@ namespace rsx
FIFO_control(rsx::thread* pctrl);
~FIFO_control() = default;
u32 translate_address(u32 addr) const;
std::pair<bool, u32> fetch_u32(u32 addr);
void invalidate_cache() { m_cache_size = 0; }

View File

@ -732,43 +732,6 @@ void VKGSRender::emit_geometry(u32 sub_index)
}
}
if (state_flags & rsx::transform_constants_changed)
{
auto allocate_mem = [&](usz size) -> std::pair<void*, usz>
{
vertex_scratchpad.resize(size);
return { vertex_scratchpad.data(), size };
};
rsx::io_buffer iobuf(allocate_mem);
upload_transform_constants(iobuf);
ensure(iobuf.size() >= m_vertex_constants_buffer_info.range);
vk::insert_buffer_memory_barrier(
*m_current_command_buffer,
m_vertex_constants_buffer_info.buffer,
m_vertex_constants_buffer_info.offset,
m_vertex_constants_buffer_info.range,
VK_PIPELINE_STAGE_VERTEX_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_ACCESS_MEMORY_READ_BIT, VK_ACCESS_TRANSFER_WRITE_BIT);
vkCmdUpdateBuffer(
*m_current_command_buffer,
m_vertex_constants_buffer_info.buffer,
m_vertex_constants_buffer_info.offset,
m_vertex_constants_buffer_info.range,
iobuf.data());
vk::insert_buffer_memory_barrier(
*m_current_command_buffer,
m_vertex_constants_buffer_info.buffer,
m_vertex_constants_buffer_info.offset,
m_vertex_constants_buffer_info.range,
VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_VERTEX_SHADER_BIT,
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_MEMORY_READ_BIT);
}
if ((state_flags & vertex_state_mask) && !m_vertex_layout.validate())
{
// No vertex inputs enabled

View File

@ -15,6 +15,7 @@
#include "vkutils/scratch.h"
#include "Emu/RSX/rsx_methods.h"
#include "Emu/RSX/NV47/HW/context_accessors.define.h"
#include "Emu/Memory/vm_locking.h"
#include "../Program/program_state_cache2.hpp"
@ -2354,9 +2355,60 @@ void VKGSRender::update_vertex_env(u32 id, const vk::vertex_upload_info& vertex_
m_vertex_layout_ring_info.unmap();
}
void VKGSRender::patch_transform_constants(rsx::context* ctx, u32 first_index, const std::span<u32>& data)
void VKGSRender::patch_transform_constants(rsx::context* ctx, u32 index, u32 count)
{
// Hot-patching transform constants mid-draw (instanced draw)
utils::address_range data_range;
void* data_source = nullptr;
if (!m_vertex_prog || m_vertex_prog->has_indexed_constants)
{
// We're working with a full range. We can do a direct patch in this case since no index translation is required.
const auto byte_count = count * 16;
const auto byte_offset = index * 16;
data_range = utils::address_range::start_length(m_vertex_constants_buffer_info.offset + byte_offset, byte_count);
data_source = &REGS(ctx)->transform_constants[index];
}
else
{
// Indexed. This is a bit trickier. Use scratchpad to avoid UAF
auto allocate_mem = [&](usz size) -> std::pair<void*, usz>
{
scratchpad.resize(size);
return { scratchpad.data(), size };
};
rsx::io_buffer iobuf(allocate_mem);
upload_transform_constants(iobuf);
ensure(iobuf.size() >= m_vertex_constants_buffer_info.range);
data_range = utils::address_range::start_length(m_vertex_constants_buffer_info.offset, m_vertex_constants_buffer_info.range);
data_source = iobuf.data();
}
vk::insert_buffer_memory_barrier(
*m_current_command_buffer,
m_vertex_constants_buffer_info.buffer,
data_range.start,
data_range.length(),
VK_PIPELINE_STAGE_VERTEX_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
VK_ACCESS_MEMORY_READ_BIT, VK_ACCESS_TRANSFER_WRITE_BIT);
vkCmdUpdateBuffer(
*m_current_command_buffer,
m_vertex_constants_buffer_info.buffer,
data_range.start,
data_range.length(),
data_source);
vk::insert_buffer_memory_barrier(
*m_current_command_buffer,
m_vertex_constants_buffer_info.buffer,
data_range.start,
data_range.length(),
VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_VERTEX_SHADER_BIT,
VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_MEMORY_READ_BIT);
}
void VKGSRender::init_buffers(rsx::framebuffer_creation_context context, bool)

View File

@ -234,7 +234,7 @@ private:
VkDescriptorSet allocate_descriptor_set();
vk::vertex_upload_info upload_vertex_data();
rsx::simple_array<u8> vertex_scratchpad;
rsx::simple_array<u8> scratchpad;
bool load_program();
void load_program_env();
@ -277,7 +277,7 @@ public:
inline std::pair<volatile vk::host_data_t*, VkBuffer> map_host_object_data() { return { m_host_data_ptr, m_host_object_data->value }; }
// GRAPH backend
void patch_transform_constants(rsx::context* ctx, u32 first_index, const std::span<u32>& data) override;
void patch_transform_constants(rsx::context* ctx, u32 index, u32 count) override;
protected:
void clear_surface(u32 mask) override;