vulkan: Optimize vertex data upload

- Reuse buffer views as much as possible, vkCreateBufferView is slow on NV
  Implemented as a large sliding window, reuseable until it is filled
This commit is contained in:
kd-11 2018-02-21 20:50:27 +03:00
parent 01349b8cee
commit 8ccaabb502
8 changed files with 105 additions and 57 deletions

View File

@ -131,4 +131,9 @@ public:
else
fmt::throw_exception("m_put_pos == m_get_pos!" HERE);
}
size_t size() const
{
return m_size;
}
};

View File

@ -1034,7 +1034,7 @@ bool GLGSRender::check_program_state()
return (rsx::method_registers.shader_program_address() != 0);
}
void GLGSRender::load_program(const vertex_upload_info& upload_info)
void GLGSRender::load_program(const gl::vertex_upload_info& upload_info)
{
get_current_fragment_program(fs_sampler_state);
verify(HERE), current_fragment_program.valid;

View File

@ -21,6 +21,16 @@ namespace gl
using null_vertex_cache = vertex_cache;
using shader_cache = rsx::shaders_cache<void*, GLProgramBuffer>;
struct vertex_upload_info
{
u32 vertex_draw_count;
u32 allocated_vertex_count;
u32 vertex_index_base;
u32 persistent_mapping_offset;
u32 volatile_mapping_offset;
std::optional<std::tuple<GLenum, u32> > index_info;
};
}
struct work_item
@ -255,16 +265,6 @@ struct driver_state
}
};
struct vertex_upload_info
{
u32 vertex_draw_count;
u32 allocated_vertex_count;
u32 vertex_index_base;
u32 persistent_mapping_offset;
u32 volatile_mapping_offset;
std::optional<std::tuple<GLenum, u32> > index_info;
};
class GLGSRender : public GSRender
{
private:
@ -340,14 +340,14 @@ private:
driver_state gl_state;
// Return element to draw and in case of indexed draw index type and offset in index buffer
vertex_upload_info set_vertex_buffer();
gl::vertex_upload_info set_vertex_buffer();
rsx::vertex_input_layout m_vertex_layout = {};
void clear_surface(u32 arg);
void init_buffers(rsx::framebuffer_creation_context context, bool skip_reading = false);
bool check_program_state();
void load_program(const vertex_upload_info& upload_info);
void load_program(const gl::vertex_upload_info& upload_info);
void update_draw_state();

View File

@ -180,7 +180,7 @@ namespace
};
}
vertex_upload_info GLGSRender::set_vertex_buffer()
gl::vertex_upload_info GLGSRender::set_vertex_buffer()
{
std::chrono::time_point<steady_clock> then = steady_clock::now();
@ -196,7 +196,7 @@ vertex_upload_info GLGSRender::set_vertex_buffer()
auto required = calculate_memory_requirements(m_vertex_layout, vertex_count);
std::pair<void*, u32> persistent_mapping = {}, volatile_mapping = {};
vertex_upload_info upload_info = { result.vertex_draw_count, result.allocated_vertex_count, result.vertex_index_base, 0u, 0u, result.index_info };
gl::vertex_upload_info upload_info = { result.vertex_draw_count, result.allocated_vertex_count, result.vertex_index_base, 0u, 0u, result.index_info };
if (required.first > 0)
{

View File

@ -666,6 +666,9 @@ VKGSRender::~VKGSRender()
vk::finalize_compiler_context();
m_prog_buffer->clear();
m_persistent_attribute_storage.reset();
m_volatile_attribute_storage.reset();
//Global resources
vk::destroy_global_resources();
@ -1209,10 +1212,12 @@ void VKGSRender::end()
//Load program
std::chrono::time_point<steady_clock> program_start = textures_end;
load_program(std::get<2>(upload_info), std::get<3>(upload_info));
load_program(upload_info);
m_program->bind_uniform(m_persistent_attribute_storage, "persistent_input_stream", m_current_frame->descriptor_set);
m_program->bind_uniform(m_volatile_attribute_storage, "volatile_input_stream", m_current_frame->descriptor_set);
VkBufferView persistent_buffer = m_persistent_attribute_storage ? m_persistent_attribute_storage->value : null_buffer_view->value;
VkBufferView volatile_buffer = m_volatile_attribute_storage ? m_volatile_attribute_storage->value : null_buffer_view->value;
m_program->bind_uniform(persistent_buffer, "persistent_input_stream", m_current_frame->descriptor_set);
m_program->bind_uniform(volatile_buffer, "volatile_input_stream", m_current_frame->descriptor_set);
std::chrono::time_point<steady_clock> program_stop = steady_clock::now();
m_setup_time += std::chrono::duration_cast<std::chrono::microseconds>(program_stop - program_start).count();
@ -1445,8 +1450,6 @@ void VKGSRender::end()
vkCmdClearAttachments(*m_current_command_buffer, static_cast<u32>(buffers_to_clear.size()), buffers_to_clear.data(), 1, &clear_rect);
}
std::optional<std::tuple<VkDeviceSize, VkIndexType> > index_info = std::get<4>(upload_info);
bool primitive_emulated = false;
vk::get_appropriate_topology(rsx::method_registers.current_draw_clause.primitive, primitive_emulated);
@ -1461,12 +1464,11 @@ void VKGSRender::end()
m_occlusion_map[m_active_query_info->driver_handle].command_buffer_to_wait = m_current_command_buffer;
}
if (!index_info)
if (!upload_info.index_info)
{
if (single_draw)
{
const auto vertex_count = std::get<1>(upload_info);
vkCmdDraw(*m_current_command_buffer, vertex_count, 1, 0, 0);
vkCmdDraw(*m_current_command_buffer, upload_info.vertex_draw_count, 1, 0, 0);
}
else
{
@ -1480,10 +1482,10 @@ void VKGSRender::end()
else
{
VkIndexType index_type;
u32 index_count = std::get<1>(upload_info);
const u32 index_count = upload_info.vertex_draw_count;
VkDeviceSize offset;
std::tie(offset, index_type) = index_info.value();
std::tie(offset, index_type) = upload_info.index_info.value();
vkCmdBindIndexBuffer(*m_current_command_buffer, m_index_buffer_ring_info.heap->value, offset, index_type);
if (single_draw)
@ -2160,7 +2162,7 @@ bool VKGSRender::check_program_status()
return (rsx::method_registers.shader_program_address() != 0);
}
void VKGSRender::load_program(u32 vertex_count, u32 vertex_base)
void VKGSRender::load_program(const vk::vertex_upload_info& vertex_info)
{
get_current_fragment_program(fs_sampler_state);
verify(HERE), current_fragment_program.valid;
@ -2343,11 +2345,13 @@ void VKGSRender::load_program(u32 vertex_count, u32 vertex_base)
fill_scale_offset_data(buf, false);
fill_user_clip_data(buf + 64);
*(reinterpret_cast<u32*>(buf + 128)) = rsx::method_registers.transform_branch_bits();
*(reinterpret_cast<u32*>(buf + 132)) = vertex_base;
*(reinterpret_cast<u32*>(buf + 132)) = vertex_info.vertex_index_base;
*(reinterpret_cast<f32*>(buf + 136)) = rsx::method_registers.point_size();
*(reinterpret_cast<f32*>(buf + 140)) = rsx::method_registers.clip_min();
*(reinterpret_cast<f32*>(buf + 144)) = rsx::method_registers.clip_max();
fill_vertex_layout_state(m_vertex_layout, vertex_count, reinterpret_cast<s32*>(buf + 160));
fill_vertex_layout_state(m_vertex_layout, vertex_info.allocated_vertex_count, reinterpret_cast<s32*>(buf + 160),
vertex_info.persistent_window_offset, vertex_info.volatile_window_offset);
//Vertex constants
buf = buf + 512;

View File

@ -23,6 +23,17 @@ namespace vk
using null_vertex_cache = vertex_cache;
using shader_cache = rsx::shaders_cache<vk::pipeline_props, VKProgramBuffer>;
struct vertex_upload_info
{
VkPrimitiveTopology primitive;
u32 vertex_draw_count;
u32 allocated_vertex_count;
u32 vertex_index_base;
u32 persistent_window_offset;
u32 volatile_window_offset;
std::optional<std::tuple<VkDeviceSize, VkIndexType>> index_info;
};
}
//Heap allocation sizes in MB
@ -262,8 +273,8 @@ private:
std::array<std::unique_ptr<vk::sampler>, rsx::limits::fragment_textures_count> fs_sampler_handles;
std::array<std::unique_ptr<vk::sampler>, rsx::limits::vertex_textures_count> vs_sampler_handles;
VkBufferView m_persistent_attribute_storage;
VkBufferView m_volatile_attribute_storage;
std::unique_ptr<vk::buffer_view> m_persistent_attribute_storage;
std::unique_ptr<vk::buffer_view> m_volatile_attribute_storage;
public:
//vk::fbo draw_fbo;
@ -379,11 +390,11 @@ private:
void check_heap_status();
/// returns primitive topology, index_count, allocated_verts, vertex_base_index, (offset in index buffer, index type)
std::tuple<VkPrimitiveTopology, u32, u32, u32, std::optional<std::tuple<VkDeviceSize, VkIndexType> > > upload_vertex_data();
vk::vertex_upload_info upload_vertex_data();
public:
bool check_program_status();
void load_program(u32 vertex_count, u32 vertex_base);
void load_program(const vk::vertex_upload_info& vertex_info);
void init_buffers(rsx::framebuffer_creation_context context, bool skip_reading = false);
void read_buffers();
void write_buffers();

View File

@ -616,6 +616,25 @@ namespace vk
buffer_view(const buffer_view&) = delete;
buffer_view(buffer_view&&) = delete;
bool in_range(u32 address, u32 size, u32& offset) const
{
if (address < info.offset)
return false;
const u32 _offset = address - (u32)info.offset;
if (info.range < _offset)
return false;
const auto remaining = info.range - _offset;
if (size <= remaining)
{
offset = _offset;
return true;
}
return false;
}
private:
VkDevice m_device;
};

View File

@ -253,8 +253,7 @@ namespace
};
}
std::tuple<VkPrimitiveTopology, u32, u32, u32, std::optional<std::tuple<VkDeviceSize, VkIndexType> > >
VKGSRender::upload_vertex_data()
vk::vertex_upload_info VKGSRender::upload_vertex_data()
{
m_vertex_layout = analyse_inputs_interleaved();
@ -266,11 +265,9 @@ VKGSRender::upload_vertex_data()
//Do actual vertex upload
auto required = calculate_memory_requirements(m_vertex_layout, vertex_count);
u32 persistent_range_base = UINT32_MAX, volatile_range_base = UINT32_MAX;
size_t persistent_offset = UINT64_MAX, volatile_offset = UINT64_MAX;
m_persistent_attribute_storage = VK_NULL_HANDLE;
m_volatile_attribute_storage = VK_NULL_HANDLE;
if (required.first > 0)
{
//Check if cacheable
@ -287,8 +284,7 @@ VKGSRender::upload_vertex_data()
if (auto cached = m_vertex_cache->find_vertex_range(storage_address, VK_FORMAT_R8_UINT, required.first))
{
in_cache = true;
m_current_frame->buffer_views_to_clean.push_back(std::make_unique<vk::buffer_view>(*m_device,
m_attrib_ring_info.heap->value, VK_FORMAT_R8_UINT, cached->offset_in_heap, required.first));
persistent_range_base = cached->offset_in_heap;
}
else
{
@ -299,8 +295,7 @@ VKGSRender::upload_vertex_data()
if (!in_cache)
{
persistent_offset = (u32)m_attrib_ring_info.alloc<256>(required.first);
m_current_frame->buffer_views_to_clean.push_back(std::make_unique<vk::buffer_view>(*m_device,
m_attrib_ring_info.heap->value, VK_FORMAT_R8_UINT, persistent_offset, required.first));
persistent_range_base = (u32)persistent_offset;
if (to_store)
{
@ -308,25 +303,12 @@ VKGSRender::upload_vertex_data()
m_vertex_cache->store_range(storage_address, VK_FORMAT_R8_UINT, required.first, (u32)persistent_offset);
}
}
m_persistent_attribute_storage = m_current_frame->buffer_views_to_clean.back()->value;
}
else
{
m_persistent_attribute_storage = null_buffer_view->value;
}
if (required.second > 0)
{
volatile_offset = (u32)m_attrib_ring_info.alloc<256>(required.second);
m_current_frame->buffer_views_to_clean.push_back(std::make_unique<vk::buffer_view>(*m_device,
m_attrib_ring_info.heap->value, VK_FORMAT_R8_UINT, volatile_offset, required.second));
m_volatile_attribute_storage = m_current_frame->buffer_views_to_clean.back()->value;
}
else
{
m_volatile_attribute_storage = null_buffer_view->value;
volatile_range_base = (u32)volatile_offset;
}
//Write all the data once if possible
@ -358,5 +340,32 @@ VKGSRender::upload_vertex_data()
}
}
return std::make_tuple(result.native_primitive_type, result.vertex_draw_count, result.allocated_vertex_count, result.vertex_index_base, result.index_info);
if (persistent_range_base != UINT32_MAX)
{
if (!m_persistent_attribute_storage || !m_persistent_attribute_storage->in_range(persistent_range_base, required.first, persistent_range_base))
{
if (m_persistent_attribute_storage)
m_current_frame->buffer_views_to_clean.push_back(std::move(m_persistent_attribute_storage));
//View 64M blocks at a time (different drivers will only allow a fixed viewable heap size, 64M should be safe)
const size_t view_size = (persistent_range_base + 0x4000000) > m_attrib_ring_info.size() ? m_attrib_ring_info.size() - persistent_range_base : 0x4000000;
m_persistent_attribute_storage = std::make_unique<vk::buffer_view>(*m_device, m_attrib_ring_info.heap->value, VK_FORMAT_R8_UINT, persistent_range_base, view_size);
persistent_range_base = 0;
}
}
if (volatile_range_base != UINT32_MAX)
{
if (!m_volatile_attribute_storage || !m_volatile_attribute_storage->in_range(volatile_range_base, required.second, volatile_range_base))
{
if (m_volatile_attribute_storage)
m_current_frame->buffer_views_to_clean.push_back(std::move(m_volatile_attribute_storage));
const size_t view_size = (volatile_range_base + 0x4000000) > m_attrib_ring_info.size() ? m_attrib_ring_info.size() - volatile_range_base : 0x4000000;
m_volatile_attribute_storage = std::make_unique<vk::buffer_view>(*m_device, m_attrib_ring_info.heap->value, VK_FORMAT_R8_UINT, volatile_range_base, view_size);
volatile_range_base = 0;
}
}
return{ result.native_primitive_type, result.vertex_draw_count, result.allocated_vertex_count, result.vertex_index_base, persistent_range_base, volatile_range_base, result.index_info };
}