gl: use streaming buffers for uniform & elem buffer as well

gl: stream uniform data using stream buffer

gl: vertex streaming improvements and bugfixes

gl: add basic timing info check for profiling

gl: ebo streaming fixes and enhancements
This commit is contained in:
kd-11 2016-06-12 12:05:22 +03:00
parent ed94626411
commit 6761d4a30c
5 changed files with 150 additions and 101 deletions

View File

@ -7,6 +7,7 @@
#include "../Common/BufferUtils.h"
extern cfg::bool_entry g_cfg_rsx_debug_output;
extern cfg::bool_entry g_cfg_rsx_overlay;
#define DUMP_VERTEX_DATA 0
@ -70,6 +71,8 @@ void GLGSRender::begin()
init_buffers();
std::chrono::time_point<std::chrono::system_clock> then = std::chrono::system_clock::now();
u32 color_mask = rsx::method_registers[NV4097_SET_COLOR_MASK];
bool color_mask_b = !!(color_mask & 0xff);
bool color_mask_g = !!((color_mask >> 8) & 0xff);
@ -241,6 +244,10 @@ void GLGSRender::begin()
{
__glcheck glPrimitiveRestartIndex(rsx::method_registers[NV4097_SET_RESTART_INDEX]);
}
std::chrono::time_point<std::chrono::system_clock> now = std::chrono::system_clock::now();
m_begin_time += std::chrono::duration_cast<std::chrono::microseconds>(now - then).count();
m_draw_calls++;
}
namespace
@ -266,8 +273,6 @@ void GLGSRender::end()
return;
}
//LOG_NOTICE(Log::RSX, "draw()");
draw_fbo.bind();
m_program->use();
@ -292,13 +297,11 @@ void GLGSRender::end()
}
}
set_vertex_buffer();
u32 offset_in_index_buffer = set_vertex_buffer();
m_vao.bind();
std::chrono::time_point<std::chrono::system_clock> then = std::chrono::system_clock::now();
/**
* Validate fails if called right after linking a program because the VS and FS both use textures bound using different
* samplers. So far only sampler2D has been largely used, hiding the problem. This call shall also degrade performance further
* if used every draw call. Fixes shader validation issues on AMD.
*/
if (g_cfg_rsx_debug_output)
m_program->validate();
@ -307,19 +310,22 @@ void GLGSRender::end()
rsx::index_array_type indexed_type = rsx::to_index_array_type(rsx::method_registers[NV4097_SET_INDEX_ARRAY_DMA] >> 4);
if (indexed_type == rsx::index_array_type::u32)
__glcheck glDrawElements(gl::draw_mode(draw_mode), vertex_draw_count, GL_UNSIGNED_INT, nullptr);
__glcheck glDrawElements(gl::draw_mode(draw_mode), vertex_draw_count, GL_UNSIGNED_INT, (GLvoid *)(offset_in_index_buffer));
if (indexed_type == rsx::index_array_type::u16)
__glcheck glDrawElements(gl::draw_mode(draw_mode), vertex_draw_count, GL_UNSIGNED_SHORT, nullptr);
__glcheck glDrawElements(gl::draw_mode(draw_mode), vertex_draw_count, GL_UNSIGNED_SHORT, (GLvoid *)(offset_in_index_buffer));
}
else if (!is_primitive_native(draw_mode))
{
__glcheck glDrawElements(gl::draw_mode(draw_mode), vertex_draw_count, GL_UNSIGNED_SHORT, nullptr);
__glcheck glDrawElements(gl::draw_mode(draw_mode), vertex_draw_count, GL_UNSIGNED_SHORT, (GLvoid *)(offset_in_index_buffer));
}
else
{
draw_fbo.draw_arrays(draw_mode, vertex_draw_count);
}
std::chrono::time_point<std::chrono::system_clock> now = std::chrono::system_clock::now();
m_draw_time += std::chrono::duration_cast<std::chrono::microseconds>(now - then).count();
write_buffers();
rsx::thread::end();
@ -376,18 +382,6 @@ void GLGSRender::on_init_thread()
glEnable(GL_VERTEX_PROGRAM_POINT_SIZE);
m_vao.create();
m_vbo.create();
m_ebo.create();
m_scale_offset_buffer.create(32 * sizeof(float));
m_vertex_constants_buffer.create(512 * 4 * sizeof(float));
m_fragment_constants_buffer.create();
glBindBufferBase(GL_UNIFORM_BUFFER, 0, m_scale_offset_buffer.id());
glBindBufferBase(GL_UNIFORM_BUFFER, 1, m_vertex_constants_buffer.id());
glBindBufferBase(GL_UNIFORM_BUFFER, 2, m_fragment_constants_buffer.id());
m_vao.array_buffer = m_vbo;
m_vao.element_array_buffer = m_ebo;
for (gl::texture &tex : m_gl_attrib_buffers)
{
@ -395,7 +389,11 @@ void GLGSRender::on_init_thread()
tex.set_target(gl::texture::target::textureBuffer);
}
m_attrib_ring_buffer.reset(new gl::ring_buffer(16 * 0x100000));
m_attrib_ring_buffer.reset(new gl::ring_buffer(16 * 0x100000, gl::buffer::target::texture));
m_uniform_ring_buffer.reset(new gl::ring_buffer(16 * 0x100000, gl::buffer::target::uniform));
m_index_ring_buffer.reset(new gl::ring_buffer(0x100000, gl::buffer::target::element_array));
m_vao.element_array_buffer = m_index_ring_buffer->get_buffer();
m_gl_texture_cache.initialize_rtt_cache();
}
@ -414,30 +412,17 @@ void GLGSRender::on_exit()
if (m_flip_tex_color)
m_flip_tex_color.remove();
if (m_vbo)
m_vbo.remove();
if (m_ebo)
m_ebo.remove();
if (m_vao)
m_vao.remove();
if (m_scale_offset_buffer)
m_scale_offset_buffer.remove();
if (m_vertex_constants_buffer)
m_vertex_constants_buffer.remove();
if (m_fragment_constants_buffer)
m_fragment_constants_buffer.remove();
for (gl::texture &tex : m_gl_attrib_buffers)
{
tex.remove();
}
m_attrib_ring_buffer->destroy();
m_uniform_ring_buffer->destroy();
m_index_ring_buffer->destroy();
}
void nv4097_clear_surface(u32 arg, GLGSRender* renderer)
@ -570,32 +555,47 @@ bool GLGSRender::load_program()
(m_program.recreate() += { fp.compile(), vp.compile() }).make();
#endif
size_t max_buffer_sz =(size_t) m_vertex_constants_buffer.size();
size_t fragment_constants_sz = m_prog_buffer.get_fragment_constants_buffer_size(fragment_program);
if (fragment_constants_sz > max_buffer_sz)
max_buffer_sz = fragment_constants_sz;
u32 fragment_constants_sz = m_prog_buffer.get_fragment_constants_buffer_size(fragment_program);
fragment_constants_sz = std::max(32U, fragment_constants_sz);
u32 max_buffer_sz = 8192 + 512 + fragment_constants_sz;
u32 is_alpha_tested = !!(rsx::method_registers[NV4097_SET_ALPHA_TEST_ENABLE]);
u8 alpha_ref_raw = (u8)(rsx::method_registers[NV4097_SET_ALPHA_REF] & 0xFF);
float alpha_ref = alpha_ref_raw / 255.f;
std::vector<u8> client_side_buf(max_buffer_sz);
u8 *buf;
u32 scale_offset_offset;
u32 vertex_constants_offset;
u32 fragment_constants_offset;
fill_scale_offset_data(client_side_buf.data(), false);
memcpy(client_side_buf.data() + 16 * sizeof(float), &rsx::method_registers[NV4097_SET_FOG_PARAMS], sizeof(float));
memcpy(client_side_buf.data() + 17 * sizeof(float), &rsx::method_registers[NV4097_SET_FOG_PARAMS + 1], sizeof(float));
memcpy(client_side_buf.data() + 18 * sizeof(float), &is_alpha_tested, sizeof(u32));
memcpy(client_side_buf.data() + 19 * sizeof(float), &alpha_ref, sizeof(float));
m_scale_offset_buffer.data(m_scale_offset_buffer.size(), nullptr);
m_scale_offset_buffer.sub_data(0, m_scale_offset_buffer.size(), client_side_buf.data());
m_uniform_ring_buffer->reserve_and_map(max_buffer_sz);
auto mapping = m_uniform_ring_buffer->alloc_from_reserve(512);
buf = static_cast<u8*>(mapping.first);
scale_offset_offset = mapping.second;
fill_vertex_program_constants_data(client_side_buf.data());
m_vertex_constants_buffer.data(m_vertex_constants_buffer.size(), nullptr);
m_vertex_constants_buffer.sub_data(0, m_vertex_constants_buffer.size(), client_side_buf.data());
fill_scale_offset_data(buf, false);
memcpy(buf + 16 * sizeof(float), &rsx::method_registers[NV4097_SET_FOG_PARAMS], sizeof(float));
memcpy(buf + 17 * sizeof(float), &rsx::method_registers[NV4097_SET_FOG_PARAMS + 1], sizeof(float));
memcpy(buf + 18 * sizeof(float), &is_alpha_tested, sizeof(u32));
memcpy(buf + 19 * sizeof(float), &alpha_ref, sizeof(float));
m_prog_buffer.fill_fragment_constans_buffer({ reinterpret_cast<float*>(client_side_buf.data()), gsl::narrow<int>(fragment_constants_sz) }, fragment_program);
m_fragment_constants_buffer.data(fragment_constants_sz, nullptr);
m_fragment_constants_buffer.sub_data(0, fragment_constants_sz, client_side_buf.data());
mapping = m_uniform_ring_buffer->alloc_from_reserve(512 * 16);
buf = static_cast<u8*>(mapping.first);
vertex_constants_offset = mapping.second;
fill_vertex_program_constants_data(buf);
mapping = m_uniform_ring_buffer->alloc_from_reserve(fragment_constants_sz);
buf = static_cast<u8*>(mapping.first);
fragment_constants_offset = mapping.second;
m_prog_buffer.fill_fragment_constans_buffer({ reinterpret_cast<float*>(buf), gsl::narrow<int>(fragment_constants_sz) }, fragment_program);
m_uniform_ring_buffer->unmap();
glBindBufferRange(GL_UNIFORM_BUFFER, 0, m_uniform_ring_buffer->get_buffer().id(), scale_offset_offset, 512);
glBindBufferRange(GL_UNIFORM_BUFFER, 1, m_uniform_ring_buffer->get_buffer().id(), vertex_constants_offset, 512 * 16);
glBindBufferRange(GL_UNIFORM_BUFFER, 2, m_uniform_ring_buffer->get_buffer().id(), fragment_constants_offset, fragment_constants_sz);
return true;
}
@ -715,6 +715,26 @@ void GLGSRender::flip(int buffer)
m_frame->flip(m_context);
if (g_cfg_rsx_overlay)
{
//TODO: Display overlay in a cross-platform manner
//Core context throws wgl font functions out of the window as they use display lists
//Only show debug info if the user really requests it
if (g_cfg_rsx_debug_output)
{
std::string message =
"draw_calls: " + std::to_string(m_draw_calls) + ", " + "draw_call_setup: " + std::to_string(m_begin_time) + "us, " + "vertex_upload_time: " + std::to_string(m_vertex_upload_time) + "us, " + "draw_call_execution: " + std::to_string(m_draw_time) + "us";
LOG_ERROR(RSX, message.c_str());
}
}
m_draw_calls = 0;
m_begin_time = 0;
m_draw_time = 0;
m_vertex_upload_time = 0;
for (auto &tex : m_rtts.invalidated_resources)
{
tex->remove();

View File

@ -29,6 +29,13 @@ private:
gl::texture m_gl_attrib_buffers[rsx::limits::vertex_count];
std::unique_ptr<gl::ring_buffer> m_attrib_ring_buffer;
std::unique_ptr<gl::ring_buffer> m_uniform_ring_buffer;
std::unique_ptr<gl::ring_buffer> m_index_ring_buffer;
u32 m_draw_calls = 0;
u32 m_begin_time = 0;
u32 m_draw_time = 0;
u32 m_vertex_upload_time = 0;
public:
gl::fbo draw_fbo;
@ -40,12 +47,7 @@ private:
gl::fbo m_flip_fbo;
gl::texture m_flip_tex_color;
gl::buffer m_scale_offset_buffer;
gl::buffer m_vertex_constants_buffer;
gl::buffer m_fragment_constants_buffer;
gl::buffer m_vbo;
gl::buffer m_ebo;
//vaos are mandatory for core profile
gl::vao m_vao;
public:
@ -54,7 +56,7 @@ public:
private:
static u32 enable(u32 enable, u32 cap);
static u32 enable(u32 enable, u32 cap, u32 index);
void set_vertex_buffer();
u32 set_vertex_buffer();
public:
bool load_program();

View File

@ -164,6 +164,7 @@ OPENGL_PROC(PFNGLGETINTEGER64VPROC, GetInteger64v);
OPENGL_PROC(PFNGLCHECKFRAMEBUFFERSTATUSPROC, CheckFramebufferStatus);
OPENGL_PROC(PFNGLMAPBUFFERRANGEPROC, MapBufferRange);
OPENGL_PROC(PFNGLBINDBUFFERRANGEPROC, BindBufferRange);
OPENGL_PROC(PFNGLBINDBUFFERBASEPROC, BindBufferBase);
//Texture Buffers

View File

@ -372,7 +372,9 @@ namespace gl
pixel_pack = GL_PIXEL_PACK_BUFFER,
pixel_unpack = GL_PIXEL_UNPACK_BUFFER,
array = GL_ARRAY_BUFFER,
element_array = GL_ELEMENT_ARRAY_BUFFER
element_array = GL_ELEMENT_ARRAY_BUFFER,
uniform = GL_UNIFORM_BUFFER,
texture = GL_TEXTURE_BUFFER
};
enum class access
{
@ -421,6 +423,8 @@ namespace gl
case target::pixel_unpack: pname = GL_PIXEL_UNPACK_BUFFER_BINDING; break;
case target::array: pname = GL_ARRAY_BUFFER_BINDING; break;
case target::element_array: pname = GL_ELEMENT_ARRAY_BUFFER_BINDING; break;
case target::uniform: pname = GL_UNIFORM_BUFFER_BINDING; break;
case target::texture: pname = GL_TEXTURE_BUFFER_BINDING; break;
}
glGetIntegerv(pname, &m_last_binding);
@ -465,6 +469,13 @@ namespace gl
data(size, data_);
}
void create(target target_, GLsizeiptr size, const void* data_ = nullptr)
{
create();
m_target = target_;
data(size, data_);
}
void data(GLsizeiptr size, const void* data_ = nullptr)
{
target target_ = current_target();
@ -572,6 +583,7 @@ namespace gl
class ring_buffer
{
buffer storage_buffer;
buffer::target m_target;
u32 m_data_loc = 0;
u32 m_size;
@ -582,11 +594,12 @@ namespace gl
void *m_mapped_base = nullptr;
public:
ring_buffer(u32 initial_size)
ring_buffer(u32 initial_size, buffer::target target)
{
storage_buffer.create();
storage_buffer.data(initial_size);
m_size = initial_size;
m_target = target;
}
void destroy()
@ -598,13 +611,10 @@ namespace gl
{
size = (size + 255) & ~255;
//storage_buffer.bind(storage_buffer.current_target());
glBindBuffer(GL_TEXTURE_BUFFER, storage_buffer.id());
glBindBuffer((GLenum)m_target, storage_buffer.id());
u32 limit = m_data_loc + size;
if (limit > m_size)
{
//Orphan this buffer and have the driver allocate a new one instead of looping back to the front.
//Hopefully, the driver will track usage here and re-use if sync is not a problem
if (size > m_size)
m_size = size;
@ -612,7 +622,7 @@ namespace gl
m_data_loc = 0;
}
void *ptr = glMapBufferRange(GL_TEXTURE_BUFFER, m_data_loc, size, GL_MAP_WRITE_BIT|GL_MAP_INVALIDATE_RANGE_BIT|GL_MAP_UNSYNCHRONIZED_BIT);
void *ptr = glMapBufferRange((GLenum)m_target, m_data_loc, size, GL_MAP_WRITE_BIT|GL_MAP_INVALIDATE_RANGE_BIT|GL_MAP_UNSYNCHRONIZED_BIT);
u32 offset = m_data_loc;
m_data_loc += size;
return std::make_pair(ptr, offset);
@ -620,8 +630,7 @@ namespace gl
void unmap()
{
//storage_buffer.unmap();
glUnmapBuffer(GL_TEXTURE_BUFFER);
glUnmapBuffer((GLenum)m_target);
m_mapped_block_size = 0;
m_mapped_base = 0;
}
@ -638,21 +647,29 @@ namespace gl
std::pair<void*, u32> alloc_from_reserve(u32 size)
{
size = (size + 255) & ~255;
size = (size + 15) & ~15;
if (m_mapped_bytes_available < size || !m_mapped_base)
{
if (m_mapped_base)
{
//This doesn't really work for some reason, probably since the caller should bind the target
//before making this call as the block may be reallocated
LOG_ERROR(RSX, "reserved allocation exceeded. check for corruption!");
unmap();
}
reserve_and_map((size > 4096) ? size : 4096);
}
EXPECTS(m_mapped_bytes_available >= size);
void *ptr = (char*)m_mapped_base + m_mapped_reserve_offset;
u32 offset = m_mapped_reserve_offset + m_mapped_block_offset;
m_mapped_reserve_offset += size;
m_mapped_bytes_available -= size;
EXPECTS((offset & 15) == 0);
return std::make_pair(ptr, offset);
}

View File

@ -127,8 +127,8 @@ namespace
throw EXCEPTION("unknow vertex type");
}
// return vertex count and filled index array if primitive type is not native (empty array otherwise)
std::tuple<u32, std::vector<u8>> get_index_array_for_emulated_non_indexed_draw(const std::vector<std::pair<u32, u32>> &first_count_commands, rsx::primitive_type primitive_mode)
// return vertex count if primitive type is not native (empty array otherwise)
std::tuple<u32, u32> get_index_array_for_emulated_non_indexed_draw(const std::vector<std::pair<u32, u32>> &first_count_commands, rsx::primitive_type primitive_mode, gl::ring_buffer &dst)
{
u32 vertex_draw_count = 0;
assert(!is_primitive_native(primitive_mode));
@ -138,9 +138,10 @@ namespace
vertex_draw_count += (u32)get_index_count(primitive_mode, pair.second);
}
std::vector<u8> vertex_index_array(vertex_draw_count * sizeof(u16));
u32 first = 0;
char* mapped_buffer = (char*)vertex_index_array.data();
auto mapping = dst.alloc_and_map(vertex_draw_count * sizeof(u16));
char *mapped_buffer = (char *)mapping.first;
for (const auto &pair : first_count_commands)
{
size_t element_count = get_index_count(primitive_mode, pair.second);
@ -149,16 +150,17 @@ namespace
first += pair.second;
}
return std::make_tuple(vertex_draw_count, vertex_index_array);
dst.unmap();
return std::make_tuple(vertex_draw_count, mapping.second);
}
}
void GLGSRender::set_vertex_buffer()
u32 GLGSRender::set_vertex_buffer()
{
//initialize vertex attributes
//merge all vertex arrays
//std::vector<u8> vertex_arrays_data;
std::chrono::time_point<std::chrono::system_clock> then = std::chrono::system_clock::now();
const std::string reg_table[] =
{
@ -171,18 +173,21 @@ void GLGSRender::set_vertex_buffer()
};
u32 input_mask = rsx::method_registers[NV4097_SET_VERTEX_ATTRIB_INPUT_MASK];
std::vector<u8> vertex_index_array;
vertex_draw_count = 0;
u32 min_index, max_index;
u32 min_index = 0, max_index = 0;
u32 max_vertex_attrib_size = 0;
u32 offset_in_index_buffer = 0;
vertex_draw_count = 0;
//place holder; replace with actual index buffer
gsl::span<gsl::byte> index_array;
for (u8 index = 0; index < rsx::limits::vertex_count; ++index)
{
if (vertex_arrays_info[index].size == 0)
continue;
max_vertex_attrib_size += (vertex_arrays_info[index].size << 2);
max_vertex_attrib_size += 16;
}
if (draw_command == rsx::draw_command::indexed)
@ -193,12 +198,19 @@ void GLGSRender::set_vertex_buffer()
{
vertex_draw_count += first_count.second;
}
// Index count
vertex_draw_count = (u32)get_index_count(draw_mode, gsl::narrow<int>(vertex_draw_count));
vertex_index_array.resize(vertex_draw_count * type_size);
u32 block_sz = vertex_draw_count * type_size;
gsl::span<gsl::byte> dst{ reinterpret_cast<gsl::byte*>(vertex_index_array.data()), gsl::narrow<u32>(vertex_index_array.size()) };
auto mapping = m_index_ring_buffer->alloc_and_map(block_sz);
void *ptr = mapping.first;
offset_in_index_buffer = mapping.second;
gsl::span<gsl::byte> dst{ reinterpret_cast<gsl::byte*>(ptr), gsl::narrow<u32>(block_sz) };
std::tie(min_index, max_index) = write_index_array_data_to_buffer(dst, type, draw_mode, first_count_commands);
m_index_ring_buffer->unmap();
}
if (draw_command == rsx::draw_command::inlined_array)
@ -270,7 +282,7 @@ void GLGSRender::set_vertex_buffer()
m_program->uniforms.texture(location, index + rsx::limits::textures_count, texture);
if (!is_primitive_native(draw_mode))
{
std::tie(vertex_draw_count, vertex_index_array) = get_index_array_for_emulated_non_indexed_draw({ { 0, vertex_draw_count } }, draw_mode);
std::tie(vertex_draw_count, offset_in_index_buffer) = get_index_array_for_emulated_non_indexed_draw({ { 0, vertex_draw_count } }, draw_mode, *m_index_ring_buffer);
}
}
}
@ -285,7 +297,8 @@ void GLGSRender::set_vertex_buffer()
if (draw_command == rsx::draw_command::array || draw_command == rsx::draw_command::indexed)
{
m_attrib_ring_buffer->reserve_and_map(vertex_draw_count * max_vertex_attrib_size);
u32 verts_allocated = std::max(vertex_draw_count, max_index + 1);
m_attrib_ring_buffer->reserve_and_map(verts_allocated * max_vertex_attrib_size);
for (int index = 0; index < rsx::limits::vertex_count; ++index)
{
@ -395,20 +408,16 @@ void GLGSRender::set_vertex_buffer()
continue;
}
}
if (draw_command == rsx::draw_command::array && !is_primitive_native(draw_mode))
{
std::tie(vertex_draw_count, vertex_index_array) = get_index_array_for_emulated_non_indexed_draw(first_count_commands, draw_mode);
std::tie(vertex_draw_count, offset_in_index_buffer) = get_index_array_for_emulated_non_indexed_draw(first_count_commands, draw_mode, *m_index_ring_buffer);
}
}
m_attrib_ring_buffer->unmap();
std::chrono::time_point<std::chrono::system_clock> now = std::chrono::system_clock::now();
m_vertex_upload_time += std::chrono::duration_cast<std::chrono::microseconds>(now - then).count();
if (draw_command == rsx::draw_command::indexed)
{
m_ebo.data(vertex_index_array.size(), vertex_index_array.data());
}
else if (!is_primitive_native(draw_mode))
{
m_ebo.data(vertex_index_array.size(), vertex_index_array.data());
}
return offset_in_index_buffer;
}