From 6761d4a30cb19a8466c4ff1f43d5e52ad64686ad Mon Sep 17 00:00:00 2001 From: kd-11 Date: Sun, 12 Jun 2016 12:05:22 +0300 Subject: [PATCH] gl: use streaming buffers for uniform & elem buffer as well gl: stream uniform data using stream buffer gl: vertex streaming improvements and bugfixes gl: add basic timing info check for profiling gl: ebo streaming fixes and enhancements --- rpcs3/Emu/RSX/GL/GLGSRender.cpp | 134 +++++++++++++++++------------ rpcs3/Emu/RSX/GL/GLGSRender.h | 16 ++-- rpcs3/Emu/RSX/GL/GLProcTable.h | 1 + rpcs3/Emu/RSX/GL/gl_helpers.h | 37 +++++--- rpcs3/Emu/RSX/GL/vertex_buffer.cpp | 63 ++++++++------ 5 files changed, 150 insertions(+), 101 deletions(-) diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp index 8838d23896..778267c3f3 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp +++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp @@ -7,6 +7,7 @@ #include "../Common/BufferUtils.h" extern cfg::bool_entry g_cfg_rsx_debug_output; +extern cfg::bool_entry g_cfg_rsx_overlay; #define DUMP_VERTEX_DATA 0 @@ -70,6 +71,8 @@ void GLGSRender::begin() init_buffers(); + std::chrono::time_point then = std::chrono::system_clock::now(); + u32 color_mask = rsx::method_registers[NV4097_SET_COLOR_MASK]; bool color_mask_b = !!(color_mask & 0xff); bool color_mask_g = !!((color_mask >> 8) & 0xff); @@ -241,6 +244,10 @@ void GLGSRender::begin() { __glcheck glPrimitiveRestartIndex(rsx::method_registers[NV4097_SET_RESTART_INDEX]); } + + std::chrono::time_point now = std::chrono::system_clock::now(); + m_begin_time += std::chrono::duration_cast(now - then).count(); + m_draw_calls++; } namespace @@ -266,8 +273,6 @@ void GLGSRender::end() return; } - //LOG_NOTICE(Log::RSX, "draw()"); - draw_fbo.bind(); m_program->use(); @@ -292,13 +297,11 @@ void GLGSRender::end() } } - set_vertex_buffer(); + u32 offset_in_index_buffer = set_vertex_buffer(); + m_vao.bind(); + + std::chrono::time_point then = std::chrono::system_clock::now(); - /** - * Validate fails if called right after linking a program because the VS and FS both use textures bound using different - * samplers. So far only sampler2D has been largely used, hiding the problem. This call shall also degrade performance further - * if used every draw call. Fixes shader validation issues on AMD. - */ if (g_cfg_rsx_debug_output) m_program->validate(); @@ -307,19 +310,22 @@ void GLGSRender::end() rsx::index_array_type indexed_type = rsx::to_index_array_type(rsx::method_registers[NV4097_SET_INDEX_ARRAY_DMA] >> 4); if (indexed_type == rsx::index_array_type::u32) - __glcheck glDrawElements(gl::draw_mode(draw_mode), vertex_draw_count, GL_UNSIGNED_INT, nullptr); + __glcheck glDrawElements(gl::draw_mode(draw_mode), vertex_draw_count, GL_UNSIGNED_INT, (GLvoid *)(offset_in_index_buffer)); if (indexed_type == rsx::index_array_type::u16) - __glcheck glDrawElements(gl::draw_mode(draw_mode), vertex_draw_count, GL_UNSIGNED_SHORT, nullptr); + __glcheck glDrawElements(gl::draw_mode(draw_mode), vertex_draw_count, GL_UNSIGNED_SHORT, (GLvoid *)(offset_in_index_buffer)); } else if (!is_primitive_native(draw_mode)) { - __glcheck glDrawElements(gl::draw_mode(draw_mode), vertex_draw_count, GL_UNSIGNED_SHORT, nullptr); + __glcheck glDrawElements(gl::draw_mode(draw_mode), vertex_draw_count, GL_UNSIGNED_SHORT, (GLvoid *)(offset_in_index_buffer)); } else { draw_fbo.draw_arrays(draw_mode, vertex_draw_count); } + std::chrono::time_point now = std::chrono::system_clock::now(); + m_draw_time += std::chrono::duration_cast(now - then).count(); + write_buffers(); rsx::thread::end(); @@ -376,18 +382,6 @@ void GLGSRender::on_init_thread() glEnable(GL_VERTEX_PROGRAM_POINT_SIZE); m_vao.create(); - m_vbo.create(); - m_ebo.create(); - m_scale_offset_buffer.create(32 * sizeof(float)); - m_vertex_constants_buffer.create(512 * 4 * sizeof(float)); - m_fragment_constants_buffer.create(); - - glBindBufferBase(GL_UNIFORM_BUFFER, 0, m_scale_offset_buffer.id()); - glBindBufferBase(GL_UNIFORM_BUFFER, 1, m_vertex_constants_buffer.id()); - glBindBufferBase(GL_UNIFORM_BUFFER, 2, m_fragment_constants_buffer.id()); - - m_vao.array_buffer = m_vbo; - m_vao.element_array_buffer = m_ebo; for (gl::texture &tex : m_gl_attrib_buffers) { @@ -395,7 +389,11 @@ void GLGSRender::on_init_thread() tex.set_target(gl::texture::target::textureBuffer); } - m_attrib_ring_buffer.reset(new gl::ring_buffer(16 * 0x100000)); + m_attrib_ring_buffer.reset(new gl::ring_buffer(16 * 0x100000, gl::buffer::target::texture)); + m_uniform_ring_buffer.reset(new gl::ring_buffer(16 * 0x100000, gl::buffer::target::uniform)); + m_index_ring_buffer.reset(new gl::ring_buffer(0x100000, gl::buffer::target::element_array)); + + m_vao.element_array_buffer = m_index_ring_buffer->get_buffer(); m_gl_texture_cache.initialize_rtt_cache(); } @@ -414,30 +412,17 @@ void GLGSRender::on_exit() if (m_flip_tex_color) m_flip_tex_color.remove(); - if (m_vbo) - m_vbo.remove(); - - if (m_ebo) - m_ebo.remove(); - if (m_vao) m_vao.remove(); - if (m_scale_offset_buffer) - m_scale_offset_buffer.remove(); - - if (m_vertex_constants_buffer) - m_vertex_constants_buffer.remove(); - - if (m_fragment_constants_buffer) - m_fragment_constants_buffer.remove(); - for (gl::texture &tex : m_gl_attrib_buffers) { tex.remove(); } m_attrib_ring_buffer->destroy(); + m_uniform_ring_buffer->destroy(); + m_index_ring_buffer->destroy(); } void nv4097_clear_surface(u32 arg, GLGSRender* renderer) @@ -570,32 +555,47 @@ bool GLGSRender::load_program() (m_program.recreate() += { fp.compile(), vp.compile() }).make(); #endif - size_t max_buffer_sz =(size_t) m_vertex_constants_buffer.size(); - size_t fragment_constants_sz = m_prog_buffer.get_fragment_constants_buffer_size(fragment_program); - if (fragment_constants_sz > max_buffer_sz) - max_buffer_sz = fragment_constants_sz; + u32 fragment_constants_sz = m_prog_buffer.get_fragment_constants_buffer_size(fragment_program); + fragment_constants_sz = std::max(32U, fragment_constants_sz); + u32 max_buffer_sz = 8192 + 512 + fragment_constants_sz; u32 is_alpha_tested = !!(rsx::method_registers[NV4097_SET_ALPHA_TEST_ENABLE]); u8 alpha_ref_raw = (u8)(rsx::method_registers[NV4097_SET_ALPHA_REF] & 0xFF); float alpha_ref = alpha_ref_raw / 255.f; - std::vector client_side_buf(max_buffer_sz); + u8 *buf; + u32 scale_offset_offset; + u32 vertex_constants_offset; + u32 fragment_constants_offset; - fill_scale_offset_data(client_side_buf.data(), false); - memcpy(client_side_buf.data() + 16 * sizeof(float), &rsx::method_registers[NV4097_SET_FOG_PARAMS], sizeof(float)); - memcpy(client_side_buf.data() + 17 * sizeof(float), &rsx::method_registers[NV4097_SET_FOG_PARAMS + 1], sizeof(float)); - memcpy(client_side_buf.data() + 18 * sizeof(float), &is_alpha_tested, sizeof(u32)); - memcpy(client_side_buf.data() + 19 * sizeof(float), &alpha_ref, sizeof(float)); - m_scale_offset_buffer.data(m_scale_offset_buffer.size(), nullptr); - m_scale_offset_buffer.sub_data(0, m_scale_offset_buffer.size(), client_side_buf.data()); + m_uniform_ring_buffer->reserve_and_map(max_buffer_sz); + auto mapping = m_uniform_ring_buffer->alloc_from_reserve(512); + buf = static_cast(mapping.first); + scale_offset_offset = mapping.second; - fill_vertex_program_constants_data(client_side_buf.data()); - m_vertex_constants_buffer.data(m_vertex_constants_buffer.size(), nullptr); - m_vertex_constants_buffer.sub_data(0, m_vertex_constants_buffer.size(), client_side_buf.data()); + fill_scale_offset_data(buf, false); + memcpy(buf + 16 * sizeof(float), &rsx::method_registers[NV4097_SET_FOG_PARAMS], sizeof(float)); + memcpy(buf + 17 * sizeof(float), &rsx::method_registers[NV4097_SET_FOG_PARAMS + 1], sizeof(float)); + memcpy(buf + 18 * sizeof(float), &is_alpha_tested, sizeof(u32)); + memcpy(buf + 19 * sizeof(float), &alpha_ref, sizeof(float)); - m_prog_buffer.fill_fragment_constans_buffer({ reinterpret_cast(client_side_buf.data()), gsl::narrow(fragment_constants_sz) }, fragment_program); - m_fragment_constants_buffer.data(fragment_constants_sz, nullptr); - m_fragment_constants_buffer.sub_data(0, fragment_constants_sz, client_side_buf.data()); + mapping = m_uniform_ring_buffer->alloc_from_reserve(512 * 16); + buf = static_cast(mapping.first); + vertex_constants_offset = mapping.second; + + fill_vertex_program_constants_data(buf); + + mapping = m_uniform_ring_buffer->alloc_from_reserve(fragment_constants_sz); + buf = static_cast(mapping.first); + fragment_constants_offset = mapping.second; + + m_prog_buffer.fill_fragment_constans_buffer({ reinterpret_cast(buf), gsl::narrow(fragment_constants_sz) }, fragment_program); + + m_uniform_ring_buffer->unmap(); + + glBindBufferRange(GL_UNIFORM_BUFFER, 0, m_uniform_ring_buffer->get_buffer().id(), scale_offset_offset, 512); + glBindBufferRange(GL_UNIFORM_BUFFER, 1, m_uniform_ring_buffer->get_buffer().id(), vertex_constants_offset, 512 * 16); + glBindBufferRange(GL_UNIFORM_BUFFER, 2, m_uniform_ring_buffer->get_buffer().id(), fragment_constants_offset, fragment_constants_sz); return true; } @@ -714,6 +714,26 @@ void GLGSRender::flip(int buffer) } m_frame->flip(m_context); + + if (g_cfg_rsx_overlay) + { + //TODO: Display overlay in a cross-platform manner + //Core context throws wgl font functions out of the window as they use display lists + //Only show debug info if the user really requests it + + if (g_cfg_rsx_debug_output) + { + std::string message = + "draw_calls: " + std::to_string(m_draw_calls) + ", " + "draw_call_setup: " + std::to_string(m_begin_time) + "us, " + "vertex_upload_time: " + std::to_string(m_vertex_upload_time) + "us, " + "draw_call_execution: " + std::to_string(m_draw_time) + "us"; + + LOG_ERROR(RSX, message.c_str()); + } + } + + m_draw_calls = 0; + m_begin_time = 0; + m_draw_time = 0; + m_vertex_upload_time = 0; for (auto &tex : m_rtts.invalidated_resources) { diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.h b/rpcs3/Emu/RSX/GL/GLGSRender.h index ae732c1abd..ce22766993 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.h +++ b/rpcs3/Emu/RSX/GL/GLGSRender.h @@ -29,6 +29,13 @@ private: gl::texture m_gl_attrib_buffers[rsx::limits::vertex_count]; std::unique_ptr m_attrib_ring_buffer; + std::unique_ptr m_uniform_ring_buffer; + std::unique_ptr m_index_ring_buffer; + + u32 m_draw_calls = 0; + u32 m_begin_time = 0; + u32 m_draw_time = 0; + u32 m_vertex_upload_time = 0; public: gl::fbo draw_fbo; @@ -40,12 +47,7 @@ private: gl::fbo m_flip_fbo; gl::texture m_flip_tex_color; - gl::buffer m_scale_offset_buffer; - gl::buffer m_vertex_constants_buffer; - gl::buffer m_fragment_constants_buffer; - - gl::buffer m_vbo; - gl::buffer m_ebo; + //vaos are mandatory for core profile gl::vao m_vao; public: @@ -54,7 +56,7 @@ public: private: static u32 enable(u32 enable, u32 cap); static u32 enable(u32 enable, u32 cap, u32 index); - void set_vertex_buffer(); + u32 set_vertex_buffer(); public: bool load_program(); diff --git a/rpcs3/Emu/RSX/GL/GLProcTable.h b/rpcs3/Emu/RSX/GL/GLProcTable.h index 4b374615f2..04d99aee12 100644 --- a/rpcs3/Emu/RSX/GL/GLProcTable.h +++ b/rpcs3/Emu/RSX/GL/GLProcTable.h @@ -164,6 +164,7 @@ OPENGL_PROC(PFNGLGETINTEGER64VPROC, GetInteger64v); OPENGL_PROC(PFNGLCHECKFRAMEBUFFERSTATUSPROC, CheckFramebufferStatus); OPENGL_PROC(PFNGLMAPBUFFERRANGEPROC, MapBufferRange); +OPENGL_PROC(PFNGLBINDBUFFERRANGEPROC, BindBufferRange); OPENGL_PROC(PFNGLBINDBUFFERBASEPROC, BindBufferBase); //Texture Buffers diff --git a/rpcs3/Emu/RSX/GL/gl_helpers.h b/rpcs3/Emu/RSX/GL/gl_helpers.h index 7fda56c72c..c24e9487ac 100644 --- a/rpcs3/Emu/RSX/GL/gl_helpers.h +++ b/rpcs3/Emu/RSX/GL/gl_helpers.h @@ -372,7 +372,9 @@ namespace gl pixel_pack = GL_PIXEL_PACK_BUFFER, pixel_unpack = GL_PIXEL_UNPACK_BUFFER, array = GL_ARRAY_BUFFER, - element_array = GL_ELEMENT_ARRAY_BUFFER + element_array = GL_ELEMENT_ARRAY_BUFFER, + uniform = GL_UNIFORM_BUFFER, + texture = GL_TEXTURE_BUFFER }; enum class access { @@ -421,6 +423,8 @@ namespace gl case target::pixel_unpack: pname = GL_PIXEL_UNPACK_BUFFER_BINDING; break; case target::array: pname = GL_ARRAY_BUFFER_BINDING; break; case target::element_array: pname = GL_ELEMENT_ARRAY_BUFFER_BINDING; break; + case target::uniform: pname = GL_UNIFORM_BUFFER_BINDING; break; + case target::texture: pname = GL_TEXTURE_BUFFER_BINDING; break; } glGetIntegerv(pname, &m_last_binding); @@ -465,6 +469,13 @@ namespace gl data(size, data_); } + void create(target target_, GLsizeiptr size, const void* data_ = nullptr) + { + create(); + m_target = target_; + data(size, data_); + } + void data(GLsizeiptr size, const void* data_ = nullptr) { target target_ = current_target(); @@ -572,6 +583,7 @@ namespace gl class ring_buffer { buffer storage_buffer; + buffer::target m_target; u32 m_data_loc = 0; u32 m_size; @@ -582,11 +594,12 @@ namespace gl void *m_mapped_base = nullptr; public: - ring_buffer(u32 initial_size) + ring_buffer(u32 initial_size, buffer::target target) { storage_buffer.create(); storage_buffer.data(initial_size); m_size = initial_size; + m_target = target; } void destroy() @@ -598,13 +611,10 @@ namespace gl { size = (size + 255) & ~255; - //storage_buffer.bind(storage_buffer.current_target()); - glBindBuffer(GL_TEXTURE_BUFFER, storage_buffer.id()); + glBindBuffer((GLenum)m_target, storage_buffer.id()); u32 limit = m_data_loc + size; if (limit > m_size) { - //Orphan this buffer and have the driver allocate a new one instead of looping back to the front. - //Hopefully, the driver will track usage here and re-use if sync is not a problem if (size > m_size) m_size = size; @@ -612,7 +622,7 @@ namespace gl m_data_loc = 0; } - void *ptr = glMapBufferRange(GL_TEXTURE_BUFFER, m_data_loc, size, GL_MAP_WRITE_BIT|GL_MAP_INVALIDATE_RANGE_BIT|GL_MAP_UNSYNCHRONIZED_BIT); + void *ptr = glMapBufferRange((GLenum)m_target, m_data_loc, size, GL_MAP_WRITE_BIT|GL_MAP_INVALIDATE_RANGE_BIT|GL_MAP_UNSYNCHRONIZED_BIT); u32 offset = m_data_loc; m_data_loc += size; return std::make_pair(ptr, offset); @@ -620,8 +630,7 @@ namespace gl void unmap() { - //storage_buffer.unmap(); - glUnmapBuffer(GL_TEXTURE_BUFFER); + glUnmapBuffer((GLenum)m_target); m_mapped_block_size = 0; m_mapped_base = 0; } @@ -638,21 +647,29 @@ namespace gl std::pair alloc_from_reserve(u32 size) { - size = (size + 255) & ~255; + size = (size + 15) & ~15; if (m_mapped_bytes_available < size || !m_mapped_base) { if (m_mapped_base) + { + //This doesn't really work for some reason, probably since the caller should bind the target + //before making this call as the block may be reallocated + LOG_ERROR(RSX, "reserved allocation exceeded. check for corruption!"); unmap(); + } reserve_and_map((size > 4096) ? size : 4096); } + EXPECTS(m_mapped_bytes_available >= size); + void *ptr = (char*)m_mapped_base + m_mapped_reserve_offset; u32 offset = m_mapped_reserve_offset + m_mapped_block_offset; m_mapped_reserve_offset += size; m_mapped_bytes_available -= size; + EXPECTS((offset & 15) == 0); return std::make_pair(ptr, offset); } diff --git a/rpcs3/Emu/RSX/GL/vertex_buffer.cpp b/rpcs3/Emu/RSX/GL/vertex_buffer.cpp index 31a38fa17b..903745553d 100644 --- a/rpcs3/Emu/RSX/GL/vertex_buffer.cpp +++ b/rpcs3/Emu/RSX/GL/vertex_buffer.cpp @@ -127,8 +127,8 @@ namespace throw EXCEPTION("unknow vertex type"); } - // return vertex count and filled index array if primitive type is not native (empty array otherwise) - std::tuple> get_index_array_for_emulated_non_indexed_draw(const std::vector> &first_count_commands, rsx::primitive_type primitive_mode) + // return vertex count if primitive type is not native (empty array otherwise) + std::tuple get_index_array_for_emulated_non_indexed_draw(const std::vector> &first_count_commands, rsx::primitive_type primitive_mode, gl::ring_buffer &dst) { u32 vertex_draw_count = 0; assert(!is_primitive_native(primitive_mode)); @@ -138,9 +138,10 @@ namespace vertex_draw_count += (u32)get_index_count(primitive_mode, pair.second); } - std::vector vertex_index_array(vertex_draw_count * sizeof(u16)); u32 first = 0; - char* mapped_buffer = (char*)vertex_index_array.data(); + auto mapping = dst.alloc_and_map(vertex_draw_count * sizeof(u16)); + char *mapped_buffer = (char *)mapping.first; + for (const auto &pair : first_count_commands) { size_t element_count = get_index_count(primitive_mode, pair.second); @@ -149,16 +150,17 @@ namespace first += pair.second; } - return std::make_tuple(vertex_draw_count, vertex_index_array); + dst.unmap(); + return std::make_tuple(vertex_draw_count, mapping.second); } } -void GLGSRender::set_vertex_buffer() +u32 GLGSRender::set_vertex_buffer() { //initialize vertex attributes - //merge all vertex arrays - //std::vector vertex_arrays_data; + + std::chrono::time_point then = std::chrono::system_clock::now(); const std::string reg_table[] = { @@ -171,18 +173,21 @@ void GLGSRender::set_vertex_buffer() }; u32 input_mask = rsx::method_registers[NV4097_SET_VERTEX_ATTRIB_INPUT_MASK]; - - std::vector vertex_index_array; - vertex_draw_count = 0; - u32 min_index, max_index; - + u32 min_index = 0, max_index = 0; u32 max_vertex_attrib_size = 0; + u32 offset_in_index_buffer = 0; + + vertex_draw_count = 0; + + //place holder; replace with actual index buffer + gsl::span index_array; + for (u8 index = 0; index < rsx::limits::vertex_count; ++index) { if (vertex_arrays_info[index].size == 0) continue; - max_vertex_attrib_size += (vertex_arrays_info[index].size << 2); + max_vertex_attrib_size += 16; } if (draw_command == rsx::draw_command::indexed) @@ -193,12 +198,19 @@ void GLGSRender::set_vertex_buffer() { vertex_draw_count += first_count.second; } + // Index count vertex_draw_count = (u32)get_index_count(draw_mode, gsl::narrow(vertex_draw_count)); - vertex_index_array.resize(vertex_draw_count * type_size); + u32 block_sz = vertex_draw_count * type_size; + + auto mapping = m_index_ring_buffer->alloc_and_map(block_sz); + void *ptr = mapping.first; + offset_in_index_buffer = mapping.second; - gsl::span dst{ reinterpret_cast(vertex_index_array.data()), gsl::narrow(vertex_index_array.size()) }; + gsl::span dst{ reinterpret_cast(ptr), gsl::narrow(block_sz) }; std::tie(min_index, max_index) = write_index_array_data_to_buffer(dst, type, draw_mode, first_count_commands); + + m_index_ring_buffer->unmap(); } if (draw_command == rsx::draw_command::inlined_array) @@ -270,7 +282,7 @@ void GLGSRender::set_vertex_buffer() m_program->uniforms.texture(location, index + rsx::limits::textures_count, texture); if (!is_primitive_native(draw_mode)) { - std::tie(vertex_draw_count, vertex_index_array) = get_index_array_for_emulated_non_indexed_draw({ { 0, vertex_draw_count } }, draw_mode); + std::tie(vertex_draw_count, offset_in_index_buffer) = get_index_array_for_emulated_non_indexed_draw({ { 0, vertex_draw_count } }, draw_mode, *m_index_ring_buffer); } } } @@ -285,7 +297,8 @@ void GLGSRender::set_vertex_buffer() if (draw_command == rsx::draw_command::array || draw_command == rsx::draw_command::indexed) { - m_attrib_ring_buffer->reserve_and_map(vertex_draw_count * max_vertex_attrib_size); + u32 verts_allocated = std::max(vertex_draw_count, max_index + 1); + m_attrib_ring_buffer->reserve_and_map(verts_allocated * max_vertex_attrib_size); for (int index = 0; index < rsx::limits::vertex_count; ++index) { @@ -395,20 +408,16 @@ void GLGSRender::set_vertex_buffer() continue; } } + if (draw_command == rsx::draw_command::array && !is_primitive_native(draw_mode)) { - std::tie(vertex_draw_count, vertex_index_array) = get_index_array_for_emulated_non_indexed_draw(first_count_commands, draw_mode); + std::tie(vertex_draw_count, offset_in_index_buffer) = get_index_array_for_emulated_non_indexed_draw(first_count_commands, draw_mode, *m_index_ring_buffer); } } m_attrib_ring_buffer->unmap(); + std::chrono::time_point now = std::chrono::system_clock::now(); + m_vertex_upload_time += std::chrono::duration_cast(now - then).count(); - if (draw_command == rsx::draw_command::indexed) - { - m_ebo.data(vertex_index_array.size(), vertex_index_array.data()); - } - else if (!is_primitive_native(draw_mode)) - { - m_ebo.data(vertex_index_array.size(), vertex_index_array.data()); - } + return offset_in_index_buffer; } \ No newline at end of file