From 4fcf9c6229a1837d816c372b627ba8fe5f92def5 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sun, 4 Jan 2015 11:20:42 -0800 Subject: [PATCH] MultiDrawIndirect draw batching - now down to <20us per draw. --- src/xenia/gpu/gl4/circular_buffer.h | 2 + src/xenia/gpu/gl4/command_processor.cc | 756 ++++++++------------- src/xenia/gpu/gl4/command_processor.h | 120 +--- src/xenia/gpu/gl4/draw_batcher.cc | 384 +++++++++++ src/xenia/gpu/gl4/draw_batcher.h | 230 +++++++ src/xenia/gpu/gl4/gl4_shader.cc | 206 ++++-- src/xenia/gpu/gl4/gl4_shader.h | 6 + src/xenia/gpu/gl4/gl4_shader_translator.cc | 4 +- src/xenia/gpu/gl4/gl_context.cc | 11 +- src/xenia/gpu/gl4/sources.gypi | 2 + 10 files changed, 1120 insertions(+), 601 deletions(-) create mode 100644 src/xenia/gpu/gl4/draw_batcher.cc create mode 100644 src/xenia/gpu/gl4/draw_batcher.h diff --git a/src/xenia/gpu/gl4/circular_buffer.h b/src/xenia/gpu/gl4/circular_buffer.h index d7288d5a8..aaa7ea7e8 100644 --- a/src/xenia/gpu/gl4/circular_buffer.h +++ b/src/xenia/gpu/gl4/circular_buffer.h @@ -35,6 +35,8 @@ class CircularBuffer { void Shutdown(); GLuint handle() const { return buffer_; } + GLuint64 gpu_handle() const { return gpu_base_; } + size_t capacity() const { return capacity_; } bool CanAcquire(size_t length); Allocation Acquire(size_t length); diff --git a/src/xenia/gpu/gl4/command_processor.cc b/src/xenia/gpu/gl4/command_processor.cc index 750079ae5..c686a94af 100644 --- a/src/xenia/gpu/gl4/command_processor.cc +++ b/src/xenia/gpu/gl4/command_processor.cc @@ -25,6 +25,8 @@ #define XETRACECP(fmt, ...) \ if (FLAGS_trace_ring_buffer) XELOGGPU(fmt, ##__VA_ARGS__) +#define FINE_GRAINED_DRAW_SCOPES 1 + namespace xe { namespace gpu { namespace gl4 { @@ -39,6 +41,7 @@ const GLuint kAnyTarget = UINT_MAX; // with the GPU, so this should be large enough to prevent that in a normal // frame. const size_t kScratchBufferCapacity = 256 * 1024 * 1024; +const size_t kScratchBufferAlignment = 256; CommandProcessor::CachedPipeline::CachedPipeline() : vertex_program(0), fragment_program(0), handles({0}) {} @@ -69,12 +72,12 @@ CommandProcessor::CommandProcessor(GL4GraphicsSystem* graphics_system) active_vertex_shader_(nullptr), active_pixel_shader_(nullptr), active_framebuffer_(nullptr), - vertex_array_(0), point_list_geometry_program_(0), rect_list_geometry_program_(0), quad_list_geometry_program_(0), - scratch_buffer_(kScratchBufferCapacity) { - std::memset(&draw_command_, 0, sizeof(draw_command_)); + draw_index_count_(0), + draw_batcher_(graphics_system_->register_file()), + scratch_buffer_(kScratchBufferCapacity, kScratchBufferAlignment) { LARGE_INTEGER perf_counter; QueryPerformanceCounter(&perf_counter); time_base_ = perf_counter.QuadPart; @@ -163,6 +166,9 @@ void CommandProcessor::WorkerMain() { } bool CommandProcessor::SetupGL() { + if (FLAGS_vendor_gl_extensions && GLEW_NV_vertex_buffer_unified_memory) { + has_bindless_vbos_ = true; + } // Circular buffer holding scratch vertex/index data. if (!scratch_buffer_.Initialize()) { @@ -170,27 +176,18 @@ bool CommandProcessor::SetupGL() { return false; } + // Command buffer. + if (!draw_batcher_.Initialize(&scratch_buffer_)) { + PLOGE("Unable to initialize command buffer"); + return false; + } + // Texture cache that keeps track of any textures/samplers used. if (!texture_cache_.Initialize(membase_, &scratch_buffer_)) { PLOGE("Unable to initialize texture cache"); return false; } - // TODO(benvanik): cache. - glGenVertexArrays(1, &vertex_array_); - glBindVertexArray(vertex_array_); - - if (FLAGS_vendor_gl_extensions && GLEW_NV_vertex_buffer_unified_memory) { - has_bindless_vbos_ = true; - glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV); - glEnableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV); - } - GLint max_vertex_attribs = 0; - glGetIntegerv(GL_MAX_VERTEX_ATTRIBS, &max_vertex_attribs); - for (GLint i = 0; i < max_vertex_attribs; ++i) { - glEnableVertexAttribArray(i); - } - const std::string geometry_header = "#version 450\n" "#extension all : warn\n" @@ -346,8 +343,8 @@ void CommandProcessor::ShutdownGL() { glDeleteProgram(point_list_geometry_program_); glDeleteProgram(rect_list_geometry_program_); glDeleteProgram(quad_list_geometry_program_); - glDeleteVertexArrays(1, &vertex_array_); texture_cache_.Shutdown(); + draw_batcher_.Shutdown(); scratch_buffer_.Shutdown(); } @@ -437,7 +434,8 @@ void CommandProcessor::PrepareForWait() { // TODO(benvanik): fences and fancy stuff. We should figure out a way to // make interrupt callbacks from the GPU so that we don't have to do a full // synchronize here. - glFlush(); + //glFlush(); + glFinish(); if (FLAGS_thread_safe_gl) { context_->ClearCurrent(); @@ -1106,46 +1104,45 @@ bool CommandProcessor::ExecutePacketType3_DRAW_INDX(RingbufferReader* reader, uint32_t index_count = dword1 >> 16; auto prim_type = static_cast(dword1 & 0x3F); - uint32_t index_base = 0; - uint32_t index_size = 0; - Endian index_endianness = Endian::kUnspecified; - bool index_32bit = false; uint32_t src_sel = (dword1 >> 6) & 0x3; if (src_sel == 0x0) { // Indexed draw. - index_base = reader->Read(); - index_size = reader->Read(); - index_endianness = static_cast(index_size >> 30); + index_buffer_info_.guest_base = reader->Read(); + uint32_t index_size = reader->Read(); + index_buffer_info_.endianness = static_cast(index_size >> 30); index_size &= 0x00FFFFFF; - index_32bit = (dword1 >> 11) & 0x1; + bool index_32bit = (dword1 >> 11) & 0x1; + index_buffer_info_.format = + index_32bit ? IndexFormat::kInt32 : IndexFormat::kInt16; index_size *= index_32bit ? 4 : 2; + index_buffer_info_.length = index_size; + index_buffer_info_.count = index_count; } else if (src_sel == 0x2) { // Auto draw. + index_buffer_info_.guest_base = 0; + index_buffer_info_.length = 0; } else { // Unknown source select. assert_always(); } + draw_index_count_ = index_count; - PrepareDraw(&draw_command_); - draw_command_.prim_type = prim_type; - draw_command_.start_index = 0; - draw_command_.index_count = index_count; - draw_command_.base_vertex = 0; + bool draw_valid = false; if (src_sel == 0x0) { // Indexed draw. - draw_command_.index_buffer.address = membase_ + index_base; - draw_command_.index_buffer.size = index_size; - draw_command_.index_buffer.endianness = index_endianness; - draw_command_.index_buffer.format = - index_32bit ? IndexFormat::kInt32 : IndexFormat::kInt16; + draw_valid = draw_batcher_.BeginDrawElements(prim_type, index_count, + index_buffer_info_.format); } else if (src_sel == 0x2) { // Auto draw. - draw_command_.index_buffer.address = nullptr; + draw_valid = draw_batcher_.BeginDrawArrays(prim_type, index_count); } else { // Unknown source select. assert_always(); } - return IssueDraw(&draw_command_); + if (!draw_valid) { + return false; + } + return IssueDraw(); } bool CommandProcessor::ExecutePacketType3_DRAW_INDX_2(RingbufferReader* reader, @@ -1164,14 +1161,15 @@ bool CommandProcessor::ExecutePacketType3_DRAW_INDX_2(RingbufferReader* reader, uint32_t indices_size = index_count * (index_32bit ? 4 : 2); reader->CheckRead(indices_size / sizeof(uint32_t)); uint32_t index_ptr = reader->ptr(); + index_buffer_info_.guest_base = 0; + index_buffer_info_.length = 0; reader->Advance(count - 1); - PrepareDraw(&draw_command_); - draw_command_.prim_type = prim_type; - draw_command_.start_index = 0; - draw_command_.index_count = index_count; - draw_command_.base_vertex = 0; - draw_command_.index_buffer.address = nullptr; - return IssueDraw(&draw_command_); + draw_index_count_ = index_count; + bool draw_valid = draw_batcher_.BeginDrawArrays(prim_type, index_count); + if (!draw_valid) { + return false; + } + return IssueDraw(); } bool CommandProcessor::ExecutePacketType3_SET_CONSTANT(RingbufferReader* reader, @@ -1319,58 +1317,30 @@ bool CommandProcessor::LoadShader(ShaderType shader_type, return true; } -void CommandProcessor::PrepareDraw(DrawCommand* draw_command) { - auto& regs = *register_file_; - auto& cmd = *draw_command; - - // Reset the things we don't modify so that we have clean state. - cmd.prim_type = PrimitiveType::kPointList; - cmd.index_count = 0; - cmd.index_buffer.address = nullptr; - - // Starting index when drawing indexed. - cmd.start_index = regs[XE_GPU_REG_VGT_INDX_OFFSET].u32; - - // Min/max index ranges. This is often [0,FFFF|FFFFFF], but if it's not we - // can use it to do a glDrawRangeElements. - cmd.min_index = regs[XE_GPU_REG_VGT_MIN_VTX_INDX].u32; - cmd.max_index = regs[XE_GPU_REG_VGT_MAX_VTX_INDX].u32; - - // ? - cmd.base_vertex = 0; - - cmd.state_data = nullptr; -} - -bool CommandProcessor::IssueDraw(DrawCommand* draw_command) { +bool CommandProcessor::IssueDraw() { +#if FINE_GRAINED_DRAW_SCOPES SCOPE_profile_cpu_f("gpu"); +#endif // FINE_GRAINED_DRAW_SCOPES + auto& regs = *register_file_; - auto& cmd = *draw_command; auto enable_mode = static_cast(regs[XE_GPU_REG_RB_MODECONTROL].u32 & 0x7); if (enable_mode == ModeControl::kIgnore) { // Ignored. + draw_batcher_.DiscardDraw(); return true; } else if (enable_mode == ModeControl::kCopy) { // Special copy handling. - return IssueCopy(draw_command); - } - - // Allocate a state data block. - // Everything the shaders access lives here. - auto allocation = scratch_buffer_.Acquire(sizeof(UniformDataBlock)); - scratch_buffer_stats_.total_state_data_size += sizeof(UniformDataBlock); - cmd.state_data = reinterpret_cast(allocation.host_ptr); - if (!cmd.state_data) { - PLOGE("Unable to allocate uniform data buffer"); - return false; + draw_batcher_.DiscardDraw(); + return IssueCopy(); } #define CHECK_ISSUE_UPDATE_STATUS(status, mismatch, error_message) \ { \ if (status == UpdateStatus::kError) { \ PLOGE(error_message); \ + draw_batcher_.DiscardDraw(); \ return false; \ } else if (status == UpdateStatus::kMismatch) { \ mismatch = true; \ @@ -1379,93 +1349,31 @@ bool CommandProcessor::IssueDraw(DrawCommand* draw_command) { UpdateStatus status; bool mismatch = false; - status = UpdateShaders(draw_command); + status = UpdateShaders(draw_batcher_.prim_type()); CHECK_ISSUE_UPDATE_STATUS(status, mismatch, "Unable to prepare draw shaders"); - status = UpdateRenderTargets(draw_command); + status = UpdateRenderTargets(); CHECK_ISSUE_UPDATE_STATUS(status, mismatch, "Unable to setup render targets"); if (!active_framebuffer_) { // No framebuffer, so nothing we do will actually have an effect. // Treat it as a no-op. + // TODO(benvanik): if we have a vs export, still allow it to go. XETRACECP("No-op draw (no framebuffer set)"); + draw_batcher_.DiscardDraw(); return true; } - status = UpdateState(draw_command); + status = UpdateState(); CHECK_ISSUE_UPDATE_STATUS(status, mismatch, "Unable to setup render state"); - status = UpdateConstants(draw_command); - CHECK_ISSUE_UPDATE_STATUS(status, mismatch, - "Unable to update shader constants"); - status = PopulateSamplers(draw_command); + status = PopulateSamplers(); CHECK_ISSUE_UPDATE_STATUS(status, mismatch, "Unable to prepare draw samplers"); - status = PopulateIndexBuffer(draw_command); + status = PopulateIndexBuffer(); CHECK_ISSUE_UPDATE_STATUS(status, mismatch, "Unable to setup index buffer"); - status = PopulateVertexBuffers(draw_command); + status = PopulateVertexBuffers(); CHECK_ISSUE_UPDATE_STATUS(status, mismatch, "Unable to setup vertex buffers"); - GLenum prim_type = 0; - switch (cmd.prim_type) { - case PrimitiveType::kPointList: - prim_type = GL_POINTS; - break; - case PrimitiveType::kLineList: - prim_type = GL_LINES; - break; - case PrimitiveType::kLineStrip: - prim_type = GL_LINE_STRIP; - break; - case PrimitiveType::kLineLoop: - prim_type = GL_LINE_LOOP; - break; - case PrimitiveType::kTriangleList: - prim_type = GL_TRIANGLES; - break; - case PrimitiveType::kTriangleStrip: - prim_type = GL_TRIANGLE_STRIP; - break; - case PrimitiveType::kTriangleFan: - prim_type = GL_TRIANGLE_FAN; - break; - case PrimitiveType::kRectangleList: - prim_type = GL_TRIANGLE_STRIP; - break; - case PrimitiveType::kQuadList: - prim_type = GL_LINES_ADJACENCY; - break; - default: - case PrimitiveType::kUnknown0x07: - prim_type = GL_POINTS; - XELOGE("unsupported primitive type %d", cmd.prim_type); - assert_unhandled_case(cmd.prim_type); - return false; - } - - // Commit the state buffer - nothing can change after this. - glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 0, scratch_buffer_.handle(), - allocation.offset, allocation.length); - scratch_buffer_.Commit(std::move(allocation)); - scratch_buffer_.Flush(); - - if (cmd.index_buffer.address) { - // Indexed draw. - // PopulateIndexBuffer has our element array setup. - size_t element_size = cmd.index_buffer.format == IndexFormat::kInt32 - ? sizeof(uint32_t) - : sizeof(uint16_t); - glDrawElementsBaseVertex( - prim_type, cmd.index_count, - cmd.index_buffer.format == IndexFormat::kInt32 ? GL_UNSIGNED_INT - : GL_UNSIGNED_SHORT, - reinterpret_cast(cmd.index_buffer.buffer_offset + - cmd.start_index * element_size), - cmd.base_vertex); - } else { - // Auto draw. - glDrawArrays(prim_type, cmd.start_index, cmd.index_count); - } - - return true; + return draw_batcher_.CommitDraw(); } bool CommandProcessor::SetShadowRegister(uint32_t& dest, @@ -1487,8 +1395,129 @@ bool CommandProcessor::SetShadowRegister(float& dest, uint32_t register_name) { return true; } -CommandProcessor::UpdateStatus CommandProcessor::UpdateRenderTargets( - DrawCommand* draw_command) { +CommandProcessor::UpdateStatus CommandProcessor::UpdateShaders( + PrimitiveType prim_type) { + auto& regs = update_shaders_regs_; + + bool dirty = false; + dirty |= SetShadowRegister(regs.sq_program_cntl, XE_GPU_REG_SQ_PROGRAM_CNTL); + dirty |= regs.vertex_shader != active_vertex_shader_; + dirty |= regs.pixel_shader != active_pixel_shader_; + dirty |= regs.prim_type != prim_type; + if (!dirty) { + return UpdateStatus::kCompatible; + } + regs.vertex_shader = active_vertex_shader_; + regs.pixel_shader = active_pixel_shader_; + regs.prim_type = prim_type; + + SCOPE_profile_cpu_f("gpu"); + + draw_batcher_.Flush(DrawBatcher::FlushMode::kStateChange); + + xe_gpu_program_cntl_t program_cntl; + program_cntl.dword_0 = regs.sq_program_cntl; + if (!active_vertex_shader_->has_prepared()) { + if (!active_vertex_shader_->PrepareVertexShader(program_cntl)) { + XELOGE("Unable to prepare vertex shader"); + return UpdateStatus::kError; + } + } else if (!active_vertex_shader_->is_valid()) { + XELOGE("Vertex shader invalid"); + return UpdateStatus::kError; + } + + if (!active_pixel_shader_->has_prepared()) { + if (!active_pixel_shader_->PreparePixelShader(program_cntl)) { + XELOGE("Unable to prepare pixel shader"); + return UpdateStatus::kError; + } + } else if (!active_pixel_shader_->is_valid()) { + XELOGE("Pixel shader invalid"); + return UpdateStatus::kError; + } + + GLuint vertex_program = active_vertex_shader_->program(); + GLuint fragment_program = active_pixel_shader_->program(); + + uint64_t key = (uint64_t(vertex_program) << 32) | fragment_program; + CachedPipeline* cached_pipeline = nullptr; + auto it = cached_pipelines_.find(key); + if (it == cached_pipelines_.end()) { + // Existing pipeline for these programs not found - create it. + auto new_pipeline = std::make_unique(); + new_pipeline->vertex_program = vertex_program; + new_pipeline->fragment_program = fragment_program; + new_pipeline->handles.default_pipeline = 0; + cached_pipeline = new_pipeline.get(); + all_pipelines_.emplace_back(std::move(new_pipeline)); + cached_pipelines_.insert({key, cached_pipeline}); + } else { + // Found a pipeline container - it may or may not have what we want. + cached_pipeline = it->second; + } + if (!cached_pipeline->handles.default_pipeline) { + // Perhaps it's a bit wasteful to do all of these, but oh well. + GLuint pipelines[4]; + glCreateProgramPipelines(GLsizei(poly::countof(pipelines)), pipelines); + + glUseProgramStages(pipelines[0], GL_VERTEX_SHADER_BIT, vertex_program); + glUseProgramStages(pipelines[0], GL_FRAGMENT_SHADER_BIT, fragment_program); + cached_pipeline->handles.default_pipeline = pipelines[0]; + + glUseProgramStages(pipelines[1], GL_VERTEX_SHADER_BIT, vertex_program); + glUseProgramStages(pipelines[1], GL_GEOMETRY_SHADER_BIT, + point_list_geometry_program_); + glUseProgramStages(pipelines[1], GL_FRAGMENT_SHADER_BIT, fragment_program); + cached_pipeline->handles.point_list_pipeline = pipelines[1]; + + glUseProgramStages(pipelines[2], GL_VERTEX_SHADER_BIT, vertex_program); + glUseProgramStages(pipelines[2], GL_GEOMETRY_SHADER_BIT, + rect_list_geometry_program_); + glUseProgramStages(pipelines[2], GL_FRAGMENT_SHADER_BIT, fragment_program); + cached_pipeline->handles.rect_list_pipeline = pipelines[2]; + + glUseProgramStages(pipelines[3], GL_VERTEX_SHADER_BIT, vertex_program); + glUseProgramStages(pipelines[3], GL_GEOMETRY_SHADER_BIT, + quad_list_geometry_program_); + glUseProgramStages(pipelines[3], GL_FRAGMENT_SHADER_BIT, fragment_program); + cached_pipeline->handles.quad_list_pipeline = pipelines[3]; + + // This can be set once, as the buffer never changes. + if (has_bindless_vbos_) { + glBindVertexArray(active_vertex_shader_->vao()); + glBufferAddressRangeNV(GL_ELEMENT_ARRAY_ADDRESS_NV, 0, + scratch_buffer_.gpu_handle(), + scratch_buffer_.capacity()); + } else { + glVertexArrayElementBuffer(active_vertex_shader_->vao(), + scratch_buffer_.handle()); + } + } + + GLuint pipeline = cached_pipeline->handles.default_pipeline; + switch (regs.prim_type) { + case PrimitiveType::kPointList: + pipeline = cached_pipeline->handles.point_list_pipeline; + break; + case PrimitiveType::kRectangleList: + pipeline = cached_pipeline->handles.rect_list_pipeline; + break; + case PrimitiveType::kQuadList: + pipeline = cached_pipeline->handles.quad_list_pipeline; + break; + } + + draw_batcher_.ReconfigurePipeline(active_vertex_shader_, active_pixel_shader_, + pipeline); + + glBindProgramPipeline(pipeline); + glBindVertexArray(active_vertex_shader_->vao()); + + return UpdateStatus::kMismatch; +} + +CommandProcessor::UpdateStatus CommandProcessor::UpdateRenderTargets() { auto& regs = update_render_targets_regs_; bool dirty = false; @@ -1509,6 +1538,8 @@ CommandProcessor::UpdateStatus CommandProcessor::UpdateRenderTargets( SCOPE_profile_cpu_f("gpu"); + draw_batcher_.Flush(DrawBatcher::FlushMode::kStateChange); + auto enable_mode = static_cast(regs.rb_modecontrol & 0x7); // RB_SURFACE_INFO @@ -1586,10 +1617,8 @@ CommandProcessor::UpdateStatus CommandProcessor::UpdateRenderTargets( return UpdateStatus::kMismatch; } -CommandProcessor::UpdateStatus CommandProcessor::UpdateState( - DrawCommand* draw_command) { +CommandProcessor::UpdateStatus CommandProcessor::UpdateState() { auto& regs = *register_file_; - auto state_data = draw_command->state_data; bool mismatch = false; @@ -1597,10 +1626,9 @@ CommandProcessor::UpdateStatus CommandProcessor::UpdateState( // Deprecated in GL, implemented in shader. // if(ALPHATESTENABLE && frag_out.a [<=/ALPHAFUNC] ALPHAREF) discard; uint32_t color_control = regs[XE_GPU_REG_RB_COLORCONTROL].u32; - state_data->alpha_test.x = - (color_control & 0x4) ? 1.0f : 0.0f; // ALPAHTESTENABLE - state_data->alpha_test.y = float(color_control & 0x3); // ALPHAFUNC - state_data->alpha_test.z = regs[XE_GPU_REG_RB_ALPHA_REF].f32; + draw_batcher_.set_alpha_test((color_control & 0x4) != 0, // ALPAHTESTENABLE + color_control & 0x3, // ALPHAFUNC + regs[XE_GPU_REG_RB_ALPHA_REF].f32); #define CHECK_UPDATE_STATUS(status, mismatch, error_message) \ { \ @@ -1613,22 +1641,20 @@ CommandProcessor::UpdateStatus CommandProcessor::UpdateState( } UpdateStatus status; - status = UpdateViewportState(draw_command); + status = UpdateViewportState(); CHECK_UPDATE_STATUS(status, mismatch, "Unable to update viewport state"); - status = UpdateRasterizerState(draw_command); + status = UpdateRasterizerState(); CHECK_UPDATE_STATUS(status, mismatch, "Unable to update rasterizer state"); - status = UpdateBlendState(draw_command); + status = UpdateBlendState(); CHECK_UPDATE_STATUS(status, mismatch, "Unable to update blend state"); - status = UpdateDepthStencilState(draw_command); + status = UpdateDepthStencilState(); CHECK_UPDATE_STATUS(status, mismatch, "Unable to update depth/stencil state"); return mismatch ? UpdateStatus::kMismatch : UpdateStatus::kCompatible; } -CommandProcessor::UpdateStatus CommandProcessor::UpdateViewportState( - DrawCommand* draw_command) { +CommandProcessor::UpdateStatus CommandProcessor::UpdateViewportState() { auto& regs = *register_file_; - auto state_data = draw_command->state_data; // NOTE: we don't track state here as this is all cheap to update (ish). @@ -1644,18 +1670,16 @@ CommandProcessor::UpdateStatus CommandProcessor::UpdateViewportState( // https://github.com/freedreno/mesa/blob/master/src/mesa/drivers/dri/r200/r200_state.c if ((mode_control >> 17) & 1) { uint32_t window_offset = regs[XE_GPU_REG_PA_SC_WINDOW_OFFSET].u32; - state_data->window_offset.x = float(window_offset & 0x7FFF); - state_data->window_offset.y = float((window_offset >> 16) & 0x7FFF); + draw_batcher_.set_window_offset(window_offset & 0x7FFF, + (window_offset >> 16) & 0x7FFF); } else { - state_data->window_offset.x = 0; - state_data->window_offset.y = 0; + draw_batcher_.set_window_offset(0, 0); } uint32_t window_scissor_tl = regs[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_TL].u32; uint32_t window_scissor_br = regs[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_BR].u32; - state_data->window_scissor.x = float(window_scissor_tl & 0x7FFF); - state_data->window_scissor.y = float((window_scissor_tl >> 16) & 0x7FFF); - state_data->window_scissor.z = float(window_scissor_br & 0x7FFF); - state_data->window_scissor.w = float((window_scissor_br >> 16) & 0x7FFF); + draw_batcher_.set_window_scissor( + window_scissor_tl & 0x7FFF, (window_scissor_tl >> 16) & 0x7FFF, + window_scissor_br & 0x7FFF, (window_scissor_br >> 16) & 0x7FFF); // HACK: no clue where to get these values. // RB_SURFACE_INFO @@ -1676,8 +1700,7 @@ CommandProcessor::UpdateStatus CommandProcessor::UpdateViewportState( window_height_scalar = 2; break; } - state_data->window_offset.z = window_width_scalar; - state_data->window_offset.w = window_height_scalar; + draw_batcher_.set_window_scalar(window_width_scalar, window_height_scalar); // Whether each of the viewport settings is enabled. // http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf @@ -1693,33 +1716,25 @@ CommandProcessor::UpdateStatus CommandProcessor::UpdateViewportState( vport_yoffset_enable == vport_zoffset_enable); // Viewport scaling. Only enabled if the flags are all set. - state_data->viewport_scale.x = - vport_xscale_enable ? regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32 : 1; // 640 - state_data->viewport_offset.x = vport_xoffset_enable - ? regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32 - : 0; // 640 - state_data->viewport_scale.y = vport_yscale_enable - ? regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32 - : 1; // -360 - state_data->viewport_offset.y = vport_yoffset_enable - ? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32 - : 0; // 360 - state_data->viewport_scale.z = - vport_zscale_enable ? regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32 : 1; // 1 - state_data->viewport_offset.z = - vport_zoffset_enable ? regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32 : 0; // 0 + draw_batcher_.set_viewport_offset( + vport_xoffset_enable ? regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32 : 0, + vport_yoffset_enable ? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32 : 0, + vport_zoffset_enable ? regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32 : 0); + draw_batcher_.set_viewport_scale( + vport_xscale_enable ? regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32 : 1, + vport_yscale_enable ? regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32 : 1, + vport_zscale_enable ? regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32 : 1); // http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf // VTX_XY_FMT = true: the incoming X, Y have already been multiplied by 1/W0. // = false: multiply the X, Y coordinates by 1/W0. - state_data->vtx_fmt.x = state_data->vtx_fmt.y = - (vte_control >> 8) & 0x1 ? 1.0f : 0.0f; // VTX_Z_FMT = true: the incoming Z has already been multiplied by 1/W0. // = false: multiply the Z coordinate by 1/W0. - state_data->vtx_fmt.z = (vte_control >> 9) & 0x1 ? 1.0f : 0.0f; // VTX_W0_FMT = true: the incoming W0 is not 1/W0. Perform the reciprocal to // get 1/W0. - state_data->vtx_fmt.w = (vte_control >> 10) & 0x1 ? 1.0f : 0.0f; + draw_batcher_.set_vtx_fmt((vte_control >> 8) & 0x1 ? 1.0f : 0.0f, + (vte_control >> 9) & 0x1 ? 1.0f : 0.0f, + (vte_control >> 10) & 0x1 ? 1.0f : 0.0f); // Clipping. // https://github.com/freedreno/amd-gpu/blob/master/include/reg/yamato/14/yamato_genenum.h#L1587 @@ -1732,8 +1747,7 @@ CommandProcessor::UpdateStatus CommandProcessor::UpdateViewportState( return UpdateStatus::kCompatible; } -CommandProcessor::UpdateStatus CommandProcessor::UpdateRasterizerState( - DrawCommand* draw_command) { +CommandProcessor::UpdateStatus CommandProcessor::UpdateRasterizerState() { auto& regs = update_rasterizer_state_regs_; bool dirty = false; @@ -1749,6 +1763,8 @@ CommandProcessor::UpdateStatus CommandProcessor::UpdateRasterizerState( SCOPE_profile_cpu_f("gpu"); + draw_batcher_.Flush(DrawBatcher::FlushMode::kStateChange); + // Scissoring. if (regs.pa_sc_screen_scissor_tl != 0 && regs.pa_sc_screen_scissor_br != 0x20002000) { @@ -1766,10 +1782,6 @@ CommandProcessor::UpdateStatus CommandProcessor::UpdateRasterizerState( glDisable(GL_SCISSOR_TEST); } - // Rect lists aren't culled. There may be other things they skip too. - assert_true((regs.pa_su_sc_mode_cntl & 0x3) == 0 || - draw_command->prim_type != PrimitiveType::kRectangleList); - switch (regs.pa_su_sc_mode_cntl & 0x3) { case 0: glDisable(GL_CULL_FACE); @@ -1784,6 +1796,12 @@ CommandProcessor::UpdateStatus CommandProcessor::UpdateRasterizerState( break; } + if (regs.pa_su_sc_mode_cntl & (1 << 20)) { + glProvokingVertex(GL_LAST_VERTEX_CONVENTION); + } else { + glProvokingVertex(GL_FIRST_VERTEX_CONVENTION); + } + if (regs.pa_su_sc_mode_cntl & 0x4) { glFrontFace(GL_CW); } else { @@ -1797,8 +1815,7 @@ CommandProcessor::UpdateStatus CommandProcessor::UpdateRasterizerState( return UpdateStatus::kMismatch; } -CommandProcessor::UpdateStatus CommandProcessor::UpdateBlendState( - DrawCommand* draw_command) { +CommandProcessor::UpdateStatus CommandProcessor::UpdateBlendState() { auto& regs = update_blend_state_regs_; bool dirty = false; @@ -1820,6 +1837,8 @@ CommandProcessor::UpdateStatus CommandProcessor::UpdateBlendState( SCOPE_profile_cpu_f("gpu"); + draw_batcher_.Flush(DrawBatcher::FlushMode::kStateChange); + static const GLenum blend_map[] = { /* 0 */ GL_ZERO, /* 1 */ GL_ONE, @@ -1882,8 +1901,7 @@ CommandProcessor::UpdateStatus CommandProcessor::UpdateBlendState( return UpdateStatus::kMismatch; } -CommandProcessor::UpdateStatus CommandProcessor::UpdateDepthStencilState( - DrawCommand* draw_command) { +CommandProcessor::UpdateStatus CommandProcessor::UpdateDepthStencilState() { auto& regs = update_depth_stencil_state_regs_; bool dirty = false; @@ -1896,6 +1914,8 @@ CommandProcessor::UpdateStatus CommandProcessor::UpdateDepthStencilState( SCOPE_profile_cpu_f("gpu"); + draw_batcher_.Flush(DrawBatcher::FlushMode::kStateChange); + static const GLenum compare_func_map[] = { /* 0 */ GL_NEVER, /* 1 */ GL_LESS, @@ -1977,192 +1997,72 @@ CommandProcessor::UpdateStatus CommandProcessor::UpdateDepthStencilState( return UpdateStatus::kMismatch; } -CommandProcessor::UpdateStatus CommandProcessor::UpdateConstants( - DrawCommand* draw_command) { +CommandProcessor::UpdateStatus CommandProcessor::PopulateIndexBuffer() { auto& regs = *register_file_; - auto state_data = draw_command->state_data; - - // TODO(benvanik): partial updates, etc. We could use shader constant access - // knowledge that we get at compile time to only upload those constants - // required. If we did this as a variable length then we could really cut - // down on state block sizes. - - // Copy over all constants. - std::memcpy(&state_data->float_consts, - ®s[XE_GPU_REG_SHADER_CONSTANT_000_X].f32, - sizeof(state_data->float_consts)); - std::memcpy( - &state_data->bool_consts, - ®s[XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031].f32, - sizeof(state_data->bool_consts) + sizeof(state_data->loop_consts)); - - return UpdateStatus::kCompatible; -} - -CommandProcessor::UpdateStatus CommandProcessor::UpdateShaders( - DrawCommand* draw_command) { - auto& regs = update_shaders_regs_; - auto& cmd = *draw_command; - - bool dirty = false; - dirty |= SetShadowRegister(regs.sq_program_cntl, XE_GPU_REG_SQ_PROGRAM_CNTL); - dirty |= regs.vertex_shader != active_vertex_shader_; - dirty |= regs.pixel_shader != active_pixel_shader_; - dirty |= regs.prim_type != cmd.prim_type; - if (!dirty) { + auto& info = index_buffer_info_; + if (!info.guest_base) { + // No index buffer or auto draw. return UpdateStatus::kCompatible; } - regs.vertex_shader = active_vertex_shader_; - regs.pixel_shader = active_pixel_shader_; - regs.prim_type = cmd.prim_type; +#if FINE_GRAINED_DRAW_SCOPES SCOPE_profile_cpu_f("gpu"); +#endif // FINE_GRAINED_DRAW_SCOPES - xe_gpu_program_cntl_t program_cntl; - program_cntl.dword_0 = regs.sq_program_cntl; - if (!active_vertex_shader_->has_prepared()) { - if (!active_vertex_shader_->PrepareVertexShader(program_cntl)) { - XELOGE("Unable to prepare vertex shader"); - return UpdateStatus::kError; - } - } else if (!active_vertex_shader_->is_valid()) { - XELOGE("Vertex shader invalid"); - return UpdateStatus::kError; - } - - if (!active_pixel_shader_->has_prepared()) { - if (!active_pixel_shader_->PreparePixelShader(program_cntl)) { - XELOGE("Unable to prepare pixel shader"); - return UpdateStatus::kError; - } - } else if (!active_pixel_shader_->is_valid()) { - XELOGE("Pixel shader invalid"); - return UpdateStatus::kError; - } - - GLuint vertex_program = active_vertex_shader_->program(); - GLuint fragment_program = active_pixel_shader_->program(); - - uint64_t key = (uint64_t(vertex_program) << 32) | fragment_program; - CachedPipeline* cached_pipeline = nullptr; - auto it = cached_pipelines_.find(key); - if (it == cached_pipelines_.end()) { - // Existing pipeline for these programs not found - create it. - auto new_pipeline = std::make_unique(); - new_pipeline->vertex_program = vertex_program; - new_pipeline->fragment_program = fragment_program; - new_pipeline->handles.default_pipeline = 0; - cached_pipeline = new_pipeline.get(); - all_pipelines_.emplace_back(std::move(new_pipeline)); - cached_pipelines_.insert({key, cached_pipeline}); - } else { - // Found a pipeline container - it may or may not have what we want. - cached_pipeline = it->second; - } - if (!cached_pipeline->handles.default_pipeline) { - // Perhaps it's a bit wasteful to do all of these, but oh well. - GLuint pipelines[4]; - glCreateProgramPipelines(GLsizei(poly::countof(pipelines)), pipelines); - - glUseProgramStages(pipelines[0], GL_VERTEX_SHADER_BIT, vertex_program); - glUseProgramStages(pipelines[0], GL_FRAGMENT_SHADER_BIT, fragment_program); - cached_pipeline->handles.default_pipeline = pipelines[0]; - - glUseProgramStages(pipelines[1], GL_VERTEX_SHADER_BIT, vertex_program); - glUseProgramStages(pipelines[1], GL_GEOMETRY_SHADER_BIT, - point_list_geometry_program_); - glUseProgramStages(pipelines[1], GL_FRAGMENT_SHADER_BIT, fragment_program); - cached_pipeline->handles.point_list_pipeline = pipelines[1]; - - glUseProgramStages(pipelines[2], GL_VERTEX_SHADER_BIT, vertex_program); - glUseProgramStages(pipelines[2], GL_GEOMETRY_SHADER_BIT, - rect_list_geometry_program_); - glUseProgramStages(pipelines[2], GL_FRAGMENT_SHADER_BIT, fragment_program); - cached_pipeline->handles.rect_list_pipeline = pipelines[2]; - - glUseProgramStages(pipelines[3], GL_VERTEX_SHADER_BIT, vertex_program); - glUseProgramStages(pipelines[3], GL_GEOMETRY_SHADER_BIT, - quad_list_geometry_program_); - glUseProgramStages(pipelines[3], GL_FRAGMENT_SHADER_BIT, fragment_program); - cached_pipeline->handles.quad_list_pipeline = pipelines[3]; - } - - GLuint pipeline = cached_pipeline->handles.default_pipeline; - switch (regs.prim_type) { - case PrimitiveType::kPointList: - pipeline = cached_pipeline->handles.point_list_pipeline; - break; - case PrimitiveType::kRectangleList: - pipeline = cached_pipeline->handles.rect_list_pipeline; - break; - case PrimitiveType::kQuadList: - pipeline = cached_pipeline->handles.quad_list_pipeline; - break; - } - glBindProgramPipeline(pipeline); - - return UpdateStatus::kMismatch; -} - -CommandProcessor::UpdateStatus CommandProcessor::PopulateIndexBuffer( - DrawCommand* draw_command) { - auto& cmd = *draw_command; - - auto& info = cmd.index_buffer; - if (!cmd.index_count || !info.address) { - // No index buffer or auto draw. - return UpdateStatus::kMismatch; // ? - } - - SCOPE_profile_cpu_f("gpu"); + // Min/max index ranges. This is often [0g,FFFF|FFFFFF], but if it's not we + // can use it to do a glDrawRangeElements. + uint32_t min_index = regs[XE_GPU_REG_VGT_MIN_VTX_INDX].u32; + uint32_t max_index = regs[XE_GPU_REG_VGT_MAX_VTX_INDX].u32; + assert_true(min_index == 0); + assert_true(max_index == 0xFFFF || max_index == 0xFFFFFF); assert_true(info.endianness == Endian::k8in16 || info.endianness == Endian::k8in32); size_t total_size = - cmd.index_count * (info.format == IndexFormat::kInt32 ? sizeof(uint32_t) - : sizeof(uint16_t)); + info.count * (info.format == IndexFormat::kInt32 ? sizeof(uint32_t) + : sizeof(uint16_t)); auto allocation = scratch_buffer_.Acquire(total_size); - scratch_buffer_stats_.total_indices_size += total_size; if (info.format == IndexFormat::kInt32) { - poly::copy_and_swap_32_aligned( - reinterpret_cast(allocation.host_ptr), - reinterpret_cast(cmd.index_buffer.address), - cmd.index_count); + auto dest = reinterpret_cast(allocation.host_ptr); + auto src = reinterpret_cast(membase_ + info.guest_base); + uint32_t max_index_found; + poly::copy_and_swap_32_aligned(dest, src, info.count, &max_index_found); + index_buffer_info_.max_index_found = max_index_found; } else { - poly::copy_and_swap_16_aligned( - reinterpret_cast(allocation.host_ptr), - reinterpret_cast(cmd.index_buffer.address), - cmd.index_count); + auto dest = reinterpret_cast(allocation.host_ptr); + auto src = reinterpret_cast(membase_ + info.guest_base); + uint16_t max_index_found; + poly::copy_and_swap_16_aligned(dest, src, info.count, &max_index_found); + index_buffer_info_.max_index_found = max_index_found; } - if (has_bindless_vbos_) { - glBufferAddressRangeNV(GL_ELEMENT_ARRAY_ADDRESS_NV, 0, allocation.gpu_ptr, - allocation.length); - } else { - // Offset is used in glDrawElements. - cmd.index_buffer.buffer_offset = allocation.offset; - glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, scratch_buffer_.handle()); - } + draw_batcher_.set_index_buffer(allocation); + scratch_buffer_.Commit(std::move(allocation)); - return UpdateStatus::kMismatch; + return UpdateStatus::kCompatible; } -CommandProcessor::UpdateStatus CommandProcessor::PopulateVertexBuffers( - DrawCommand* draw_command) { +CommandProcessor::UpdateStatus CommandProcessor::PopulateVertexBuffers() { +#if FINE_GRAINED_DRAW_SCOPES SCOPE_profile_cpu_f("gpu"); +#endif // FINE_GRAINED_DRAW_SCOPES + auto& regs = *register_file_; - auto& cmd = *draw_command; assert_not_null(active_vertex_shader_); - const auto& buffer_inputs = active_vertex_shader_->buffer_inputs(); + if (!has_bindless_vbos_) { + // TODO(benvanik): find a way to get around glVertexArrayVertexBuffer below. + draw_batcher_.Flush(DrawBatcher::FlushMode::kMakeCoherent); + } uint32_t el_index = 0; - for (uint32_t n = 0; n < buffer_inputs.count; n++) { - const auto& desc = buffer_inputs.descs[n]; - + const auto& buffer_inputs = active_vertex_shader_->buffer_inputs(); + for (uint32_t buffer_index = 0; buffer_index < buffer_inputs.count; + ++buffer_index) { + const auto& desc = buffer_inputs.descs[buffer_index]; int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + (desc.fetch_slot / 3) * 6; auto group = reinterpret_cast(®s.values[r]); xe_gpu_vertex_fetch_t* fetch = nullptr; @@ -2177,14 +2077,16 @@ CommandProcessor::UpdateStatus CommandProcessor::PopulateVertexBuffers( fetch = &group->vertex_fetch_2; break; } - assert_not_null(fetch); - assert_true(fetch->type == 0x3); // must be of type vertex - // TODO(benvanik): some games have type 2, which is texture - maybe - // fetch_slot wrong? - assert_not_zero(fetch->size); - auto allocation = scratch_buffer_.Acquire(fetch->size * sizeof(uint32_t)); - scratch_buffer_stats_.total_vertices_size += fetch->size * sizeof(uint32_t); + // Constrain the vertex upload to just what we are interested in. + const size_t kRangeKludge = 5; // could pick index count based on prim. + uint32_t max_index = index_buffer_info_.guest_base + ? index_buffer_info_.max_index_found + : draw_index_count_; + size_t valid_range = (max_index + kRangeKludge) * desc.stride_words * 4; + valid_range = std::min(valid_range, size_t(fetch->size * 4)); + + auto allocation = scratch_buffer_.Acquire(valid_range); // Copy and byte swap the entire buffer. // We could be smart about this to save GPU bandwidth by building a CRC @@ -2193,93 +2095,35 @@ CommandProcessor::UpdateStatus CommandProcessor::PopulateVertexBuffers( poly::copy_and_swap_32_aligned( reinterpret_cast(allocation.host_ptr), reinterpret_cast(membase_ + (fetch->address << 2)), - fetch->size); + valid_range / 4); if (!has_bindless_vbos_) { - glBindVertexBuffer(n, scratch_buffer_.handle(), allocation.offset, - desc.stride_words * 4); + // TODO(benvanik): if we could find a way to avoid this, we could use + // multidraw without flushing. + glVertexArrayVertexBuffer(active_vertex_shader_->vao(), buffer_index, + scratch_buffer_.handle(), allocation.offset, + desc.stride_words * 4); } - for (uint32_t i = 0; i < desc.element_count; ++i) { - const auto& el = desc.elements[i]; - auto comp_count = GetVertexFormatComponentCount(el.format); - GLenum comp_type; - switch (el.format) { - case VertexFormat::k_8_8_8_8: - comp_type = el.is_signed ? GL_BYTE : GL_UNSIGNED_BYTE; - break; - case VertexFormat::k_2_10_10_10: - comp_type = el.is_signed ? GL_INT_2_10_10_10_REV - : GL_UNSIGNED_INT_2_10_10_10_REV; - break; - case VertexFormat::k_10_11_11: - assert_false(el.is_signed); - comp_type = GL_UNSIGNED_INT_10F_11F_11F_REV; - break; - /*case VertexFormat::k_11_11_10: - break;*/ - case VertexFormat::k_16_16: - comp_type = el.is_signed ? GL_SHORT : GL_UNSIGNED_SHORT; - break; - case VertexFormat::k_16_16_FLOAT: - comp_type = GL_HALF_FLOAT; - break; - case VertexFormat::k_16_16_16_16: - comp_type = el.is_signed ? GL_SHORT : GL_UNSIGNED_SHORT; - break; - case VertexFormat::k_16_16_16_16_FLOAT: - comp_type = GL_HALF_FLOAT; - break; - case VertexFormat::k_32: - comp_type = el.is_signed ? GL_INT : GL_UNSIGNED_INT; - break; - case VertexFormat::k_32_32: - comp_type = el.is_signed ? GL_INT : GL_UNSIGNED_INT; - break; - case VertexFormat::k_32_32_32_32: - comp_type = el.is_signed ? GL_INT : GL_UNSIGNED_INT; - break; - case VertexFormat::k_32_FLOAT: - comp_type = GL_FLOAT; - break; - case VertexFormat::k_32_32_FLOAT: - comp_type = GL_FLOAT; - break; - case VertexFormat::k_32_32_32_FLOAT: - comp_type = GL_FLOAT; - break; - case VertexFormat::k_32_32_32_32_FLOAT: - comp_type = GL_FLOAT; - break; - default: - assert_unhandled_case(el.format); - break; + if (has_bindless_vbos_) { + for (uint32_t i = 0; i < desc.element_count; ++i, ++el_index) { + const auto& el = desc.elements[i]; + draw_batcher_.set_vertex_buffer(el_index, 0, desc.stride_words * 4, + allocation); } - if (has_bindless_vbos_) { - glVertexAttribFormatNV(el_index, comp_count, comp_type, - el.is_normalized, - desc.stride_words * sizeof(uint32_t)); - glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, el_index, - allocation.gpu_ptr + (el.offset_words * 4), - allocation.length - (el.offset_words * 4)); - } else { - glVertexAttribBinding(el_index, n); - glVertexAttribFormat(el_index, comp_count, comp_type, el.is_normalized, - el.offset_words * 4); - } - ++el_index; } - // Flush buffer before we draw. scratch_buffer_.Commit(std::move(allocation)); } - return UpdateStatus::kMismatch; + return UpdateStatus::kCompatible; } -CommandProcessor::UpdateStatus CommandProcessor::PopulateSamplers( - DrawCommand* draw_command) { +CommandProcessor::UpdateStatus CommandProcessor::PopulateSamplers() { +#if FINE_GRAINED_DRAW_SCOPES SCOPE_profile_cpu_f("gpu"); +#endif // FINE_GRAINED_DRAW_SCOPES + auto& regs = *register_file_; bool mismatch = false; @@ -2296,7 +2140,7 @@ CommandProcessor::UpdateStatus CommandProcessor::PopulateSamplers( continue; } has_setup_sampler[desc.fetch_slot] = true; - auto status = PopulateSampler(draw_command, desc); + auto status = PopulateSampler(desc); if (status == UpdateStatus::kError) { return status; } else if (status == UpdateStatus::kMismatch) { @@ -2312,7 +2156,7 @@ CommandProcessor::UpdateStatus CommandProcessor::PopulateSamplers( continue; } has_setup_sampler[desc.fetch_slot] = true; - auto status = PopulateSampler(draw_command, desc); + auto status = PopulateSampler(desc); if (status == UpdateStatus::kError) { return UpdateStatus::kError; } else if (status == UpdateStatus::kMismatch) { @@ -2324,7 +2168,7 @@ CommandProcessor::UpdateStatus CommandProcessor::PopulateSamplers( } CommandProcessor::UpdateStatus CommandProcessor::PopulateSampler( - DrawCommand* draw_command, const Shader::SamplerDesc& desc) { + const Shader::SamplerDesc& desc) { auto& regs = *register_file_; int r = XE_GPU_REG_SHADER_CONSTANT_FETCH_00_0 + desc.fetch_slot * 6; auto group = reinterpret_cast(®s.values[r]); @@ -2332,7 +2176,7 @@ CommandProcessor::UpdateStatus CommandProcessor::PopulateSampler( // Reset slot. // If we fail, we still draw but with an invalid texture. - draw_command->state_data->texture_samplers[desc.fetch_slot] = 0; + draw_batcher_.set_texture_sampler(desc.fetch_slot, 0); if (FLAGS_disable_textures) { return UpdateStatus::kCompatible; @@ -2363,13 +2207,13 @@ CommandProcessor::UpdateStatus CommandProcessor::PopulateSampler( } // Shaders will use bindless to fetch right from it. - draw_command->state_data->texture_samplers[desc.fetch_slot] = - entry_view->texture_sampler_handle; + draw_batcher_.set_texture_sampler(desc.fetch_slot, + entry_view->texture_sampler_handle); return UpdateStatus::kCompatible; } -bool CommandProcessor::IssueCopy(DrawCommand* draw_command) { +bool CommandProcessor::IssueCopy() { SCOPE_profile_cpu_f("gpu"); auto& regs = *register_file_; diff --git a/src/xenia/gpu/gl4/command_processor.h b/src/xenia/gpu/gl4/command_processor.h index d1cdcd141..481bb98ff 100644 --- a/src/xenia/gpu/gl4/command_processor.h +++ b/src/xenia/gpu/gl4/command_processor.h @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -41,73 +42,6 @@ struct SwapParameters { GLenum attachment; }; -// This must match the layout in gl4_shader.cc. -struct UniformDataBlock { - union float4 { - float v[4]; - struct { - float x, y, z, w; - }; - }; - - float4 window_offset; // tx,ty,sx,sy - float4 window_scissor; // x0,y0,x1,y1 - float4 vtx_fmt; - float4 viewport_offset; // tx,ty,tz,? - float4 viewport_scale; // sx,sy,sz,? - // TODO(benvanik): vertex format xyzw? - - float4 alpha_test; // alpha test enable, func, ref, ? - - // TODO(benvanik): pack tightly - uint64_t texture_samplers[32]; - - // Register data from 0x4000 to 0x4927. - // UpdateConstants relies on the packing of these. - struct { - // SHADER_CONSTANT_000_X... - float4 float_consts[512]; - // SHADER_CONSTANT_FETCH_00_0 is omitted - // SHADER_CONSTANT_BOOL_000_031... - int32_t bool_consts[8]; - // SHADER_CONSTANT_LOOP_00... - int32_t loop_consts[32]; - }; -}; -static_assert(sizeof(UniformDataBlock) <= 16 * 1024, "Need <=16k uniform data"); - -// TODO(benvanik): move more of the enums in here? -struct DrawCommand { - PrimitiveType prim_type; - uint32_t start_index; - uint32_t min_index; - uint32_t max_index; - uint32_t index_count; - uint32_t base_vertex; - - // Index buffer, if present. - // If index_count > 0 but buffer is nullptr then auto draw. - struct { - const uint8_t* address; - size_t size; - xenos::Endian endianness; - xenos::IndexFormat format; - size_t buffer_offset; - } index_buffer; - - // Texture samplers. - struct SamplerInput { - uint32_t input_index; - // TextureResource* texture; - // SamplerStateResource* sampler_state; - }; - SamplerInput vertex_shader_samplers[32]; - SamplerInput pixel_shader_samplers[32]; - - // NOTE: do not read from this - the mapped memory is likely write combined. - UniformDataBlock* state_data; -}; - class CommandProcessor { public: CommandProcessor(GL4GraphicsSystem* graphics_system); @@ -241,22 +175,19 @@ class CommandProcessor { bool LoadShader(ShaderType shader_type, const uint32_t* address, uint32_t dword_count); - void PrepareDraw(DrawCommand* draw_command); - bool IssueDraw(DrawCommand* draw_command); - UpdateStatus UpdateRenderTargets(DrawCommand* draw_command); - UpdateStatus UpdateState(DrawCommand* draw_command); - UpdateStatus UpdateViewportState(DrawCommand* draw_command); - UpdateStatus UpdateRasterizerState(DrawCommand* draw_command); - UpdateStatus UpdateBlendState(DrawCommand* draw_command); - UpdateStatus UpdateDepthStencilState(DrawCommand* draw_command); - UpdateStatus UpdateConstants(DrawCommand* draw_command); - UpdateStatus UpdateShaders(DrawCommand* draw_command); - UpdateStatus PopulateIndexBuffer(DrawCommand* draw_command); - UpdateStatus PopulateVertexBuffers(DrawCommand* draw_command); - UpdateStatus PopulateSamplers(DrawCommand* draw_command); - UpdateStatus PopulateSampler(DrawCommand* draw_command, - const Shader::SamplerDesc& desc); - bool IssueCopy(DrawCommand* draw_command); + bool IssueDraw(); + UpdateStatus UpdateShaders(PrimitiveType prim_type); + UpdateStatus UpdateRenderTargets(); + UpdateStatus UpdateState(); + UpdateStatus UpdateViewportState(); + UpdateStatus UpdateRasterizerState(); + UpdateStatus UpdateBlendState(); + UpdateStatus UpdateDepthStencilState(); + UpdateStatus PopulateIndexBuffer(); + UpdateStatus PopulateVertexBuffers(); + UpdateStatus PopulateSamplers(); + UpdateStatus PopulateSampler(const Shader::SamplerDesc& desc); + bool IssueCopy(); CachedFramebuffer* GetFramebuffer(GLuint color_targets[4], GLuint depth_target); @@ -306,21 +237,23 @@ class CommandProcessor { std::vector cached_depth_render_targets_; std::vector> all_pipelines_; std::unordered_map cached_pipelines_; - GLuint vertex_array_; GLuint point_list_geometry_program_; GLuint rect_list_geometry_program_; GLuint quad_list_geometry_program_; + struct { + xenos::IndexFormat format; + xenos::Endian endianness; + uint32_t count; + uint32_t guest_base; + size_t length; + uint32_t max_index_found; + } index_buffer_info_; + uint32_t draw_index_count_; TextureCache texture_cache_; + DrawBatcher draw_batcher_; CircularBuffer scratch_buffer_; - struct ScratchBufferStats { - size_t total_state_data_size = 0; - size_t total_indices_size = 0; - size_t total_vertices_size = 0; - } scratch_buffer_stats_; - - DrawCommand draw_command_; private: bool SetShadowRegister(uint32_t& dest, uint32_t register_name); @@ -341,7 +274,6 @@ class CommandProcessor { void Reset() { std::memset(this, 0, sizeof(*this)); } } update_render_targets_regs_; struct UpdateViewportStateRegisters { - // UpdateViewportStateRegisters() { Reset(); } void Reset() { std::memset(this, 0, sizeof(*this)); } } update_viewport_state_regs_; @@ -367,7 +299,6 @@ class CommandProcessor { UpdateDepthStencilStateRegisters() { Reset(); } void Reset() { std::memset(this, 0, sizeof(*this)); } } update_depth_stencil_state_regs_; - // TODO(benvanik): constant bitmask? struct UpdateShadersRegisters { PrimitiveType prim_type; uint32_t sq_program_cntl; @@ -380,9 +311,6 @@ class CommandProcessor { vertex_shader = pixel_shader = nullptr; } } update_shaders_regs_; - // ib - // vb - // samplers }; } // namespace gl4 diff --git a/src/xenia/gpu/gl4/draw_batcher.cc b/src/xenia/gpu/gl4/draw_batcher.cc new file mode 100644 index 000000000..c5d0bdb39 --- /dev/null +++ b/src/xenia/gpu/gl4/draw_batcher.cc @@ -0,0 +1,384 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#include + +#include +#include +#include +#include + +namespace xe { +namespace gpu { +namespace gl4 { + +using namespace xe::gpu::xenos; + +extern "C" GLEWContext* glewGetContext(); + +const size_t kCommandBufferCapacity = 16 * (1024 * 1024); +const size_t kCommandBufferAlignment = 4; +const size_t kStateBufferCapacity = 64 * (1024 * 1024); +const size_t kStateBufferAlignment = 256; + +DrawBatcher::DrawBatcher(RegisterFile* register_file) + : register_file_(register_file), + command_buffer_(kCommandBufferCapacity, kCommandBufferAlignment), + state_buffer_(kStateBufferCapacity, kStateBufferAlignment), + array_data_buffer_(nullptr), + has_bindless_mdi_(false), + draw_open_(false) { + std::memset(&batch_state_, 0, sizeof(batch_state_)); + batch_state_.needs_reconfigure = true; + batch_state_.command_range_start = batch_state_.state_range_start = + UINTPTR_MAX; + std::memset(&active_draw_, 0, sizeof(active_draw_)); +} + +bool DrawBatcher::Initialize(CircularBuffer* array_data_buffer) { + array_data_buffer_ = array_data_buffer; + if (!command_buffer_.Initialize()) { + return false; + } + if (!state_buffer_.Initialize()) { + return false; + } + glBindBuffer(GL_DRAW_INDIRECT_BUFFER, command_buffer_.handle()); + if (FLAGS_vendor_gl_extensions && GLEW_NV_bindless_multi_draw_indirect) { + has_bindless_mdi_ = true; + } + return true; +} + +void DrawBatcher::Shutdown() { + command_buffer_.Shutdown(); + state_buffer_.Shutdown(); +} + +bool DrawBatcher::ReconfigurePipeline(GL4Shader* vertex_shader, + GL4Shader* pixel_shader, + GLuint pipeline) { + if (batch_state_.pipeline == pipeline) { + // No-op. + return true; + } + if (!Flush(FlushMode::kReconfigure)) { + return false; + } + + batch_state_.vertex_shader = vertex_shader; + batch_state_.pixel_shader = pixel_shader; + batch_state_.pipeline = pipeline; + + return true; +} + +bool DrawBatcher::BeginDrawArrays(PrimitiveType prim_type, + uint32_t index_count) { + assert_false(draw_open_); + if (batch_state_.prim_type != prim_type || batch_state_.indexed) { + if (!Flush(FlushMode::kReconfigure)) { + return false; + } + } + batch_state_.prim_type = prim_type; + batch_state_.indexed = false; + + if (!BeginDraw()) { + return false; + } + + auto cmd = active_draw_.draw_arrays_cmd; + cmd->base_instance = 0; + cmd->instance_count = 1; + cmd->count = index_count; + cmd->first_index = 0; + + return true; +} + +bool DrawBatcher::BeginDrawElements(PrimitiveType prim_type, + uint32_t index_count, + IndexFormat index_format) { + assert_false(draw_open_); + GLenum index_type = + index_format == IndexFormat::kInt32 ? GL_UNSIGNED_INT : GL_UNSIGNED_SHORT; + if (batch_state_.prim_type != prim_type || !batch_state_.indexed || + batch_state_.index_type != index_type) { + if (!Flush(FlushMode::kReconfigure)) { + return false; + } + } + batch_state_.prim_type = prim_type; + batch_state_.indexed = true; + batch_state_.index_type = index_type; + + if (!BeginDraw()) { + return false; + } + + uint32_t start_index = register_file_->values[XE_GPU_REG_VGT_INDX_OFFSET].u32; + assert_zero(start_index); + + auto cmd = active_draw_.draw_elements_cmd; + cmd->base_instance = 0; + cmd->instance_count = 1; + cmd->count = index_count; + cmd->first_index = start_index; + cmd->base_vertex = 0; + + if (has_bindless_mdi_) { + auto bindless_cmd = active_draw_.draw_elements_bindless_cmd; + bindless_cmd->reserved_zero = 0; + } + return true; +} + +bool DrawBatcher::BeginDraw() { + draw_open_ = true; + + if (batch_state_.needs_reconfigure) { + batch_state_.needs_reconfigure = false; + // Have been reconfigured since last draw - need to compute state size. + // Layout: + // [draw command] + // [common header] + // [consts] + + // Padded to max. + GLsizei command_size = 0; + if (has_bindless_mdi_) { + if (batch_state_.indexed) { + command_size = sizeof(DrawElementsIndirectBindlessCommandNV); + } else { + command_size = sizeof(DrawArraysIndirectBindlessCommandNV); + } + } else { + if (batch_state_.indexed) { + command_size = sizeof(DrawElementsIndirectCommand); + } else { + command_size = sizeof(DrawArraysIndirectCommand); + } + } + batch_state_.command_stride = + poly::round_up(command_size, GLsizei(kCommandBufferAlignment)); + + GLsizei header_size = sizeof(CommonHeader); + + // TODO(benvanik); consts sizing. + // GLsizei float_consts_size = sizeof(float4) * 512; + // GLsizei bool_consts_size = sizeof(uint32_t) * 8; + // GLsizei loop_consts_size = sizeof(uint32_t) * 32; + // GLsizei consts_size = + // float_consts_size + bool_consts_size + loop_consts_size; + // batch_state_.float_consts_offset = batch_state_.header_offset + + // header_size; + // batch_state_.bool_consts_offset = + // batch_state_.float_consts_offset + float_consts_size; + // batch_state_.loop_consts_offset = + // batch_state_.bool_consts_offset + bool_consts_size; + GLsizei consts_size = 0; + + batch_state_.state_stride = header_size + consts_size; + } + + // Allocate a command data block. + // We should treat it as write-only. + if (!command_buffer_.CanAcquire(batch_state_.command_stride)) { + Flush(FlushMode::kMakeCoherent); + } + active_draw_.command_allocation = + command_buffer_.Acquire(batch_state_.command_stride); + assert_not_null(active_draw_.command_allocation.host_ptr); + + // Allocate a state data block. + // We should treat it as write-only. + if (!state_buffer_.CanAcquire(batch_state_.state_stride)) { + Flush(FlushMode::kMakeCoherent); + } + active_draw_.state_allocation = + state_buffer_.Acquire(batch_state_.state_stride); + assert_not_null(active_draw_.state_allocation.host_ptr); + + active_draw_.command_address = + reinterpret_cast(active_draw_.command_allocation.host_ptr); + auto state_host_ptr = + reinterpret_cast(active_draw_.state_allocation.host_ptr); + active_draw_.header = reinterpret_cast(state_host_ptr); + // active_draw_.float_consts = + // reinterpret_cast(state_host_ptr + + // batch_state_.float_consts_offset); + // active_draw_.bool_consts = + // reinterpret_cast(state_host_ptr + + // batch_state_.bool_consts_offset); + // active_draw_.loop_consts = + // reinterpret_cast(state_host_ptr + + // batch_state_.loop_consts_offset); + return true; +} + +void DrawBatcher::DiscardDraw() { + if (!draw_open_) { + // No-op. + return; + } + draw_open_ = false; + + command_buffer_.Discard(std::move(active_draw_.command_allocation)); + state_buffer_.Discard(std::move(active_draw_.state_allocation)); +} + +bool DrawBatcher::CommitDraw() { + assert_true(draw_open_); + draw_open_ = false; + + // Copy over required constants. + CopyConstants(); + + if (batch_state_.state_range_start == UINTPTR_MAX) { + batch_state_.command_range_start = active_draw_.command_allocation.offset; + batch_state_.state_range_start = active_draw_.state_allocation.offset; + } + batch_state_.command_range_length += + active_draw_.command_allocation.aligned_length; + batch_state_.state_range_length += + active_draw_.state_allocation.aligned_length; + + command_buffer_.Commit(std::move(active_draw_.command_allocation)); + state_buffer_.Commit(std::move(active_draw_.state_allocation)); + + ++batch_state_.draw_count; + return true; +} + +bool DrawBatcher::Flush(FlushMode mode) { + if (batch_state_.draw_count) { + SCOPE_profile_cpu_f("gpu"); + + assert_not_zero(batch_state_.command_stride); + assert_not_zero(batch_state_.state_stride); + + // Flush pending buffer changes. + command_buffer_.Flush(); + state_buffer_.Flush(); + array_data_buffer_->Flush(); + + // State data is indexed by draw ID. + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 0, state_buffer_.handle(), + batch_state_.state_range_start, + batch_state_.state_range_length); + + GLenum prim_type = 0; + switch (batch_state_.prim_type) { + case PrimitiveType::kPointList: + prim_type = GL_POINTS; + break; + case PrimitiveType::kLineList: + prim_type = GL_LINES; + break; + case PrimitiveType::kLineStrip: + prim_type = GL_LINE_STRIP; + break; + case PrimitiveType::kLineLoop: + prim_type = GL_LINE_LOOP; + break; + case PrimitiveType::kTriangleList: + prim_type = GL_TRIANGLES; + break; + case PrimitiveType::kTriangleStrip: + prim_type = GL_TRIANGLE_STRIP; + break; + case PrimitiveType::kTriangleFan: + prim_type = GL_TRIANGLE_FAN; + break; + case PrimitiveType::kRectangleList: + prim_type = GL_TRIANGLE_STRIP; + // Rect lists aren't culled. There may be other things they skip too. + // assert_true((register_file_->values[XE_GPU_REG_PA_SU_SC_MODE_CNTL].u32 + // & + // 0x3) == 0); + break; + case PrimitiveType::kQuadList: + prim_type = GL_LINES_ADJACENCY; + break; + default: + case PrimitiveType::kUnknown0x07: + prim_type = GL_POINTS; + XELOGE("unsupported primitive type %d", batch_state_.prim_type); + assert_unhandled_case(batch_state_.prim_type); + DiscardDraw(); + return false; + } + + // Fast path for single draws. + void* indirect_offset = + reinterpret_cast(batch_state_.command_range_start); + + if (has_bindless_mdi_) { + int vertex_buffer_count = + batch_state_.vertex_shader->buffer_inputs().total_elements_count; + assert_true(vertex_buffer_count < 8); + if (batch_state_.indexed) { + glMultiDrawElementsIndirectBindlessNV( + prim_type, batch_state_.index_type, indirect_offset, + batch_state_.draw_count, batch_state_.command_stride, + vertex_buffer_count); + } else { + glMultiDrawArraysIndirectBindlessNV( + prim_type, indirect_offset, batch_state_.draw_count, + batch_state_.command_stride, vertex_buffer_count); + } + } else { + if (batch_state_.indexed) { + glMultiDrawElementsIndirect(prim_type, batch_state_.index_type, + indirect_offset, batch_state_.draw_count, + batch_state_.command_stride); + } else { + glMultiDrawArraysIndirect(prim_type, indirect_offset, + batch_state_.draw_count, + batch_state_.command_stride); + } + } + + batch_state_.command_range_start = UINTPTR_MAX; + batch_state_.command_range_length = 0; + batch_state_.state_range_start = UINTPTR_MAX; + batch_state_.state_range_length = 0; + batch_state_.draw_count = 0; + } + + if (mode == FlushMode::kReconfigure) { + // Reset - we'll update it as soon as we have all the information. + batch_state_.needs_reconfigure = true; + } + + return true; +} + +void DrawBatcher::CopyConstants() { + // TODO(benvanik): partial updates, etc. We could use shader constant access + // knowledge that we get at compile time to only upload those constants + // required. If we did this as a variable length then we could really cut + // down on state block sizes. + + std::memcpy(active_draw_.header->float_consts, + ®ister_file_->values[XE_GPU_REG_SHADER_CONSTANT_000_X].f32, + sizeof(active_draw_.header->float_consts)); + std::memcpy( + active_draw_.header->bool_consts, + ®ister_file_->values[XE_GPU_REG_SHADER_CONSTANT_BOOL_000_031].f32, + sizeof(active_draw_.header->bool_consts)); + std::memcpy(active_draw_.header->loop_consts, + ®ister_file_->values[XE_GPU_REG_SHADER_CONSTANT_LOOP_00].f32, + sizeof(active_draw_.header->loop_consts)); +} + +} // namespace gl4 +} // namespace gpu +} // namespace xe diff --git a/src/xenia/gpu/gl4/draw_batcher.h b/src/xenia/gpu/gl4/draw_batcher.h new file mode 100644 index 000000000..f930ba2a0 --- /dev/null +++ b/src/xenia/gpu/gl4/draw_batcher.h @@ -0,0 +1,230 @@ +/** + ****************************************************************************** + * Xenia : Xbox 360 Emulator Research Project * + ****************************************************************************** + * Copyright 2014 Ben Vanik. All rights reserved. * + * Released under the BSD license - see LICENSE in the root for more details. * + ****************************************************************************** + */ + +#ifndef XENIA_GPU_GL4_GL4_STATE_DATA_BUILDER_H_ +#define XENIA_GPU_GL4_GL4_STATE_DATA_BUILDER_H_ + +#include +#include +#include +#include +#include +#include + +namespace xe { +namespace gpu { +namespace gl4 { + +union float4 { + float v[4]; + struct { + float x, y, z, w; + }; +}; + +#pragma pack(push, 4) +struct DrawArraysIndirectCommand { + GLuint count; + GLuint instance_count; + GLuint first_index; + GLuint base_instance; +}; +struct DrawElementsIndirectCommand { + GLuint count; + GLuint instance_count; + GLuint first_index; + GLint base_vertex; + GLuint base_instance; +}; +struct BindlessPtrNV { + GLuint index; + GLuint reserved_zero; + GLuint64 address; + GLuint64 length; +}; +struct DrawArraysIndirectBindlessCommandNV { + DrawArraysIndirectCommand cmd; + // NOTE: the spec is wrong here. For fucks sake. + // GLuint reserved_zero; + BindlessPtrNV vertex_buffers[8]; +}; +struct DrawElementsIndirectBindlessCommandNV { + DrawElementsIndirectCommand cmd; + GLuint reserved_zero; + BindlessPtrNV index_buffer; + BindlessPtrNV vertex_buffers[8]; +}; +#pragma pack(pop) + +class DrawBatcher { + public: + enum class FlushMode { + kMakeCoherent, + kStateChange, + kReconfigure, + }; + + DrawBatcher(RegisterFile* register_file); + + bool Initialize(CircularBuffer* array_data_buffer); + void Shutdown(); + + PrimitiveType prim_type() const { return batch_state_.prim_type; } + + void set_window_offset(uint32_t x, uint32_t y) { + active_draw_.header->window_offset.x = float(x); + active_draw_.header->window_offset.y = float(y); + } + void set_window_scissor(uint32_t left, uint32_t top, uint32_t right, + uint32_t bottom) { + active_draw_.header->window_scissor.x = float(left); + active_draw_.header->window_scissor.y = float(top); + active_draw_.header->window_scissor.z = float(right); + active_draw_.header->window_scissor.w = float(bottom); + } + void set_window_scalar(float width_scalar, float height_scalar) { + active_draw_.header->window_offset.z = width_scalar; + active_draw_.header->window_offset.w = height_scalar; + } + void set_viewport_offset(float offset_x, float offset_y, float offset_z) { + active_draw_.header->viewport_offset.x = offset_x; + active_draw_.header->viewport_offset.y = offset_y; + active_draw_.header->viewport_offset.z = offset_z; + } + void set_viewport_scale(float scale_x, float scale_y, float scale_z) { + active_draw_.header->viewport_scale.x = scale_x; + active_draw_.header->viewport_scale.y = scale_y; + active_draw_.header->viewport_scale.z = scale_z; + } + void set_vtx_fmt(float xy, float z, float w) { + active_draw_.header->vtx_fmt.x = xy; + active_draw_.header->vtx_fmt.y = xy; + active_draw_.header->vtx_fmt.z = z; + active_draw_.header->vtx_fmt.w = w; + } + void set_alpha_test(bool enabled, uint32_t func, float ref) { + active_draw_.header->alpha_test.x = enabled ? 1.0f : 0.0f; + active_draw_.header->alpha_test.y = float(func); + active_draw_.header->alpha_test.z = ref; + } + void set_texture_sampler(int index, GLuint64 handle) { + active_draw_.header->texture_samplers[index] = handle; + } + void set_index_buffer(const CircularBuffer::Allocation& allocation) { + if (has_bindless_mdi_) { + auto& ptr = active_draw_.draw_elements_bindless_cmd->index_buffer; + ptr.reserved_zero = 0; + ptr.index = 0; + ptr.address = allocation.gpu_ptr; + ptr.length = allocation.length; + } else { + // Offset is used in glDrawElements. + auto& cmd = active_draw_.draw_elements_cmd; + size_t index_size = batch_state_.index_type == GL_UNSIGNED_SHORT ? 2 : 4; + cmd->first_index = GLuint(allocation.offset / index_size); + } + } + void set_vertex_buffer(int index, GLsizei offset, GLsizei stride, + const CircularBuffer::Allocation& allocation) { + if (has_bindless_mdi_) { + BindlessPtrNV* ptr; + if (batch_state_.indexed) { + ptr = &active_draw_.draw_elements_bindless_cmd->vertex_buffers[index]; + } else { + ptr = &active_draw_.draw_arrays_bindless_cmd->vertex_buffers[index]; + } + ptr->reserved_zero = 0; + ptr->index = index; + ptr->address = allocation.gpu_ptr + offset; + ptr->length = allocation.length - offset; + } + } + + bool ReconfigurePipeline(GL4Shader* vertex_shader, GL4Shader* pixel_shader, + GLuint pipeline); + + bool BeginDrawArrays(PrimitiveType prim_type, uint32_t index_count); + bool BeginDrawElements(PrimitiveType prim_type, uint32_t index_count, + xenos::IndexFormat index_format); + void DiscardDraw(); + bool CommitDraw(); + bool Flush(FlushMode mode); + + private: + bool BeginDraw(); + void CopyConstants(); + + RegisterFile* register_file_; + CircularBuffer command_buffer_; + CircularBuffer state_buffer_; + CircularBuffer* array_data_buffer_; + + bool has_bindless_mdi_; + + struct BatchState { + bool needs_reconfigure; + PrimitiveType prim_type; + bool indexed; + GLenum index_type; + + GL4Shader* vertex_shader; + GL4Shader* pixel_shader; + GLuint pipeline; + + GLsizei command_stride; + GLsizei state_stride; + GLsizei float_consts_offset; + GLsizei bool_consts_offset; + GLsizei loop_consts_offset; + + uintptr_t command_range_start; + uintptr_t command_range_length; + uintptr_t state_range_start; + uintptr_t state_range_length; + GLsizei draw_count; + } batch_state_; + + // This must match GL4Shader's header. + struct CommonHeader { + float4 window_offset; // tx,ty,sx,sy + float4 window_scissor; // x0,y0,x1,y1 + float4 viewport_offset; // tx,ty,tz,? + float4 viewport_scale; // sx,sy,sz,? + float4 vtx_fmt; // + float4 alpha_test; // alpha test enable, func, ref, ? + + // TODO(benvanik): pack tightly + GLuint64 texture_samplers[32]; + + float4 float_consts[512]; + uint32_t bool_consts[8]; + uint32_t loop_consts[32]; + }; + struct { + CircularBuffer::Allocation command_allocation; + CircularBuffer::Allocation state_allocation; + + union { + DrawArraysIndirectCommand* draw_arrays_cmd; + DrawElementsIndirectCommand* draw_elements_cmd; + DrawArraysIndirectBindlessCommandNV* draw_arrays_bindless_cmd; + DrawElementsIndirectBindlessCommandNV* draw_elements_bindless_cmd; + uintptr_t command_address; + }; + + CommonHeader* header; + } active_draw_; + bool draw_open_; +}; + +} // namespace gl4 +} // namespace gpu +} // namespace xe + +#endif // XENIA_GPU_GL4_GL4_STATE_DATA_BUILDER_H_ diff --git a/src/xenia/gpu/gl4/gl4_shader.cc b/src/xenia/gpu/gl4/gl4_shader.cc index 3409824a2..2255be226 100644 --- a/src/xenia/gpu/gl4/gl4_shader.cc +++ b/src/xenia/gpu/gl4/gl4_shader.cc @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -18,6 +19,8 @@ namespace xe { namespace gpu { namespace gl4 { +using namespace xe::gpu::xenos; + extern "C" GLEWContext* glewGetContext(); // Stateful, but minimally. @@ -25,41 +28,147 @@ thread_local GL4ShaderTranslator shader_translator_; GL4Shader::GL4Shader(ShaderType shader_type, uint64_t data_hash, const uint32_t* dword_ptr, uint32_t dword_count) - : Shader(shader_type, data_hash, dword_ptr, dword_count), program_(0) {} + : Shader(shader_type, data_hash, dword_ptr, dword_count), + program_(0), + vao_(0) {} -GL4Shader::~GL4Shader() { glDeleteProgram(program_); } +GL4Shader::~GL4Shader() { + glDeleteProgram(program_); + glDeleteVertexArrays(1, &vao_); +} -const std::string header = - "#version 450\n" - "#extension all : warn\n" - "#extension GL_ARB_bindless_texture : require\n" - "#extension GL_ARB_explicit_uniform_location : require\n" - "#extension GL_ARB_shading_language_420pack : require\n" - "#extension GL_ARB_shader_storage_buffer_object : require\n" - "precision highp float;\n" - "precision highp int;\n" - "layout(std140, column_major) uniform;\n" - "layout(std430, column_major) buffer;\n" - "struct StateData {\n" - " vec4 window_offset;\n" - " vec4 window_scissor;\n" - " vec4 vtx_fmt;\n" - " vec4 viewport_offset;\n" - " vec4 viewport_scale;\n" - " vec4 alpha_test;\n" - " uvec2 texture_samplers[32];\n" - " vec4 float_consts[512];\n" - " uint fetch_consts[32 * 6];\n" - " int bool_consts[8];\n" - " int loop_consts[32];\n" - "};\n" - "struct VertexData {\n" - " vec4 o[16];\n" - "};\n" - "\n" - "layout(binding = 0) buffer State {\n" - " StateData state;\n" - "};\n"; +std::string GL4Shader::GetHeader() { + static const std::string header = + "#version 450\n" + "#extension all : warn\n" + "#extension GL_ARB_bindless_texture : require\n" + "#extension GL_ARB_explicit_uniform_location : require\n" + "#extension GL_ARB_shader_draw_parameters : require\n" + "#extension GL_ARB_shader_storage_buffer_object : require\n" + "#extension GL_ARB_shading_language_420pack : require\n" + "precision highp float;\n" + "precision highp int;\n" + "layout(std140, column_major) uniform;\n" + "layout(std430, column_major) buffer;\n" + "\n" + // This must match DrawBatcher::CommonHeader. + "struct StateData {\n" + " vec4 window_offset;\n" + " vec4 window_scissor;\n" + " vec4 viewport_offset;\n" + " vec4 viewport_scale;\n" + " vec4 vtx_fmt;\n" + " vec4 alpha_test;\n" + // TODO(benvanik): variable length. + " uvec2 texture_samplers[32];\n" + " vec4 float_consts[512];\n" + " int bool_consts[8];\n" + " int loop_consts[32];\n" + "};\n" + "layout(binding = 0) buffer State {\n" + " StateData states[];\n" + "};\n" + "\n" + "struct VertexData {\n" + " vec4 o[16];\n" + "};\n"; + return header; +} + +bool GL4Shader::PrepareVertexArrayObject() { + glCreateVertexArrays(1, &vao_); + + bool has_bindless_vbos = false; + if (FLAGS_vendor_gl_extensions && GLEW_NV_vertex_buffer_unified_memory) { + has_bindless_vbos = true; + // Nasty, but no DSA for this. + glBindVertexArray(vao_); + glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV); + glEnableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV); + } + + uint32_t el_index = 0; + for (uint32_t buffer_index = 0; buffer_index < buffer_inputs_.count; + ++buffer_index) { + const auto& desc = buffer_inputs_.descs[buffer_index]; + + for (uint32_t i = 0; i < desc.element_count; ++i, ++el_index) { + const auto& el = desc.elements[i]; + auto comp_count = GetVertexFormatComponentCount(el.format); + GLenum comp_type; + switch (el.format) { + case VertexFormat::k_8_8_8_8: + comp_type = el.is_signed ? GL_BYTE : GL_UNSIGNED_BYTE; + break; + case VertexFormat::k_2_10_10_10: + comp_type = el.is_signed ? GL_INT_2_10_10_10_REV + : GL_UNSIGNED_INT_2_10_10_10_REV; + break; + case VertexFormat::k_10_11_11: + assert_false(el.is_signed); + comp_type = GL_UNSIGNED_INT_10F_11F_11F_REV; + break; + /*case VertexFormat::k_11_11_10: + break;*/ + case VertexFormat::k_16_16: + comp_type = el.is_signed ? GL_SHORT : GL_UNSIGNED_SHORT; + break; + case VertexFormat::k_16_16_FLOAT: + comp_type = GL_HALF_FLOAT; + break; + case VertexFormat::k_16_16_16_16: + comp_type = el.is_signed ? GL_SHORT : GL_UNSIGNED_SHORT; + break; + case VertexFormat::k_16_16_16_16_FLOAT: + comp_type = GL_HALF_FLOAT; + break; + case VertexFormat::k_32: + comp_type = el.is_signed ? GL_INT : GL_UNSIGNED_INT; + break; + case VertexFormat::k_32_32: + comp_type = el.is_signed ? GL_INT : GL_UNSIGNED_INT; + break; + case VertexFormat::k_32_32_32_32: + comp_type = el.is_signed ? GL_INT : GL_UNSIGNED_INT; + break; + case VertexFormat::k_32_FLOAT: + comp_type = GL_FLOAT; + break; + case VertexFormat::k_32_32_FLOAT: + comp_type = GL_FLOAT; + break; + case VertexFormat::k_32_32_32_FLOAT: + comp_type = GL_FLOAT; + break; + case VertexFormat::k_32_32_32_32_FLOAT: + comp_type = GL_FLOAT; + break; + default: + assert_unhandled_case(el.format); + return false; + } + + glEnableVertexArrayAttrib(vao_, el_index); + if (has_bindless_vbos) { + // NOTE: MultiDrawIndirectBindlessMumble doesn't handle separate + // vertex bindings/formats. + glVertexAttribFormat(el_index, comp_count, comp_type, el.is_normalized, + el.offset_words * 4); + glVertexArrayVertexBuffer(vao_, el_index, 0, 0, desc.stride_words * 4); + } else { + glVertexArrayAttribBinding(vao_, el_index, buffer_index); + glVertexArrayAttribFormat(vao_, el_index, comp_count, comp_type, + el.is_normalized, el.offset_words * 4); + } + } + } + + if (has_bindless_vbos) { + glBindVertexArray(0); + } + + return true; +} bool GL4Shader::PrepareVertexShader( const xenos::xe_gpu_program_cntl_t& program_cntl) { @@ -68,8 +177,14 @@ bool GL4Shader::PrepareVertexShader( } has_prepared_ = true; + // Build static vertex array descriptor. + if (!PrepareVertexArrayObject()) { + PLOGE("Unable to prepare vertex shader array object"); + return false; + } + std::string apply_transform = - "vec4 applyTransform(vec4 pos) {\n" + "vec4 applyTransform(const in StateData state, vec4 pos) {\n" " // Clip->NDC with perspective divide.\n" " // We do this here because it's programmable on the 360.\n" " float w = pos.w;\n" @@ -107,14 +222,15 @@ bool GL4Shader::PrepareVertexShader( " return pos;\n" "}\n"; std::string source = - header + apply_transform + + GetHeader() + apply_transform + "out gl_PerVertex {\n" " vec4 gl_Position;\n" " float gl_PointSize;\n" " float gl_ClipDistance[];\n" "};\n" - "layout(location = 0) out VertexData vtx;\n" - "void processVertex();\n" + "layout(location = 0) flat out uint draw_id;\n" + "layout(location = 1) out VertexData vtx;\n" + "void processVertex(const in StateData state);\n" "void main() {\n" + (alloc_counts().positions ? " gl_Position = vec4(0.0, 0.0, 0.0, 1.0);\n" : "") + @@ -122,8 +238,10 @@ bool GL4Shader::PrepareVertexShader( " for (int i = 0; i < vtx.o.length(); ++i) {\n" " vtx.o[i] = vec4(0.0, 0.0, 0.0, 0.0);\n" " }\n" - " processVertex();\n" - " gl_Position = applyTransform(gl_Position);\n" + " const StateData state = states[gl_DrawIDARB];\n" + " processVertex(state);\n" + " gl_Position = applyTransform(state, gl_Position);\n" + " draw_id = gl_DrawIDARB;\n" "}\n"; std::string translated_source = @@ -149,12 +267,14 @@ bool GL4Shader::PreparePixelShader( } has_prepared_ = true; - std::string source = header + - "layout(location = 0) in VertexData vtx;\n" + std::string source = GetHeader() + + "layout(location = 0) flat in uint draw_id;\n" + "layout(location = 1) in VertexData vtx;\n" "layout(location = 0) out vec4 oC[4];\n" - "void processFragment();\n" + "void processFragment(const in StateData state);\n" "void main() {\n" + - " processFragment();\n" + " const StateData state = states[draw_id];\n" + " processFragment(state);\n" "}\n"; std::string translated_source = diff --git a/src/xenia/gpu/gl4/gl4_shader.h b/src/xenia/gpu/gl4/gl4_shader.h index da3c3df78..1dac6b4c3 100644 --- a/src/xenia/gpu/gl4/gl4_shader.h +++ b/src/xenia/gpu/gl4/gl4_shader.h @@ -10,6 +10,8 @@ #ifndef XENIA_GPU_GL4_GL4_SHADER_H_ #define XENIA_GPU_GL4_GL4_SHADER_H_ +#include + #include #include #include @@ -25,14 +27,18 @@ class GL4Shader : public Shader { ~GL4Shader() override; GLuint program() const { return program_; } + GLuint vao() const { return vao_; } bool PrepareVertexShader(const xenos::xe_gpu_program_cntl_t& program_cntl); bool PreparePixelShader(const xenos::xe_gpu_program_cntl_t& program_cntl); protected: + std::string GetHeader(); + bool PrepareVertexArrayObject(); bool CompileProgram(std::string source); GLuint program_; + GLuint vao_; }; } // namespace gl4 diff --git a/src/xenia/gpu/gl4/gl4_shader_translator.cc b/src/xenia/gpu/gl4/gl4_shader_translator.cc index e314191ea..1b3aff02b 100644 --- a/src/xenia/gpu/gl4/gl4_shader_translator.cc +++ b/src/xenia/gpu/gl4/gl4_shader_translator.cc @@ -91,7 +91,7 @@ std::string GL4ShaderTranslator::TranslateVertexShader( const auto& alloc_counts = vertex_shader->alloc_counts(); // Vertex shader main() header. - Append("void processVertex() {\n"); + Append("void processVertex(const in StateData state) {\n"); // Add temporaries for any registers we may use. uint32_t temp_regs = program_cntl.vs_regs + program_cntl.ps_regs; @@ -126,7 +126,7 @@ std::string GL4ShaderTranslator::TranslatePixelShader( // (and less than the number of required registers), things may die. // Pixel shader main() header. - Append("void processFragment() {\n"); + Append("void processFragment(const in StateData state) {\n"); // Add temporary registers. uint32_t temp_regs = program_cntl.vs_regs + program_cntl.ps_regs; diff --git a/src/xenia/gpu/gl4/gl_context.cc b/src/xenia/gpu/gl4/gl_context.cc index d5e39fa34..3d4600514 100644 --- a/src/xenia/gpu/gl4/gl_context.cc +++ b/src/xenia/gpu/gl4/gl_context.cc @@ -132,12 +132,15 @@ std::unique_ptr GLContext::CreateShared() { GLContextLock context_lock(this); int context_flags = 0; + //int profile = WGL_CONTEXT_COMPATIBILITY_PROFILE_BIT_ARB; + int profile = WGL_CONTEXT_CORE_PROFILE_BIT_ARB; #if DEBUG context_flags |= WGL_CONTEXT_DEBUG_BIT_ARB; -#endif // DEBUG - int attrib_list[] = {WGL_CONTEXT_MAJOR_VERSION_ARB, 4, // - WGL_CONTEXT_MINOR_VERSION_ARB, 5, // - WGL_CONTEXT_FLAGS_ARB, context_flags, // +#endif // DEBUG + int attrib_list[] = {WGL_CONTEXT_MAJOR_VERSION_ARB, 4, // + WGL_CONTEXT_MINOR_VERSION_ARB, 5, // + WGL_CONTEXT_FLAGS_ARB, context_flags, // + WGL_CONTEXT_PROFILE_MASK_ARB, profile, // 0}; new_glrc = wglCreateContextAttribsARB(dc_, glrc_, attrib_list); if (!new_glrc) { diff --git a/src/xenia/gpu/gl4/sources.gypi b/src/xenia/gpu/gl4/sources.gypi index efc52f2b6..2f5c0db72 100644 --- a/src/xenia/gpu/gl4/sources.gypi +++ b/src/xenia/gpu/gl4/sources.gypi @@ -5,6 +5,8 @@ 'circular_buffer.h', 'command_processor.cc', 'command_processor.h', + 'draw_batcher.cc', + 'draw_batcher.h', 'gl4_gpu-private.h', 'gl4_gpu.cc', 'gl4_gpu.h',