From 02d52167d3eb5ab3c54ca98466d943bd29e43b2d Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Tue, 23 Dec 2014 20:32:41 -0800 Subject: [PATCH] GL context on command processor. --- src/xenia/gpu/gl4/command_processor.cc | 476 +++++++++++++++++++++-- src/xenia/gpu/gl4/command_processor.h | 56 ++- src/xenia/gpu/gl4/gl4_graphics_system.cc | 16 +- src/xenia/gpu/gl4/gl_context.cc | 61 ++- src/xenia/gpu/gl4/gl_context.h | 8 +- src/xenia/gpu/gl4/wgl_control.cc | 8 +- src/xenia/gpu/shader.cc | 6 +- 7 files changed, 569 insertions(+), 62 deletions(-) diff --git a/src/xenia/gpu/gl4/command_processor.cc b/src/xenia/gpu/gl4/command_processor.cc index 6f7e964b4..bc97be59d 100644 --- a/src/xenia/gpu/gl4/command_processor.cc +++ b/src/xenia/gpu/gl4/command_processor.cc @@ -12,6 +12,7 @@ #include #include +#include #include #include #include @@ -27,6 +28,8 @@ namespace gl4 { using namespace xe::gpu::xenos; +extern "C" extern "C" GLEWContext* glewGetContext(); + CommandProcessor::CommandProcessor(GL4GraphicsSystem* graphics_system) : memory_(graphics_system->memory()), membase_(graphics_system->memory()->membase()), @@ -46,6 +49,7 @@ CommandProcessor::CommandProcessor(GL4GraphicsSystem* graphics_system) bin_mask_(0xFFFFFFFFull), active_vertex_shader_(nullptr), active_pixel_shader_(nullptr) { + std::memset(&draw_command_, 0, sizeof(draw_command_)); LARGE_INTEGER perf_counter; QueryPerformanceCounter(&perf_counter); time_base_ = perf_counter.QuadPart; @@ -59,33 +63,37 @@ uint64_t CommandProcessor::QueryTime() { return perf_counter.QuadPart - time_base_; } -void CommandProcessor::Initialize(uint32_t ptr, uint32_t page_count) { - primary_buffer_ptr_ = ptr; - // Not sure this is correct, but it's a way to take the page_count back to - // the number of bytes allocated by the physical alloc. - uint32_t original_size = 1 << (0x1C - page_count - 1); - primary_buffer_size_ = original_size; - read_ptr_index_ = 0; +bool CommandProcessor::Initialize(std::unique_ptr context) { + context_ = std::move(context); worker_running_ = true; worker_thread_ = std::thread([this]() { poly::threading::set_name("GL4 Worker"); xe::Profiler::ThreadEnter("GL4 Worker"); + context_->MakeCurrent(); WorkerMain(); xe::Profiler::ThreadExit(); }); + + return true; } void CommandProcessor::Shutdown() { worker_running_ = false; SetEvent(write_ptr_index_event_); worker_thread_.join(); + context_.reset(); all_shaders_.clear(); shader_cache_.clear(); } void CommandProcessor::WorkerMain() { + if (!SetupGL()) { + PFATAL("Unable to setup command processor GL state"); + return; + } + while (worker_running_) { uint32_t write_ptr_index = write_ptr_index_.load(); while (write_ptr_index == 0xBAADF00D || @@ -94,6 +102,7 @@ void CommandProcessor::WorkerMain() { // We wait a short bit here to yield time. Since we are also running the // main window display we don't want to pause too long, though. // YieldProcessor(); + PrepareForWait(); const int wait_time_ms = 5; if (WaitForSingleObject(write_ptr_index_event_, wait_time_ms) == WAIT_TIMEOUT) { @@ -117,6 +126,28 @@ void CommandProcessor::WorkerMain() { read_ptr_index_); } } + + ShutdownGL(); +} + +bool CommandProcessor::SetupGL() { + // Uniform buffer that stores the per-draw state (constants, etc). + glGenBuffers(1, &uniform_data_buffer_); + glNamedBufferStorage(uniform_data_buffer_, 16 * 1024, nullptr, GL_MAP_WRITE_BIT); + + return true; +} + +void CommandProcessor::ShutdownGL() { + glDeleteBuffers(1, &uniform_data_buffer_); +} + +void CommandProcessor::InitializeRingBuffer(uint32_t ptr, uint32_t page_count) { + primary_buffer_ptr_ = ptr; + // Not sure this is correct, but it's a way to take the page_count back to + // the number of bytes allocated by the physical alloc. + uint32_t original_size = 1 << (0x1C - page_count - 1); + primary_buffer_size_ = original_size; } void CommandProcessor::EnableReadPointerWriteBack(uint32_t ptr, @@ -162,6 +193,8 @@ void CommandProcessor::WriteRegister(uint32_t packet_ptr, uint32_t index, } void CommandProcessor::MakeCoherent() { + SCOPE_profile_cpu_f("gpu"); + // Status host often has 0x01000000 or 0x03000000. // This is likely toggling VC (vertex cache) or TC (texture cache). // Or, it also has a direction in here maybe - there is probably @@ -189,6 +222,16 @@ void CommandProcessor::MakeCoherent() { regs->values[XE_GPU_REG_COHER_STATUS_HOST].u32 = status_host; } +void CommandProcessor::PrepareForWait() { + SCOPE_profile_cpu_f("gpu"); + + // TODO(benvanik): fences and fancy stuff. We should figure out a way to + // make interrupt callbacks from the GPU so that we don't have to do a full + // synchronize here. + // glFlush(); + glFinish(); +} + class CommandProcessor::RingbufferReader { public: RingbufferReader(uint8_t* membase, uint32_t base_ptr, uint32_t ptr_mask, @@ -274,6 +317,8 @@ void CommandProcessor::ExecutePrimaryBuffer(uint32_t start_index, } void CommandProcessor::ExecuteIndirectBuffer(uint32_t ptr, uint32_t length) { + SCOPE_profile_cpu_f("gpu"); + XETRACECP("[%.8X] ExecuteIndirectBuffer(%dw)", ptr, length); // Execute commands! @@ -625,6 +670,7 @@ bool CommandProcessor::ExecutePacketType3_WAIT_REG_MEM(RingbufferReader* reader, if (!matched) { // Wait. if (wait >= 0x100) { + PrepareForWait(); Sleep(wait / 0x100); } else { SwitchToThread(); @@ -790,14 +836,19 @@ bool CommandProcessor::ExecutePacketType3_DRAW_INDX(RingbufferReader* reader, uint32_t dword1 = reader->Read(); uint32_t index_count = dword1 >> 16; auto prim_type = static_cast(dword1 & 0x3F); + + uint32_t index_base = 0; + uint32_t index_size = 0; + Endian index_endianness = Endian::kUnspecified; + bool index_32bit = false; uint32_t src_sel = (dword1 >> 6) & 0x3; if (src_sel == 0x0) { // Indexed draw. - uint32_t index_base = reader->Read(); - uint32_t index_size = reader->Read(); - auto endianness = static_cast(index_size >> 30); + index_base = reader->Read(); + index_size = reader->Read(); + index_endianness = static_cast(index_size >> 30); index_size &= 0x00FFFFFF; - bool index_32bit = (dword1 >> 11) & 0x1; + index_32bit = (dword1 >> 11) & 0x1; index_size *= index_32bit ? 4 : 2; } else if (src_sel == 0x2) { // Auto draw. @@ -805,33 +856,31 @@ bool CommandProcessor::ExecutePacketType3_DRAW_INDX(RingbufferReader* reader, // Unknown source select. assert_always(); } - // if (!driver_->PrepareDraw(draw_command_)) { - // draw_command_.prim_type = prim_type; - // draw_command_.start_index = 0; - // draw_command_.index_count = index_count; - // draw_command_.base_vertex = 0; - // if (src_sel == 0x0) { - // // Indexed draw. - // // TODO(benvanik): detect subregions of larger index - // buffers! - // driver_->PrepareDrawIndexBuffer( - // draw_command_, index_base, index_size, - // endianness, - // index_32bit ? INDEX_FORMAT_32BIT : INDEX_FORMAT_16BIT); - // } else if (src_sel == 0x2) { - // // Auto draw. - // draw_command_.index_buffer = nullptr; - // } else { - // // Unknown source select. - // assert_always(); - // } - // driver_->Draw(draw_command_); - // } else { - // if (src_sel == 0x0) { - // reader->Advance(2); // skip - // } - // } - return true; + + if (!PrepareDraw(&draw_command_)) { + PLOGE("Invalid DRAW_INDX; ignoring"); + return false; + } + draw_command_.prim_type = prim_type; + draw_command_.start_index = 0; + draw_command_.index_count = index_count; + draw_command_.base_vertex = 0; + if (src_sel == 0x0) { + // Indexed draw. + // TODO(benvanik): detect subregions of larger index buffers + /*driver_->PrepareDrawIndexBuffer( + draw_command_, index_base, index_size, + endianness, + index_32bit ? INDEX_FORMAT_32BIT : INDEX_FORMAT_16BIT);*/ + draw_command_.index_buffer = nullptr; + } else if (src_sel == 0x2) { + // Auto draw. + draw_command_.index_buffer = nullptr; + } else { + // Unknown source select. + assert_always(); + } + return IssueDraw(&draw_command_); } bool CommandProcessor::ExecutePacketType3_DRAW_INDX_2(RingbufferReader* reader, @@ -849,16 +898,17 @@ bool CommandProcessor::ExecutePacketType3_DRAW_INDX_2(RingbufferReader* reader, bool index_32bit = (dword0 >> 11) & 0x1; uint32_t indices_size = index_count * (index_32bit ? 4 : 2); reader->CheckRead(indices_size / sizeof(uint32_t)); - /*if (!driver_->PrepareDraw(draw_command_)) { + uint32_t index_ptr = reader->ptr(); + reader->Advance(count - 1); + if (!PrepareDraw(&draw_command_)) { + return false; + } draw_command_.prim_type = prim_type; draw_command_.start_index = 0; draw_command_.index_count = index_count; draw_command_.base_vertex = 0; draw_command_.index_buffer = nullptr; - driver_->Draw(draw_command_); - }*/ - reader->Advance(count - 1); - return true; + return IssueDraw(&draw_command_); } bool CommandProcessor::ExecutePacketType3_SET_CONSTANT(RingbufferReader* reader, @@ -967,6 +1017,8 @@ bool CommandProcessor::ExecutePacketType3_INVALIDATE_STATE( bool CommandProcessor::LoadShader(ShaderType shader_type, const uint32_t* address, uint32_t dword_count) { + SCOPE_profile_cpu_f("gpu"); + // Hash the input memory and lookup the shader. GL4Shader* shader_ptr = nullptr; uint64_t hash = XXH64(address, dword_count * sizeof(uint32_t), 0); @@ -1004,6 +1056,344 @@ bool CommandProcessor::LoadShader(ShaderType shader_type, return true; } +bool CommandProcessor::PrepareDraw(DrawCommand* draw_command) { + SCOPE_profile_cpu_f("gpu"); + auto& regs = *register_file_; + auto& cmd = *draw_command; + + // Reset the things we don't modify so that we have clean state. + cmd.prim_type = PrimitiveType::kPointList; + cmd.index_count = 0; + cmd.index_buffer = nullptr; + + // Generic stuff. + cmd.start_index = regs[XE_GPU_REG_VGT_INDX_OFFSET].u32; + cmd.base_vertex = 0; + + if (!UpdateState(draw_command)) { + return false; + } + if (!UpdateRenderTargets()) { + return false; + } + return true; +} + +bool CommandProcessor::UpdateState(DrawCommand* draw_command) { + // Much of this state machine is extracted from: + // https://github.com/freedreno/mesa/blob/master/src/mesa/drivers/dri/r200/r200_state.c + // http://fossies.org/dox/MesaLib-10.3.5/fd2__gmem_8c_source.html + // http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf + + auto& regs = *register_file_; + + union float4 { + float v[4]; + struct { + float x, y, z, w; + }; + }; + struct UniformDataBlock { + float4 window_offset; // tx,ty,?,? + float4 window_scissor; // x0,y0,x1,y1 + float4 viewport_offset; // tx,ty,tz,? + float4 viewport_scale; // sx,sy,sz,? + // TODO(benvanik): vertex format xyzw? + + float4 alpha_test; // alpha test enable, func, ref, ? + + // Register data from 0x4000 to 0x4927. + // SHADER_CONSTANT_000_X... + float4 float_consts[512]; + // SHADER_CONSTANT_FETCH_00_0... + uint32_t fetch_consts[32 * 6]; + // SHADER_CONSTANT_BOOL_000_031... + int32_t bool_consts[8]; + // SHADER_CONSTANT_LOOP_00... + int32_t loop_consts[32]; + }; + static_assert(sizeof(UniformDataBlock) <= 16 * 1024, + "Need <=16k uniform data"); + + auto buffer_ptr = reinterpret_cast( + glMapNamedBufferRange(uniform_data_buffer_, 0, 0, + GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_BUFFER_BIT)); + if (!buffer_ptr) { + PLOGE("Unable to map uniform data buffer"); + return false; + } + + // Window parameters. + // See r200UpdateWindow: + // https://github.com/freedreno/mesa/blob/master/src/mesa/drivers/dri/r200/r200_state.c + uint32_t window_offset = regs[XE_GPU_REG_PA_SC_WINDOW_OFFSET].u32; + buffer_ptr->window_offset.x = float(window_offset & 0x7FFF); + buffer_ptr->window_offset.y = float((window_offset >> 16) & 0x7FFF); + uint32_t window_scissor_tl = regs[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_TL].u32; + uint32_t window_scissor_br = regs[XE_GPU_REG_PA_SC_WINDOW_SCISSOR_BR].u32; + buffer_ptr->window_scissor.x = float(window_scissor_tl & 0x7FFF); + buffer_ptr->window_scissor.y = float((window_scissor_tl >> 16) & 0x7FFF); + buffer_ptr->window_scissor.z = float(window_scissor_br & 0x7FFF); + buffer_ptr->window_scissor.w = float((window_scissor_br >> 16) & 0x7FFF); + + // Viewport scaling. Only enabled if the flags are all set. + buffer_ptr->viewport_scale.x = + regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32; // 640 + buffer_ptr->viewport_offset.x = + regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32; // 640 + buffer_ptr->viewport_scale.y = + regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32; // -360 + buffer_ptr->viewport_offset.y = + regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32; // 360 + buffer_ptr->viewport_scale.z = regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32; // 1 + buffer_ptr->viewport_offset.z = + regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32; // 0 + + // Whether each of the viewport settings is enabled. + // We require it to be all or nothing right now. + uint32_t vte_control = regs[XE_GPU_REG_PA_CL_VTE_CNTL].u32; + bool vport_xscale_enable = (vte_control & (1 << 0)) > 0; + bool vport_xoffset_enable = (vte_control & (1 << 1)) > 0; + bool vport_yscale_enable = (vte_control & (1 << 2)) > 0; + bool vport_yoffset_enable = (vte_control & (1 << 3)) > 0; + bool vport_zscale_enable = (vte_control & (1 << 4)) > 0; + bool vport_zoffset_enable = (vte_control & (1 << 5)) > 0; + assert_true(vport_xscale_enable == vport_yscale_enable == + vport_zscale_enable == vport_xoffset_enable == + vport_yoffset_enable == vport_zoffset_enable); + // TODO(benvanik): pass to shaders? disable transform? etc? + glViewport(0, 0, 1280, 720); + + // Copy over all constants. + // TODO(benvanik): partial updates, etc. We could use shader constant access + // knowledge that we get at compile time to only upload those constants + // required. + std::memcpy( + &buffer_ptr->float_consts, ®s[XE_GPU_REG_SHADER_CONSTANT_000_X].f32, + sizeof(buffer_ptr->float_consts) + sizeof(buffer_ptr->fetch_consts) + + sizeof(buffer_ptr->loop_consts) + sizeof(buffer_ptr->bool_consts)); + + // Scissoring. + int32_t screen_scissor_tl = regs[XE_GPU_REG_PA_SC_SCREEN_SCISSOR_TL].u32; + int32_t screen_scissor_br = regs[XE_GPU_REG_PA_SC_SCREEN_SCISSOR_BR].u32; + if (screen_scissor_tl != 0 && screen_scissor_br != 0x20002000) { + glEnable(GL_SCISSOR_TEST); + // TODO(benvanik): signed? + int32_t screen_scissor_x = screen_scissor_tl & 0x7FFF; + int32_t screen_scissor_y = (screen_scissor_tl >> 16) & 0x7FFF; + int32_t screen_scissor_w = screen_scissor_br & 0x7FFF - screen_scissor_x; + int32_t screen_scissor_h = + (screen_scissor_br >> 16) & 0x7FFF - screen_scissor_y; + glScissor(screen_scissor_x, screen_scissor_y, screen_scissor_w, + screen_scissor_h); + } else { + glDisable(GL_SCISSOR_TEST); + } + + // Rasterizer state. + uint32_t mode_control = regs[XE_GPU_REG_PA_SU_SC_MODE_CNTL].u32; + if (draw_command->prim_type == PrimitiveType::kRectangleList) { + // Rect lists aren't culled. There may be other things they skip too. + glDisable(GL_CULL_FACE); + } else { + switch (mode_control & 0x3) { + case 0: + glDisable(GL_CULL_FACE); + break; + case 1: + glEnable(GL_CULL_FACE); + glCullFace(GL_FRONT); + break; + case 2: + glEnable(GL_CULL_FACE); + glCullFace(GL_BACK); + break; + } + } + if (mode_control & 0x4) { + glFrontFace(GL_CW); + } else { + glFrontFace(GL_CCW); + } + // TODO(benvanik): wireframe mode. + // glPolygonMode(GL_FRONT_AND_BACK, GL_LINE); + glPolygonMode(GL_FRONT_AND_BACK, GL_FILL); + + // Alpha testing -- ALPHAREF, ALPHAFUNC, ALPHATESTENABLE + // Deprecated in GL, implemented in shader. + // if(ALPHATESTENABLE && frag_out.a [<=/ALPHAFUNC] ALPHAREF) discard; + uint32_t color_control = regs[XE_GPU_REG_RB_COLORCONTROL].u32; + buffer_ptr->alpha_test.x = + (color_control & 0x4) ? 1.0f : 0.0f; // ALPAHTESTENABLE + buffer_ptr->alpha_test.y = float(color_control & 0x3); // ALPHAFUNC + buffer_ptr->alpha_test.z = regs[XE_GPU_REG_RB_ALPHA_REF].f32; + + static const GLenum blend_map[] = { + /* 0 */ GL_ZERO, + /* 1 */ GL_ONE, + /* 2 */ GL_ZERO, // ? + /* 3 */ GL_ZERO, // ? + /* 4 */ GL_SRC_COLOR, + /* 5 */ GL_ONE_MINUS_SRC_COLOR, + /* 6 */ GL_SRC_ALPHA, + /* 7 */ GL_ONE_MINUS_SRC_ALPHA, + /* 8 */ GL_DST_COLOR, + /* 9 */ GL_ONE_MINUS_DST_COLOR, + /* 10 */ GL_DST_ALPHA, + /* 11 */ GL_ONE_MINUS_DST_ALPHA, + /* 12 */ GL_CONSTANT_COLOR, + /* 13 */ GL_ONE_MINUS_CONSTANT_COLOR, + /* 14 */ GL_CONSTANT_ALPHA, + /* 15 */ GL_ONE_MINUS_CONSTANT_ALPHA, + /* 16 */ GL_SRC_ALPHA_SATURATE, + }; + static const GLenum blend_op_map[] = { + /* 0 */ GL_FUNC_ADD, + /* 1 */ GL_FUNC_SUBTRACT, + /* 2 */ GL_MIN, + /* 3 */ GL_MAX, + /* 4 */ GL_FUNC_REVERSE_SUBTRACT, + }; + uint32_t color_mask = regs[XE_GPU_REG_RB_COLOR_MASK].u32; + uint32_t blend_control[4] = { + regs[XE_GPU_REG_RB_BLENDCONTROL_0].u32, + regs[XE_GPU_REG_RB_BLENDCONTROL_1].u32, + regs[XE_GPU_REG_RB_BLENDCONTROL_2].u32, + regs[XE_GPU_REG_RB_BLENDCONTROL_3].u32, + }; + for (int n = 0; n < poly::countof(blend_control); n++) { + // A2XX_RB_BLEND_CONTROL_COLOR_SRCBLEND + auto src_blend = blend_map[(blend_control[n] & 0x0000001F) >> 0]; + // A2XX_RB_BLEND_CONTROL_COLOR_DESTBLEND + auto dest_blend = blend_map[(blend_control[n] & 0x00001F00) >> 8]; + // A2XX_RB_BLEND_CONTROL_COLOR_COMB_FCN + auto blend_op = blend_op_map[(blend_control[n] & 0x000000E0) >> 5]; + // A2XX_RB_BLEND_CONTROL_ALPHA_SRCBLEND + auto src_blend_alpha = blend_map[(blend_control[n] & 0x001F0000) >> 16]; + // A2XX_RB_BLEND_CONTROL_ALPHA_DESTBLEND + auto dest_blend_alpha = blend_map[(blend_control[n] & 0x1F000000) >> 24]; + // A2XX_RB_BLEND_CONTROL_ALPHA_COMB_FCN + auto blend_op_alpha = blend_op_map[(blend_control[n] & 0x00E00000) >> 21]; + // A2XX_RB_COLOR_MASK_WRITE_* + uint32_t write_mask = (color_mask >> (n * 4)) & 0xF; + // A2XX_RB_COLORCONTROL_BLEND_DISABLE ?? Can't find this! + // Just guess based on actions. + bool blend_enable = + !((src_blend == GL_ONE) && (dest_blend == GL_ZERO) && + (blend_op == GL_FUNC_ADD) && (src_blend_alpha == GL_ONE) && + (dest_blend_alpha == GL_ZERO) && (blend_op_alpha == GL_FUNC_ADD)); + if (blend_enable) { + glEnablei(GL_BLEND, n); + glBlendEquationSeparatei(n, blend_op, blend_op_alpha); + glBlendFuncSeparatei(n, src_blend, dest_blend, src_blend_alpha, + dest_blend_alpha); + } else { + glDisablei(GL_BLEND, n); + } + } + float blend_color[4] = { + regs[XE_GPU_REG_RB_BLEND_RED].f32, regs[XE_GPU_REG_RB_BLEND_GREEN].f32, + regs[XE_GPU_REG_RB_BLEND_BLUE].f32, regs[XE_GPU_REG_RB_BLEND_ALPHA].f32, + }; + glBlendColor(blend_color[0], blend_color[1], blend_color[2], blend_color[3]); + + static const GLenum compare_func_map[] = { + /* 0 */ GL_NEVER, + /* 1 */ GL_LESS, + /* 2 */ GL_EQUAL, + /* 3 */ GL_LEQUAL, + /* 4 */ GL_GREATER, + /* 5 */ GL_NOTEQUAL, + /* 6 */ GL_GEQUAL, + /* 7 */ GL_ALWAYS, + }; + static const GLenum stencil_op_map[] = { + /* 0 */ GL_KEEP, + /* 1 */ GL_ZERO, + /* 2 */ GL_REPLACE, + /* 3 */ GL_INCR_WRAP, + /* 4 */ GL_DECR_WRAP, + /* 5 */ GL_INVERT, + /* 6 */ GL_INCR, + /* 7 */ GL_DECR, + }; + uint32_t depth_control = regs[XE_GPU_REG_RB_DEPTHCONTROL].u32; + // A2XX_RB_DEPTHCONTROL_Z_ENABLE + if (depth_control & 0x00000002) { + glEnable(GL_DEPTH_TEST); + } else { + glDisable(GL_DEPTH_TEST); + } + // A2XX_RB_DEPTHCONTROL_Z_WRITE_ENABLE + glDepthMask((depth_control & 0x00000004) ? GL_TRUE : GL_FALSE); + // A2XX_RB_DEPTHCONTROL_EARLY_Z_ENABLE + // ? + // A2XX_RB_DEPTHCONTROL_ZFUNC + glDepthFunc(compare_func_map[(depth_control & 0x00000070) >> 4]); + // A2XX_RB_DEPTHCONTROL_STENCIL_ENABLE + if (depth_control & 0x00000001) { + glEnable(GL_STENCIL_TEST); + } else { + glDisable(GL_STENCIL_TEST); + } + uint32_t stencil_ref_mask = regs[XE_GPU_REG_RB_STENCILREFMASK].u32; + // RB_STENCILREFMASK_STENCILREF + uint32_t stencil_ref = (stencil_ref_mask & 0x000000FF); + // RB_STENCILREFMASK_STENCILMASK + uint32_t stencil_read_mask = (stencil_ref_mask & 0x0000FF00) >> 8; + // RB_STENCILREFMASK_STENCILWRITEMASK + glStencilMask((stencil_ref_mask & 0x00FF0000) >> 16); + // A2XX_RB_DEPTHCONTROL_BACKFACE_ENABLE + bool backface_enabled = (depth_control & 0x00000080) != 0; + if (backface_enabled) { + // A2XX_RB_DEPTHCONTROL_STENCILFUNC + glStencilFuncSeparate(GL_FRONT, + compare_func_map[(depth_control & 0x00000700) >> 8], + stencil_ref, stencil_read_mask); + // A2XX_RB_DEPTHCONTROL_STENCILFAIL + // A2XX_RB_DEPTHCONTROL_STENCILZFAIL + // A2XX_RB_DEPTHCONTROL_STENCILZPASS + glStencilOpSeparate(GL_FRONT, + stencil_op_map[(depth_control & 0x00003800) >> 11], + stencil_op_map[(depth_control & 0x000E0000) >> 17], + stencil_op_map[(depth_control & 0x0001C000) >> 14]); + // A2XX_RB_DEPTHCONTROL_STENCILFUNC_BF + glStencilFuncSeparate(GL_BACK, + compare_func_map[(depth_control & 0x00700000) >> 20], + stencil_ref, stencil_read_mask); + // A2XX_RB_DEPTHCONTROL_STENCILFAIL_BF + // A2XX_RB_DEPTHCONTROL_STENCILZFAIL_BF + // A2XX_RB_DEPTHCONTROL_STENCILZPASS_BF + glStencilOpSeparate(GL_BACK, + stencil_op_map[(depth_control & 0x03800000) >> 23], + stencil_op_map[(depth_control & 0xE0000000) >> 29], + stencil_op_map[(depth_control & 0x1C000000) >> 26]); + } else { + // Backfaces disabled - treat backfaces as frontfaces. + glStencilFunc(compare_func_map[(depth_control & 0x00000700) >> 8], + stencil_ref, stencil_read_mask); + glStencilOp(stencil_op_map[(depth_control & 0x00003800) >> 11], + stencil_op_map[(depth_control & 0x000E0000) >> 17], + stencil_op_map[(depth_control & 0x0001C000) >> 14]); + } + + glUnmapNamedBuffer(uniform_data_buffer_); + + return true; +} + +bool CommandProcessor::UpdateRenderTargets() { + auto& regs = *register_file_; + + return true; +} + +bool CommandProcessor::IssueDraw(DrawCommand* draw_command) { + SCOPE_profile_cpu_f("gpu"); + + return true; +} + } // namespace gl4 } // namespace gpu } // namespace xe diff --git a/src/xenia/gpu/gl4/command_processor.h b/src/xenia/gpu/gl4/command_processor.h index 81be5e5b5..5839d0c87 100644 --- a/src/xenia/gpu/gl4/command_processor.h +++ b/src/xenia/gpu/gl4/command_processor.h @@ -12,10 +12,12 @@ #include #include +#include #include #include #include +#include #include #include #include @@ -27,6 +29,42 @@ namespace gl4 { class GL4GraphicsSystem; +// TODO(benvanik): move more of the enums in here? +struct DrawCommand { + PrimitiveType prim_type; + uint32_t start_index; + uint32_t index_count; + uint32_t base_vertex; + + GL4Shader* vertex_shader; + GL4Shader* pixel_shader; + + // Index buffer, if present. + // If index_count > 0 but buffer is nullptr then auto draw. + //IndexBufferResource* index_buffer; + void* index_buffer; + + // Vertex buffers. + struct { + uint32_t input_index; + //VertexBufferResource* buffer; + uint32_t stride; + uint32_t offset; + } vertex_buffers[96]; + size_t vertex_buffer_count; + + // Texture samplers. + struct SamplerInput { + uint32_t input_index; + //TextureResource* texture; + //SamplerStateResource* sampler_state; + }; + SamplerInput vertex_shader_samplers[32]; + size_t vertex_shader_sampler_count; + SamplerInput pixel_shader_samplers[32]; + size_t pixel_shader_sampler_count; +}; + class CommandProcessor { public: CommandProcessor(GL4GraphicsSystem* graphics_system); @@ -38,8 +76,10 @@ class CommandProcessor { uint32_t counter() const { return counter_; } void increment_counter() { counter_++; } - void Initialize(uint32_t ptr, uint32_t page_count); + bool Initialize(std::unique_ptr context); void Shutdown(); + + void InitializeRingBuffer(uint32_t ptr, uint32_t page_count); void EnableReadPointerWriteBack(uint32_t ptr, uint32_t block_size); void UpdateWritePointer(uint32_t value); @@ -48,9 +88,12 @@ class CommandProcessor { class RingbufferReader; void WorkerMain(); + bool SetupGL(); + void ShutdownGL(); void WriteRegister(uint32_t packet_ptr, uint32_t index, uint32_t value); void MakeCoherent(); + void PrepareForWait(); void ExecutePrimaryBuffer(uint32_t start_index, uint32_t end_index); void ExecuteIndirectBuffer(uint32_t ptr, uint32_t length); @@ -113,6 +156,11 @@ class CommandProcessor { bool LoadShader(ShaderType shader_type, const uint32_t* address, uint32_t dword_count); + bool PrepareDraw(DrawCommand* draw_command); + bool UpdateState(DrawCommand* draw_command); + bool UpdateRenderTargets(); + bool IssueDraw(DrawCommand* draw_command); + Memory* memory_; uint8_t* membase_; GL4GraphicsSystem* graphics_system_; @@ -120,7 +168,7 @@ class CommandProcessor { std::thread worker_thread_; std::atomic worker_running_; - + std::unique_ptr context_; std::function swap_handler_; uint64_t time_base_; @@ -143,6 +191,10 @@ class CommandProcessor { std::unordered_map shader_cache_; GL4Shader* active_vertex_shader_; GL4Shader* active_pixel_shader_; + + GLuint uniform_data_buffer_; + + DrawCommand draw_command_; }; } // namespace gl4 diff --git a/src/xenia/gpu/gl4/gl4_graphics_system.cc b/src/xenia/gpu/gl4/gl4_graphics_system.cc index fbff321ff..239911dc2 100644 --- a/src/xenia/gpu/gl4/gl4_graphics_system.cc +++ b/src/xenia/gpu/gl4/gl4_graphics_system.cc @@ -32,9 +32,19 @@ X_STATUS GL4GraphicsSystem::Setup() { // This must happen on the UI thread. poly::threading::Fence control_ready_fence; auto loop = emulator_->main_window()->loop(); + std::unique_ptr processor_context; loop->Post([&]() { + // Setup the GL control that actually does the drawing. + // We run here in the loop and only touch it (and its context) on this + // thread. That means some sync-fu when we want to swap. control_ = std::make_unique(loop); emulator_->main_window()->AddChild(control_.get()); + + // Setup the GL context the command processor will do all its drawing in. + // It's shared with the control context so that we can resolve framebuffers + // from it. + processor_context = control_->context()->CreateShared(); + control_ready_fence.Signal(); }); control_ready_fence.Wait(); @@ -42,6 +52,10 @@ X_STATUS GL4GraphicsSystem::Setup() { // Create command processor. This will spin up a thread to process all // incoming ringbuffer packets. command_processor_ = std::make_unique(this); + if (!command_processor_->Initialize(std::move(processor_context))) { + PLOGE("Unable to initialize command processor"); + return X_STATUS_UNSUCCESSFUL; + } command_processor_->set_swap_handler( std::bind(&GL4GraphicsSystem::SwapHandler, this)); @@ -76,7 +90,7 @@ void GL4GraphicsSystem::Shutdown() { void GL4GraphicsSystem::InitializeRingBuffer(uint32_t ptr, uint32_t page_count) { - command_processor_->Initialize(ptr, page_count); + command_processor_->InitializeRingBuffer(ptr, page_count); } void GL4GraphicsSystem::EnableReadPointerWriteBack(uint32_t ptr, diff --git a/src/xenia/gpu/gl4/gl_context.cc b/src/xenia/gpu/gl4/gl_context.cc index 82439e337..594bde6ef 100644 --- a/src/xenia/gpu/gl4/gl_context.cc +++ b/src/xenia/gpu/gl4/gl_context.cc @@ -9,6 +9,7 @@ #include +#include #include namespace xe { @@ -20,17 +21,26 @@ thread_local WGLEWContext* tls_wglew_context_ = nullptr; extern "C" GLEWContext* glewGetContext() { return tls_glew_context_; } extern "C" WGLEWContext* wglewGetContext() { return tls_wglew_context_; } -GLContext::GLContext() : dc_(nullptr), glrc_(nullptr) {} +GLContext::GLContext() : hwnd_(nullptr), dc_(nullptr), glrc_(nullptr) {} + +GLContext::GLContext(HWND hwnd, HGLRC glrc) + : hwnd_(hwnd), dc_(nullptr), glrc_(glrc) { + dc_ = GetDC(hwnd); +} GLContext::~GLContext() { wglMakeCurrent(nullptr, nullptr); if (glrc_) { wglDeleteContext(glrc_); } + if (dc_) { + ReleaseDC(hwnd_, dc_); + } } -bool GLContext::Initialize(HDC dc) { - dc_ = dc; +bool GLContext::Initialize(HWND hwnd) { + hwnd_ = hwnd; + dc_ = GetDC(hwnd); PIXELFORMATDESCRIPTOR pfd = {0}; pfd.nSize = sizeof(pfd); @@ -59,6 +69,7 @@ bool GLContext::Initialize(HDC dc) { tls_glew_context_ = &glew_context_; tls_wglew_context_ = &wglew_context_; + glewExperimental = GL_TRUE; if (glewInit() != GLEW_OK) { PLOGE("Unable to initialize GLEW"); return false; @@ -73,11 +84,10 @@ bool GLContext::Initialize(HDC dc) { return false; } - int context_flags = WGL_CONTEXT_FORWARD_COMPATIBLE_BIT_ARB; + int context_flags = 0; #if DEBUG context_flags |= WGL_CONTEXT_DEBUG_BIT_ARB; -#endif // DEBUG - +#endif // DEBUG int attrib_list[] = {WGL_CONTEXT_MAJOR_VERSION_ARB, 4, // WGL_CONTEXT_MINOR_VERSION_ARB, 5, // WGL_CONTEXT_FLAGS_ARB, context_flags, // @@ -99,6 +109,45 @@ bool GLContext::Initialize(HDC dc) { return true; } +std::unique_ptr GLContext::CreateShared() { + assert_not_null(glrc_); + + int context_flags = 0; +#if DEBUG + context_flags |= WGL_CONTEXT_DEBUG_BIT_ARB; +#endif // DEBUG + int attrib_list[] = {WGL_CONTEXT_MAJOR_VERSION_ARB, 4, // + WGL_CONTEXT_MINOR_VERSION_ARB, 5, // + WGL_CONTEXT_FLAGS_ARB, context_flags, // + 0}; + auto new_glrc = wglCreateContextAttribsARB(dc_, glrc_, attrib_list); + if (!new_glrc) { + PLOGE("Could not create shared context"); + return nullptr; + } + + auto new_context = std::make_unique(hwnd_, new_glrc); + if (!new_context->MakeCurrent()) { + PLOGE("Could not make new GL context current"); + return nullptr; + } + + glewExperimental = GL_TRUE; + if (glewInit() != GLEW_OK) { + PLOGE("Unable to initialize GLEW"); + return nullptr; + } + if (wglewInit() != GLEW_OK) { + PLOGE("Unable to initialize WGLEW"); + return nullptr; + } + + new_context->ClearCurrent(); + MakeCurrent(); + + return new_context; +} + bool GLContext::MakeCurrent() { if (!wglMakeCurrent(dc_, glrc_)) { return false; diff --git a/src/xenia/gpu/gl4/gl_context.h b/src/xenia/gpu/gl4/gl_context.h index 5a3d6005a..05d3c5206 100644 --- a/src/xenia/gpu/gl4/gl_context.h +++ b/src/xenia/gpu/gl4/gl_context.h @@ -10,6 +10,8 @@ #ifndef XENIA_GPU_GL4_GL_CONTEXT_H_ #define XENIA_GPU_GL4_GL_CONTEXT_H_ +#include + #include #include @@ -20,16 +22,20 @@ namespace gl4 { class GLContext { public: GLContext(); + GLContext(HWND hwnd, HGLRC glrc); ~GLContext(); - bool Initialize(HDC dc); + bool Initialize(HWND hwnd); HDC dc() const { return dc_; } + std::unique_ptr CreateShared(); + bool MakeCurrent(); void ClearCurrent(); private: + HWND hwnd_; HDC dc_; HGLRC glrc_; diff --git a/src/xenia/gpu/gl4/wgl_control.cc b/src/xenia/gpu/gl4/wgl_control.cc index 5040fcd3a..527463fb7 100644 --- a/src/xenia/gpu/gl4/wgl_control.cc +++ b/src/xenia/gpu/gl4/wgl_control.cc @@ -56,13 +56,7 @@ bool WGLControl::Create() { return false; } - HDC dc = GetDC(hwnd_); - if (!dc) { - PLOGE("No DC for WGL window"); - return false; - } - - if (!context_.Initialize(dc)) { + if (!context_.Initialize(hwnd_)) { PFATAL("Unable to initialize GL context"); return false; } diff --git a/src/xenia/gpu/shader.cc b/src/xenia/gpu/shader.cc index d49fa5554..009e93af7 100644 --- a/src/xenia/gpu/shader.cc +++ b/src/xenia/gpu/shader.cc @@ -100,7 +100,8 @@ void Shader::GatherExec(const instr_cf_exec_t* cf) { uint32_t alu_off = (cf->address + i); int sync = sequence & 0x2; if (sequence & 0x1) { - auto fetch = reinterpret_cast(&data_[alu_off * 3]); + auto fetch = + reinterpret_cast(data_.data() + alu_off * 3); switch (fetch->opc) { case VTX_FETCH: GatherVertexFetch(&fetch->vtx); @@ -121,7 +122,8 @@ void Shader::GatherExec(const instr_cf_exec_t* cf) { } } else { // TODO(benvanik): gather registers used, predicate bits used, etc. - auto alu = reinterpret_cast(&data_[alu_off * 3]); + auto alu = + reinterpret_cast(data_.data() + alu_off * 3); if (alu->vector_write_mask) { if (alu->export_data && alu->vector_dest == 63) { alloc_counts_.point_size = true;