diff --git a/src/xenia/gpu/gl4/circular_buffer.h b/src/xenia/gpu/gl4/circular_buffer.h index 2ef75853f..dde0e41d1 100644 --- a/src/xenia/gpu/gl4/circular_buffer.h +++ b/src/xenia/gpu/gl4/circular_buffer.h @@ -31,6 +31,8 @@ class CircularBuffer { bool Initialize(); + GLuint handle() const { return buffer_; } + Allocation Acquire(size_t length); void Commit(Allocation allocation); diff --git a/src/xenia/gpu/gl4/command_processor.cc b/src/xenia/gpu/gl4/command_processor.cc index a15d53116..da4839329 100644 --- a/src/xenia/gpu/gl4/command_processor.cc +++ b/src/xenia/gpu/gl4/command_processor.cc @@ -151,13 +151,17 @@ bool CommandProcessor::SetupGL() { GL_MAP_WRITE_BIT | GL_DYNAMIC_STORAGE_BIT); // Circular buffer holding scratch vertex/index data. - glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV); - glEnableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV); if (!scratch_buffer_.Initialize()) { PLOGE("Unable to initialize scratch buffer"); return false; } + GLuint vertex_array; + glGenVertexArrays(1, &vertex_array); + glBindVertexArray(vertex_array); + glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV); + glEnableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV); + return true; } @@ -251,8 +255,7 @@ void CommandProcessor::PrepareForWait() { // TODO(benvanik): fences and fancy stuff. We should figure out a way to // make interrupt callbacks from the GPU so that we don't have to do a full // synchronize here. - // glFlush(); - glFinish(); + glFlush(); if (FLAGS_thread_safe_gl) { context_->ClearCurrent(); @@ -1162,10 +1165,11 @@ bool CommandProcessor::IssueDraw(DrawCommand* draw_command) { return false; } - // if (!PopulateShaders(draw_command)) { - // XELOGE("Unable to prepare draw shaders"); - // return false; - //} + if (!UpdateShaders(draw_command)) { + PLOGE("Unable to prepare draw shaders"); + return false; + } + // if (!PopulateSamplers(draw_command)) { // XELOGE("Unable to prepare draw samplers"); // return false; @@ -1176,25 +1180,77 @@ bool CommandProcessor::IssueDraw(DrawCommand* draw_command) { return false; } if (!PopulateVertexBuffers(draw_command)) { - XELOGE("Unable to setup vertex buffers"); + PLOGE("Unable to setup vertex buffers"); return false; } + GLenum prim_type = 0; + switch (cmd.prim_type) { + case PrimitiveType::kPointList: + prim_type = GL_POINTS; + /*if (vs->DemandGeometryShader( + D3D11VertexShaderResource::POINT_SPRITE_SHADER, &geometry_shader)) { + return 1; + }*/ + break; + case PrimitiveType::kLineList: + prim_type = GL_LINES; + break; + case PrimitiveType::kLineStrip: + prim_type = GL_LINE_STRIP; + break; + case PrimitiveType::kLineLoop: + prim_type = GL_LINE_LOOP; + break; + case PrimitiveType::kTriangleList: + prim_type = GL_TRIANGLES; + break; + case PrimitiveType::kTriangleStrip: + prim_type = GL_TRIANGLE_STRIP; + break; + case PrimitiveType::kTriangleFan: + prim_type = GL_TRIANGLE_FAN; + break; + case PrimitiveType::kRectangleList: + prim_type = GL_TRIANGLE_STRIP; + /*if (vs->DemandGeometryShader( + D3D11VertexShaderResource::RECT_LIST_SHADER, &geometry_shader)) { + return 1; + }*/ + break; + case PrimitiveType::kQuadList: + prim_type = GL_LINES_ADJACENCY; + /*if + (vs->DemandGeometryShader(D3D11VertexShaderResource::QUAD_LIST_SHADER, + &geometry_shader)) { + return 1; + }*/ + break; + default: + case PrimitiveType::kUnknown0x07: + prim_type = GL_POINTS; + XELOGE("D3D11: unsupported primitive type %d", cmd.prim_type); + break; + } + + // HACK HACK HACK + glDisable(GL_DEPTH_TEST); + if (cmd.index_buffer.address) { // Indexed draw. // PopulateIndexBuffer has our element array setup. - //size_t element_size = cmd.index_buffer.format == IndexFormat::kInt32 - // ? sizeof(uint32_t) - // : sizeof(uint16_t); - //glDrawElementsBaseVertex( - // prim_type, cmd.index_count, - // cmd.index_buffer.format == IndexFormat::kInt32 ? GL_UNSIGNED_INT - // : GL_UNSIGNED_SHORT, - // reinterpret_cast(cmd.start_index * element_size), - // cmd.base_vertex); + size_t element_size = cmd.index_buffer.format == IndexFormat::kInt32 + ? sizeof(uint32_t) + : sizeof(uint16_t); + glDrawElementsBaseVertex( + prim_type, cmd.index_count, + cmd.index_buffer.format == IndexFormat::kInt32 ? GL_UNSIGNED_INT + : GL_UNSIGNED_SHORT, + reinterpret_cast(cmd.start_index * element_size), + cmd.base_vertex); } else { // Auto draw. - //glDrawArrays(prim_type, cmd.start_index, cmd.index_count); + glDrawArrays(prim_type, cmd.start_index, cmd.index_count); } return true; @@ -1215,10 +1271,10 @@ bool CommandProcessor::UpdateState(DrawCommand* draw_command) { }; }; struct UniformDataBlock { - float4 window_offset; // tx,ty,?,? - float4 window_scissor; // x0,y0,x1,y1 - float4 viewport_offset; // tx,ty,tz,? - float4 viewport_scale; // sx,sy,sz,? + float4 window_offset; // tx,ty,rt_w,rt_h + float4 window_scissor; // x0,y0,x1,y1 + float4 viewport_offset; // tx,ty,tz,? + float4 viewport_scale; // sx,sy,sz,? // TODO(benvanik): vertex format xyzw? float4 alpha_test; // alpha test enable, func, ref, ? @@ -1236,11 +1292,10 @@ bool CommandProcessor::UpdateState(DrawCommand* draw_command) { static_assert(sizeof(UniformDataBlock) <= 16 * 1024, "Need <=16k uniform data"); - auto buffer_ptr = reinterpret_cast( - glMapNamedBufferRange(uniform_data_buffer_, 0, 16 * 1024, - GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_BUFFER_BIT)); + auto allocation = scratch_buffer_.Acquire(16 * 1024); + auto buffer_ptr = reinterpret_cast(allocation.host_ptr); if (!buffer_ptr) { - PLOGE("Unable to map uniform data buffer"); + PLOGE("Unable to allocate uniform data buffer"); return false; } @@ -1257,18 +1312,9 @@ bool CommandProcessor::UpdateState(DrawCommand* draw_command) { buffer_ptr->window_scissor.z = float(window_scissor_br & 0x7FFF); buffer_ptr->window_scissor.w = float((window_scissor_br >> 16) & 0x7FFF); - // Viewport scaling. Only enabled if the flags are all set. - buffer_ptr->viewport_scale.x = - regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32; // 640 - buffer_ptr->viewport_offset.x = - regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32; // 640 - buffer_ptr->viewport_scale.y = - regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32; // -360 - buffer_ptr->viewport_offset.y = - regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32; // 360 - buffer_ptr->viewport_scale.z = regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32; // 1 - buffer_ptr->viewport_offset.z = - regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32; // 0 + // HACK: no clue where to get these values. + buffer_ptr->window_offset.z = 1280; + buffer_ptr->window_offset.w = 720; // Whether each of the viewport settings is enabled. // http://www.x.org/docs/AMD/old/evergreen_3D_registers_v2.pdf @@ -1282,6 +1328,23 @@ bool CommandProcessor::UpdateState(DrawCommand* draw_command) { assert_true(vport_xscale_enable == vport_yscale_enable == vport_zscale_enable == vport_xoffset_enable == vport_yoffset_enable == vport_zoffset_enable); + + // Viewport scaling. Only enabled if the flags are all set. + buffer_ptr->viewport_scale.x = + vport_xscale_enable ? regs[XE_GPU_REG_PA_CL_VPORT_XSCALE].f32 : 1; // 640 + buffer_ptr->viewport_offset.x = vport_xoffset_enable + ? regs[XE_GPU_REG_PA_CL_VPORT_XOFFSET].f32 + : 0; // 640 + buffer_ptr->viewport_scale.y = vport_yscale_enable + ? regs[XE_GPU_REG_PA_CL_VPORT_YSCALE].f32 + : 1; // -360 + buffer_ptr->viewport_offset.y = vport_yoffset_enable + ? regs[XE_GPU_REG_PA_CL_VPORT_YOFFSET].f32 + : 0; // 360 + buffer_ptr->viewport_scale.z = + vport_zscale_enable ? regs[XE_GPU_REG_PA_CL_VPORT_ZSCALE].f32 : 1; // 1 + buffer_ptr->viewport_offset.z = + vport_zoffset_enable ? regs[XE_GPU_REG_PA_CL_VPORT_ZOFFSET].f32 : 0; // 0 // VTX_XY_FMT = true: the incoming X, Y have already been multiplied by 1/W0. // = false: multiply the X, Y coordinates by 1/W0. bool vtx_xy_fmt = (vte_control >> 8) & 0x1; @@ -1504,7 +1567,9 @@ bool CommandProcessor::UpdateState(DrawCommand* draw_command) { stencil_op_map[(depth_control & 0x0001C000) >> 14]); } - glUnmapNamedBuffer(uniform_data_buffer_); + // Stash - program setup will bind this to uniforms. + draw_command->state_data_gpu_ptr = allocation.gpu_ptr; + scratch_buffer_.Commit(std::move(allocation)); return true; } @@ -1590,11 +1655,80 @@ bool CommandProcessor::UpdateRenderTargets(DrawCommand* draw_command) { // TEST TEST TEST TEST TEST TEST TEST TEST TEST TEST // Pretend we are drawing. - glEnable(GL_SCISSOR_TEST); - glScissor(100, 100, 100, 100); - float red[] = {rand() / (float)RAND_MAX, 0, 0, 1.0f}; - glClearNamedFramebufferfv(active_framebuffer_->framebuffer, GL_COLOR, 0, red); - glDisable(GL_SCISSOR_TEST); + // glEnable(GL_SCISSOR_TEST); + // glScissor(100, 100, 100, 100); + // float red[] = {rand() / (float)RAND_MAX, 0, 0, 1.0f}; + // glClearNamedFramebufferfv(active_framebuffer_->framebuffer, GL_COLOR, 0, + // red); + // glDisable(GL_SCISSOR_TEST); + + return true; +} + +bool CommandProcessor::UpdateShaders(DrawCommand* draw_command) { + SCOPE_profile_cpu_f("gpu"); + auto& regs = *register_file_; + auto& cmd = *draw_command; + + xe_gpu_program_cntl_t program_cntl; + program_cntl.dword_0 = regs[XE_GPU_REG_SQ_PROGRAM_CNTL].u32; + if (!active_vertex_shader_->has_prepared()) { + if (!active_vertex_shader_->PrepareVertexShader(program_cntl)) { + XELOGE("Unable to prepare vertex shader"); + return false; + } + } else if (!active_vertex_shader_->is_valid()) { + XELOGE("Vertex shader invalid"); + return false; + } + + if (!active_pixel_shader_->has_prepared()) { + if (!active_pixel_shader_->PreparePixelShader(program_cntl, + active_vertex_shader_)) { + XELOGE("Unable to prepare pixel shader"); + return false; + } + } else if (!active_pixel_shader_->is_valid()) { + XELOGE("Pixel shader invalid"); + return false; + } + + GLuint vertex_program = active_vertex_shader_->program(); + GLuint geometry_program = 0; + GLuint fragment_program = active_pixel_shader_->program(); + + GLuint pipeline; + glCreateProgramPipelines(1, &pipeline); + glUseProgramStages(pipeline, GL_VERTEX_SHADER_BIT, vertex_program); + glUseProgramStages(pipeline, GL_GEOMETRY_SHADER_BIT, geometry_program); + glUseProgramStages(pipeline, GL_FRAGMENT_SHADER_BIT, fragment_program); + + // HACK: layout(location=0) on a bindless uniform crashes nvidia driver. + GLint vertex_state_loc = glGetUniformLocation(vertex_program, "state"); + assert_true(vertex_state_loc == -1 || vertex_state_loc == 0); + GLint geometry_state_loc = + geometry_program ? glGetUniformLocation(geometry_program, "state") : -1; + assert_true(geometry_state_loc == -1 || geometry_state_loc == 0); + GLint fragment_state_loc = glGetUniformLocation(fragment_program, "state"); + assert_true(fragment_state_loc == -1 || fragment_state_loc == 0); + + // TODO(benvanik): do we need to do this for all stages if the locations + // match? + if (vertex_state_loc != -1) { + glProgramUniformHandleui64ARB(vertex_program, vertex_state_loc, + cmd.state_data_gpu_ptr); + } + if (geometry_program && geometry_state_loc != -1) { + glProgramUniformHandleui64ARB(geometry_program, geometry_state_loc, + cmd.state_data_gpu_ptr); + } + if (fragment_state_loc != -1) { + glProgramUniformHandleui64ARB(fragment_program, fragment_state_loc, + cmd.state_data_gpu_ptr); + } + + glBindProgramPipeline(pipeline); + // glDeleteProgramPipelines(1, &pipeline); return true; } @@ -1641,15 +1775,9 @@ bool CommandProcessor::PopulateVertexBuffers(DrawCommand* draw_command) { SCOPE_profile_cpu_f("gpu"); auto& regs = *register_file_; auto& cmd = *draw_command; + assert_not_null(active_vertex_shader_); - if (!cmd.vertex_shader) { - // No vertex shader, no-op. - return true; - } - - const auto& buffer_inputs = cmd.vertex_shader->buffer_inputs(); - - // glBindVertexArray(vertex_array); + const auto& buffer_inputs = active_vertex_shader_->buffer_inputs(); for (size_t n = 0; n < buffer_inputs.count; n++) { const auto& desc = buffer_inputs.descs[n]; @@ -1685,9 +1813,100 @@ bool CommandProcessor::PopulateVertexBuffers(DrawCommand* draw_command) { reinterpret_cast(membase_ + (fetch->address << 2)), fetch->size); - /*glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, - desc.input_index, - allocation.gpu_ptr, allocation.length);*/ + uint32_t el_index = 0; + for (uint32_t i = 0; i < desc.element_count; ++i) { + const auto& el = desc.elements[i]; + GLuint comp_count; + GLuint comp_size; + GLenum comp_type; + switch (el.format) { + case VertexFormat::k_8_8_8_8: + comp_count = 4; + comp_size = 1; + comp_type = el.is_signed ? GL_BYTE : GL_UNSIGNED_BYTE; + break; + case VertexFormat::k_2_10_10_10: + comp_count = 4; + comp_size = 4; + comp_type = el.is_signed ? GL_INT_2_10_10_10_REV + : GL_UNSIGNED_INT_2_10_10_10_REV; + break; + case VertexFormat::k_10_11_11: + comp_count = 3; + comp_size = 4; + assert_false(el.is_signed); + comp_type = GL_UNSIGNED_INT_10F_11F_11F_REV; + break; + /*case VertexFormat::k_11_11_10: + break;*/ + case VertexFormat::k_16_16: + comp_count = 2; + comp_size = 2; + comp_type = el.is_signed ? GL_SHORT : GL_UNSIGNED_SHORT; + break; + case VertexFormat::k_16_16_FLOAT: + comp_count = 2; + comp_size = 2; + comp_type = GL_HALF_FLOAT; + break; + case VertexFormat::k_16_16_16_16: + comp_count = 4; + comp_size = 2; + comp_type = el.is_signed ? GL_SHORT : GL_UNSIGNED_SHORT; + break; + case VertexFormat::k_16_16_16_16_FLOAT: + comp_count = 4; + comp_size = 2; + comp_type = GL_HALF_FLOAT; + break; + case VertexFormat::k_32: + comp_count = 1; + comp_size = 4; + comp_type = el.is_signed ? GL_INT : GL_UNSIGNED_INT; + break; + case VertexFormat::k_32_32: + comp_count = 2; + comp_size = 4; + comp_type = el.is_signed ? GL_INT : GL_UNSIGNED_INT; + break; + case VertexFormat::k_32_32_32_32: + comp_count = 4; + comp_size = 4; + comp_type = el.is_signed ? GL_INT : GL_UNSIGNED_INT; + break; + case VertexFormat::k_32_FLOAT: + comp_count = 1; + comp_size = 4; + comp_type = GL_FLOAT; + break; + case VertexFormat::k_32_32_FLOAT: + comp_count = 2; + comp_size = 4; + comp_type = GL_FLOAT; + break; + case VertexFormat::k_32_32_32_FLOAT: + comp_count = 3; + comp_size = 4; + comp_type = GL_FLOAT; + break; + case VertexFormat::k_32_32_32_32_FLOAT: + comp_count = 4; + comp_size = 4; + comp_type = GL_FLOAT; + break; + default: + assert_unhandled_case(el.format); + break; + } + size_t offset = el.offset_words * sizeof(uint32_t); + glEnableVertexAttribArray(el_index); + glVertexAttribFormatNV(el_index, comp_count, comp_type, el.is_normalized, + desc.stride_words * sizeof(uint32_t)); + glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, el_index, + allocation.gpu_ptr + offset, + allocation.length - offset); + ++el_index; + } // Flush buffer before we draw. scratch_buffer_.Commit(std::move(allocation)); @@ -1782,7 +2001,7 @@ bool CommandProcessor::IssueCopy(DrawCommand* draw_command) { GLenum read_format; GLenum read_type; switch (copy_dest_format) { - case ColorFormat::kColor_8_8_8_8: + case ColorFormat::k_8_8_8_8: read_format = copy_dest_swap ? GL_BGRA : GL_RGBA; read_type = GL_UNSIGNED_BYTE; break; @@ -1832,10 +2051,10 @@ bool CommandProcessor::IssueCopy(DrawCommand* draw_command) { // glBindBuffer(GL_READ_FRAMEBUFFER, framebuffer) glNamedFramebufferReadBuffer(source_framebuffer->framebuffer, GL_COLOR_ATTACHMENT0 + copy_src_select); - glReadPixels(x, y, w, h, read_format, read_type, ptr); + //glReadPixels(x, y, w, h, read_format, read_type, ptr); } else { // Source from the bound depth/stencil target. - glReadPixels(x, y, w, h, GL_DEPTH_STENCIL, read_type, ptr); + //glReadPixels(x, y, w, h, GL_DEPTH_STENCIL, read_type, ptr); } break; case CopyCommand::kRaw: @@ -1876,7 +2095,7 @@ bool CommandProcessor::IssueCopy(DrawCommand* draw_command) { glClearNamedFramebufferfi(source_framebuffer->framebuffer, GL_DEPTH_STENCIL, depth.float_value, stencil); } - + return true; } @@ -1890,8 +2109,8 @@ GLuint CommandProcessor::GetColorRenderTarget(uint32_t pitch, uint32_t height = 2560; // NOTE: we strip gamma formats down to normal ones. - if (format == ColorRenderTargetFormat::k8888Gamma) { - format = ColorRenderTargetFormat::k8888; + if (format == ColorRenderTargetFormat::k_8_8_8_8_GAMMA) { + format = ColorRenderTargetFormat::k_8_8_8_8; } for (auto& it = cached_color_render_targets_.begin(); @@ -1910,8 +2129,8 @@ GLuint CommandProcessor::GetColorRenderTarget(uint32_t pitch, GLenum internal_format; switch (format) { - case ColorRenderTargetFormat::k8888: - case ColorRenderTargetFormat::k8888Gamma: + case ColorRenderTargetFormat::k_8_8_8_8: + case ColorRenderTargetFormat::k_8_8_8_8_GAMMA: internal_format = GL_RGBA8; break; default: diff --git a/src/xenia/gpu/gl4/command_processor.h b/src/xenia/gpu/gl4/command_processor.h index 242325699..e73464d38 100644 --- a/src/xenia/gpu/gl4/command_processor.h +++ b/src/xenia/gpu/gl4/command_processor.h @@ -47,9 +47,6 @@ struct DrawCommand { uint32_t index_count; uint32_t base_vertex; - GL4Shader* vertex_shader; - GL4Shader* pixel_shader; - // Index buffer, if present. // If index_count > 0 but buffer is nullptr then auto draw. struct { @@ -69,6 +66,8 @@ struct DrawCommand { size_t vertex_shader_sampler_count; SamplerInput pixel_shader_samplers[32]; size_t pixel_shader_sampler_count; + + GLuint64 state_data_gpu_ptr; }; class CommandProcessor { @@ -188,6 +187,7 @@ class CommandProcessor { bool IssueDraw(DrawCommand* draw_command); bool UpdateState(DrawCommand* draw_command); bool UpdateRenderTargets(DrawCommand* draw_command); + bool UpdateShaders(DrawCommand* draw_command); bool PopulateIndexBuffer(DrawCommand* draw_command); bool PopulateVertexBuffers(DrawCommand* draw_command); bool IssueCopy(DrawCommand* draw_command); diff --git a/src/xenia/gpu/gl4/gl4_graphics_system.cc b/src/xenia/gpu/gl4/gl4_graphics_system.cc index c61d4a6fa..a977f2a6f 100644 --- a/src/xenia/gpu/gl4/gl4_graphics_system.cc +++ b/src/xenia/gpu/gl4/gl4_graphics_system.cc @@ -11,6 +11,7 @@ #include #include +#include #include namespace xe { @@ -42,11 +43,19 @@ X_STATUS GL4GraphicsSystem::Setup() { control_ = std::make_unique(loop); emulator_->main_window()->AddChild(control_.get()); + if (FLAGS_thread_safe_gl) { + control_->context()->MakeCurrent(); + } + // Setup the GL context the command processor will do all its drawing in. // It's shared with the control context so that we can resolve framebuffers // from it. processor_context = control_->context()->CreateShared(); + if (FLAGS_thread_safe_gl) { + control_->context()->ClearCurrent(); + } + control_ready_fence.Signal(); }); control_ready_fence.Wait(); diff --git a/src/xenia/gpu/gl4/gl4_shader.cc b/src/xenia/gpu/gl4/gl4_shader.cc index d4861f1e2..251e9f197 100644 --- a/src/xenia/gpu/gl4/gl4_shader.cc +++ b/src/xenia/gpu/gl4/gl4_shader.cc @@ -15,7 +15,163 @@ namespace xe { namespace gpu { namespace gl4 { -bool GL4Shader::TranslateImpl() { return true; } +extern "C" GLEWContext* glewGetContext(); + +GL4Shader::GL4Shader(ShaderType shader_type, uint64_t data_hash, + const uint32_t* dword_ptr, uint32_t dword_count) + : Shader(shader_type, data_hash, dword_ptr, dword_count), program_(0) {} + +GL4Shader::~GL4Shader() { glDeleteProgram(program_); } + +const std::string header = + "#version 450\n" + "#extension all : warn\n" + "#extension GL_ARB_bindless_texture : require\n" + "#extension GL_ARB_explicit_uniform_location : require\n" + "#extension GL_ARB_shading_language_420pack : require\n" + "#extension GL_ARB_shader_storage_buffer_object : require\n" + "#extension GL_NV_shader_buffer_load : require\n" + "precision highp float;\n" + "precision highp int;\n" + "layout(std140, column_major) uniform;\n" + "layout(std430, column_major) buffer;\n" + "struct StateData {\n" + " vec4 window_offset;\n" + " vec4 window_scissor;\n" + " vec4 viewport_offset;\n" + " vec4 viewport_scale;\n" + " vec4 alpha_test;\n" + " vec4 float_consts[512];\n" + " uint fetch_consts[32 * 6];\n" + " int bool_consts[8];\n" + " int loop_consts[32];\n" + "};\n" + "struct VertexData {\n" + " vec4 o[16];\n" + "};\n" + "\n" + "uniform StateData* state;\n"; + +bool GL4Shader::PrepareVertexShader( + const xenos::xe_gpu_program_cntl_t& program_cntl) { + if (has_prepared_) { + return is_valid_; + } + has_prepared_ = true; + + std::string apply_viewport = + "vec4 applyViewport(vec4 pos) {\n" + // TODO(benvanik): piecewise viewport_enable -> offset/scale logic. + " if (false) {\n" + " } else {\n" + /*" pos.xy = pos.xy / vec2(state->window_offset.z / 2.0, " + "-state->window_offset.w / 2.0) + vec2(-1.0, 1.0);\n" + " pos.zw = vec2(0.0, 1.0);\n"*/ + " pos.xy = pos.xy / vec2(1280.0 / 2.0, " + "-720.0 / 2.0) + vec2(-1.0, 1.0);\n" + " //pos.zw = vec2(0.0, 1.0);\n" + " }\n" + " pos.x = pos.x * state->viewport_scale.x + \n" + " state->viewport_offset.x;\n" + " pos.y = pos.y * state->viewport_scale.y + \n" + " state->viewport_offset.y;\n" + " pos.z = pos.z * state->viewport_scale.z + \n" + " state->viewport_offset.z;\n" + " pos.xy += state->window_offset.xy;\n" + " return pos;\n" + "}\n"; + std::string source = + header + apply_viewport + + "out gl_PerVertex {\n" + " vec4 gl_Position;\n" + " float gl_PointSize;\n" + " float gl_ClipDistance[];\n" + "};\n" + "layout(location = 0) in vec3 iF0;\n" + "layout(location = 1) in vec4 iF1;\n" + "layout(location = 0) out VertexData vtx;\n" + "void main() {\n" + //" vec4 oPos = vec4(iF0.xy, 0.0, 1.0);\n" + " vec4 oPos = iF0.xxxx * state->float_consts[0];\n" + " oPos = (iF0.yyyy * state->float_consts[1]) + oPos;\n" + " oPos = (iF0.zzzz * state->float_consts[2]) + oPos;\n" + " oPos = (vec4(1.0, 1.0, 1.0, 1.0) * state->float_consts[3]) + oPos;\n" + //" gl_PointSize = 1.0;\n" + " for (int i = 0; i < vtx.o.length(); ++i) {\n" + " vtx.o[0] = vec4(0.0, 0.0, 0.0, 0.0);\n" + " }\n" + " vtx.o[0] = iF1;\n" + " gl_Position = applyViewport(oPos);\n" + //" gl_Position = oPos;\n" + "}\n"; + + if (!CompileProgram(source)) { + return false; + } + + is_valid_ = true; + return true; +} + +bool GL4Shader::PreparePixelShader( + const xenos::xe_gpu_program_cntl_t& program_cntl, + GL4Shader* vertex_shader) { + if (has_prepared_) { + return is_valid_; + } + has_prepared_ = true; + + std::string source = header + + "layout(location = 0) in VertexData vtx;\n" + "layout(location = 0) out vec4 oC[4];\n" + "void main() {\n" + " for (int i = 0; i < oC.length(); ++i) {\n" + " oC[i] = vec4(1.0, 0.0, 0.0, 1.0);\n" + " }\n" + " oC[0] = vtx.o[0];\n" + //" gl_FragDepth = 0.0;\n" + "}\n"; + + if (!CompileProgram(source)) { + return false; + } + + is_valid_ = true; + return true; +} + +bool GL4Shader::CompileProgram(std::string source) { + assert_zero(program_); + + translated_disassembly_ = std::move(source); + const char* source_str = translated_disassembly_.c_str(); + + program_ = glCreateShaderProgramv(shader_type_ == ShaderType::kVertex + ? GL_VERTEX_SHADER + : GL_FRAGMENT_SHADER, + 1, &source_str); + if (!program_) { + PLOGE("Unable to create shader program"); + return false; + } + + GLint link_status = 0; + glGetProgramiv(program_, GL_LINK_STATUS, &link_status); + if (!link_status) { + // log_length includes the null character. + GLint log_length = 0; + glGetProgramiv(program_, GL_INFO_LOG_LENGTH, &log_length); + std::string info_log; + info_log.resize(log_length - 1); + glGetProgramInfoLog(program_, log_length, &log_length, + const_cast(info_log.data())); + PLOGE("Unable to link program: %s", info_log.c_str()); + error_log_ = std::move(info_log); + return false; + } + + return true; +} } // namespace gl4 } // namespace gpu diff --git a/src/xenia/gpu/gl4/gl4_shader.h b/src/xenia/gpu/gl4/gl4_shader.h index 079307e7a..94489d766 100644 --- a/src/xenia/gpu/gl4/gl4_shader.h +++ b/src/xenia/gpu/gl4/gl4_shader.h @@ -11,6 +11,7 @@ #define XENIA_GPU_GL4_GL4_SHADER_H_ #include +#include #include namespace xe { @@ -19,10 +20,20 @@ namespace gl4 { class GL4Shader : public Shader { public: - using Shader::Shader; + GL4Shader(ShaderType shader_type, uint64_t data_hash, + const uint32_t* dword_ptr, uint32_t dword_count); + ~GL4Shader() override; + + GLuint program() const { return program_; } + + bool PrepareVertexShader(const xenos::xe_gpu_program_cntl_t& program_cntl); + bool PreparePixelShader(const xenos::xe_gpu_program_cntl_t& program_cntl, + GL4Shader* vertex_shader); protected: - bool TranslateImpl() override; + bool CompileProgram(std::string source); + + GLuint program_; }; } // namespace gl4 diff --git a/src/xenia/gpu/shader.cc b/src/xenia/gpu/shader.cc index 334331716..d2cb0bd5d 100644 --- a/src/xenia/gpu/shader.cc +++ b/src/xenia/gpu/shader.cc @@ -16,10 +16,14 @@ namespace xe { namespace gpu { using namespace xe::gpu::ucode; +using namespace xe::gpu::xenos; Shader::Shader(ShaderType shader_type, uint64_t data_hash, const uint32_t* dword_ptr, uint32_t dword_count) - : shader_type_(shader_type), data_hash_(data_hash), is_valid_(false) { + : shader_type_(shader_type), + data_hash_(data_hash), + has_prepared_(false), + is_valid_(false) { data_.resize(dword_count); poly::copy_and_swap(data_.data(), dword_ptr, dword_count); std::memset(&alloc_counts_, 0, sizeof(alloc_counts_)); @@ -35,18 +39,7 @@ Shader::Shader(ShaderType shader_type, uint64_t data_hash, GatherIO(); } -bool Shader::Translate() { - assert_false(is_valid_); - - // TODO(benvanik): disk cache/etc - lookup hash and load if found. - // TODO(benvanik): dump to disk. - - // Attempt implementation-specific translation. - // This may take awhile, and probably will fail. - // TODO(benvanik): parallelize? (allow two translations at once, etc). - is_valid_ = TranslateImpl(); - return is_valid_; -} +Shader::~Shader() = default; void Shader::GatherIO() { // Process all execution blocks. @@ -203,44 +196,43 @@ void Shader::GatherVertexFetch(const instr_fetch_vtx_t* vtx) { } el->vtx_fetch = *vtx; - el->format = vtx->format; + el->format = static_cast(vtx->format); el->is_normalized = vtx->num_format_all == 0; el->is_signed = vtx->format_comp_all == 1; el->offset_words = vtx->offset; el->size_words = 0; switch (el->format) { - case FMT_8_8_8_8: - case FMT_2_10_10_10: - case FMT_10_11_11: - case FMT_11_11_10: + case VertexFormat::k_8_8_8_8: + case VertexFormat::k_2_10_10_10: + case VertexFormat::k_10_11_11: + case VertexFormat::k_11_11_10: el->size_words = 1; break; - case FMT_16_16: - case FMT_16_16_FLOAT: + case VertexFormat::k_16_16: + case VertexFormat::k_16_16_FLOAT: el->size_words = 1; break; - case FMT_16_16_16_16: - case FMT_16_16_16_16_FLOAT: + case VertexFormat::k_16_16_16_16: + case VertexFormat::k_16_16_16_16_FLOAT: el->size_words = 2; break; - case FMT_32: - case FMT_32_FLOAT: + case VertexFormat::k_32: + case VertexFormat::k_32_FLOAT: el->size_words = 1; break; - case FMT_32_32: - case FMT_32_32_FLOAT: + case VertexFormat::k_32_32: + case VertexFormat::k_32_32_FLOAT: el->size_words = 2; break; - case FMT_32_32_32_FLOAT: + case VertexFormat::k_32_32_32_FLOAT: el->size_words = 3; break; - case FMT_32_32_32_32: - case FMT_32_32_32_32_FLOAT: + case VertexFormat::k_32_32_32_32: + case VertexFormat::k_32_32_32_32_FLOAT: el->size_words = 4; break; default: - XELOGE("Unknown vertex format: %d", el->format); - assert_always(); + assert_unhandled_case(el->format); break; } } diff --git a/src/xenia/gpu/shader.h b/src/xenia/gpu/shader.h index 0b755e7c9..05438657c 100644 --- a/src/xenia/gpu/shader.h +++ b/src/xenia/gpu/shader.h @@ -20,21 +20,19 @@ namespace gpu { class Shader { public: - Shader(ShaderType shader_type, uint64_t data_hash, const uint32_t* dword_ptr, - uint32_t dword_count); + virtual ~Shader(); ShaderType type() const { return shader_type_; } + bool has_prepared() const { return has_prepared_; } bool is_valid() const { return is_valid_; } const std::string& ucode_disassembly() const { return ucode_disassembly_; } const std::string& translated_disassembly() const { return translated_disassembly_; } - bool Translate(); - struct BufferDescElement { ucode::instr_fetch_vtx_t vtx_fetch; - uint32_t format; + xenos::VertexFormat format; uint32_t offset_words; uint32_t size_words; bool is_signed; @@ -76,7 +74,8 @@ class Shader { const std::vector& allocs() const { return allocs_; } protected: - virtual bool TranslateImpl() = 0; + Shader(ShaderType shader_type, uint64_t data_hash, const uint32_t* dword_ptr, + uint32_t dword_count); void GatherIO(); void GatherAlloc(const ucode::instr_cf_alloc_t* cf); @@ -87,10 +86,12 @@ class Shader { ShaderType shader_type_; uint64_t data_hash_; std::vector data_; + bool has_prepared_; bool is_valid_; std::string ucode_disassembly_; std::string translated_disassembly_; + std::string error_log_; AllocCounts alloc_counts_; std::vector execs_; diff --git a/src/xenia/gpu/xenos.h b/src/xenia/gpu/xenos.h index 97c48254c..668f94aae 100644 --- a/src/xenia/gpu/xenos.h +++ b/src/xenia/gpu/xenos.h @@ -72,9 +72,9 @@ enum class MsaaSamples : uint32_t { }; enum class ColorRenderTargetFormat : uint32_t { - k8888 = 0, // D3DFMT_A8R8G8B8 (or ABGR?) - k8888Gamma = 1, // D3DFMT_A8R8G8B8 with gamma correction - // ... + k_8_8_8_8 = 0, // D3DFMT_A8R8G8B8 (or ABGR?) + k_8_8_8_8_GAMMA = 1, // D3DFMT_A8R8G8B8 with gamma correction + // ... }; enum class DepthRenderTargetFormat : uint32_t { @@ -98,29 +98,47 @@ enum class CopyCommand : uint32_t { // Subset of a2xx_sq_surfaceformat. enum class ColorFormat : uint32_t { - kColor_8 = 2, - kColor_1_5_5_5 = 3, - kColor_5_6_5 = 4, - kColor_6_5_5 = 5, - kColor_8_8_8_8 = 6, - kColor_2_10_10_10 = 7, - kColor_8_A = 8, - kColor_8_B = 9, - kColor_8_8 = 10, - kColor_8_8_8_8_A = 14, - kColor_4_4_4_4 = 15, - kColor_10_11_11 = 16, - kColor_11_11_10 = 17, - kColor_16 = 24, - kColor_16_16 = 25, - kColor_16_16_16_16 = 26, - kColor_16_FLOAT = 30, - kColor_16_16_FLOAT = 31, - kColor_16_16_16_16_FLOAT = 32, - kColor_32_FLOAT = 36, - kColor_32_32_FLOAT = 37, - kColor_32_32_32_32_FLOAT = 38, - kColor_2_10_10_10_FLOAT = 62, + k_8 = 2, + k_1_5_5_5 = 3, + k_5_6_5 = 4, + k_6_5_5 = 5, + k_8_8_8_8 = 6, + k_2_10_10_10 = 7, + k_8_A = 8, + k_8_B = 9, + k_8_8 = 10, + k_8_8_8_8_A = 14, + k_4_4_4_4 = 15, + k_10_11_11 = 16, + k_11_11_10 = 17, + k_16 = 24, + k_16_16 = 25, + k_16_16_16_16 = 26, + k_16_FLOAT = 30, + k_16_16_FLOAT = 31, + k_16_16_16_16_FLOAT = 32, + k_32_FLOAT = 36, + k_32_32_FLOAT = 37, + k_32_32_32_32_FLOAT = 38, + k_2_10_10_10_FLOAT = 62, +}; + +enum class VertexFormat : uint32_t { + k_8_8_8_8 = 6, + k_2_10_10_10 = 7, + k_10_11_11 = 16, + k_11_11_10 = 17, + k_16_16 = 25, + k_16_16_16_16 = 26, + k_16_16_FLOAT = 31, + k_16_16_16_16_FLOAT = 32, + k_32 = 33, + k_32_32 = 34, + k_32_32_32_32 = 35, + k_32_FLOAT = 36, + k_32_32_FLOAT = 37, + k_32_32_32_32_FLOAT = 38, + k_32_32_32_FLOAT = 57, }; #define XE_GPU_MAKE_SWIZZLE(x, y, z, w) \