diff --git a/rpcs3/Emu/RSX/GL/GLExecutionState.h b/rpcs3/Emu/RSX/GL/GLExecutionState.h index 660674ef32..8c1b58fa74 100644 --- a/rpcs3/Emu/RSX/GL/GLExecutionState.h +++ b/rpcs3/Emu/RSX/GL/GLExecutionState.h @@ -21,6 +21,7 @@ namespace gl bool ARB_shader_draw_parameters_supported = false; bool ARB_depth_buffer_float_supported = false; bool ARB_texture_barrier_supported = false; + bool ARB_shader_stencil_export_supported = false; bool NV_texture_barrier_supported = false; bool NV_gpu_shader5_supported = false; bool AMD_gpu_shader_half_float_supported = false; @@ -45,7 +46,7 @@ namespace gl void initialize() { - int find_count = 14; + int find_count = 15; int ext_count = 0; glGetIntegerv(GL_NUM_EXTENSIONS, &ext_count); @@ -162,6 +163,13 @@ namespace gl find_count--; continue; } + + if (check(ext_name, "GL_ARB_shader_stencil_export")) + { + ARB_shader_stencil_export_supported = true; + find_count--; + continue; + } } // Check GL_VERSION and GL_RENDERER for the presence of Mesa diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp index a30b477c45..c8b3388d50 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp +++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp @@ -364,6 +364,7 @@ void GLGSRender::on_exit() // Globals // TODO: Move these gl::destroy_compute_tasks(); + gl::destroy_overlay_passes(); gl::destroy_global_texture_resources(); diff --git a/rpcs3/Emu/RSX/GL/GLHelpers.h b/rpcs3/Emu/RSX/GL/GLHelpers.h index a48e296560..5c8dfc5142 100644 --- a/rpcs3/Emu/RSX/GL/GLHelpers.h +++ b/rpcs3/Emu/RSX/GL/GLHelpers.h @@ -402,6 +402,16 @@ namespace gl m_alignment = value; return *this; } + + bool get_swap_bytes() const + { + return m_swap_bytes; + } + + int get_row_length() const + { + return m_row_length; + } }; class vao; diff --git a/rpcs3/Emu/RSX/GL/GLOverlays.cpp b/rpcs3/Emu/RSX/GL/GLOverlays.cpp index 1bdb3a9b54..a6ec591b26 100644 --- a/rpcs3/Emu/RSX/GL/GLOverlays.cpp +++ b/rpcs3/Emu/RSX/GL/GLOverlays.cpp @@ -2,6 +2,19 @@ namespace gl { + // Lame + std::unordered_map> g_overlay_passes; + + void destroy_overlay_passes() + { + for (auto& [key, prog] : g_overlay_passes) + { + prog->destroy(); + } + + g_overlay_passes.clear(); + } + void overlay_pass::create() { if (!compiled) @@ -505,17 +518,8 @@ namespace gl video_out_calibration_pass::video_out_calibration_pass() { vs_src = - "#version 420\n\n" - "layout(location=0) out vec2 tc0;\n" - "\n" - "void main()\n" - "{\n" - " vec2 positions[] = {vec2(-1., -1.), vec2(1., -1.), vec2(-1., 1.), vec2(1., 1.)};\n" - " vec2 coords[] = {vec2(0., 1.), vec2(1., 1.), vec2(0., 0.), vec2(1., 0.)};\n" - " tc0 = coords[gl_VertexID % 4];\n" - " vec2 pos = positions[gl_VertexID % 4];\n" - " gl_Position = vec4(pos, 0., 1.);\n" - "}\n"; + #include "../Program/GLSLSnippets/GenericVSPassthrough.glsl" + ; fs_src = "#version 420\n\n" @@ -578,4 +582,39 @@ namespace gl overlay_pass::run(cmd, viewport, GL_NONE, false, false); } + + rp_ssbo_to_d24x8_texture::rp_ssbo_to_d24x8_texture() + { + vs_src = + #include "../Program/GLSLSnippets/GenericVSPassthrough.glsl" + ; + + fs_src = + #include "../Program/GLSLSnippets/CopyBufferToD24x8.glsl" + ; + + std::pair repl_list[] = + { + { "%set, ", "" }, + { "%loc", std::to_string(GL_COMPUTE_BUFFER_SLOT(0)) }, + { "%push_block", fmt::format("binding=%d, std140", GL_COMPUTE_BUFFER_SLOT(1)) } + }; + + fs_src = fmt::replace_all(fs_src, repl_list); + } + + void rp_ssbo_to_d24x8_texture::run(gl::command_context& cmd, + const buffer* src, const texture* dst, + const u32 src_offset, const coordu& dst_region, + const pixel_unpack_settings& settings) + { + const int row_length = settings.get_row_length(); + program_handle.uniforms["src_pitch"] = row_length ? row_length : static_cast(dst_region.width); + program_handle.uniforms["swap_bytes"] = settings.get_swap_bytes() ? 1 : 0; + src->bind_range(GL_COMPUTE_BUFFER_SLOT(0), src_offset, row_length * dst_region.height); + + cmd->stencil_mask(0xFF); + + overlay_pass::run(cmd, dst_region, dst->id(), true); + } } diff --git a/rpcs3/Emu/RSX/GL/GLOverlays.h b/rpcs3/Emu/RSX/GL/GLOverlays.h index bd8b69b4f3..18a89dac38 100644 --- a/rpcs3/Emu/RSX/GL/GLOverlays.h +++ b/rpcs3/Emu/RSX/GL/GLOverlays.h @@ -108,4 +108,30 @@ namespace gl void run(gl::command_context& cmd, const areau& viewport, const rsx::simple_array& source, f32 gamma, bool limited_rgb, bool _3d); }; + + struct rp_ssbo_to_d24x8_texture : public overlay_pass + { + rp_ssbo_to_d24x8_texture(); + void run(gl::command_context& cmd, const buffer* src, const texture* dst, const u32 src_offset, const coordu& dst_region, const pixel_unpack_settings& settings); + }; + + // TODO: Replace with a proper manager + extern std::unordered_map> g_overlay_passes; + + template + T* get_overlay_pass() + { + u32 index = id_manager::typeinfo::get_index(); + auto &e = g_overlay_passes[index]; + + if (!e) + { + e = std::make_unique(); + e->create(); + } + + return static_cast(e.get()); + } + + void destroy_overlay_passes(); } diff --git a/rpcs3/Emu/RSX/GL/GLTexture.cpp b/rpcs3/Emu/RSX/GL/GLTexture.cpp index 6eae223f5e..98ec717349 100644 --- a/rpcs3/Emu/RSX/GL/GLTexture.cpp +++ b/rpcs3/Emu/RSX/GL/GLTexture.cpp @@ -2,6 +2,7 @@ #include "GLTexture.h" #include "GLCompute.h" #include "GLRenderTargets.h" +#include "GLOverlays.h" #include "../GCM.h" #include "../RSXThread.h" #include "../RSXTexture.h" @@ -622,16 +623,36 @@ namespace gl fmt::throw_exception("Invalid depth/stencil type 0x%x", unpack_info.type); } - if (!skip_barrier) + const auto caps = gl::get_driver_caps(); + if (dst->get_internal_format() == gl::texture::internal_format::depth24_stencil8 && + dst->get_target() == gl::texture::target::texture2D && // Only 2D output supported for the moment. + !caps.vendor_NVIDIA && // NVIDIA has native support for D24X8 data as they introduced this extension. + caps.ARB_shader_stencil_export_supported) // The driver needs to support stencil export at the very least { - glMemoryBarrier(GL_PIXEL_BUFFER_BARRIER_BIT); + // This optimized path handles the data load on the GPU without context switching to compute. + // The upside is that it is very fast if you have headroom. + // The downside is that it is linear. Not that it matters that much as most drivers seem to be downloading the entire data source and doing really slow things with it. + if (!skip_barrier) + { + glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); + } + + auto pass = gl::get_overlay_pass(); + pass->run(cmd, transfer_buf, dst, out_offset, {{dst_region.x, dst_region.y}, {dst_region.width, dst_region.height}}, {}); } + else + { + if (!skip_barrier) + { + glMemoryBarrier(GL_PIXEL_BUFFER_BARRIER_BIT); + } - glBindBuffer(GL_SHADER_STORAGE_BUFFER, GL_NONE); - transfer_buf->bind(buffer::target::pixel_unpack); + glBindBuffer(GL_SHADER_STORAGE_BUFFER, GL_NONE); + transfer_buf->bind(buffer::target::pixel_unpack); - dst->copy_from(reinterpret_cast(u64(out_offset)), static_cast(unpack_info.format), - static_cast(unpack_info.type), dst_level, dst_region, {}); + dst->copy_from(reinterpret_cast(u64(out_offset)), static_cast(unpack_info.format), + static_cast(unpack_info.type), dst_level, dst_region, {}); + } if (scratch_mem) scratch_mem.remove(); } diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/CopyBufferToD24x8.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/CopyBufferToD24x8.glsl new file mode 100644 index 0000000000..bc9d88305b --- /dev/null +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/CopyBufferToD24x8.glsl @@ -0,0 +1,48 @@ +R"( +#version 430 +#extension GL_ARB_shader_stencil_export : enable + +layout(%set, binding=%loc) readonly restrict buffer RawDataBlock +{ + uint data[]; +}; + +#if USE_UBO +layout(%push_block) uniform UnpackConfiguration +{ + uint swap_bytes; + uint src_pitch; +}; +#else + uniform int swap_bytes; + uniform int src_pitch; +#endif + +int getDataOffset() +{ + const ivec2 coords = ivec2(gl_FragCoord.xy); + return coords.y * src_pitch + coords.x; +} + +void main() +{ + const int virtual_address = getDataOffset(); + uint real_data = data[virtual_address]; + + const uint stencil_byte = bitfieldExtract(real_data, 0, 8); + uint depth_bytes; + + if (swap_bytes > 0) + { + // CCBBAA00 -> 00AABBCC -> AABBCC. Stencil byte does not actually move + depth_bytes = bitfieldExtract(real_data, 24, 8) | (bitfieldExtract(real_data, 16, 8) << 8) | (bitfieldExtract(real_data, 8, 8) << 24); + } + else + { + depth_bytes = bitfieldExtract(real_data, 8, 24); + } + + gl_FragDepth = float(depth_bytes) / 0xffffff; + gl_FragStencilRefARB = int(stencil_byte); +} +)" diff --git a/rpcs3/Emu/RSX/Program/GLSLSnippets/GenericVSPassthrough.glsl b/rpcs3/Emu/RSX/Program/GLSLSnippets/GenericVSPassthrough.glsl new file mode 100644 index 0000000000..5c4eca74bb --- /dev/null +++ b/rpcs3/Emu/RSX/Program/GLSLSnippets/GenericVSPassthrough.glsl @@ -0,0 +1,13 @@ +R"( +#version 420 +layout(location=0) out vec2 tc0; + +void main() +{ + vec2 positions[] = {vec2(-1., -1.), vec2(1., -1.), vec2(-1., 1.), vec2(1., 1.)}; + vec2 coords[] = {vec2(0., 1.), vec2(1., 1.), vec2(0., 0.), vec2(1., 0.)}; + tc0 = coords[gl_VertexID % 4]; + vec2 pos = positions[gl_VertexID % 4]; + gl_Position = vec4(pos, 0., 1.); +} +)"