gl: Implement on-chip buffer-to-d24x8 conversion

This commit is contained in:
kd-11 2022-05-31 01:47:55 +03:00 committed by kd-11
parent dd6cb054a7
commit d167582f6b
8 changed files with 184 additions and 18 deletions

View File

@ -21,6 +21,7 @@ namespace gl
bool ARB_shader_draw_parameters_supported = false; bool ARB_shader_draw_parameters_supported = false;
bool ARB_depth_buffer_float_supported = false; bool ARB_depth_buffer_float_supported = false;
bool ARB_texture_barrier_supported = false; bool ARB_texture_barrier_supported = false;
bool ARB_shader_stencil_export_supported = false;
bool NV_texture_barrier_supported = false; bool NV_texture_barrier_supported = false;
bool NV_gpu_shader5_supported = false; bool NV_gpu_shader5_supported = false;
bool AMD_gpu_shader_half_float_supported = false; bool AMD_gpu_shader_half_float_supported = false;
@ -45,7 +46,7 @@ namespace gl
void initialize() void initialize()
{ {
int find_count = 14; int find_count = 15;
int ext_count = 0; int ext_count = 0;
glGetIntegerv(GL_NUM_EXTENSIONS, &ext_count); glGetIntegerv(GL_NUM_EXTENSIONS, &ext_count);
@ -162,6 +163,13 @@ namespace gl
find_count--; find_count--;
continue; continue;
} }
if (check(ext_name, "GL_ARB_shader_stencil_export"))
{
ARB_shader_stencil_export_supported = true;
find_count--;
continue;
}
} }
// Check GL_VERSION and GL_RENDERER for the presence of Mesa // Check GL_VERSION and GL_RENDERER for the presence of Mesa

View File

@ -364,6 +364,7 @@ void GLGSRender::on_exit()
// Globals // Globals
// TODO: Move these // TODO: Move these
gl::destroy_compute_tasks(); gl::destroy_compute_tasks();
gl::destroy_overlay_passes();
gl::destroy_global_texture_resources(); gl::destroy_global_texture_resources();

View File

@ -402,6 +402,16 @@ namespace gl
m_alignment = value; m_alignment = value;
return *this; return *this;
} }
bool get_swap_bytes() const
{
return m_swap_bytes;
}
int get_row_length() const
{
return m_row_length;
}
}; };
class vao; class vao;

View File

@ -2,6 +2,19 @@
namespace gl namespace gl
{ {
// Lame
std::unordered_map<u32, std::unique_ptr<gl::overlay_pass>> g_overlay_passes;
void destroy_overlay_passes()
{
for (auto& [key, prog] : g_overlay_passes)
{
prog->destroy();
}
g_overlay_passes.clear();
}
void overlay_pass::create() void overlay_pass::create()
{ {
if (!compiled) if (!compiled)
@ -505,17 +518,8 @@ namespace gl
video_out_calibration_pass::video_out_calibration_pass() video_out_calibration_pass::video_out_calibration_pass()
{ {
vs_src = vs_src =
"#version 420\n\n" #include "../Program/GLSLSnippets/GenericVSPassthrough.glsl"
"layout(location=0) out vec2 tc0;\n" ;
"\n"
"void main()\n"
"{\n"
" vec2 positions[] = {vec2(-1., -1.), vec2(1., -1.), vec2(-1., 1.), vec2(1., 1.)};\n"
" vec2 coords[] = {vec2(0., 1.), vec2(1., 1.), vec2(0., 0.), vec2(1., 0.)};\n"
" tc0 = coords[gl_VertexID % 4];\n"
" vec2 pos = positions[gl_VertexID % 4];\n"
" gl_Position = vec4(pos, 0., 1.);\n"
"}\n";
fs_src = fs_src =
"#version 420\n\n" "#version 420\n\n"
@ -578,4 +582,39 @@ namespace gl
overlay_pass::run(cmd, viewport, GL_NONE, false, false); overlay_pass::run(cmd, viewport, GL_NONE, false, false);
} }
rp_ssbo_to_d24x8_texture::rp_ssbo_to_d24x8_texture()
{
vs_src =
#include "../Program/GLSLSnippets/GenericVSPassthrough.glsl"
;
fs_src =
#include "../Program/GLSLSnippets/CopyBufferToD24x8.glsl"
;
std::pair<std::string_view, std::string> repl_list[] =
{
{ "%set, ", "" },
{ "%loc", std::to_string(GL_COMPUTE_BUFFER_SLOT(0)) },
{ "%push_block", fmt::format("binding=%d, std140", GL_COMPUTE_BUFFER_SLOT(1)) }
};
fs_src = fmt::replace_all(fs_src, repl_list);
}
void rp_ssbo_to_d24x8_texture::run(gl::command_context& cmd,
const buffer* src, const texture* dst,
const u32 src_offset, const coordu& dst_region,
const pixel_unpack_settings& settings)
{
const int row_length = settings.get_row_length();
program_handle.uniforms["src_pitch"] = row_length ? row_length : static_cast<int>(dst_region.width);
program_handle.uniforms["swap_bytes"] = settings.get_swap_bytes() ? 1 : 0;
src->bind_range(GL_COMPUTE_BUFFER_SLOT(0), src_offset, row_length * dst_region.height);
cmd->stencil_mask(0xFF);
overlay_pass::run(cmd, dst_region, dst->id(), true);
}
} }

View File

@ -108,4 +108,30 @@ namespace gl
void run(gl::command_context& cmd, const areau& viewport, const rsx::simple_array<GLuint>& source, f32 gamma, bool limited_rgb, bool _3d); void run(gl::command_context& cmd, const areau& viewport, const rsx::simple_array<GLuint>& source, f32 gamma, bool limited_rgb, bool _3d);
}; };
struct rp_ssbo_to_d24x8_texture : public overlay_pass
{
rp_ssbo_to_d24x8_texture();
void run(gl::command_context& cmd, const buffer* src, const texture* dst, const u32 src_offset, const coordu& dst_region, const pixel_unpack_settings& settings);
};
// TODO: Replace with a proper manager
extern std::unordered_map<u32, std::unique_ptr<gl::overlay_pass>> g_overlay_passes;
template<class T>
T* get_overlay_pass()
{
u32 index = id_manager::typeinfo::get_index<T>();
auto &e = g_overlay_passes[index];
if (!e)
{
e = std::make_unique<T>();
e->create();
}
return static_cast<T*>(e.get());
}
void destroy_overlay_passes();
} }

View File

@ -2,6 +2,7 @@
#include "GLTexture.h" #include "GLTexture.h"
#include "GLCompute.h" #include "GLCompute.h"
#include "GLRenderTargets.h" #include "GLRenderTargets.h"
#include "GLOverlays.h"
#include "../GCM.h" #include "../GCM.h"
#include "../RSXThread.h" #include "../RSXThread.h"
#include "../RSXTexture.h" #include "../RSXTexture.h"
@ -622,16 +623,36 @@ namespace gl
fmt::throw_exception("Invalid depth/stencil type 0x%x", unpack_info.type); fmt::throw_exception("Invalid depth/stencil type 0x%x", unpack_info.type);
} }
if (!skip_barrier) const auto caps = gl::get_driver_caps();
if (dst->get_internal_format() == gl::texture::internal_format::depth24_stencil8 &&
dst->get_target() == gl::texture::target::texture2D && // Only 2D output supported for the moment.
!caps.vendor_NVIDIA && // NVIDIA has native support for D24X8 data as they introduced this extension.
caps.ARB_shader_stencil_export_supported) // The driver needs to support stencil export at the very least
{ {
glMemoryBarrier(GL_PIXEL_BUFFER_BARRIER_BIT); // This optimized path handles the data load on the GPU without context switching to compute.
// The upside is that it is very fast if you have headroom.
// The downside is that it is linear. Not that it matters that much as most drivers seem to be downloading the entire data source and doing really slow things with it.
if (!skip_barrier)
{
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
}
auto pass = gl::get_overlay_pass<gl::rp_ssbo_to_d24x8_texture>();
pass->run(cmd, transfer_buf, dst, out_offset, {{dst_region.x, dst_region.y}, {dst_region.width, dst_region.height}}, {});
} }
else
{
if (!skip_barrier)
{
glMemoryBarrier(GL_PIXEL_BUFFER_BARRIER_BIT);
}
glBindBuffer(GL_SHADER_STORAGE_BUFFER, GL_NONE); glBindBuffer(GL_SHADER_STORAGE_BUFFER, GL_NONE);
transfer_buf->bind(buffer::target::pixel_unpack); transfer_buf->bind(buffer::target::pixel_unpack);
dst->copy_from(reinterpret_cast<void*>(u64(out_offset)), static_cast<texture::format>(unpack_info.format), dst->copy_from(reinterpret_cast<void*>(u64(out_offset)), static_cast<texture::format>(unpack_info.format),
static_cast<texture::type>(unpack_info.type), dst_level, dst_region, {}); static_cast<texture::type>(unpack_info.type), dst_level, dst_region, {});
}
if (scratch_mem) scratch_mem.remove(); if (scratch_mem) scratch_mem.remove();
} }

View File

@ -0,0 +1,48 @@
R"(
#version 430
#extension GL_ARB_shader_stencil_export : enable
layout(%set, binding=%loc) readonly restrict buffer RawDataBlock
{
uint data[];
};
#if USE_UBO
layout(%push_block) uniform UnpackConfiguration
{
uint swap_bytes;
uint src_pitch;
};
#else
uniform int swap_bytes;
uniform int src_pitch;
#endif
int getDataOffset()
{
const ivec2 coords = ivec2(gl_FragCoord.xy);
return coords.y * src_pitch + coords.x;
}
void main()
{
const int virtual_address = getDataOffset();
uint real_data = data[virtual_address];
const uint stencil_byte = bitfieldExtract(real_data, 0, 8);
uint depth_bytes;
if (swap_bytes > 0)
{
// CCBBAA00 -> 00AABBCC -> AABBCC. Stencil byte does not actually move
depth_bytes = bitfieldExtract(real_data, 24, 8) | (bitfieldExtract(real_data, 16, 8) << 8) | (bitfieldExtract(real_data, 8, 8) << 24);
}
else
{
depth_bytes = bitfieldExtract(real_data, 8, 24);
}
gl_FragDepth = float(depth_bytes) / 0xffffff;
gl_FragStencilRefARB = int(stencil_byte);
}
)"

View File

@ -0,0 +1,13 @@
R"(
#version 420
layout(location=0) out vec2 tc0;
void main()
{
vec2 positions[] = {vec2(-1., -1.), vec2(1., -1.), vec2(-1., 1.), vec2(1., 1.)};
vec2 coords[] = {vec2(0., 1.), vec2(1., 1.), vec2(0., 0.), vec2(1., 0.)};
tc0 = coords[gl_VertexID % 4];
vec2 pos = positions[gl_VertexID % 4];
gl_Position = vec4(pos, 0., 1.);
}
)"