gl: Support loading data from SSBO using compute shaders

- Gives better performance than using raw draw calls.
- Does not work with all formats. The draw call version is still used when needed.
This commit is contained in:
kd-11 2022-06-26 17:59:09 +03:00 committed by kd-11
parent f60002e87d
commit 82439327fa
13 changed files with 326 additions and 94 deletions

View File

@ -351,4 +351,47 @@ namespace gl
const int num_invocations = utils::aligned_div(region.width * region.height, optimal_kernel_size);
compute_task::run(cmd, num_invocations);
}
cs_ssbo_to_color_image::cs_ssbo_to_color_image()
{
initialize();
const auto raw_data =
#include "../Program/GLSLSnippets/CopyBufferToColorImage.glsl"
;
const std::pair<std::string_view, std::string> repl_list[] =
{
{ "%set, ", "" },
{ "%loc", std::to_string(GL_COMPUTE_BUFFER_SLOT(0)) },
{ "%ws", std::to_string(optimal_group_size) },
{ "%wks", std::to_string(optimal_kernel_size) }
};
m_src = fmt::replace_all(raw_data, repl_list);
}
void cs_ssbo_to_color_image::run(gl::command_context& cmd, const buffer* src, const texture_view* dst, const u32 src_offset, const coordu& dst_region, const pixel_buffer_layout& layout)
{
const auto row_pitch = static_cast<u32>(dst_region.width);
const u32 bpp = dst->image()->pitch() / dst->image()->width();
m_program.uniforms["swap_bytes"] = layout.swap_bytes;
m_program.uniforms["src_pitch"] = row_pitch;
m_program.uniforms["format"] = static_cast<GLenum>(dst->image()->get_internal_format());
m_program.uniforms["region_offset"] = color2i(dst_region.x, dst_region.y);
m_program.uniforms["region_size"] = color2i(dst_region.width, dst_region.height);
src->bind_range(gl::buffer::target::ssbo, GL_COMPUTE_BUFFER_SLOT(0), src_offset, row_pitch * bpp * dst_region.height);
glBindImageTexture(GL_COMPUTE_IMAGE_SLOT(0), dst->id(), 0, GL_FALSE, 0, GL_WRITE_ONLY, dst->view_format());
const int num_invocations = utils::aligned_div(dst_region.width * dst_region.height, optimal_kernel_size);
compute_task::run(cmd, num_invocations);
}
void cs_ssbo_to_color_image::run(gl::command_context& cmd, const buffer* src, texture* dst, const u32 src_offset, const coordu& dst_region, const pixel_buffer_layout& layout)
{
gl::nil_texture_view view(dst);
run(cmd, src, &view, src_offset, dst_region, layout);
}
}

View File

@ -360,6 +360,13 @@ namespace gl
void run(gl::command_context& cmd, gl::viewable_image* src, const gl::buffer* dst, u32 out_offset, const coordu& region, const gl::pixel_buffer_layout& layout, const gl::pixel_pack_settings& settings) override;
};
struct cs_ssbo_to_color_image : compute_task
{
cs_ssbo_to_color_image();
void run(gl::command_context& cmd, const buffer* src, const texture_view* dst, const u32 src_offset, const coordu& dst_region, const pixel_buffer_layout& layout);
void run(gl::command_context& cmd, const buffer* src, texture* dst, const u32 src_offset, const coordu& dst_region, const pixel_buffer_layout& layout);
};
// TODO: Replace with a proper manager
extern std::unordered_map<u32, std::unique_ptr<gl::compute_task>> g_compute_tasks;

View File

@ -595,14 +595,14 @@ namespace gl
overlay_pass::run(cmd, viewport, GL_NONE, gl::image_aspect::color, false);
}
rp_ssbo_to_texture::rp_ssbo_to_texture()
rp_ssbo_to_generic_texture::rp_ssbo_to_generic_texture()
{
vs_src =
#include "../Program/GLSLSnippets/GenericVSPassthrough.glsl"
;
fs_src =
#include "../Program/GLSLSnippets/CopyBufferToImage.glsl"
#include "../Program/GLSLSnippets/CopyBufferToGenericImage.glsl"
;
std::pair<std::string_view, std::string> repl_list[] =
@ -616,7 +616,7 @@ namespace gl
fs_src = fmt::replace_all(fs_src, repl_list);
}
void rp_ssbo_to_texture::run(gl::command_context& cmd,
void rp_ssbo_to_generic_texture::run(gl::command_context& cmd,
const buffer* src, const texture_view* dst,
const u32 src_offset, const coordu& dst_region,
const pixel_buffer_layout& layout)
@ -634,7 +634,7 @@ namespace gl
overlay_pass::run(cmd, dst_region, dst->id(), dst->aspect());
}
void rp_ssbo_to_texture::run(gl::command_context& cmd,
void rp_ssbo_to_generic_texture::run(gl::command_context& cmd,
const buffer* src, texture* dst,
const u32 src_offset, const coordu& dst_region,
const pixel_buffer_layout& layout)

View File

@ -114,9 +114,9 @@ namespace gl
void run(gl::command_context& cmd, const areau& viewport, const rsx::simple_array<GLuint>& source, f32 gamma, bool limited_rgb, bool _3d);
};
struct rp_ssbo_to_texture : public overlay_pass
struct rp_ssbo_to_generic_texture : public overlay_pass
{
rp_ssbo_to_texture();
rp_ssbo_to_generic_texture();
void run(gl::command_context& cmd, const buffer* src, texture* dst, const u32 src_offset, const coordu& dst_region, const pixel_buffer_layout& layout);
void run(gl::command_context& cmd, const buffer* src, const texture_view* dst, const u32 src_offset, const coordu& dst_region, const pixel_buffer_layout& layout);
};

View File

@ -210,6 +210,9 @@ OPENGL_PROC(PFNGLNAMEDBUFFERDATAEXTPROC, NamedBufferDataEXT);
OPENGL_PROC(PFNGLNAMEDBUFFERSUBDATAPROC, NamedBufferSubData);
OPENGL_PROC(PFNGLNAMEDBUFFERSUBDATAEXTPROC, NamedBufferSubDataEXT);
// ARB_shader_image_load_store
OPENGL_PROC(PFNGLBINDIMAGETEXTUREPROC, BindImageTexture);
// Sampler Objects
OPENGL_PROC(PFNGLGENSAMPLERSPROC, GenSamplers);
OPENGL_PROC(PFNGLDELETESAMPLERSPROC, DeleteSamplers);

View File

@ -631,7 +631,29 @@ namespace gl
}
}
gl::get_overlay_pass<gl::rp_ssbo_to_texture>()->run(cmd, transfer_buf, scratch_view.get(), out_offset, image_region, unpack_info);
// If possible, decode using a compute transform to potentially have asynchronous scheduling
bool use_compute_transform = (dst->aspect() == gl::image_aspect::color);
switch (dst->get_internal_format())
{
case texture::internal_format::bgr5a1:
case texture::internal_format::rgb5a1:
case texture::internal_format::rgb565:
case texture::internal_format::rgba4:
// Packed formats are a problem with image_load_store
use_compute_transform = false;
break;
default:
break;
}
if (use_compute_transform)
{
gl::get_compute_task<gl::cs_ssbo_to_color_image>()->run(cmd, transfer_buf, scratch_view.get(), out_offset, image_region, unpack_info);
}
else
{
gl::get_overlay_pass<gl::rp_ssbo_to_generic_texture>()->run(cmd, transfer_buf, scratch_view.get(), out_offset, image_region, unpack_info);
}
if (dst->get_target() == texture::target::texture3D)
{

View File

@ -21,6 +21,7 @@
#define GL_INTERPRETER_VERTEX_BLOCK SSBO_SLOT(0)
#define GL_INTERPRETER_FRAGMENT_BLOCK SSBO_SLOT(1)
#define GL_COMPUTE_BUFFER_SLOT(index) SSBO_SLOT(2 + index)
#define GL_COMPUTE_IMAGE_SLOT(index) UBO_SLOT(index)
//Function call wrapped in ARB_DSA vs EXT_DSA compat check
#define DSA_CALL(func, object_name, target, ...)\

View File

@ -235,14 +235,15 @@ namespace gl
void texture_view::create(texture* data, GLenum target, GLenum sized_format, const subresource_range& range, const GLenum* argb_swizzle)
{
m_target = target;
m_format = sizedfmt_to_ifmt(sized_format);
m_format = sized_format;
m_view_format = sizedfmt_to_ifmt(sized_format);
m_image_data = data;
m_aspect_flags = range.aspect_mask & data->aspect();
ensure(m_aspect_flags);
glGenTextures(1, &m_id);
glTextureView(m_id, target, data->id(), m_format, range.min_level, range.num_levels, range.min_layer, range.num_layers);
glTextureView(m_id, target, data->id(), m_view_format, range.min_level, range.num_levels, range.min_layer, range.num_layers);
if (argb_swizzle)
{
@ -287,6 +288,26 @@ namespace gl
cmd->bind_texture(layer, m_target, m_id);
}
nil_texture_view::nil_texture_view(texture* data)
{
m_id = data->id();
m_target = static_cast<GLenum>(data->get_target());
m_format = static_cast<GLenum>(data->get_internal_format());
m_view_format = sizedfmt_to_ifmt(m_format);
m_aspect_flags = data->aspect();
m_image_data = data;
component_swizzle[0] = GL_RED;
component_swizzle[1] = GL_GREEN;
component_swizzle[2] = GL_BLUE;
component_swizzle[3] = GL_ALPHA;
}
nil_texture_view::~nil_texture_view()
{
m_id = GL_NONE;
}
texture_view* viewable_image::get_view(u32 remap_encoding, const std::pair<std::array<u8, 4>, std::array<u8, 4>>& remap_, GLenum aspect_flags)
{
auto remap = remap_;

View File

@ -346,6 +346,7 @@ namespace gl
GLuint m_id = GL_NONE;
GLenum m_target = 0;
GLenum m_format = 0;
GLenum m_view_format = 0;
GLenum m_aspect_flags = 0;
texture* m_image_data = nullptr;
@ -406,6 +407,11 @@ namespace gl
return m_format;
}
GLenum view_format() const
{
return m_view_format;
}
GLenum aspect() const
{
return m_aspect_flags;
@ -442,25 +448,8 @@ namespace gl
class nil_texture_view : public texture_view
{
public:
nil_texture_view(texture* data)
: texture_view()
{
m_id = data->id();
m_target = static_cast<GLenum>(data->get_target());
m_format = static_cast<GLenum>(data->get_internal_format());
m_aspect_flags = data->aspect();
m_image_data = data;
component_swizzle[0] = GL_RED;
component_swizzle[1] = GL_GREEN;
component_swizzle[2] = GL_BLUE;
component_swizzle[3] = GL_ALPHA;
}
~nil_texture_view()
{
m_id = GL_NONE;
}
nil_texture_view(texture* data);
~nil_texture_view();
};
class viewable_image : public texture

View File

@ -0,0 +1,188 @@
R"(
#version 450
layout(local_size_x = %ws, local_size_y = 1, local_size_z = 1) in;
#define SSBO_LOCATION(x) (x + %loc)
#define IMAGE_LOCATION(x) (x)
layout(%set, binding=IMAGE_LOCATION(0)) uniform writeonly restrict image2D output2D;
#define FMT_GL_RGBA8 0x8058
#define FMT_GL_BGRA8 0x80E1
#define FMT_GL_R8 0x8229
#define FMT_GL_R16 0x822A
#define FMT_GL_R32F 0x822E
#define FMT_GL_RG8 0x822B
#define FMT_GL_RG8_SNORM 0x8F95
#define FMT_GL_RG16 0x822C
#define FMT_GL_RG16F 0x822F
#define FMT_GL_RGBA16F 0x881A
#define FMT_GL_RGBA32F 0x8814
#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8
#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24
layout(%set, binding=SSBO_LOCATION(0), std430) readonly restrict buffer RawDataBlock
{
uint data[];
};
#if USE_UBO
layout(%push_block) uniform UnpackConfiguration
{
uint swap_bytes;
uint src_pitch;
uint format;
uint reserved;
ivec2 region_offset;
ivec2 region_size;
};
#else
uniform uint swap_bytes;
uniform uint src_pitch;
uniform uint format;
uniform ivec2 region_offset;
uniform ivec2 region_size;
#endif
uint linear_invocation_id()
{
uint size_in_x = (gl_NumWorkGroups.x * gl_WorkGroupSize.x);
return (gl_GlobalInvocationID.y * size_in_x) + gl_GlobalInvocationID.x;
}
ivec2 linear_id_to_output_coord(uint index)
{
return ivec2(int(index % src_pitch), int(index / src_pitch));
}
// Decoders. Beware of multi-wide swapped types (e.g swap(16x2) != swap(32x1))
uint readUint8(const in uint address)
{
const uint block = address / 4;
const uint offset = address % 4;
return bitfieldExtract(data[block], int(offset) * 8, 8);
}
uint readUint16(const in uint address)
{
const uint block = address / 2;
const uint offset = address % 2;
const uint value = bitfieldExtract(data[block], int(offset) * 16, 16);
if (swap_bytes != 0)
{
return bswap_u16(value);
}
return value;
}
uint readUint32(const in uint address)
{
const uint value = data[address];
return (swap_bytes != 0) ? bswap_u32(value) : value;
}
uvec2 readUint8x2(const in uint address)
{
const uint raw = readUint16(address);
return uvec2(bitfieldExtract(raw, 0, 8), bitfieldExtract(raw, 8, 8));
}
ivec2 readInt8x2(const in uint address)
{
const ivec2 raw = ivec2(readUint8x2(address));
return raw - (ivec2(greaterThan(raw, ivec2(127))) * 256);
}
#define readFixed8(address) readUint8(address) / 255.f
#define readFixed8x2(address) readUint8x2(address) / 255.f
#define readFixed8x2Snorm(address) readInt8x2(address) / 127.f
vec4 readFixed8x4(const in uint address)
{
const uint raw = readUint32(address);
return uvec4(
bitfieldExtract(raw, 0, 8),
bitfieldExtract(raw, 8, 8),
bitfieldExtract(raw, 16, 8),
bitfieldExtract(raw, 24, 8)
) / 255.f;
}
#define readFixed16(address) readUint16(uint(address)) / 65535.f
#define readFixed16x2(address) vec2(readFixed16(address * 2 + 0), readFixed16(address * 2 + 1))
#define readFixed16x4(address) vec4(readFixed16(address * 4 + 0), readFixed16(address * 4 + 1), readFixed16(address * 4 + 2), readFixed16(address * 4 + 3))
#define readFloat16(address) unpackHalf2x16(readUint16(uint(address))).x
#define readFloat16x2(address) vec2(readFloat16(address * 2 + 0), readFloat16(address * 2 + 1))
#define readFloat16x4(address) vec4(readFloat16(address * 4 + 0), readFloat16(address * 4 + 1), readFloat16(address * 4 + 2), readFloat16(address * 4 + 3))
#define readFloat32(address) uintBitsToFloat(readUint32(address))
#define readFloat32x4(address) uintBitsToFloat(uvec4(readUint32(address * 4 + 0), readUint32(address * 4 + 1), readUint32(address * 4 + 2), readUint32(address * 4 + 3)))
#define KERNEL_SIZE %wks
void write_output(const in uint invocation_id)
{
vec4 outColor;
uint utmp;
switch (format)
{
// Simple color
case FMT_GL_RGBA8:
outColor = readFixed8x4(invocation_id);
break;
case FMT_GL_BGRA8:
outColor = readFixed8x4(invocation_id).bgra;
break;
case FMT_GL_R8:
outColor.r = readFixed8(invocation_id);
break;
case FMT_GL_R16:
outColor.r = readFixed16(invocation_id);
break;
case FMT_GL_R32F:
outColor.r = readFloat32(invocation_id);
break;
case FMT_GL_RG8:
outColor.rg = readFixed8x2(invocation_id);
break;
case FMT_GL_RG8_SNORM:
outColor.rg = readFixed8x2Snorm(invocation_id);
break;
case FMT_GL_RG16:
outColor.rg = readFixed16x2(invocation_id);
break;
case FMT_GL_RG16F:
outColor.rg = readFloat16x2(invocation_id);
break;
case FMT_GL_RGBA16F:
outColor = readFloat16x4(invocation_id);
break;
case FMT_GL_RGBA32F:
outColor = readFloat32x4(invocation_id);
break;
}
const ivec2 coord = linear_id_to_output_coord(invocation_id);
if (any(greaterThan(coord, region_size)))
{
return;
}
imageStore(output2D, coord + region_offset, outColor);
}
void main()
{
uint index = linear_invocation_id() * KERNEL_SIZE;
for (int loop = 0; loop < KERNEL_SIZE; ++loop, ++index)
{
write_output(index);
}
}
)"

View File

@ -9,26 +9,15 @@ R"(
#define FMT_GL_DEPTH24_STENCIL8 0x88F0
#define FMT_GL_DEPTH32F_STENCIL8 0x8CAD
#define FMT_GL_RGBA8 0x8058
#define FMT_GL_BGRA8 0x80E1
#define FMT_GL_RGB565 0x8D62
#define FMT_GL_RGB5_A1 0x8057
#define FMT_GL_BGR5_A1 0x99F0
#define FMT_GL_RGBA4 0x8056
#define FMT_GL_R8 0x8229
#define FMT_GL_R16 0x822A
#define FMT_GL_R32F 0x822E
#define FMT_GL_RG8 0x822B
#define FMT_GL_RG8_SNORM 0x8F95
#define FMT_GL_RG16 0x822C
#define FMT_GL_RG16F 0x822F
#define FMT_GL_RGBA16F 0x881A
#define FMT_GL_RGBA32F 0x8814
#define bswap_u16(bits) (bits & 0xFF) << 8 | (bits & 0xFF00) >> 8 | (bits & 0xFF0000) << 8 | (bits & 0xFF000000) >> 8
#define bswap_u32(bits) (bits & 0xFF) << 24 | (bits & 0xFF00) << 8 | (bits & 0xFF0000) >> 8 | (bits & 0xFF000000) >> 24
layout(location=0) out vec4 fragColor;
layout(location=0) out vec4 outColor;
layout(%set, binding=%loc, std430) readonly restrict buffer RawDataBlock
{
@ -165,68 +154,33 @@ void main()
#endif
// Simple color
case FMT_GL_RGBA8:
fragColor = readFixed8x4(texel_address);
break;
case FMT_GL_BGRA8:
fragColor = readFixed8x4(texel_address).bgra;
break;
case FMT_GL_R8:
fragColor.r = readFixed8(texel_address);
break;
case FMT_GL_R16:
fragColor.r = readFixed16(texel_address);
break;
case FMT_GL_R32F:
fragColor.r = readFloat32(texel_address);
break;
case FMT_GL_RG8:
fragColor.rg = readFixed8x2(texel_address);
break;
case FMT_GL_RG8_SNORM:
fragColor.rg = readFixed8x2Snorm(texel_address);
break;
case FMT_GL_RG16:
fragColor.rg = readFixed16x2(texel_address);
break;
case FMT_GL_RG16F:
fragColor.rg = readFloat16x2(texel_address);
break;
case FMT_GL_RGBA16F:
fragColor = readFloat16x4(texel_address);
break;
case FMT_GL_RGBA32F:
fragColor = readFloat32x4(texel_address);
break;
// Packed color
case FMT_GL_RGB565:
utmp = readUint16(texel_address);
fragColor.b = bitfieldExtract(utmp, 0, 5) / 31.f;
fragColor.g = bitfieldExtract(utmp, 5, 6) / 63.f;
fragColor.r = bitfieldExtract(utmp, 11, 5) / 31.f;
outColor.b = bitfieldExtract(utmp, 0, 5) / 31.f;
outColor.g = bitfieldExtract(utmp, 5, 6) / 63.f;
outColor.r = bitfieldExtract(utmp, 11, 5) / 31.f;
break;
case FMT_GL_BGR5_A1:
utmp = readUint16(texel_address);
fragColor.b = bitfieldExtract(utmp, 0, 5) / 31.f;
fragColor.g = bitfieldExtract(utmp, 5, 5) / 31.f;
fragColor.r = bitfieldExtract(utmp, 10, 5) / 31.f;
fragColor.a = bitfieldExtract(utmp, 15, 1) * 1.f;
outColor.b = bitfieldExtract(utmp, 0, 5) / 31.f;
outColor.g = bitfieldExtract(utmp, 5, 5) / 31.f;
outColor.r = bitfieldExtract(utmp, 10, 5) / 31.f;
outColor.a = bitfieldExtract(utmp, 15, 1) * 1.f;
break;
case FMT_GL_RGB5_A1:
utmp = readUint16(texel_address);
fragColor.a = bitfieldExtract(utmp, 0, 1) * 1.f;
fragColor.b = bitfieldExtract(utmp, 1, 5) / 31.f;
fragColor.g = bitfieldExtract(utmp, 6, 5) / 31.f;
fragColor.r = bitfieldExtract(utmp, 11, 5) / 31.f;
outColor.a = bitfieldExtract(utmp, 0, 1) * 1.f;
outColor.b = bitfieldExtract(utmp, 1, 5) / 31.f;
outColor.g = bitfieldExtract(utmp, 6, 5) / 31.f;
outColor.r = bitfieldExtract(utmp, 11, 5) / 31.f;
break;
case FMT_GL_RGBA4:
utmp = readUint16(texel_address);
fragColor.b = bitfieldExtract(utmp, 0, 4) / 15.f;
fragColor.g = bitfieldExtract(utmp, 4, 4) / 15.f;
fragColor.r = bitfieldExtract(utmp, 8, 4) / 15.f;
fragColor.a = bitfieldExtract(utmp, 12, 4) / 15.f;
outColor.b = bitfieldExtract(utmp, 0, 4) / 15.f;
outColor.g = bitfieldExtract(utmp, 4, 4) / 15.f;
outColor.r = bitfieldExtract(utmp, 8, 4) / 15.f;
outColor.a = bitfieldExtract(utmp, 12, 4) / 15.f;
break;
}
}

View File

@ -831,7 +831,8 @@
<ItemGroup>
<None Include="Emu\RSX\Program\GLSLInterpreter\FragmentInterpreter.glsl" />
<None Include="Emu\RSX\Program\GLSLInterpreter\VertexInterpreter.glsl" />
<None Include="Emu\RSX\Program\GLSLSnippets\CopyBufferToImage.glsl" />
<None Include="Emu\RSX\Program\GLSLSnippets\CopyBufferToColorImage.glsl" />
<None Include="Emu\RSX\Program\GLSLSnippets\CopyBufferToGenericImage.glsl" />
<None Include="Emu\RSX\Program\GLSLSnippets\CopyD24x8ToBuffer.glsl" />
<None Include="Emu\RSX\Program\GLSLSnippets\CopyRGBA8ToBuffer.glsl" />
<None Include="Emu\RSX\Program\GLSLSnippets\GenericVSPassthrough.glsl" />

View File

@ -2178,9 +2178,6 @@
<None Include="Emu\RSX\Program\GLSLSnippets\ShuffleBytes.glsl">
<Filter>Emu\GPU\RSX\Program\Snippets</Filter>
</None>
<None Include="Emu\RSX\Program\GLSLSnippets\CopyBufferToImage.glsl">
<Filter>Emu\GPU\RSX\Program\Snippets</Filter>
</None>
<None Include="Emu\RSX\Program\GLSLSnippets\GenericVSPassthrough.glsl">
<Filter>Emu\GPU\RSX\Program\Snippets</Filter>
</None>
@ -2190,5 +2187,11 @@
<None Include="Emu\RSX\Program\GLSLSnippets\CopyRGBA8ToBuffer.glsl">
<Filter>Emu\GPU\RSX\Program\Snippets</Filter>
</None>
<None Include="Emu\RSX\Program\GLSLSnippets\CopyBufferToColorImage.glsl">
<Filter>Emu\GPU\RSX\Program\Snippets</Filter>
</None>
<None Include="Emu\RSX\Program\GLSLSnippets\CopyBufferToGenericImage.glsl">
<Filter>Emu\GPU\RSX\Program\Snippets</Filter>
</None>
</ItemGroup>
</Project>