gl: Accelerate D24X8_UINT operations

- Adds compute decoding for D24X8_UINT on both download and upload routines
- Adds support for D24X8_UINT operations for typeless copy
This commit is contained in:
kd-11 2020-09-05 18:27:24 +03:00 committed by kd-11
parent abc715bc5c
commit 220e86bbd1
5 changed files with 277 additions and 71 deletions

View File

@ -745,7 +745,6 @@ namespace gl
m_target = static_cast<GLenum>(target_);
}
~save_binding_state()
{
glBindBuffer(m_target, m_last_binding);
@ -942,6 +941,18 @@ namespace gl
{
glBindBufferRange(static_cast<GLenum>(target_), index, id(), offset, size);
}
void copy_to(buffer* other, u64 src_offset, u64 dst_offset, u64 size)
{
if (get_driver_caps().ARB_dsa_supported)
{
glCopyNamedBufferSubData(this->id(), other->id(), src_offset, dst_offset, size);
}
else
{
glNamedCopyBufferSubDataEXT(this->id(), other->id(), src_offset, dst_offset, size);
}
}
};
class ring_buffer : public buffer
@ -1638,7 +1649,12 @@ namespace gl
m_aspect_flags = image_aspect::depth;
break;
}
case GL_DEPTH_COMPONENT32: // Unimplemented decode
case GL_DEPTH_COMPONENT32F:
{
m_pitch = width * 4;
m_aspect_flags = image_aspect::depth;
break;
}
case GL_DEPTH24_STENCIL8:
case GL_DEPTH32F_STENCIL8:
{
@ -1678,17 +1694,17 @@ namespace gl
{
fmt::throw_exception("Unhandled GL format 0x%X" HERE, sized_format);
}
}
if (format_class == RSX_FORMAT_CLASS_UNDEFINED)
{
if (m_aspect_flags != image_aspect::color)
if (format_class == RSX_FORMAT_CLASS_UNDEFINED)
{
rsx_log.error("Undefined format class for depth texture is not allowed");
}
else
{
format_class = RSX_FORMAT_CLASS_COLOR;
if (m_aspect_flags != image_aspect::color)
{
rsx_log.error("Undefined format class for depth texture is not allowed");
}
else
{
format_class = RSX_FORMAT_CLASS_COLOR;
}
}
}

View File

@ -189,6 +189,9 @@ OPENGL_PROC(PFNGLTEXTURESUBIMAGE3DPROC, TextureSubImage3D);
OPENGL_PROC(PFNGLCLEARBUFFERFVPROC, ClearBufferfv);
OPENGL_PROC(PFNGLCOPYNAMEDBUFFERSUBDATAPROC, CopyNamedBufferSubData);
OPENGL_PROC(PFNGLNAMEDCOPYBUFFERSUBDATAEXTPROC, NamedCopyBufferSubDataEXT);
// Sampler Objects
OPENGL_PROC(PFNGLGENSAMPLERSPROC, GenSamplers);
OPENGL_PROC(PFNGLDELETESAMPLERSPROC, DeleteSamplers);

View File

@ -54,8 +54,8 @@ namespace gl
void initialize_memory(gl::command_context& cmd, bool read_access);
public:
render_target(GLuint width, GLuint height, GLenum sized_format)
: viewable_image(GL_TEXTURE_2D, width, height, 1, 1, sized_format)
render_target(GLuint width, GLuint height, GLenum sized_format, rsx::format_class format_class)
: viewable_image(GL_TEXTURE_2D, width, height, 1, 1, sized_format, format_class)
{}
// Internal pitch is the actual row length in bytes of the openGL texture
@ -146,7 +146,8 @@ struct gl_render_target_traits
auto format = rsx::internals::surface_color_format_to_gl(surface_color_format);
std::unique_ptr<gl::render_target> result(new gl::render_target(rsx::apply_resolution_scale(static_cast<u16>(width), true),
rsx::apply_resolution_scale(static_cast<u16>(height), true), static_cast<GLenum>(format.internal_format)));
rsx::apply_resolution_scale(static_cast<u16>(height), true), static_cast<GLenum>(format.internal_format),
RSX_FORMAT_CLASS_COLOR));
result->set_aa_mode(antialias);
result->set_native_pitch(static_cast<u16>(width) * get_format_block_size_in_bytes(surface_color_format) * result->samples_x);
@ -173,7 +174,8 @@ struct gl_render_target_traits
{
auto format = rsx::internals::surface_depth_format_to_gl(surface_depth_format);
std::unique_ptr<gl::render_target> result(new gl::render_target(rsx::apply_resolution_scale(static_cast<u16>(width), true),
rsx::apply_resolution_scale(static_cast<u16>(height), true), static_cast<GLenum>(format.internal_format)));
rsx::apply_resolution_scale(static_cast<u16>(height), true), static_cast<GLenum>(format.internal_format),
rsx::classify_format(surface_depth_format)));
result->set_aa_mode(antialias);
result->set_surface_dimensions(static_cast<u16>(width), static_cast<u16>(height), static_cast<u16>(pitch));
@ -202,7 +204,7 @@ struct gl_render_target_traits
const auto new_w = rsx::apply_resolution_scale(prev.width, true, ref->get_surface_width(rsx::surface_metrics::pixels));
const auto new_h = rsx::apply_resolution_scale(prev.height, true, ref->get_surface_height(rsx::surface_metrics::pixels));
sink = std::make_unique<gl::render_target>(new_w, new_h, internal_format);
sink = std::make_unique<gl::render_target>(new_w, new_h, internal_format, ref->format_class());
sink->add_ref();
sink->memory_usage_flags = rsx::surface_usage_flags::storage;

View File

@ -69,9 +69,9 @@ namespace gl
case CELL_GCM_TEXTURE_G8B8: return std::make_tuple(GL_RG, GL_UNSIGNED_BYTE);
case CELL_GCM_TEXTURE_R6G5B5: return std::make_tuple(GL_RGB, GL_UNSIGNED_SHORT_5_6_5);
case CELL_GCM_TEXTURE_DEPTH24_D8: return std::make_tuple(GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8);
case CELL_GCM_TEXTURE_DEPTH24_D8_FLOAT: return std::make_tuple(GL_DEPTH_STENCIL, GL_FLOAT); // TODO, requires separate aspect readback
case CELL_GCM_TEXTURE_DEPTH24_D8_FLOAT: return std::make_tuple(GL_DEPTH_STENCIL, GL_FLOAT_32_UNSIGNED_INT_24_8_REV);
case CELL_GCM_TEXTURE_DEPTH16: return std::make_tuple(GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT);
case CELL_GCM_TEXTURE_DEPTH16_FLOAT: return std::make_tuple(GL_DEPTH_COMPONENT, GL_HALF_FLOAT);
case CELL_GCM_TEXTURE_DEPTH16_FLOAT: return std::make_tuple(GL_DEPTH_COMPONENT, GL_FLOAT);
case CELL_GCM_TEXTURE_X16: return std::make_tuple(GL_RED, GL_UNSIGNED_SHORT);
case CELL_GCM_TEXTURE_Y16_X16: return std::make_tuple(GL_RG, GL_UNSIGNED_SHORT);
case CELL_GCM_TEXTURE_R5G5B5A1: return std::make_tuple(GL_RGBA, GL_UNSIGNED_SHORT_5_5_5_1);
@ -126,6 +126,8 @@ namespace gl
return { GL_RGBA, GL_FLOAT, 4, true };
case texture::internal_format::depth16:
return { GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT, 2, true };
case texture::internal_format::depth32f:
return { GL_DEPTH_COMPONENT, GL_FLOAT, 2, true };
case texture::internal_format::depth24_stencil8:
case texture::internal_format::depth32f_stencil8:
return { GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, 4, true };
@ -154,7 +156,13 @@ namespace gl
}
}
return get_format_type(ifmt);
auto ret = get_format_type(ifmt);
if (tex->format_class() == RSX_FORMAT_CLASS_DEPTH24_FLOAT_X8_PACK32)
{
ret.type = GL_FLOAT_32_UNSIGNED_INT_24_8_REV;
}
return ret;
}
GLenum get_srgb_format(GLenum in_format)
@ -459,6 +467,7 @@ namespace gl
GLenum target;
GLenum internal_format = get_sized_internal_format(gcm_format);
auto format_class = rsx::classify_format(gcm_format);
switch (type)
{
@ -476,7 +485,7 @@ namespace gl
break;
}
return new gl::viewable_image(target, width, height, depth, mipmaps, internal_format);
return new gl::viewable_image(target, width, height, depth, mipmaps, internal_format, format_class);
}
void fill_texture(rsx::texture_dimension_extended dim, u16 mipmap_count, int format, u16 width, u16 height, u16 depth,
@ -538,6 +547,14 @@ namespace gl
else
{
bool apply_settings = true;
buffer upload_scratch_mem, compute_scratch_mem;
cs_shuffle_base* pixel_transform = nullptr;
gsl::span<gsl::byte> dst_buffer = staging_buffer;
void* out_pointer = staging_buffer.data();
u8 block_size_in_bytes = rsx::get_format_block_size_in_bytes(format);
u64 image_linear_size;
switch (gl_type)
{
case GL_UNSIGNED_INT_8_8_8_8:
@ -552,6 +569,21 @@ namespace gl
apply_settings = (gl_format == GL_RED);
caps.supports_byteswap = apply_settings;
break;
case GL_UNSIGNED_INT_24_8:
if (gl::get_driver_caps().ARB_compute_shader_supported)
{
apply_settings = false;
pixel_transform = gl::get_compute_task<cs_shuffle_x8d24_to_d24x8<true>>();
}
break;
case GL_FLOAT:
// TODO: Expand depth16f to depth32f
gl_type = GL_HALF_FLOAT;
break;
case GL_FLOAT_32_UNSIGNED_INT_24_8_REV:
// TODO: Expand depth24 to depth32f
gl_type = GL_UNSIGNED_INT_24_8;
break;
default:
break;
}
@ -561,10 +593,39 @@ namespace gl
unpack_settings.apply();
}
if (pixel_transform)
{
upload_scratch_mem.create(staging_buffer.size(), nullptr, buffer::memory_type::host_visible, GL_STREAM_DRAW);
compute_scratch_mem.create(staging_buffer.size(), nullptr, buffer::memory_type::local, GL_STATIC_COPY);
out_pointer = nullptr;
}
for (const rsx::subresource_layout& layout : input_layouts)
{
auto op = upload_texture_subresource(staging_buffer, layout, format, is_swizzled, caps);
if (apply_settings)
if (pixel_transform)
{
const u64 row_pitch = rsx::align2(layout.width_in_block * block_size_in_bytes, caps.alignment);
image_linear_size = row_pitch * layout.height_in_block * layout.depth;
dst_buffer = { reinterpret_cast<gsl::byte*>(upload_scratch_mem.map(buffer::access::write)), image_linear_size };
}
auto op = upload_texture_subresource(dst_buffer, layout, format, is_swizzled, caps);
if (pixel_transform)
{
// 1. Unmap buffer
upload_scratch_mem.unmap();
// 2. Execute compute job
upload_scratch_mem.copy_to(&compute_scratch_mem, 0, 0, image_linear_size);
pixel_transform->run(&compute_scratch_mem, image_linear_size);
// 3. Bind compute buffer as pixel unpack buffer
glMemoryBarrier(GL_PIXEL_UNPACK_BUFFER);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, GL_NONE);
compute_scratch_mem.bind(buffer::target::pixel_unpack);
}
else if (apply_settings)
{
unpack_settings.swap_bytes(op.require_swap);
unpack_settings.apply();
@ -574,22 +635,28 @@ namespace gl
switch (dim)
{
case rsx::texture_dimension_extended::texture_dimension_1d:
glTexSubImage1D(GL_TEXTURE_1D, layout.level, 0, layout.width_in_texel, gl_format, gl_type, staging_buffer.data());
glTexSubImage1D(GL_TEXTURE_1D, layout.level, 0, layout.width_in_texel, gl_format, gl_type, out_pointer);
break;
case rsx::texture_dimension_extended::texture_dimension_2d:
glTexSubImage2D(GL_TEXTURE_2D, layout.level, 0, 0, layout.width_in_texel, layout.height_in_texel, gl_format, gl_type, staging_buffer.data());
glTexSubImage2D(GL_TEXTURE_2D, layout.level, 0, 0, layout.width_in_texel, layout.height_in_texel, gl_format, gl_type, out_pointer);
break;
case rsx::texture_dimension_extended::texture_dimension_cubemap:
glTexSubImage2D(GL_TEXTURE_CUBE_MAP_POSITIVE_X + layout.layer, layout.level, 0, 0, layout.width_in_texel, layout.height_in_texel, gl_format, gl_type, staging_buffer.data());
glTexSubImage2D(GL_TEXTURE_CUBE_MAP_POSITIVE_X + layout.layer, layout.level, 0, 0, layout.width_in_texel, layout.height_in_texel, gl_format, gl_type, out_pointer);
break;
case rsx::texture_dimension_extended::texture_dimension_3d:
glTexSubImage3D(GL_TEXTURE_3D, layout.layer, 0, 0, 0, layout.width_in_texel, layout.height_in_texel, depth, gl_format, gl_type, staging_buffer.data());
glTexSubImage3D(GL_TEXTURE_3D, layout.layer, 0, 0, 0, layout.width_in_texel, layout.height_in_texel, depth, gl_format, gl_type, out_pointer);
break;
default:
ASSUME(0);
fmt::throw_exception("Unreachable" HERE);
}
}
if (pixel_transform)
{
upload_scratch_mem.remove();
compute_scratch_mem.remove();
}
}
}
@ -754,40 +821,97 @@ namespace gl
return false;
}
cs_shuffle_base* get_pixel_transform_job(const pixel_buffer_layout& pack_info)
cs_shuffle_base* get_trivial_transform_job(const pixel_buffer_layout& pack_info)
{
const bool is_depth_stencil = (pack_info.type == GL_UNSIGNED_INT_24_8);
if (!is_depth_stencil) [[likely]]
if (!pack_info.swap_bytes)
{
if (!pack_info.swap_bytes)
return nullptr;
}
switch (pack_info.size)
{
case 1:
return nullptr;
case 2:
return gl::get_compute_task<gl::cs_shuffle_16>();
break;
case 4:
return gl::get_compute_task<gl::cs_shuffle_32>();
break;
default:
fmt::throw_exception("Unsupported format");
}
}
cs_shuffle_base* get_image_to_buffer_job(const pixel_buffer_layout& pack_info, u32 aspect_mask)
{
switch (aspect_mask)
{
case image_aspect::color:
{
return get_trivial_transform_job(pack_info);
}
case image_aspect::depth:
{
if (pack_info.type == GL_FLOAT)
{
// TODO: D16F
return nullptr;
}
switch (pack_info.size)
{
case 1:
return nullptr;
case 2:
return gl::get_compute_task<gl::cs_shuffle_16>();
break;
case 4:
return gl::get_compute_task<gl::cs_shuffle_32>();
break;
default:
fmt::throw_exception("Unsupported format");
}
return get_trivial_transform_job(pack_info);
}
else
case image_aspect::depth | image_aspect::stencil:
{
if (pack_info.swap_bytes)
verify(HERE), pack_info.swap_bytes;
if (pack_info.type == GL_FLOAT_32_UNSIGNED_INT_24_8_REV)
{
return gl::get_compute_task<gl::cs_shuffle_d24x8_to_x8d24<true>>();
// TODO: D24FX8
return nullptr;
}
else
return gl::get_compute_task<gl::cs_shuffle_d24x8_to_x8d24<true>>();
}
default:
{
fmt::throw_exception("Invalid aspect mask 0x%x" HERE, aspect_mask);
}
}
}
cs_shuffle_base* get_buffer_to_image_job(const pixel_buffer_layout& unpack_info, u32 aspect_mask)
{
switch (aspect_mask)
{
case image_aspect::color:
{
return get_trivial_transform_job(unpack_info);
}
case image_aspect::depth:
{
if (unpack_info.type == GL_FLOAT)
{
return gl::get_compute_task<gl::cs_shuffle_d24x8_to_x8d24<false>>();
// TODO: D16F
return nullptr;
}
return get_trivial_transform_job(unpack_info);
}
case image_aspect::depth | image_aspect::stencil:
{
verify(HERE), unpack_info.swap_bytes;
if (unpack_info.type == GL_FLOAT_32_UNSIGNED_INT_24_8_REV)
{
// TODO: D24FX8
return nullptr;
}
return gl::get_compute_task<gl::cs_shuffle_x8d24_to_d24x8<true>>();
}
default:
{
fmt::throw_exception("Invalid aspect mask 0x%x" HERE, aspect_mask);
}
}
}
@ -807,6 +931,28 @@ namespace gl
auto pack_info = get_format_type(src);
auto unpack_info = get_format_type(dst);
if (!caps.ARB_compute_shader_supported)
{
auto remove_depth_transformation = [](const texture* tex, pixel_buffer_layout& pack_info)
{
if (tex->aspect() & image_aspect::depth)
{
switch (pack_info.type)
{
case GL_FLOAT_32_UNSIGNED_INT_24_8_REV:
pack_info.type = GL_UNSIGNED_INT_24_8;
break;
case GL_FLOAT:
pack_info.type = GL_HALF_FLOAT;
break;
}
}
};
remove_depth_transformation(src, pack_info);
remove_depth_transformation(dst, unpack_info);
}
// Start pack operation
g_typeless_transfer_buffer.bind(buffer::target::pixel_pack);
@ -829,8 +975,8 @@ namespace gl
if (caps.ARB_compute_shader_supported) [[likely]]
{
auto src_transform = get_pixel_transform_job(pack_info);
auto dst_transform = get_pixel_transform_job(unpack_info);
auto src_transform = get_image_to_buffer_job(pack_info, src->aspect());
auto dst_transform = get_buffer_to_image_job(unpack_info, dst->aspect());
if (src->aspect() == gl::image_aspect::color && dst->aspect() == gl::image_aspect::color)
{

View File

@ -15,6 +15,7 @@
#include "GLRenderTargets.h"
#include "GLOverlays.h"
#include "GLTexture.h"
#include "GLCompute.h"
#include "../Common/TextureUtils.h"
#include "../Common/texture_cache.h"
@ -151,9 +152,7 @@ namespace gl
void dma_transfer(gl::command_context& /*cmd*/, gl::texture* src, const areai& /*src_area*/, const utils::address_range& /*valid_range*/, u32 pitch)
{
init_buffer(src);
glGetError();
pbo.bind(buffer::target::pixel_pack);
if (context == rsx::texture_upload_context::dma)
{
@ -161,23 +160,68 @@ namespace gl
const auto format_info = gl::get_format_type(src->get_internal_format());
format = static_cast<gl::texture::format>(format_info.format);
type = static_cast<gl::texture::type>(format_info.type);
pack_unpack_swap_bytes = format_info.swap_bytes;
}
if ((src->aspect() & gl::image_aspect::stencil) == 0)
bool use_driver_pixel_transform = true;
if (get_driver_caps().ARB_compute_shader_supported) [[likely]]
{
if (src->aspect() & image_aspect::stencil)
{
pack_unpack_swap_bytes = format_info.swap_bytes;
}
else
{
// Z24S8 decode is done on the CPU for now
pack_unpack_swap_bytes = false;
buffer scratch_mem;
scratch_mem.create(buffer::target::pixel_pack, pbo.size(), nullptr, buffer::memory_type::local, GL_STATIC_COPY);
scratch_mem.bind();
pixel_pack_settings pack_settings;
pack_settings.alignment(1);
src->copy_to(nullptr, format, type, pack_settings);
// Invoke compute
if (auto error = glGetError(); !error) [[likely]]
{
cs_shuffle_base * job;
if (pack_unpack_swap_bytes)
{
job = get_compute_task<gl::cs_shuffle_d24x8_to_x8d24<true>>();
}
else
{
job = get_compute_task<gl::cs_shuffle_d24x8_to_x8d24<false>>();
}
const auto job_length = src->pitch() * src->height();
job->run(&scratch_mem, job_length);
glBindBuffer(GL_SHADER_STORAGE_BUFFER, GL_NONE);
glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
scratch_mem.copy_to(&pbo, 0, 0, job_length);
}
else
{
rsx_log.error("Memory transfer failed with error 0x%x. Format=0x%x, Type=0x%x", error, static_cast<u32>(format), static_cast<u32>(type));
}
scratch_mem.remove();
use_driver_pixel_transform = false;
}
}
pixel_pack_settings pack_settings;
pack_settings.alignment(1);
pack_settings.swap_bytes(pack_unpack_swap_bytes);
if (use_driver_pixel_transform)
{
if (src->aspect() & image_aspect::stencil)
{
pack_unpack_swap_bytes = false;
}
pbo.bind(buffer::target::pixel_pack);
pixel_pack_settings pack_settings;
pack_settings.alignment(1);
pack_settings.swap_bytes(pack_unpack_swap_bytes);
src->copy_to(nullptr, format, type, pack_settings);
}
src->copy_to(nullptr, format, type, pack_settings);
real_pitch = src->pitch();
rsx_pitch = pitch;
@ -297,20 +341,15 @@ namespace gl
const u32 valid_length = valid_range.second;
void *dst = get_ptr(get_section_base() + valid_offset);
if (pack_unpack_swap_bytes)
if (!gl::get_driver_caps().ARB_compute_shader_supported)
{
// Shuffle
// TODO: Do this with a compute shader
switch (type)
{
case gl::texture::type::sbyte:
case gl::texture::type::ubyte:
{
if (pack_unpack_swap_bytes)
{
// byte swapping does not work on byte types, use uint_8_8_8_8 for rgba8 instead to avoid penalty
rsx::shuffle_texel_data_wzyx<u8>(dst, rsx_pitch, width, align(valid_length, rsx_pitch) / rsx_pitch);
}
// byte swapping does not work on byte types, use uint_8_8_8_8 for rgba8 instead to avoid penalty
verify(HERE), !pack_unpack_swap_bytes;
break;
}
case gl::texture::type::uint_24_8: