From 220e86bbd1da0cd85ac5aa73602320c049f644df Mon Sep 17 00:00:00 2001 From: kd-11 Date: Sat, 5 Sep 2020 18:27:24 +0300 Subject: [PATCH] gl: Accelerate D24X8_UINT operations - Adds compute decoding for D24X8_UINT on both download and upload routines - Adds support for D24X8_UINT operations for typeless copy --- rpcs3/Emu/RSX/GL/GLHelpers.h | 38 +++-- rpcs3/Emu/RSX/GL/GLProcTable.h | 3 + rpcs3/Emu/RSX/GL/GLRenderTargets.h | 12 +- rpcs3/Emu/RSX/GL/GLTexture.cpp | 214 ++++++++++++++++++++++++----- rpcs3/Emu/RSX/GL/GLTextureCache.h | 81 ++++++++--- 5 files changed, 277 insertions(+), 71 deletions(-) diff --git a/rpcs3/Emu/RSX/GL/GLHelpers.h b/rpcs3/Emu/RSX/GL/GLHelpers.h index d6cf4cd9f4..d9ee67be8a 100644 --- a/rpcs3/Emu/RSX/GL/GLHelpers.h +++ b/rpcs3/Emu/RSX/GL/GLHelpers.h @@ -745,7 +745,6 @@ namespace gl m_target = static_cast(target_); } - ~save_binding_state() { glBindBuffer(m_target, m_last_binding); @@ -942,6 +941,18 @@ namespace gl { glBindBufferRange(static_cast(target_), index, id(), offset, size); } + + void copy_to(buffer* other, u64 src_offset, u64 dst_offset, u64 size) + { + if (get_driver_caps().ARB_dsa_supported) + { + glCopyNamedBufferSubData(this->id(), other->id(), src_offset, dst_offset, size); + } + else + { + glNamedCopyBufferSubDataEXT(this->id(), other->id(), src_offset, dst_offset, size); + } + } }; class ring_buffer : public buffer @@ -1638,7 +1649,12 @@ namespace gl m_aspect_flags = image_aspect::depth; break; } - case GL_DEPTH_COMPONENT32: // Unimplemented decode + case GL_DEPTH_COMPONENT32F: + { + m_pitch = width * 4; + m_aspect_flags = image_aspect::depth; + break; + } case GL_DEPTH24_STENCIL8: case GL_DEPTH32F_STENCIL8: { @@ -1678,17 +1694,17 @@ namespace gl { fmt::throw_exception("Unhandled GL format 0x%X" HERE, sized_format); } - } - if (format_class == RSX_FORMAT_CLASS_UNDEFINED) - { - if (m_aspect_flags != image_aspect::color) + if (format_class == RSX_FORMAT_CLASS_UNDEFINED) { - rsx_log.error("Undefined format class for depth texture is not allowed"); - } - else - { - format_class = RSX_FORMAT_CLASS_COLOR; + if (m_aspect_flags != image_aspect::color) + { + rsx_log.error("Undefined format class for depth texture is not allowed"); + } + else + { + format_class = RSX_FORMAT_CLASS_COLOR; + } } } diff --git a/rpcs3/Emu/RSX/GL/GLProcTable.h b/rpcs3/Emu/RSX/GL/GLProcTable.h index 4916521aec..59071462bf 100644 --- a/rpcs3/Emu/RSX/GL/GLProcTable.h +++ b/rpcs3/Emu/RSX/GL/GLProcTable.h @@ -189,6 +189,9 @@ OPENGL_PROC(PFNGLTEXTURESUBIMAGE3DPROC, TextureSubImage3D); OPENGL_PROC(PFNGLCLEARBUFFERFVPROC, ClearBufferfv); +OPENGL_PROC(PFNGLCOPYNAMEDBUFFERSUBDATAPROC, CopyNamedBufferSubData); +OPENGL_PROC(PFNGLNAMEDCOPYBUFFERSUBDATAEXTPROC, NamedCopyBufferSubDataEXT); + // Sampler Objects OPENGL_PROC(PFNGLGENSAMPLERSPROC, GenSamplers); OPENGL_PROC(PFNGLDELETESAMPLERSPROC, DeleteSamplers); diff --git a/rpcs3/Emu/RSX/GL/GLRenderTargets.h b/rpcs3/Emu/RSX/GL/GLRenderTargets.h index 79b3743880..23dd8fde91 100644 --- a/rpcs3/Emu/RSX/GL/GLRenderTargets.h +++ b/rpcs3/Emu/RSX/GL/GLRenderTargets.h @@ -54,8 +54,8 @@ namespace gl void initialize_memory(gl::command_context& cmd, bool read_access); public: - render_target(GLuint width, GLuint height, GLenum sized_format) - : viewable_image(GL_TEXTURE_2D, width, height, 1, 1, sized_format) + render_target(GLuint width, GLuint height, GLenum sized_format, rsx::format_class format_class) + : viewable_image(GL_TEXTURE_2D, width, height, 1, 1, sized_format, format_class) {} // Internal pitch is the actual row length in bytes of the openGL texture @@ -146,7 +146,8 @@ struct gl_render_target_traits auto format = rsx::internals::surface_color_format_to_gl(surface_color_format); std::unique_ptr result(new gl::render_target(rsx::apply_resolution_scale(static_cast(width), true), - rsx::apply_resolution_scale(static_cast(height), true), static_cast(format.internal_format))); + rsx::apply_resolution_scale(static_cast(height), true), static_cast(format.internal_format), + RSX_FORMAT_CLASS_COLOR)); result->set_aa_mode(antialias); result->set_native_pitch(static_cast(width) * get_format_block_size_in_bytes(surface_color_format) * result->samples_x); @@ -173,7 +174,8 @@ struct gl_render_target_traits { auto format = rsx::internals::surface_depth_format_to_gl(surface_depth_format); std::unique_ptr result(new gl::render_target(rsx::apply_resolution_scale(static_cast(width), true), - rsx::apply_resolution_scale(static_cast(height), true), static_cast(format.internal_format))); + rsx::apply_resolution_scale(static_cast(height), true), static_cast(format.internal_format), + rsx::classify_format(surface_depth_format))); result->set_aa_mode(antialias); result->set_surface_dimensions(static_cast(width), static_cast(height), static_cast(pitch)); @@ -202,7 +204,7 @@ struct gl_render_target_traits const auto new_w = rsx::apply_resolution_scale(prev.width, true, ref->get_surface_width(rsx::surface_metrics::pixels)); const auto new_h = rsx::apply_resolution_scale(prev.height, true, ref->get_surface_height(rsx::surface_metrics::pixels)); - sink = std::make_unique(new_w, new_h, internal_format); + sink = std::make_unique(new_w, new_h, internal_format, ref->format_class()); sink->add_ref(); sink->memory_usage_flags = rsx::surface_usage_flags::storage; diff --git a/rpcs3/Emu/RSX/GL/GLTexture.cpp b/rpcs3/Emu/RSX/GL/GLTexture.cpp index ae173dfc78..dd6ee9eb40 100644 --- a/rpcs3/Emu/RSX/GL/GLTexture.cpp +++ b/rpcs3/Emu/RSX/GL/GLTexture.cpp @@ -69,9 +69,9 @@ namespace gl case CELL_GCM_TEXTURE_G8B8: return std::make_tuple(GL_RG, GL_UNSIGNED_BYTE); case CELL_GCM_TEXTURE_R6G5B5: return std::make_tuple(GL_RGB, GL_UNSIGNED_SHORT_5_6_5); case CELL_GCM_TEXTURE_DEPTH24_D8: return std::make_tuple(GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8); - case CELL_GCM_TEXTURE_DEPTH24_D8_FLOAT: return std::make_tuple(GL_DEPTH_STENCIL, GL_FLOAT); // TODO, requires separate aspect readback + case CELL_GCM_TEXTURE_DEPTH24_D8_FLOAT: return std::make_tuple(GL_DEPTH_STENCIL, GL_FLOAT_32_UNSIGNED_INT_24_8_REV); case CELL_GCM_TEXTURE_DEPTH16: return std::make_tuple(GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT); - case CELL_GCM_TEXTURE_DEPTH16_FLOAT: return std::make_tuple(GL_DEPTH_COMPONENT, GL_HALF_FLOAT); + case CELL_GCM_TEXTURE_DEPTH16_FLOAT: return std::make_tuple(GL_DEPTH_COMPONENT, GL_FLOAT); case CELL_GCM_TEXTURE_X16: return std::make_tuple(GL_RED, GL_UNSIGNED_SHORT); case CELL_GCM_TEXTURE_Y16_X16: return std::make_tuple(GL_RG, GL_UNSIGNED_SHORT); case CELL_GCM_TEXTURE_R5G5B5A1: return std::make_tuple(GL_RGBA, GL_UNSIGNED_SHORT_5_5_5_1); @@ -126,6 +126,8 @@ namespace gl return { GL_RGBA, GL_FLOAT, 4, true }; case texture::internal_format::depth16: return { GL_DEPTH_COMPONENT, GL_UNSIGNED_SHORT, 2, true }; + case texture::internal_format::depth32f: + return { GL_DEPTH_COMPONENT, GL_FLOAT, 2, true }; case texture::internal_format::depth24_stencil8: case texture::internal_format::depth32f_stencil8: return { GL_DEPTH_STENCIL, GL_UNSIGNED_INT_24_8, 4, true }; @@ -154,7 +156,13 @@ namespace gl } } - return get_format_type(ifmt); + auto ret = get_format_type(ifmt); + if (tex->format_class() == RSX_FORMAT_CLASS_DEPTH24_FLOAT_X8_PACK32) + { + ret.type = GL_FLOAT_32_UNSIGNED_INT_24_8_REV; + } + + return ret; } GLenum get_srgb_format(GLenum in_format) @@ -459,6 +467,7 @@ namespace gl GLenum target; GLenum internal_format = get_sized_internal_format(gcm_format); + auto format_class = rsx::classify_format(gcm_format); switch (type) { @@ -476,7 +485,7 @@ namespace gl break; } - return new gl::viewable_image(target, width, height, depth, mipmaps, internal_format); + return new gl::viewable_image(target, width, height, depth, mipmaps, internal_format, format_class); } void fill_texture(rsx::texture_dimension_extended dim, u16 mipmap_count, int format, u16 width, u16 height, u16 depth, @@ -538,6 +547,14 @@ namespace gl else { bool apply_settings = true; + buffer upload_scratch_mem, compute_scratch_mem; + + cs_shuffle_base* pixel_transform = nullptr; + gsl::span dst_buffer = staging_buffer; + void* out_pointer = staging_buffer.data(); + u8 block_size_in_bytes = rsx::get_format_block_size_in_bytes(format); + u64 image_linear_size; + switch (gl_type) { case GL_UNSIGNED_INT_8_8_8_8: @@ -552,6 +569,21 @@ namespace gl apply_settings = (gl_format == GL_RED); caps.supports_byteswap = apply_settings; break; + case GL_UNSIGNED_INT_24_8: + if (gl::get_driver_caps().ARB_compute_shader_supported) + { + apply_settings = false; + pixel_transform = gl::get_compute_task>(); + } + break; + case GL_FLOAT: + // TODO: Expand depth16f to depth32f + gl_type = GL_HALF_FLOAT; + break; + case GL_FLOAT_32_UNSIGNED_INT_24_8_REV: + // TODO: Expand depth24 to depth32f + gl_type = GL_UNSIGNED_INT_24_8; + break; default: break; } @@ -561,10 +593,39 @@ namespace gl unpack_settings.apply(); } + if (pixel_transform) + { + upload_scratch_mem.create(staging_buffer.size(), nullptr, buffer::memory_type::host_visible, GL_STREAM_DRAW); + compute_scratch_mem.create(staging_buffer.size(), nullptr, buffer::memory_type::local, GL_STATIC_COPY); + out_pointer = nullptr; + } + for (const rsx::subresource_layout& layout : input_layouts) { - auto op = upload_texture_subresource(staging_buffer, layout, format, is_swizzled, caps); - if (apply_settings) + if (pixel_transform) + { + const u64 row_pitch = rsx::align2(layout.width_in_block * block_size_in_bytes, caps.alignment); + image_linear_size = row_pitch * layout.height_in_block * layout.depth; + dst_buffer = { reinterpret_cast(upload_scratch_mem.map(buffer::access::write)), image_linear_size }; + } + + auto op = upload_texture_subresource(dst_buffer, layout, format, is_swizzled, caps); + + if (pixel_transform) + { + // 1. Unmap buffer + upload_scratch_mem.unmap(); + + // 2. Execute compute job + upload_scratch_mem.copy_to(&compute_scratch_mem, 0, 0, image_linear_size); + pixel_transform->run(&compute_scratch_mem, image_linear_size); + + // 3. Bind compute buffer as pixel unpack buffer + glMemoryBarrier(GL_PIXEL_UNPACK_BUFFER); + glBindBuffer(GL_SHADER_STORAGE_BUFFER, GL_NONE); + compute_scratch_mem.bind(buffer::target::pixel_unpack); + } + else if (apply_settings) { unpack_settings.swap_bytes(op.require_swap); unpack_settings.apply(); @@ -574,22 +635,28 @@ namespace gl switch (dim) { case rsx::texture_dimension_extended::texture_dimension_1d: - glTexSubImage1D(GL_TEXTURE_1D, layout.level, 0, layout.width_in_texel, gl_format, gl_type, staging_buffer.data()); + glTexSubImage1D(GL_TEXTURE_1D, layout.level, 0, layout.width_in_texel, gl_format, gl_type, out_pointer); break; case rsx::texture_dimension_extended::texture_dimension_2d: - glTexSubImage2D(GL_TEXTURE_2D, layout.level, 0, 0, layout.width_in_texel, layout.height_in_texel, gl_format, gl_type, staging_buffer.data()); + glTexSubImage2D(GL_TEXTURE_2D, layout.level, 0, 0, layout.width_in_texel, layout.height_in_texel, gl_format, gl_type, out_pointer); break; case rsx::texture_dimension_extended::texture_dimension_cubemap: - glTexSubImage2D(GL_TEXTURE_CUBE_MAP_POSITIVE_X + layout.layer, layout.level, 0, 0, layout.width_in_texel, layout.height_in_texel, gl_format, gl_type, staging_buffer.data()); + glTexSubImage2D(GL_TEXTURE_CUBE_MAP_POSITIVE_X + layout.layer, layout.level, 0, 0, layout.width_in_texel, layout.height_in_texel, gl_format, gl_type, out_pointer); break; case rsx::texture_dimension_extended::texture_dimension_3d: - glTexSubImage3D(GL_TEXTURE_3D, layout.layer, 0, 0, 0, layout.width_in_texel, layout.height_in_texel, depth, gl_format, gl_type, staging_buffer.data()); + glTexSubImage3D(GL_TEXTURE_3D, layout.layer, 0, 0, 0, layout.width_in_texel, layout.height_in_texel, depth, gl_format, gl_type, out_pointer); break; default: ASSUME(0); fmt::throw_exception("Unreachable" HERE); } } + + if (pixel_transform) + { + upload_scratch_mem.remove(); + compute_scratch_mem.remove(); + } } } @@ -754,40 +821,97 @@ namespace gl return false; } - cs_shuffle_base* get_pixel_transform_job(const pixel_buffer_layout& pack_info) + cs_shuffle_base* get_trivial_transform_job(const pixel_buffer_layout& pack_info) { - const bool is_depth_stencil = (pack_info.type == GL_UNSIGNED_INT_24_8); - if (!is_depth_stencil) [[likely]] + if (!pack_info.swap_bytes) { - if (!pack_info.swap_bytes) + return nullptr; + } + + switch (pack_info.size) + { + case 1: + return nullptr; + case 2: + return gl::get_compute_task(); + break; + case 4: + return gl::get_compute_task(); + break; + default: + fmt::throw_exception("Unsupported format"); + } + } + + cs_shuffle_base* get_image_to_buffer_job(const pixel_buffer_layout& pack_info, u32 aspect_mask) + { + switch (aspect_mask) + { + case image_aspect::color: + { + return get_trivial_transform_job(pack_info); + } + case image_aspect::depth: + { + if (pack_info.type == GL_FLOAT) { + // TODO: D16F return nullptr; } - switch (pack_info.size) - { - case 1: - return nullptr; - case 2: - return gl::get_compute_task(); - break; - case 4: - return gl::get_compute_task(); - break; - default: - fmt::throw_exception("Unsupported format"); - } + return get_trivial_transform_job(pack_info); } - else + case image_aspect::depth | image_aspect::stencil: { - if (pack_info.swap_bytes) + verify(HERE), pack_info.swap_bytes; + if (pack_info.type == GL_FLOAT_32_UNSIGNED_INT_24_8_REV) { - return gl::get_compute_task>(); + // TODO: D24FX8 + return nullptr; } - else + + return gl::get_compute_task>(); + } + default: + { + fmt::throw_exception("Invalid aspect mask 0x%x" HERE, aspect_mask); + } + } + } + + cs_shuffle_base* get_buffer_to_image_job(const pixel_buffer_layout& unpack_info, u32 aspect_mask) + { + switch (aspect_mask) + { + case image_aspect::color: + { + return get_trivial_transform_job(unpack_info); + } + case image_aspect::depth: + { + if (unpack_info.type == GL_FLOAT) { - return gl::get_compute_task>(); + // TODO: D16F + return nullptr; } + + return get_trivial_transform_job(unpack_info); + } + case image_aspect::depth | image_aspect::stencil: + { + verify(HERE), unpack_info.swap_bytes; + if (unpack_info.type == GL_FLOAT_32_UNSIGNED_INT_24_8_REV) + { + // TODO: D24FX8 + return nullptr; + } + + return gl::get_compute_task>(); + } + default: + { + fmt::throw_exception("Invalid aspect mask 0x%x" HERE, aspect_mask); + } } } @@ -807,6 +931,28 @@ namespace gl auto pack_info = get_format_type(src); auto unpack_info = get_format_type(dst); + if (!caps.ARB_compute_shader_supported) + { + auto remove_depth_transformation = [](const texture* tex, pixel_buffer_layout& pack_info) + { + if (tex->aspect() & image_aspect::depth) + { + switch (pack_info.type) + { + case GL_FLOAT_32_UNSIGNED_INT_24_8_REV: + pack_info.type = GL_UNSIGNED_INT_24_8; + break; + case GL_FLOAT: + pack_info.type = GL_HALF_FLOAT; + break; + } + } + }; + + remove_depth_transformation(src, pack_info); + remove_depth_transformation(dst, unpack_info); + } + // Start pack operation g_typeless_transfer_buffer.bind(buffer::target::pixel_pack); @@ -829,8 +975,8 @@ namespace gl if (caps.ARB_compute_shader_supported) [[likely]] { - auto src_transform = get_pixel_transform_job(pack_info); - auto dst_transform = get_pixel_transform_job(unpack_info); + auto src_transform = get_image_to_buffer_job(pack_info, src->aspect()); + auto dst_transform = get_buffer_to_image_job(unpack_info, dst->aspect()); if (src->aspect() == gl::image_aspect::color && dst->aspect() == gl::image_aspect::color) { diff --git a/rpcs3/Emu/RSX/GL/GLTextureCache.h b/rpcs3/Emu/RSX/GL/GLTextureCache.h index 69dee1d630..4ff754d516 100644 --- a/rpcs3/Emu/RSX/GL/GLTextureCache.h +++ b/rpcs3/Emu/RSX/GL/GLTextureCache.h @@ -15,6 +15,7 @@ #include "GLRenderTargets.h" #include "GLOverlays.h" #include "GLTexture.h" +#include "GLCompute.h" #include "../Common/TextureUtils.h" #include "../Common/texture_cache.h" @@ -151,9 +152,7 @@ namespace gl void dma_transfer(gl::command_context& /*cmd*/, gl::texture* src, const areai& /*src_area*/, const utils::address_range& /*valid_range*/, u32 pitch) { init_buffer(src); - glGetError(); - pbo.bind(buffer::target::pixel_pack); if (context == rsx::texture_upload_context::dma) { @@ -161,23 +160,68 @@ namespace gl const auto format_info = gl::get_format_type(src->get_internal_format()); format = static_cast(format_info.format); type = static_cast(format_info.type); + pack_unpack_swap_bytes = format_info.swap_bytes; + } - if ((src->aspect() & gl::image_aspect::stencil) == 0) + bool use_driver_pixel_transform = true; + if (get_driver_caps().ARB_compute_shader_supported) [[likely]] + { + if (src->aspect() & image_aspect::stencil) { - pack_unpack_swap_bytes = format_info.swap_bytes; - } - else - { - // Z24S8 decode is done on the CPU for now - pack_unpack_swap_bytes = false; + buffer scratch_mem; + scratch_mem.create(buffer::target::pixel_pack, pbo.size(), nullptr, buffer::memory_type::local, GL_STATIC_COPY); + scratch_mem.bind(); + + pixel_pack_settings pack_settings; + pack_settings.alignment(1); + src->copy_to(nullptr, format, type, pack_settings); + + // Invoke compute + if (auto error = glGetError(); !error) [[likely]] + { + cs_shuffle_base * job; + if (pack_unpack_swap_bytes) + { + job = get_compute_task>(); + } + else + { + job = get_compute_task>(); + } + + const auto job_length = src->pitch() * src->height(); + job->run(&scratch_mem, job_length); + + glBindBuffer(GL_SHADER_STORAGE_BUFFER, GL_NONE); + glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT); + scratch_mem.copy_to(&pbo, 0, 0, job_length); + } + else + { + rsx_log.error("Memory transfer failed with error 0x%x. Format=0x%x, Type=0x%x", error, static_cast(format), static_cast(type)); + } + + scratch_mem.remove(); + use_driver_pixel_transform = false; } } - pixel_pack_settings pack_settings; - pack_settings.alignment(1); - pack_settings.swap_bytes(pack_unpack_swap_bytes); + if (use_driver_pixel_transform) + { + if (src->aspect() & image_aspect::stencil) + { + pack_unpack_swap_bytes = false; + } + + pbo.bind(buffer::target::pixel_pack); + + pixel_pack_settings pack_settings; + pack_settings.alignment(1); + pack_settings.swap_bytes(pack_unpack_swap_bytes); + + src->copy_to(nullptr, format, type, pack_settings); + } - src->copy_to(nullptr, format, type, pack_settings); real_pitch = src->pitch(); rsx_pitch = pitch; @@ -297,20 +341,15 @@ namespace gl const u32 valid_length = valid_range.second; void *dst = get_ptr(get_section_base() + valid_offset); - if (pack_unpack_swap_bytes) + if (!gl::get_driver_caps().ARB_compute_shader_supported) { - // Shuffle - // TODO: Do this with a compute shader switch (type) { case gl::texture::type::sbyte: case gl::texture::type::ubyte: { - if (pack_unpack_swap_bytes) - { - // byte swapping does not work on byte types, use uint_8_8_8_8 for rgba8 instead to avoid penalty - rsx::shuffle_texel_data_wzyx(dst, rsx_pitch, width, align(valid_length, rsx_pitch) / rsx_pitch); - } + // byte swapping does not work on byte types, use uint_8_8_8_8 for rgba8 instead to avoid penalty + verify(HERE), !pack_unpack_swap_bytes; break; } case gl::texture::type::uint_24_8: