From 2033f3f7dcae6fcdb9497d9ed8f46a9e2781c9a0 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Mon, 14 Aug 2017 00:27:19 +0300 Subject: [PATCH] rsx/vk/gl: Refactoring and reimplementation of blit engine Fix rsx offscreen-render-to-display-buffer-blit surface reads - Also, properly scale display output height if reading from compressed tile gl: Fix broken dst height computation - The extra padding is only there to force power-of-2 sizes and isnt used gl: Ignore compression scaling if output is rendered to in a renderpass rsx/gl/vk: Cleanup for GPU texture scaling. Initial impl [WIP] - TODO: Refactor more shared code into RSX/common --- rpcs3/Emu/RSX/Common/surface_store.h | 205 +++++++++++++ rpcs3/Emu/RSX/D3D12/D3D12RenderTargetSets.h | 12 + rpcs3/Emu/RSX/GL/GLGSRender.cpp | 52 +++- rpcs3/Emu/RSX/GL/GLHelpers.h | 6 + rpcs3/Emu/RSX/GL/GLRenderTargets.h | 209 +------------ rpcs3/Emu/RSX/GL/GLTextureCache.h | 107 +++---- rpcs3/Emu/RSX/VK/VKGSRender.cpp | 14 +- rpcs3/Emu/RSX/VK/VKGSRender.h | 1 + rpcs3/Emu/RSX/VK/VKRenderTargets.h | 11 + rpcs3/Emu/RSX/VK/VKTextureCache.h | 312 +++++++++++++++++++- rpcs3/Emu/RSX/rsx_methods.cpp | 31 -- 11 files changed, 664 insertions(+), 296 deletions(-) diff --git a/rpcs3/Emu/RSX/Common/surface_store.h b/rpcs3/Emu/RSX/Common/surface_store.h index eddd1d57f6..46d28bec0f 100644 --- a/rpcs3/Emu/RSX/Common/surface_store.h +++ b/rpcs3/Emu/RSX/Common/surface_store.h @@ -13,6 +13,36 @@ namespace rsx size_t get_packed_pitch(surface_color_format format, u32 width); } + template + struct surface_subresource_storage + { + surface_type surface = nullptr; + + u16 x = 0; + u16 y = 0; + u16 w = 0; + u16 h = 0; + + bool is_bound = false; + bool is_depth_surface = false; + bool is_clipped = false; + + surface_subresource_storage() {} + + surface_subresource_storage(surface_type src, u16 X, u16 Y, u16 W, u16 H, bool _Bound, bool _Depth, bool _Clipped = false) + : surface(src), x(X), y(Y), w(W), h(H), is_bound(_Bound), is_depth_surface(_Depth), is_clipped(_Clipped) + {} + }; + + struct surface_format_info + { + u32 surface_width; + u32 surface_height; + u16 native_pitch; + u16 rsx_pitch; + u8 bpp; + }; + /** * Helper for surface (ie color and depth stencil render target) management. * It handles surface creation and storage. Backend should only retrieve pointer to surface. @@ -64,6 +94,7 @@ namespace rsx using surface_type = typename Traits::surface_type; using command_list_type = typename Traits::command_list_type; using download_buffer_object = typename Traits::download_buffer_object; + using surface_subresource = typename surface_subresource_storage; std::unordered_map m_render_targets_storage = {}; std::unordered_map m_depth_stencil_storage = {}; @@ -437,5 +468,179 @@ namespace rsx for (auto &ds : m_depth_stencil_storage) Traits::invalidate_depth_surface_contents(command_list, Traits::get(std::get<1>(ds)), nullptr, true); } + + /** + * Clipping and fitting lookup funcrions + * surface_overlaps - returns true if surface overlaps a given surface address and returns the relative x and y position of the surface address within the surface + * address_is_bound - returns true if the surface at a given address is actively bound + * get_surface_subresource_if_available - returns a sectiion descriptor that allows to crop surfaces stored in memory + */ + bool surface_overlaps_address(surface_type surface, u32 surface_address, u32 texaddr, u16 *x, u16 *y, bool scale_to_fit) + { + bool is_subslice = false; + u16 x_offset = 0; + u16 y_offset = 0; + + if (surface_address > texaddr) + return false; + + u32 offset = texaddr - surface_address; + if (texaddr >= surface_address) + { + + if (offset == 0) + { + is_subslice = true; + } + else + { + surface_format_info info; + Traits::get_surface_info(surface, &info); + + u32 range = info.rsx_pitch * info.surface_height; + if (offset < range) + { + const u32 y = (offset / info.rsx_pitch); + u32 x = (offset % info.rsx_pitch) / info.bpp; + + if (scale_to_fit) + { + const f32 x_scale = (f32)info.rsx_pitch / info.native_pitch; + x = (u32)((f32)x / x_scale); + } + + x_offset = x; + y_offset = y; + + is_subslice = true; + } + } + + if (is_subslice) + { + *x = x_offset; + *y = y_offset; + + return true; + } + } + + return false; + } + + bool address_is_bound(u32 address, bool is_depth) const + { + if (is_depth) + { + const u32 bound_depth_address = std::get<0>(m_bound_depth_stencil); + return (bound_depth_address == address); + } + + for (auto &surface : m_bound_render_targets) + { + const u32 bound_address = std::get<0>(surface); + if (bound_address == address) + return true; + } + + return false; + } + + inline bool region_fits(u16 region_width, u16 region_height, u16 x_offset, u16 y_offset, u16 width, u16 height) const + { + if ((x_offset + width) > region_width) return false; + if ((y_offset + height) > region_height) return false; + + return true; + } + + surface_subresource get_surface_subresource_if_applicable(u32 texaddr, u16 requested_width, u16 requested_height, u16 requested_pitch, bool scale_to_fit = false, bool crop = false, bool ignore_depth_formats = false) + { + auto test_surface = [&](surface_type surface, u32 this_address, u16 &x_offset, u16 &y_offset, u16 &w, u16 &h, bool &clipped) + { + if (surface_overlaps_address(surface, this_address, texaddr, &x_offset, &y_offset, scale_to_fit)) + { + surface_format_info info; + Traits::get_surface_info(surface, &info); + + if (info.rsx_pitch != requested_pitch) + return false; + + u16 real_width = requested_width; + + if (scale_to_fit) + { + f32 pitch_scaling = (f32)requested_pitch / info.native_pitch; + real_width = (u16)((f32)requested_width / pitch_scaling); + } + + if (region_fits(info.surface_width, info.surface_height, x_offset, y_offset, real_width, requested_height)) + { + w = info.surface_width; + h = info.surface_height; + clipped = false; + + return true; + } + else + { + if (crop) //Forcefully fit the requested region by clipping and scaling + { + u16 remaining_width = info.surface_width - x_offset; + u16 remaining_height = info.surface_height - y_offset; + + w = remaining_width; + h = remaining_height; + clipped = true; + + return true; + } + + if (info.surface_width >= requested_width && info.surface_height >= requested_height) + { + LOG_WARNING(RSX, "Overlapping surface exceeds bounds; returning full surface region"); + w = requested_width; + h = requested_height; + clipped = true; + + return true; + } + } + } + + return false; + }; + + surface_type surface = nullptr; + bool clipped = false; + u16 x_offset = 0; + u16 y_offset = 0; + u16 w; + u16 h; + + for (auto &tex_info : m_render_targets_storage) + { + u32 this_address = std::get<0>(tex_info); + surface = std::get<1>(tex_info).get(); + + if (test_surface(surface, this_address, x_offset, y_offset, w, h, clipped)) + return { surface, x_offset, y_offset, w, h, address_is_bound(this_address, false), false, clipped }; + } + + if (ignore_depth_formats) + return{}; + + //Check depth surfaces for overlap + for (auto &tex_info : m_depth_stencil_storage) + { + u32 this_address = std::get<0>(tex_info); + surface = std::get<1>(tex_info).get(); + + if (test_surface(surface, this_address, x_offset, y_offset, w, h, clipped)) + return { surface, x_offset, y_offset, w, h, address_is_bound(this_address, true), true, clipped }; + } + + return{}; + } }; } diff --git a/rpcs3/Emu/RSX/D3D12/D3D12RenderTargetSets.h b/rpcs3/Emu/RSX/D3D12/D3D12RenderTargetSets.h index c6a9fbaedb..3c8a9ba157 100644 --- a/rpcs3/Emu/RSX/D3D12/D3D12RenderTargetSets.h +++ b/rpcs3/Emu/RSX/D3D12/D3D12RenderTargetSets.h @@ -54,6 +54,18 @@ struct render_target_traits return rtt; } + static + void get_surface_info(ID3D12Resource *surface, rsx::surface_format_info *info) + { + //TODO + auto desc = surface->GetDesc(); + info->rsx_pitch = desc.Width; + info->native_pitch = desc.Width; + info->surface_width = desc.Width; + info->surface_height = desc.Height; + info->bpp = 1; + } + static void prepare_rtt_for_drawing( gsl::not_null command_list, diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp index 56679938b3..91d24de2de 100644 --- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp +++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp @@ -1028,7 +1028,6 @@ void GLGSRender::flip(int buffer) // Calculate blit coordinates coordi aspect_ratio; - areai screen_area = coordi({}, { (int)buffer_width, (int)buffer_height }); sizei csize(m_frame->client_width(), m_frame->client_height()); sizei new_size = csize; @@ -1055,19 +1054,33 @@ void GLGSRender::flip(int buffer) // Find the source image rsx::tiled_region buffer_region = get_tiled_address(display_buffers[buffer].offset, CELL_GCM_LOCATION_LOCAL); u32 absolute_address = buffer_region.address + buffer_region.base; - gl::texture *render_target_texture = m_rtts.get_texture_from_render_target_if_applicable(absolute_address); m_flip_fbo.recreate(); m_flip_fbo.bind(); - if (render_target_texture) + //The render might have been done offscreen and a blit used to display + //Check the texture cache for a blitted copy + const u32 size = buffer_pitch * buffer_height; + auto surface = m_gl_texture_cache.find_texture_from_range(absolute_address, size); + bool ignore_scaling = false; + + if (surface != nullptr) + { + auto dims = surface->get_dimensions(); + buffer_width = std::get<0>(dims); + buffer_height = std::get<1>(dims); + + m_flip_fbo.color = surface->id(); + m_flip_fbo.read_buffer(m_flip_fbo.color); + } + else if (auto render_target_texture = m_rtts.get_texture_from_render_target_if_applicable(absolute_address)) { buffer_width = render_target_texture->width(); buffer_height = render_target_texture->height(); - __glcheck m_flip_fbo.color = *render_target_texture; - __glcheck m_flip_fbo.read_buffer(m_flip_fbo.color); - + m_flip_fbo.color = *render_target_texture; + m_flip_fbo.read_buffer(m_flip_fbo.color); + ignore_scaling = true; } else { @@ -1077,7 +1090,7 @@ void GLGSRender::flip(int buffer) { m_flip_tex_color.recreate(gl::texture::target::texture2D); - __glcheck m_flip_tex_color.config() + m_flip_tex_color.config() .size({ (int)buffer_width, (int)buffer_height }) .type(gl::texture::type::uint_8_8_8_8) .format(gl::texture::format::bgra); @@ -1089,23 +1102,38 @@ void GLGSRender::flip(int buffer) { std::unique_ptr temp(new u8[buffer_height * buffer_pitch]); buffer_region.read(temp.get(), buffer_width, buffer_height, buffer_pitch); - __glcheck m_flip_tex_color.copy_from(temp.get(), gl::texture::format::bgra, gl::texture::type::uint_8_8_8_8); + m_flip_tex_color.copy_from(temp.get(), gl::texture::format::bgra, gl::texture::type::uint_8_8_8_8); } else { - __glcheck m_flip_tex_color.copy_from(buffer_region.ptr, gl::texture::format::bgra, gl::texture::type::uint_8_8_8_8); + m_flip_tex_color.copy_from(buffer_region.ptr, gl::texture::format::bgra, gl::texture::type::uint_8_8_8_8); } m_flip_fbo.color = m_flip_tex_color; - __glcheck m_flip_fbo.read_buffer(m_flip_fbo.color); + m_flip_fbo.read_buffer(m_flip_fbo.color); + ignore_scaling = true; + } + + if (!ignore_scaling && buffer_region.tile && buffer_region.tile->comp != CELL_GCM_COMPMODE_DISABLED) + { + LOG_ERROR(RSX, "Output buffer compression mode = 0x%X", buffer_region.tile->comp); + + switch (buffer_region.tile->comp) + { + case CELL_GCM_COMPMODE_C32_2X2: + case CELL_GCM_COMPMODE_C32_2X1: + buffer_height = display_buffers[buffer].height / 2; + break; + } } // Blit source image to the screen // Disable scissor test (affects blit) glDisable(GL_SCISSOR_TEST); - gl::screen.clear(gl::buffers::color_depth_stencil); - __glcheck m_flip_fbo.blit(gl::screen, screen_area, areai(aspect_ratio).flipped_vertical(), gl::buffers::color, gl::filter::linear); + areai screen_area = coordi({}, { (int)buffer_width, (int)buffer_height }); + gl::screen.clear(gl::buffers::color); + m_flip_fbo.blit(gl::screen, screen_area, areai(aspect_ratio).flipped_vertical(), gl::buffers::color, gl::filter::linear); if (g_cfg.video.overlay) { diff --git a/rpcs3/Emu/RSX/GL/GLHelpers.h b/rpcs3/Emu/RSX/GL/GLHelpers.h index 192e0ba5d1..2bfd55d8ba 100644 --- a/rpcs3/Emu/RSX/GL/GLHelpers.h +++ b/rpcs3/Emu/RSX/GL/GLHelpers.h @@ -1953,6 +1953,12 @@ namespace gl case texture::target::texture3D: glFramebufferTexture3D(GL_FRAMEBUFFER, m_id, GL_TEXTURE_3D, rhs.id(), rhs.level(), 0); break; } } + + void operator = (const GLuint rhs) + { + save_binding_state save(m_parent); + glFramebufferTexture2D(GL_FRAMEBUFFER, m_id, GL_TEXTURE_2D, rhs, 0); + } }; class indexed_attachment : public attachment diff --git a/rpcs3/Emu/RSX/GL/GLRenderTargets.h b/rpcs3/Emu/RSX/GL/GLRenderTargets.h index 608fc90ea3..5e012fcf75 100644 --- a/rpcs3/Emu/RSX/GL/GLRenderTargets.h +++ b/rpcs3/Emu/RSX/GL/GLRenderTargets.h @@ -115,40 +115,6 @@ namespace gl { return compatible_internal_format; } - - // For an address within the texture, extract this sub-section's rect origin - // Checks whether we need to scale the subresource if it is not handled in shader - // NOTE1: When surface->real_pitch < rsx_pitch, the surface is assumed to have been scaled to fill the rsx_region - std::tuple get_texture_subresource(u32 offset, bool scale_to_fit) - { - if (!offset) - { - return std::make_tuple(true, 0, 0); - } - - if (!surface_height) surface_height = height(); - if (!surface_width) surface_width = width(); - - u32 range = rsx_pitch * surface_height; - if (offset < range) - { - if (!surface_pixel_size) - surface_pixel_size = native_pitch / surface_width; - - const u32 y = (offset / rsx_pitch); - u32 x = (offset % rsx_pitch) / surface_pixel_size; - - if (scale_to_fit) - { - const f32 x_scale = (f32)rsx_pitch / native_pitch; - x = (u32)((f32)x / x_scale); - } - - return std::make_tuple(true, (u16)x, (u16)y); - } - else - return std::make_tuple(false, 0, 0); - } }; } @@ -235,6 +201,18 @@ struct gl_render_target_traits return result; } + static + void get_surface_info(gl::render_target *surface, rsx::surface_format_info *info) + { + const auto dims = surface->get_dimensions(); + + info->rsx_pitch = surface->get_rsx_pitch(); + info->native_pitch = surface->get_native_pitch(); + info->surface_width = std::get<0>(dims); + info->surface_height = std::get<1>(dims); + info->bpp = static_cast(info->native_pitch / info->surface_width); + } + static void prepare_rtt_for_drawing(void *, gl::render_target*) {} static void prepare_rtt_for_sampling(void *, gl::render_target*) {} @@ -307,169 +285,6 @@ struct gl_render_target_traits } }; -struct surface_subresource -{ - gl::render_target *surface = nullptr; - - u16 x = 0; - u16 y = 0; - u16 w = 0; - u16 h = 0; - - bool is_bound = false; - bool is_depth_surface = false; - bool is_clipped = false; - - surface_subresource() {} - - surface_subresource(gl::render_target *src, u16 X, u16 Y, u16 W, u16 H, bool _Bound, bool _Depth, bool _Clipped = false) - : surface(src), x(X), y(Y), w(W), h(H), is_bound(_Bound), is_depth_surface(_Depth), is_clipped(_Clipped) - {} -}; - class gl_render_targets : public rsx::surface_store { -private: - bool surface_overlaps(gl::render_target *surface, u32 surface_address, u32 texaddr, u16 *x, u16 *y, bool scale_to_fit) - { - bool is_subslice = false; - u16 x_offset = 0; - u16 y_offset = 0; - - if (surface_address > texaddr) - return false; - - u32 offset = texaddr - surface_address; - if (texaddr >= surface_address) - { - std::tie(is_subslice, x_offset, y_offset) = surface->get_texture_subresource(offset, scale_to_fit); - if (is_subslice) - { - *x = x_offset; - *y = y_offset; - - return true; - } - } - - return false; - } - - bool is_bound(u32 address, bool is_depth) - { - if (is_depth) - { - const u32 bound_depth_address = std::get<0>(m_bound_depth_stencil); - return (bound_depth_address == address); - } - - for (auto &surface: m_bound_render_targets) - { - const u32 bound_address = std::get<0>(surface); - if (bound_address == address) - return true; - } - - return false; - } - - bool fits(gl::render_target*, std::pair &dims, u16 x_offset, u16 y_offset, u16 width, u16 height) const - { - if ((x_offset + width) > dims.first) return false; - if ((y_offset + height) > dims.second) return false; - - return true; - } - -public: - surface_subresource get_surface_subresource_if_applicable(u32 texaddr, u16 requested_width, u16 requested_height, u16 requested_pitch, bool scale_to_fit=false, bool crop=false, bool ignore_depth_formats=false) - { - gl::render_target *surface = nullptr; - u16 x_offset = 0; - u16 y_offset = 0; - - for (auto &tex_info : m_render_targets_storage) - { - u32 this_address = std::get<0>(tex_info); - surface = std::get<1>(tex_info).get(); - - if (surface_overlaps(surface, this_address, texaddr, &x_offset, &y_offset, scale_to_fit)) - { - if (surface->get_rsx_pitch() != requested_pitch) - continue; - - auto dims = surface->get_dimensions(); - - if (scale_to_fit) - { - f32 pitch_scaling = (f32)requested_pitch / surface->get_native_pitch(); - requested_width = (u16)((f32)requested_width / pitch_scaling); - } - - if (fits(surface, dims, x_offset, y_offset, requested_width, requested_height)) - return{ surface, x_offset, y_offset, requested_width, requested_height, is_bound(this_address, false), false }; - else - { - if (crop) //Forcefully fit the requested region by clipping and scaling - { - u16 remaining_width = dims.first - x_offset; - u16 remaining_height = dims.second - y_offset; - - return{ surface, x_offset, y_offset, remaining_width, remaining_height, is_bound(this_address, false), false, true }; - } - - if (dims.first >= requested_width && dims.second >= requested_height) - { - LOG_WARNING(RSX, "Overlapping surface exceeds bounds; returning full surface region"); - return{ surface, 0, 0, requested_width, requested_height, is_bound(this_address, false), false, true }; - } - } - } - } - - if (ignore_depth_formats) - return{}; - - //Check depth surfaces for overlap - for (auto &tex_info : m_depth_stencil_storage) - { - u32 this_address = std::get<0>(tex_info); - surface = std::get<1>(tex_info).get(); - - if (surface_overlaps(surface, this_address, texaddr, &x_offset, &y_offset, scale_to_fit)) - { - if (surface->get_rsx_pitch() != requested_pitch) - continue; - - auto dims = surface->get_dimensions(); - - if (scale_to_fit) - { - f32 pitch_scaling = (f32)requested_pitch / surface->get_native_pitch(); - requested_width = (u16)((f32)requested_width / pitch_scaling); - } - - if (fits(surface, dims, x_offset, y_offset, requested_width, requested_height)) - return{ surface, x_offset, y_offset, requested_width, requested_height, is_bound(this_address, true), true }; - else - { - if (crop) //Forcefully fit the requested region by clipping and scaling - { - u16 remaining_width = dims.first - x_offset; - u16 remaining_height = dims.second - y_offset; - - return{ surface, x_offset, y_offset, remaining_width, remaining_height, is_bound(this_address, true), true, true }; - } - - if (dims.first >= requested_width && dims.second >= requested_height) - { - LOG_WARNING(RSX, "Overlapping depth surface exceeds bounds; returning full surface region"); - return{ surface, 0, 0, requested_width, requested_height, is_bound(this_address, true), true, true }; - } - } - } - } - - return {}; - } }; diff --git a/rpcs3/Emu/RSX/GL/GLTextureCache.h b/rpcs3/Emu/RSX/GL/GLTextureCache.h index cd888af0e0..cb57102199 100644 --- a/rpcs3/Emu/RSX/GL/GLTextureCache.h +++ b/rpcs3/Emu/RSX/GL/GLTextureCache.h @@ -460,40 +460,6 @@ namespace gl GLGSRender *m_renderer; std::thread::id m_renderer_thread; - cached_texture_section *find_texture_from_dimensions(u32 texaddr, u32 w, u32 h) - { - reader_lock lock(m_section_mutex); - - for (cached_texture_section &tex : read_only_memory_sections) - { - if (tex.matches(texaddr, w, h) && !tex.is_dirty()) - return &tex; - } - - return nullptr; - } - - /** - * Searches for a texture from read_only memory sections - * Texture origin + size must be a subsection of the existing texture - */ - cached_texture_section *find_texture_from_range(u32 texaddr, u32 range) - { - reader_lock lock(m_section_mutex); - - auto test = std::make_pair(texaddr, range); - for (cached_texture_section &tex : read_only_memory_sections) - { - if (tex.get_section_base() > texaddr) - continue; - - if (tex.overlaps(test, true) && !tex.is_dirty()) - return &tex; - } - - return nullptr; - } - cached_texture_section& create_texture(u32 id, u32 texaddr, u32 texsize, u32 w, u32 h) { for (cached_texture_section &tex : read_only_memory_sections) @@ -536,19 +502,6 @@ namespace gl clear_temporary_surfaces(); } - cached_texture_section* find_cached_rtt_section(u32 base, u32 size) - { - for (cached_texture_section &rtt : no_access_memory_sections) - { - if (rtt.matches(base, size)) - { - return &rtt; - } - } - - return nullptr; - } - cached_texture_section *create_locked_view_of_section(u32 base, u32 size) { cached_texture_section *region = find_cached_rtt_section(base, size); @@ -647,6 +600,53 @@ namespace gl m_hw_blitter.destroy(); } + cached_texture_section *find_texture_from_dimensions(u32 texaddr, u32 w, u32 h) + { + reader_lock lock(m_section_mutex); + + for (cached_texture_section &tex : read_only_memory_sections) + { + if (tex.matches(texaddr, w, h) && !tex.is_dirty()) + return &tex; + } + + return nullptr; + } + + /** + * Searches for a texture from read_only memory sections + * Texture origin + size must be a subsection of the existing texture + */ + cached_texture_section *find_texture_from_range(u32 texaddr, u32 range) + { + reader_lock lock(m_section_mutex); + + auto test = std::make_pair(texaddr, range); + for (cached_texture_section &tex : read_only_memory_sections) + { + if (tex.get_section_base() > texaddr) + continue; + + if (tex.overlaps(test, true) && !tex.is_dirty()) + return &tex; + } + + return nullptr; + } + + cached_texture_section* find_cached_rtt_section(u32 base, u32 size) + { + for (cached_texture_section &rtt : no_access_memory_sections) + { + if (rtt.matches(base, size)) + { + return &rtt; + } + } + + return nullptr; + } + template void upload_texture(int index, RsxTextureType &tex, rsx::gl::texture &gl_texture, gl_render_targets &m_rtts) { @@ -739,7 +739,7 @@ namespace gl const f32 internal_scale = (f32)tex_pitch / native_pitch; const u32 internal_width = (const u32)(tex_width * internal_scale); - const surface_subresource rsc = m_rtts.get_surface_subresource_if_applicable(texaddr, internal_width, tex_height, tex_pitch, true); + const auto rsc = m_rtts.get_surface_subresource_if_applicable(texaddr, internal_width, tex_height, tex_pitch, true); if (rsc.surface) { //Check that this region is not cpu-dirty before doing a copy @@ -1078,7 +1078,7 @@ namespace gl const u32 dst_address = (u32)((u64)dst.pixels - (u64)vm::base(0)); //Check if src/dst are parts of render targets - surface_subresource dst_subres = m_rtts.get_surface_subresource_if_applicable(dst_address, dst.width, dst.clip_height, dst.pitch, true, true, true); + auto dst_subres = m_rtts.get_surface_subresource_if_applicable(dst_address, dst.width, dst.clip_height, dst.pitch, true, true, true); dst_is_render_target = dst_subres.surface != nullptr; u16 max_dst_width = dst.width; @@ -1097,7 +1097,8 @@ namespace gl position2i dst_offset = { dst.offset_x, dst.offset_y }; size2i clip_dimensions = { dst.clip_width, dst.clip_height }; - const size2i dst_dimensions = { dst.pitch / (dst_is_argb8 ? 4 : 2), dst.height }; + //Dimensions passed are restricted to powers of 2; get real height from clip_height and width from pitch + const size2i dst_dimensions = { dst.pitch / (dst_is_argb8 ? 4 : 2), dst.clip_height }; //Offset in x and y for src is 0 (it is already accounted for when getting pixels_src) //Reproject final clip onto source... @@ -1184,7 +1185,7 @@ namespace gl } //TODO: Handle cases where src or dst can be a depth texture while the other is a color texture - requires a render pass to emulate - surface_subresource src_subres = m_rtts.get_surface_subresource_if_applicable(src_address, src.width, src.height, src.pitch, true, true, true); + auto src_subres = m_rtts.get_surface_subresource_if_applicable(src_address, src.width, src.height, src.pitch, true, true, true); src_is_render_target = src_subres.surface != nullptr; //Create source texture if does not exist @@ -1283,7 +1284,9 @@ namespace gl //If so, add this texture to the no_access queue not the read_only queue writer_lock lock(m_section_mutex); - cached_texture_section &cached = create_texture(texture_id, dst.rsx_address, dst.pitch * dst.clip_height, dst.width, dst.clip_height); + const u8 bpp = dst_is_argb8 ? 4 : 2; + const u32 real_width = dst.pitch / bpp; + cached_texture_section &cached = create_texture(texture_id, dst.rsx_address, dst.pitch * dst.clip_height, real_width, dst.clip_height); //These textures are completely GPU resident so we dont watch for CPU access //There's no data to be fetched from the CPU //Its is possible for a title to attempt to read from the region, but the CPU path should be used in such cases diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp index f7f750ac91..51b7685fdd 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp +++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp @@ -2080,10 +2080,12 @@ void VKGSRender::prepare_rtts() for (u8 index : draw_buffers) { - bound_images.push_back(std::get<1>(m_rtts.m_bound_render_targets[index])); + auto surface = std::get<1>(m_rtts.m_bound_render_targets[index]); + bound_images.push_back(surface); m_surface_info[index].address = surface_addresses[index]; m_surface_info[index].pitch = surface_pitchs[index]; + surface->rsx_pitch = surface_pitchs[index]; if (surface_pitchs[index] <= 64) { @@ -2095,10 +2097,12 @@ void VKGSRender::prepare_rtts() if (std::get<0>(m_rtts.m_bound_depth_stencil) != 0) { - bound_images.push_back(std::get<1>(m_rtts.m_bound_depth_stencil)); + auto ds = std::get<1>(m_rtts.m_bound_depth_stencil); + bound_images.push_back(ds); m_depth_surface_info.address = zeta_address; m_depth_surface_info.pitch = rsx::method_registers.surface_z_pitch(); + ds->rsx_pitch = m_depth_surface_info.pitch; if (m_depth_surface_info.pitch <= 64 && clip_width > m_depth_surface_info.pitch) m_depth_surface_info.pitch = 0; @@ -2519,3 +2523,9 @@ void VKGSRender::flip(int buffer) m_uploads_8k = 0; m_uploads_16k = 0; } + +bool VKGSRender::scaled_image_from_memory(rsx::blit_src_info& src, rsx::blit_dst_info& dst, bool interpolate) +{ + return m_texture_cache.upload_scaled_image(src, dst, interpolate, (*m_device), *m_current_command_buffer, m_memory_type_mapping, + m_swap_chain->get_present_queue(), m_rtts, m_texture_upload_buffer_ring_info, m_texture_upload_buffer_ring_info.heap.get()); +} diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.h b/rpcs3/Emu/RSX/VK/VKGSRender.h index 4f405728a5..6feb21270a 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.h +++ b/rpcs3/Emu/RSX/VK/VKGSRender.h @@ -294,6 +294,7 @@ protected: void flip(int buffer) override; void do_local_task() override; + bool scaled_image_from_memory(rsx::blit_src_info& src, rsx::blit_dst_info& dst, bool interpolate) override; bool on_access_violation(u32 address, bool is_writing) override; void on_notify_memory_unmapped(u32 address_base, u32 size) override; diff --git a/rpcs3/Emu/RSX/VK/VKRenderTargets.h b/rpcs3/Emu/RSX/VK/VKRenderTargets.h index 3a99503872..7762fe55e9 100644 --- a/rpcs3/Emu/RSX/VK/VKRenderTargets.h +++ b/rpcs3/Emu/RSX/VK/VKRenderTargets.h @@ -20,6 +20,7 @@ namespace vk { bool dirty = false; u16 native_pitch = 0; + u16 rsx_pitch = 0; VkImageAspectFlags attachment_aspect_flag = VK_IMAGE_ASPECT_COLOR_BIT; std::unique_ptr view; @@ -171,6 +172,16 @@ namespace rsx return ds; } + static + void get_surface_info(vk::render_target *surface, rsx::surface_format_info *info) + { + info->rsx_pitch = surface->rsx_pitch; + info->native_pitch = surface->native_pitch; + info->surface_width = surface->info.extent.width; + info->surface_height = surface->info.extent.height; + info->bpp = static_cast(info->native_pitch / info->surface_width); + } + static void prepare_rtt_for_drawing(vk::command_buffer* pcmd, vk::render_target *surface) { VkImageSubresourceRange range = vk::get_image_subresource_range(0, 0, 1, 1, surface->attachment_aspect_flag); diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.h b/rpcs3/Emu/RSX/VK/VKTextureCache.h index 9fe0bfcfe3..d58e10a5ed 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.h +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.h @@ -94,7 +94,16 @@ namespace vk if (!width && !height && !mipmaps) return true; - return (width == this->width && height == this->height && mipmaps == this->mipmaps); + if (width && width != this->width) + return false; + + if (height && height != this->height) + return false; + + if (mipmaps && mipmaps != this->mipmaps) + return false; + + return true; } return false; @@ -361,6 +370,40 @@ namespace vk const s32 m_max_zombie_objects = 32; //Limit on how many texture objects to keep around for reuse after they are invalidated s32 m_unreleased_texture_objects = 0; //Number of invalidated objects not yet freed from memory + cached_texture_section *find_texture_from_range(u32 rsx_address, u32 range) + { + auto test = std::make_pair(rsx_address, range); + for (auto &address_range : m_cache) + { + auto &range_data = address_range.second; + for (auto &tex : range_data.data) + { + if (!tex.is_dirty() && tex.overlaps(test, true)) + return &tex; + } + } + + return nullptr; + } + + cached_texture_section *find_texture_from_dimensions(u32 rsx_address, u32 rsx_size, u16 width = 0, u16 height = 0, u16 mipmaps = 0) + { + auto found = m_cache.find(rsx_address); + if (found != m_cache.end()) + { + auto &range_data = found->second; + for (auto &tex : range_data.data) + { + if (tex.matches(rsx_address, width, height, mipmaps) && !tex.is_dirty()) + { + return &tex; + } + } + } + + return nullptr; + } + cached_texture_section& find_cached_texture(u32 rsx_address, u32 rsx_size, bool confirm_dimensions = false, u16 width = 0, u16 height = 0, u16 mipmaps = 0) { { @@ -565,7 +608,7 @@ namespace vk } template - vk::image_view* upload_texture(command_buffer cmd, RsxTextureType &tex, rsx::vk_render_targets &m_rtts, const vk::memory_type_mapping &memory_type_mapping, vk_data_heap& upload_heap, vk::buffer* upload_buffer) + vk::image_view* upload_texture(command_buffer &cmd, RsxTextureType &tex, rsx::vk_render_targets &m_rtts, const vk::memory_type_mapping &memory_type_mapping, vk_data_heap& upload_heap, vk::buffer* upload_buffer) { const u32 texaddr = rsx::get_address(tex.offset(), tex.location()); const u32 range = (u32)get_texture_size(tex); @@ -1048,5 +1091,270 @@ namespace vk value.misses --; } } + + bool upload_scaled_image(rsx::blit_src_info& src, rsx::blit_dst_info& dst, bool interpolate, + vk::render_device& dev, vk::command_buffer& cmd, vk::memory_type_mapping& memory_types, VkQueue submit_queue, + rsx::vk_render_targets &m_rtts, vk_data_heap &upload_heap, vk::buffer* upload_buffer) + { + //Since we will have dst in vram, we can 'safely' ignore the swizzle flag + //TODO: Verify correct behavior + + bool src_is_render_target = false; + bool dst_is_render_target = false; + bool dst_is_argb8 = (dst.format == rsx::blit_engine::transfer_destination_format::a8r8g8b8); + bool src_is_argb8 = (src.format == rsx::blit_engine::transfer_source_format::a8r8g8b8); + + VkFormat src_vk_format = src_is_argb8 ? VK_FORMAT_B8G8R8A8_UNORM : VK_FORMAT_R5G6B5_UNORM_PACK16; + + vk::image* vram_texture = nullptr; + vk::image* dest_texture = nullptr; + + const u32 src_address = (u32)((u64)src.pixels - (u64)vm::base(0)); + const u32 dst_address = (u32)((u64)dst.pixels - (u64)vm::base(0)); + + //Check if src/dst are parts of render targets + auto dst_subres = m_rtts.get_surface_subresource_if_applicable(dst_address, dst.width, dst.clip_height, dst.pitch, true, true, true); + dst_is_render_target = dst_subres.surface != nullptr; + + u16 max_dst_width = dst.width; + u16 max_dst_height = dst.height; + + //Prepare areas and offsets + //Copy from [src.offset_x, src.offset_y] a region of [clip.width, clip.height] + //Stretch onto [dst.offset_x, y] with clipping performed on the source region + //The implementation here adds the inverse scaled clip dimensions onto the source to completely bypass final clipping step + + float scale_x = (f32)dst.width / src.width; + float scale_y = (f32)dst.height / src.height; + + //Clip offset is unused if the clip offsets are reprojected onto the source + position2i clip_offset = { 0, 0 };//{ dst.clip_x, dst.clip_y }; + position2i dst_offset = { dst.offset_x, dst.offset_y }; + + size2i clip_dimensions = { dst.clip_width, dst.clip_height }; + //Dimensions passed are restricted to powers of 2; get real height from clip_height and width from pitch + const size2i dst_dimensions = { dst.pitch / (dst_is_argb8 ? 4 : 2), dst.clip_height }; + + //Offset in x and y for src is 0 (it is already accounted for when getting pixels_src) + //Reproject final clip onto source... + const u16 src_w = (const u16)((f32)clip_dimensions.width / scale_x); + const u16 src_h = (const u16)((f32)clip_dimensions.height / scale_y); + + areai src_area = { 0, 0, src_w, src_h }; + areai dst_area = { 0, 0, dst.clip_width, dst.clip_height }; + + //If destination is neither a render target nor an existing texture in VRAM + //its possible that this method is being used to perform a memcpy into RSX memory, so we check + //parameters. Whenever a simple memcpy can get the job done, use it instead. + //Dai-3-ji Super Robot Taisen for example uses this to copy program code to GPU RAM + + bool is_memcpy = false; + u32 memcpy_bytes_length = 0; + if (dst_is_argb8 == src_is_argb8 && !dst.swizzled) + { + if ((src.slice_h == 1 && dst.clip_height == 1) || + (dst.clip_width == src.width && dst.clip_height == src.slice_h && src.pitch == dst.pitch)) + { + const u8 bpp = dst_is_argb8 ? 4 : 2; + is_memcpy = true; + memcpy_bytes_length = dst.clip_width * bpp * dst.clip_height; + } + } + + if (!dst_is_render_target) + { + //First check if this surface exists in VRAM with exact dimensions + //Since scaled GPU resources are not invalidated by the CPU, we need to reuse older surfaces if possible + auto cached_dest = find_texture_from_dimensions(dst.rsx_address, dst.pitch * dst.clip_height, dst_dimensions.width, dst_dimensions.height); + + //Check for any available region that will fit this one + if (!cached_dest) cached_dest = find_texture_from_range(dst.rsx_address, dst.pitch * dst.clip_height); + + if (cached_dest) + { + //TODO: Verify that the new surface will fit + dest_texture = cached_dest->get_texture().get(); + + //TODO: Move this code into utils since it is used alot + const u32 address_offset = dst.rsx_address - cached_dest->get_section_base(); + + const u16 bpp = dst_is_argb8 ? 4 : 2; + const u16 offset_y = address_offset / dst.pitch; + const u16 offset_x = address_offset % dst.pitch; + + dst_offset.x += offset_x / bpp; + dst_offset.y += offset_y; + + max_dst_width = cached_dest->get_width(); + max_dst_height = cached_dest->get_height(); + } + else if (is_memcpy) + { + memcpy(dst.pixels, src.pixels, memcpy_bytes_length); + return true; + } + } + else + { + dst_offset.x = dst_subres.x; + dst_offset.y = dst_subres.y; + + dest_texture = dst_subres.surface; + + max_dst_width = dst_subres.surface->width(); + max_dst_height = dst_subres.surface->height(); + + if (is_memcpy) + { + //Some render target descriptions are actually invalid + //Confirm this is a flushable RTT + const auto rsx_pitch = dst_subres.surface->rsx_pitch; + const auto native_pitch = dst_subres.surface->native_pitch; + + if (rsx_pitch <= 64 && native_pitch != rsx_pitch) + { + memcpy(dst.pixels, src.pixels, memcpy_bytes_length); + return true; + } + } + } + + //TODO: Handle cases where src or dst can be a depth texture while the other is a color texture - requires a render pass to emulate + auto src_subres = m_rtts.get_surface_subresource_if_applicable(src_address, src.width, src.height, src.pitch, true, true, true); + src_is_render_target = src_subres.surface != nullptr; + + //Create source texture if does not exist + if (!src_is_render_target) + { + auto preloaded_texture = find_texture_from_dimensions(src_address, src.pitch * src.slice_h, src.width, src.slice_h); + + if (preloaded_texture != nullptr) + { + vram_texture = preloaded_texture->get_texture().get(); + } + else + { + flush_address(src_address, dev, cmd, memory_types, submit_queue); + writer_lock lock(m_cache_mutex); + + //Upload texture from CPU + vk::image *image = new vk::image(*vk::get_current_renderer(), memory_types.device_local, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + VK_IMAGE_TYPE_2D, + src_vk_format, + src.width, src.slice_h, 1, 1, 1, VK_SAMPLE_COUNT_1_BIT, VK_IMAGE_LAYOUT_UNDEFINED, + VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT, 0); + + vk::image_view *view = new vk::image_view(*vk::get_current_renderer(), image->value, VK_IMAGE_VIEW_TYPE_2D, src_vk_format, + { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_B, VK_COMPONENT_SWIZZLE_A }, + { VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1 }); + + cached_texture_section& region = find_cached_texture(dst.rsx_address, src.pitch * src.slice_h, true, src.width, src.slice_h, 1); + region.reset(src.rsx_address, src.pitch * src.slice_h); + region.create(src.width, src.slice_h, 1, 1, view, dest_texture); + region.protect(utils::protection::ro); + region.set_dirty(false); + + read_only_range = region.get_min_max(read_only_range); + + vk::enter_uninterruptible(); + + std::vector layout(1); + auto &subres = layout.back(); + subres.width_in_block = src.width; + subres.height_in_block = src.slice_h; + subres.pitch_in_bytes = src.pitch; + subres.depth = 1; + subres.data = {(const gsl::byte*)src.pixels, src.pitch * src.slice_h}; + + copy_mipmaped_image_using_buffer(cmd, image->value, layout, src_vk_format, false, 1, + upload_heap, upload_buffer); + + vk::leave_uninterruptible(); + } + } + else + { + if (src_subres.w != clip_dimensions.width || + src_subres.h != clip_dimensions.height) + { + f32 subres_scaling_x = (f32)src.pitch / src_subres.surface->native_pitch; + + dst_area.x2 = (int)(src_subres.w * scale_x * subres_scaling_x); + dst_area.y2 = (int)(src_subres.h * scale_y); + } + + src_area.x2 = src_subres.w; + src_area.y2 = src_subres.h; + + src_area.x1 += src_subres.x; + src_area.x2 += src_subres.x; + src_area.y1 += src_subres.y; + src_area.y2 += src_subres.y; + + vram_texture = src_subres.surface; + } + + //Validate clip offsets (Persona 4 Arena at 720p) + //Check if can fit + //NOTE: It is possible that the check is simpler (if (clip_x >= clip_width)) + //Needs verification + if ((dst.offset_x + dst.clip_x + dst.clip_width) > max_dst_width) dst.clip_x = 0; + if ((dst.offset_y + dst.clip_y + dst.clip_height) > max_dst_height) dst.clip_y = 0; + + if (dst.clip_x || dst.clip_y) + { + //Reproject clip offsets onto source + const u16 scaled_clip_offset_x = (const u16)((f32)dst.clip_x / scale_x); + const u16 scaled_clip_offset_y = (const u16)((f32)dst.clip_y / scale_y); + + src_area.x1 += scaled_clip_offset_x; + src_area.x2 += scaled_clip_offset_x; + src_area.y1 += scaled_clip_offset_y; + src_area.y2 += scaled_clip_offset_y; + } + + bool dest_exists = dest_texture != nullptr; + const VkFormat dst_vk_format = dst_is_argb8 ? VK_FORMAT_R8G8B8A8_UNORM : VK_FORMAT_R5G6B5_UNORM_PACK16; + const u8 bpp = dst_is_argb8 ? 4 : 2; + const u32 real_width = dst.pitch / bpp; + + if (!dest_exists) + { + dest_texture = new vk::image(*vk::get_current_renderer(), memory_types.device_local, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, + VK_IMAGE_TYPE_2D, + dst_vk_format, + real_width, dst.clip_height, 1, 1, 1, VK_SAMPLE_COUNT_1_BIT, VK_IMAGE_LAYOUT_UNDEFINED, + VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT, 0); + } + + //Copy data + copy_scaled_image(cmd, vram_texture->value, dest_texture->value, vram_texture->current_layout, dest_texture->current_layout, + src_area.x1, src_area.y1, src_w, src_h, dst_area.x1, dst_area.y1, dst.clip_width, dst.clip_height, 1, VK_IMAGE_ASPECT_COLOR_BIT); + + if (dest_exists) + return true; + + //TODO: Verify if any titles ever scale into CPU memory. It defeats the purpose of uploading data to the GPU, but it could happen + //If so, add this texture to the no_access queue not the read_only queue + cached_texture_section& region = find_cached_texture(dst.rsx_address, dst.pitch * dst.clip_height, true, real_width, dst.clip_height, 1); + writer_lock lock(m_cache_mutex); + + //These textures are completely GPU resident so we dont watch for CPU access + //There's no data to be fetched from the CPU + //Its is possible for a title to attempt to read from the region, but the CPU path should be used in such cases + + vk::image_view *view = new vk::image_view(*vk::get_current_renderer(), dest_texture->value, VK_IMAGE_VIEW_TYPE_2D, dst_vk_format, + { VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_B, VK_COMPONENT_SWIZZLE_A }, + { VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1 }); + + region.reset(dst.rsx_address, dst.pitch * dst.clip_height); + region.create(real_width, dst.clip_height, 1, 1, view, dest_texture); + region.protect(utils::protection::rw); + region.set_dirty(false); + + read_only_range = region.get_min_max(read_only_range); + + return true; + } }; } diff --git a/rpcs3/Emu/RSX/rsx_methods.cpp b/rpcs3/Emu/RSX/rsx_methods.cpp index bc7402e917..1079b93b88 100644 --- a/rpcs3/Emu/RSX/rsx_methods.cpp +++ b/rpcs3/Emu/RSX/rsx_methods.cpp @@ -580,37 +580,6 @@ namespace rsx return; } - if (dst_dma == CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER) - { - //HACK: it's extension of the flip-hack. remove this when textures cache would be properly implemented - for (int i = 0; i < rsx::limits::color_buffers_count; ++i) - { - u32 begin = rsx->display_buffers[i].offset; - - if (dst_offset < begin || !begin) - { - continue; - } - - if (rsx->display_buffers[i].width < 720 || rsx->display_buffers[i].height < 480) - { - continue; - } - - if (begin == dst_offset) - { - return; - } - - u32 end = begin + rsx->display_buffers[i].height * rsx->display_buffers[i].pitch; - - if (dst_offset < end) - { - return; - } - } - } - const u32 in_bpp = (src_color_format == rsx::blit_engine::transfer_source_format::r5g6b5) ? 2 : 4; // bytes per pixel const u32 out_bpp = (dst_color_format == rsx::blit_engine::transfer_destination_format::r5g6b5) ? 2 : 4;