From 858014b71812bbb6ccfaec6e520a92f2be7a2288 Mon Sep 17 00:00:00 2001 From: kd-11 Date: Wed, 4 Sep 2019 22:19:58 +0300 Subject: [PATCH] rsx: Experiments with nul sink --- rpcs3/Emu/RSX/Common/TextureUtils.h | 3 +- rpcs3/Emu/RSX/Common/surface_store.h | 3 + rpcs3/Emu/RSX/Common/surface_utils.h | 5 + rpcs3/Emu/RSX/Common/texture_cache.h | 140 +++++---- rpcs3/Emu/RSX/Common/texture_cache_utils.h | 11 +- rpcs3/Emu/RSX/GL/GLHelpers.h | 2 +- rpcs3/Emu/RSX/GL/GLTextureCache.h | 169 +++++------ rpcs3/Emu/RSX/VK/VKCompute.h | 26 +- rpcs3/Emu/RSX/VK/VKGSRender.cpp | 2 +- rpcs3/Emu/RSX/VK/VKHelpers.h | 2 +- rpcs3/Emu/RSX/VK/VKTexture.cpp | 37 ++- rpcs3/Emu/RSX/VK/VKTextureCache.h | 321 ++++++++++++--------- 12 files changed, 420 insertions(+), 301 deletions(-) diff --git a/rpcs3/Emu/RSX/Common/TextureUtils.h b/rpcs3/Emu/RSX/Common/TextureUtils.h index 19e64b22a4..aeca9b0c86 100644 --- a/rpcs3/Emu/RSX/Common/TextureUtils.h +++ b/rpcs3/Emu/RSX/Common/TextureUtils.h @@ -12,7 +12,8 @@ namespace rsx shader_read = 1, blit_engine_src = 2, blit_engine_dst = 4, - framebuffer_storage = 8 + framebuffer_storage = 8, + dma = 16 }; enum texture_colorspace : u32 diff --git a/rpcs3/Emu/RSX/Common/surface_store.h b/rpcs3/Emu/RSX/Common/surface_store.h index c804e75b7b..edbe849a3e 100644 --- a/rpcs3/Emu/RSX/Common/surface_store.h +++ b/rpcs3/Emu/RSX/Common/surface_store.h @@ -775,6 +775,9 @@ namespace rsx continue; auto surface = tex_info.second.get(); + if (access == rsx::surface_access::transfer && surface->write_through()) + continue; + if (!rsx::pitch_compatible(surface, required_pitch, required_height)) continue; diff --git a/rpcs3/Emu/RSX/Common/surface_utils.h b/rpcs3/Emu/RSX/Common/surface_utils.h index 82c2da644c..03fdb4deea 100644 --- a/rpcs3/Emu/RSX/Common/surface_utils.h +++ b/rpcs3/Emu/RSX/Common/surface_utils.h @@ -309,6 +309,11 @@ namespace rsx return (state_flags != rsx::surface_state_flags::ready) || !old_contents.empty(); } + bool write_through() const + { + return (state_flags & rsx::surface_state_flags::erase_bkgnd) && old_contents.empty(); + } + #if (ENABLE_SURFACE_CACHE_DEBUG) u64 hash_block() const { diff --git a/rpcs3/Emu/RSX/Common/texture_cache.h b/rpcs3/Emu/RSX/Common/texture_cache.h index 920755accb..0f5e6cdd4f 100644 --- a/rpcs3/Emu/RSX/Common/texture_cache.h +++ b/rpcs3/Emu/RSX/Common/texture_cache.h @@ -362,6 +362,7 @@ namespace rsx rsx::texture_upload_context context, rsx::texture_dimension_extended type, texture_create_flags flags) = 0; virtual section_storage_type* upload_image_from_cpu(commandbuffer_type&, const address_range &rsx_range, u16 width, u16 height, u16 depth, u16 mipmaps, u16 pitch, u32 gcm_format, texture_upload_context context, const std::vector& subresource_layout, rsx::texture_dimension_extended type, bool swizzled) = 0; + virtual section_storage_type* create_nul_section(commandbuffer_type&, const address_range &rsx_range, bool memory_load) = 0; virtual void enforce_surface_creation_type(section_storage_type& section, u32 gcm_format, texture_create_flags expected) = 0; virtual void insert_texture_barrier(commandbuffer_type&, image_storage_type* tex) = 0; virtual image_view_type generate_cubemap_from_images(commandbuffer_type&, u32 gcm_format, u16 size, const std::vector& sources, const texture_channel_remap_t& remap_vector) = 0; @@ -2429,6 +2430,7 @@ namespace rsx // Check if src/dst are parts of render targets typename surface_store_type::surface_overlap_info dst_subres; + bool use_null_region = false; if (dst_address > 0xc0000000) { // TODO: HACK @@ -2442,6 +2444,7 @@ namespace rsx // 1. Invalidate surfaces in range // 2. Proceed as normal, blit into a 'normal' surface and any upload routines should catch it m_rtts.invalidate_range(utils::address_range::start_length(dst_address, dst.pitch * dst_h)); + use_null_region = (scale_x == 1.f && scale_y == 1.f); } // TODO: Handle cases where src or dst can be a depth texture while the other is a color texture - requires a render pass to emulate @@ -2545,7 +2548,9 @@ namespace rsx if (!dst_is_render_target) { // Check for any available region that will fit this one - auto overlapping_surfaces = find_texture_from_range(address_range::start_length(dst_address, dst.pitch * dst.clip_height), dst.pitch, rsx::texture_upload_context::blit_engine_dst); + const auto required_type = (use_null_region) ? texture_upload_context::dma : texture_upload_context::blit_engine_dst; + const auto dst_range = address_range::start_length(dst_address, dst.pitch * dst.clip_height); + auto overlapping_surfaces = find_texture_from_range(dst_range, dst.pitch, required_type); for (const auto &surface : overlapping_surfaces) { if (!surface->is_locked()) @@ -2561,6 +2566,17 @@ namespace rsx continue; } + if (use_null_region) + { + if (dst_range.inside(surface->get_section_range())) + { + // Attach to existing region + cached_dest = surface; + } + + continue; + } + const auto this_address = surface->get_section_base(); if (this_address > dst_address) { @@ -2609,9 +2625,9 @@ namespace rsx // Check if available target is acceptable // TODO: Check for other types of format mismatch - bool format_mismatch = false; - if (cached_dest) + if (cached_dest && !use_null_region) { + bool format_mismatch = false; if (cached_dest->is_depth_texture() != src_subres.is_depth) { // Dest surface has the wrong 'aspect' @@ -2635,14 +2651,14 @@ namespace rsx break; } } - } - if (format_mismatch) - { - // The invalidate call before creating a new target will remove this section - cached_dest = nullptr; - dest_texture = 0; - dst_area = old_dst_area; + if (format_mismatch) + { + // The invalidate call before creating a new target will remove this section + cached_dest = nullptr; + dest_texture = 0; + dst_area = old_dst_area; + } } // Create source texture if does not exist @@ -2795,7 +2811,7 @@ namespace rsx else gcm_format = (dst_is_argb8) ? CELL_GCM_TEXTURE_A8R8G8B8 : CELL_GCM_TEXTURE_R5G6B5; - if (cached_dest) + if (cached_dest && !use_null_region) { // Prep surface auto channel_order = src_is_render_target ? rsx::texture_create_flags::native_component_order : @@ -2847,9 +2863,9 @@ namespace rsx const auto modified_range = utils::address_range::start_length(dst_address, mem_length); - if (dest_texture == 0) + if (!cached_dest && !dst_is_render_target) { - verify(HERE), !dst_is_render_target; + verify(HERE), !dest_texture; // Need to calculate the minium required size that will fit the data, anchored on the rsx_address // If the application starts off with an 'inseted' section, the guessed dimensions may not fit! @@ -2859,55 +2875,72 @@ namespace rsx const u32 section_length = std::max(write_end, expected_end) - dst.rsx_address; dst_dimensions.height = section_length / dst.pitch; - // render target data is already in correct swizzle layout - auto channel_order = src_is_render_target ? rsx::texture_create_flags::native_component_order : - dst_is_argb8 ? rsx::texture_create_flags::default_component_order : - rsx::texture_create_flags::swapped_native_component_order; - - // Translate dst_area into the 'full' dst block based on dst.rsx_address as (0, 0) - dst_area.x1 += dst.offset_x; - dst_area.x2 += dst.offset_x; - dst_area.y1 += dst.offset_y; - dst_area.y2 += dst.offset_y; - lock.upgrade(); // NOTE: Write flag set to remove all other overlapping regions (e.g shader_read or blit_src) const auto rsx_range = address_range::start_length(dst.rsx_address, section_length); invalidate_range_impl_base(cmd, rsx_range, invalidation_cause::write, std::forward(extras)...); - if (!dst_area.x1 && !dst_area.y1 && dst_area.x2 == dst_dimensions.width && dst_area.y2 == dst_dimensions.height) + if (LIKELY(use_null_region)) { - cached_dest = create_new_texture(cmd, rsx_range, dst_dimensions.width, dst_dimensions.height, 1, 1, dst.pitch, - gcm_format, rsx::texture_upload_context::blit_engine_dst, rsx::texture_dimension_extended::texture_dimension_2d, - channel_order); + bool force_dma_load = false; + if ((dst_w * dst_bpp) != dst.pitch) + { + // Keep Cell from touching the range we need + const auto prot_range = modified_range.to_page_range(); + utils::memory_protect(vm::base(prot_range.start), prot_range.length(), utils::protection::no); + + force_dma_load = true; + } + + cached_dest = create_nul_section(cmd, rsx_range, force_dma_load); } else { - // HACK: workaround for data race with Cell - // Pre-lock the memory range we'll be touching, then load with super_ptr - const auto prot_range = modified_range.to_page_range(); - utils::memory_protect(vm::base(prot_range.start), prot_range.length(), utils::protection::no); + // render target data is already in correct swizzle layout + auto channel_order = src_is_render_target ? rsx::texture_create_flags::native_component_order : + dst_is_argb8 ? rsx::texture_create_flags::default_component_order : + rsx::texture_create_flags::swapped_native_component_order; - const u16 pitch_in_block = dst.pitch / dst_bpp; - std::vector subresource_layout; - rsx_subresource_layout subres = {}; - subres.width_in_block = dst_dimensions.width; - subres.height_in_block = dst_dimensions.height; - subres.pitch_in_block = pitch_in_block; - subres.depth = 1; - subres.data = { reinterpret_cast(vm::get_super_ptr(dst.rsx_address)), dst.pitch * dst_dimensions.height }; - subresource_layout.push_back(subres); + // Translate dst_area into the 'full' dst block based on dst.rsx_address as (0, 0) + dst_area.x1 += dst.offset_x; + dst_area.x2 += dst.offset_x; + dst_area.y1 += dst.offset_y; + dst_area.y2 += dst.offset_y; - cached_dest = upload_image_from_cpu(cmd, rsx_range, dst_dimensions.width, dst_dimensions.height, 1, 1, dst.pitch, - gcm_format, rsx::texture_upload_context::blit_engine_dst, subresource_layout, - rsx::texture_dimension_extended::texture_dimension_2d, false); + if (!dst_area.x1 && !dst_area.y1 && dst_area.x2 == dst_dimensions.width && dst_area.y2 == dst_dimensions.height) + { + cached_dest = create_new_texture(cmd, rsx_range, dst_dimensions.width, dst_dimensions.height, 1, 1, dst.pitch, + gcm_format, rsx::texture_upload_context::blit_engine_dst, rsx::texture_dimension_extended::texture_dimension_2d, + channel_order); + } + else + { + // HACK: workaround for data race with Cell + // Pre-lock the memory range we'll be touching, then load with super_ptr + const auto prot_range = modified_range.to_page_range(); + utils::memory_protect(vm::base(prot_range.start), prot_range.length(), utils::protection::no); - enforce_surface_creation_type(*cached_dest, gcm_format, channel_order); + const u16 pitch_in_block = dst.pitch / dst_bpp; + std::vector subresource_layout; + rsx_subresource_layout subres = {}; + subres.width_in_block = dst_dimensions.width; + subres.height_in_block = dst_dimensions.height; + subres.pitch_in_block = pitch_in_block; + subres.depth = 1; + subres.data = { reinterpret_cast(vm::get_super_ptr(dst.rsx_address)), dst.pitch * dst_dimensions.height }; + subresource_layout.push_back(subres); + + cached_dest = upload_image_from_cpu(cmd, rsx_range, dst_dimensions.width, dst_dimensions.height, 1, 1, dst.pitch, + gcm_format, rsx::texture_upload_context::blit_engine_dst, subresource_layout, + rsx::texture_dimension_extended::texture_dimension_2d, false); + + enforce_surface_creation_type(*cached_dest, gcm_format, channel_order); + } + + dest_texture = cached_dest->get_raw_texture(); + typeless_info.dst_context = texture_upload_context::blit_engine_dst; } - - dest_texture = cached_dest->get_raw_texture(); - typeless_info.dst_context = texture_upload_context::blit_engine_dst; } verify(HERE), cached_dest || dst_is_render_target; @@ -2979,8 +3012,15 @@ namespace rsx dst_subres.surface->transform_blit_coordinates(rsx::surface_access::transfer, dst_area); } - typeless_info.analyse(); - blitter.scale_image(cmd, vram_texture, dest_texture, src_area, dst_area, interpolate, is_depth_blit, typeless_info); + if (!use_null_region) + { + typeless_info.analyse(); + blitter.scale_image(cmd, vram_texture, dest_texture, src_area, dst_area, interpolate, is_depth_blit, typeless_info); + } + else + { + cached_dest->dma_transfer(cmd, vram_texture, src_area, modified_range, dst.pitch); + } blit_op_result result = true; result.is_depth = is_depth_blit; diff --git a/rpcs3/Emu/RSX/Common/texture_cache_utils.h b/rpcs3/Emu/RSX/Common/texture_cache_utils.h index b5b29910f7..73b67d2e15 100644 --- a/rpcs3/Emu/RSX/Common/texture_cache_utils.h +++ b/rpcs3/Emu/RSX/Common/texture_cache_utils.h @@ -1504,7 +1504,7 @@ namespace rsx void add_flush_exclusion(const address_range& rng) { - AUDIT(exists() && is_locked() && is_flushable()); + AUDIT(is_locked() && is_flushable()); const auto _rng = rng.get_intersect(get_section_range()); flush_exclusions.merge(_rng); } @@ -1710,7 +1710,14 @@ namespace rsx bool exists() const { - return derived()->exists(); + if (derived()->exists()) + { + return true; + } + else + { + return (context == rsx::texture_upload_context::dma && is_locked()); + } } }; diff --git a/rpcs3/Emu/RSX/GL/GLHelpers.h b/rpcs3/Emu/RSX/GL/GLHelpers.h index 4d593b202e..e89e7b662b 100644 --- a/rpcs3/Emu/RSX/GL/GLHelpers.h +++ b/rpcs3/Emu/RSX/GL/GLHelpers.h @@ -883,7 +883,7 @@ namespace gl void data(GLsizeiptr size, const void* data_ = nullptr, GLenum usage = GL_STREAM_DRAW) { - verify(HERE), m_memory_type == memory_type::undefined; + verify(HERE), m_memory_type != memory_type::local; target target_ = current_target(); save_binding_state save(target_, *this); diff --git a/rpcs3/Emu/RSX/GL/GLTextureCache.h b/rpcs3/Emu/RSX/GL/GLTextureCache.h index f76b46a75c..e4faf28cf0 100644 --- a/rpcs3/Emu/RSX/GL/GLTextureCache.h +++ b/rpcs3/Emu/RSX/GL/GLTextureCache.h @@ -61,72 +61,6 @@ namespace gl texture::format format = texture::format::rgba; texture::type type = texture::type::ubyte; - u8 get_pixel_size(texture::format fmt_, texture::type type_) - { - u8 size = 1; - switch (type_) - { - case texture::type::ubyte: - case texture::type::sbyte: - break; - case texture::type::ushort: - case texture::type::sshort: - case texture::type::f16: - size = 2; - break; - case texture::type::ushort_5_6_5: - case texture::type::ushort_5_6_5_rev: - case texture::type::ushort_4_4_4_4: - case texture::type::ushort_4_4_4_4_rev: - case texture::type::ushort_5_5_5_1: - case texture::type::ushort_1_5_5_5_rev: - return 2; - case texture::type::uint_8_8_8_8: - case texture::type::uint_8_8_8_8_rev: - case texture::type::uint_10_10_10_2: - case texture::type::uint_2_10_10_10_rev: - case texture::type::uint_24_8: - return 4; - case texture::type::f32: - case texture::type::sint: - case texture::type::uint: - size = 4; - break; - default: - LOG_ERROR(RSX, "Unsupported texture type"); - } - - switch (fmt_) - { - case texture::format::r: - break; - case texture::format::rg: - size *= 2; - break; - case texture::format::rgb: - case texture::format::bgr: - size *= 3; - break; - case texture::format::rgba: - case texture::format::bgra: - size *= 4; - break; - - //Depth formats.. - case texture::format::depth: - size = 2; - break; - case texture::format::depth_stencil: - size = 4; - break; - default: - LOG_ERROR(RSX, "Unsupported rtt format %d", (GLenum)fmt_); - size = 4; - } - - return size; - } - void init_buffer(const gl::texture* src) { const u32 vram_size = src->pitch() * src->height(); @@ -218,6 +152,61 @@ namespace gl } } + void dma_transfer(gl::command_context& cmd, gl::texture* src, const areai& /*src_area*/, const utils::address_range& /*valid_range*/, u32 pitch) + { + init_buffer(src); + + glGetError(); + glBindBuffer(GL_PIXEL_PACK_BUFFER, pbo_id); + + if (context == rsx::texture_upload_context::dma) + { + // Determine unpack config dynamically + const auto format_info = gl::get_format_type(src->get_internal_format()); + format = static_cast(std::get<0>(format_info)); + type = static_cast(std::get<1>(format_info)); + + if ((src->aspect() & gl::image_aspect::stencil) == 0) + { + pack_unpack_swap_bytes = std::get<2>(format_info); + } + else + { + // Z24S8 decode is done on the CPU for now + pack_unpack_swap_bytes = false; + } + } + + pixel_pack_settings pack_settings; + pack_settings.alignment(1); + pack_settings.swap_bytes(pack_unpack_swap_bytes); + + src->copy_to(nullptr, format, type, pack_settings); + real_pitch = src->pitch(); + rsx_pitch = pitch; + + if (auto error = glGetError()) + { + if (error == GL_OUT_OF_MEMORY && ::gl::get_driver_caps().vendor_AMD) + { + // AMD driver bug + // Pixel transfer fails with GL_OUT_OF_MEMORY. Usually happens with float textures or operations attempting to swap endianness. + // Failed operations also leak a large amount of memory + LOG_ERROR(RSX, "Memory transfer failure (AMD bug). Please update your driver to Adrenalin 19.4.3 or newer. Format=0x%x, Type=0x%x, Swap=%d", (u32)format, (u32)type, pack_unpack_swap_bytes); + } + else + { + LOG_ERROR(RSX, "Memory transfer failed with error 0x%x. Format=0x%x, Type=0x%x", error, (u32)format, (u32)type); + } + } + + glBindBuffer(GL_PIXEL_PACK_BUFFER, GL_NONE); + + m_fence.reset(); + synchronized = true; + sync_timestamp = get_system_time(); + } + void copy_texture(gl::command_context& cmd, bool miss) { ASSERT(exists()); @@ -284,38 +273,7 @@ namespace gl } } - init_buffer(target_texture); - - glGetError(); - glBindBuffer(GL_PIXEL_PACK_BUFFER, pbo_id); - - pixel_pack_settings pack_settings; - pack_settings.alignment(1); - pack_settings.swap_bytes(pack_unpack_swap_bytes); - - target_texture->copy_to(nullptr, format, type, pack_settings); - real_pitch = target_texture->pitch(); - - if (auto error = glGetError()) - { - if (error == GL_OUT_OF_MEMORY && ::gl::get_driver_caps().vendor_AMD) - { - // AMD driver bug - // Pixel transfer fails with GL_OUT_OF_MEMORY. Usually happens with float textures or operations attempting to swap endianness. - // Failed operations also leak a large amount of memory - LOG_ERROR(RSX, "Memory transfer failure (AMD bug). Please update your driver to Adrenalin 19.4.3 or newer. Format=0x%x, Type=0x%x, Swap=%d", (u32)format, (u32)type, pack_unpack_swap_bytes); - } - else - { - LOG_ERROR(RSX, "Memory transfer failed with error 0x%x. Format=0x%x, Type=0x%x", error, (u32)format, (u32)type); - } - } - - glBindBuffer(GL_PIXEL_PACK_BUFFER, GL_NONE); - - m_fence.reset(); - synchronized = true; - sync_timestamp = get_system_time(); + dma_transfer(cmd, target_texture, {}, {}, rsx_pitch); } void fill_texture(gl::texture* tex) @@ -889,6 +847,21 @@ namespace gl return &cached; } + cached_texture_section* create_nul_section(gl::command_context& cmd, const utils::address_range& rsx_range, bool memory_load) override + { + auto& cached = *find_cached_texture(rsx_range, RSX_GCM_FORMAT_IGNORED, true, false); + ASSERT(!cached.is_locked()); + + // Prepare section + cached.reset(rsx_range); + cached.set_context(rsx::texture_upload_context::dma); + cached.set_dirty(false); + + no_access_range = cached.get_min_max(no_access_range, rsx::section_bounds::locked_range); + update_cache_tag(); + return &cached; + } + cached_texture_section* upload_image_from_cpu(gl::command_context &cmd, const utils::address_range& rsx_range, u16 width, u16 height, u16 depth, u16 mipmaps, u16 pitch, u32 gcm_format, rsx::texture_upload_context context, const std::vector& subresource_layout, rsx::texture_dimension_extended type, bool input_swizzled) override { diff --git a/rpcs3/Emu/RSX/VK/VKCompute.h b/rpcs3/Emu/RSX/VK/VKCompute.h index 7d0939be04..f84862aefc 100644 --- a/rpcs3/Emu/RSX/VK/VKCompute.h +++ b/rpcs3/Emu/RSX/VK/VKCompute.h @@ -443,6 +443,7 @@ namespace vk } }; + template struct cs_gather_d24x8 : cs_interleave_task { cs_gather_d24x8() @@ -456,13 +457,24 @@ namespace vk " stencil_shift = (index % 4) * 8;\n" " stencil = data[stencil_offset + s_offset];\n" " stencil = (stencil >> stencil_shift) & 0xFF;\n" - " value = (depth << 8) | stencil;\n" + " value = (depth << 8) | stencil;\n"; + + if constexpr (!_SwapBytes) + { + work_kernel += " data[index] = value;\n"; + } + else + { + work_kernel += + " data[index] = bswap_u32(value);\n"; + } cs_shuffle_base::build(""); } }; + template struct cs_gather_d32x8 : cs_interleave_task { cs_gather_d32x8() @@ -476,8 +488,18 @@ namespace vk " stencil_shift = (index % 4) * 8;\n" " stencil = data[stencil_offset + s_offset];\n" " stencil = (stencil >> stencil_shift) & 0xFF;\n" - " value = (depth << 8) | stencil;\n" + " value = (depth << 8) | stencil;\n"; + + if constexpr (!_SwapBytes) + { + work_kernel += " data[index] = value;\n"; + } + else + { + work_kernel += + " data[index] = bswap_u32(value);\n"; + } cs_shuffle_base::build(""); } diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp index 30dd65e6a0..42d48e6769 100644 --- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp +++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp @@ -2949,7 +2949,7 @@ void VKGSRender::prepare_rtts(rsx::framebuffer_creation_context context) const u32 gcm_format = (m_depth_surface_info.depth_format != rsx::surface_depth_format::z16) ? CELL_GCM_TEXTURE_DEPTH16 : CELL_GCM_TEXTURE_DEPTH24_D8; m_texture_cache.lock_memory_region( *m_current_command_buffer, m_rtts.m_bound_depth_stencil.second, surface_range, true, - m_depth_surface_info.width, m_depth_surface_info.height, m_framebuffer_layout.actual_zeta_pitch, gcm_format, false); + m_depth_surface_info.width, m_depth_surface_info.height, m_framebuffer_layout.actual_zeta_pitch, gcm_format, true); } else { diff --git a/rpcs3/Emu/RSX/VK/VKHelpers.h b/rpcs3/Emu/RSX/VK/VKHelpers.h index 451a4b128b..569b8c0fe5 100644 --- a/rpcs3/Emu/RSX/VK/VKHelpers.h +++ b/rpcs3/Emu/RSX/VK/VKHelpers.h @@ -148,7 +148,7 @@ namespace vk void change_image_layout(VkCommandBuffer cmd, vk::image *image, VkImageLayout new_layout, const VkImageSubresourceRange& range); void change_image_layout(VkCommandBuffer cmd, vk::image *image, VkImageLayout new_layout); - void copy_image_to_buffer(VkCommandBuffer cmd, const vk::image* src, const vk::buffer* dst, const VkBufferImageCopy& region); + void copy_image_to_buffer(VkCommandBuffer cmd, const vk::image* src, const vk::buffer* dst, const VkBufferImageCopy& region, bool swap_bytes = false); void copy_buffer_to_image(VkCommandBuffer cmd, const vk::buffer* src, const vk::image* dst, const VkBufferImageCopy& region); void copy_image_typeless(const command_buffer &cmd, image *src, image *dst, const areai& src_rect, const areai& dst_rect, diff --git a/rpcs3/Emu/RSX/VK/VKTexture.cpp b/rpcs3/Emu/RSX/VK/VKTexture.cpp index ba33ba9d70..7b2e528bf1 100644 --- a/rpcs3/Emu/RSX/VK/VKTexture.cpp +++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp @@ -56,7 +56,7 @@ namespace vk } } - void copy_image_to_buffer(VkCommandBuffer cmd, const vk::image* src, const vk::buffer* dst, const VkBufferImageCopy& region) + void copy_image_to_buffer(VkCommandBuffer cmd, const vk::image* src, const vk::buffer* dst, const VkBufferImageCopy& region, bool swap_bytes) { // Always validate verify("Invalid image layout!" HERE), @@ -66,6 +66,7 @@ namespace vk { default: { + verify("Implicit byteswap option not supported for speficied format" HERE), !swap_bytes; vkCmdCopyImageToBuffer(cmd, src->value, src->current_layout, dst->value, 1, ®ion); break; } @@ -83,8 +84,9 @@ namespace vk const auto allocation_end = region.bufferOffset + packed_length + in_depth_size + in_stencil_size; verify(HERE), dst->size() >= allocation_end; - const VkDeviceSize z_offset = align(region.bufferOffset + packed_length, 256); - const VkDeviceSize s_offset = align(z_offset + in_depth_size, 256); + const auto data_offset = u32(region.bufferOffset); + const auto z_offset = align(data_offset + packed_length, 256); + const auto s_offset = align(z_offset + in_depth_size, 256); // 1. Copy the depth and stencil blocks to separate banks VkBufferImageCopy sub_regions[2]; @@ -97,20 +99,34 @@ namespace vk // 2. Interleave the separated data blocks with a compute job vk::cs_interleave_task *job; - if (src->format() == VK_FORMAT_D24_UNORM_S8_UINT) + if (LIKELY(!swap_bytes)) { - job = vk::get_compute_task(); + if (src->format() == VK_FORMAT_D24_UNORM_S8_UINT) + { + job = vk::get_compute_task>(); + } + else + { + job = vk::get_compute_task>(); + } } else { - job = vk::get_compute_task(); + if (src->format() == VK_FORMAT_D24_UNORM_S8_UINT) + { + job = vk::get_compute_task>(); + } + else + { + job = vk::get_compute_task>(); + } } vk::insert_buffer_memory_barrier(cmd, dst->value, z_offset, in_depth_size + in_stencil_size, VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT); - job->run(cmd, dst, (u32)region.bufferOffset, packed_length, (u32)z_offset, (u32)s_offset); + job->run(cmd, dst, data_offset, packed_length, z_offset, s_offset); vk::insert_buffer_memory_barrier(cmd, dst->value, region.bufferOffset, packed_length, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, @@ -145,8 +161,9 @@ namespace vk const auto allocation_end = region.bufferOffset + packed_length + in_depth_size + in_stencil_size; verify("Out of memory (compute heap). Lower your resolution scale setting." HERE), src->size() >= allocation_end; - const VkDeviceSize z_offset = align(region.bufferOffset + packed_length, 256); - const VkDeviceSize s_offset = align(z_offset + in_depth_size, 256); + const auto data_offset = u32(region.bufferOffset); + const auto z_offset = align(data_offset + packed_length, 256); + const auto s_offset = align(z_offset + in_depth_size, 256); // Zero out the stencil block vkCmdFillBuffer(cmd, src->value, s_offset, in_stencil_size, 0); @@ -166,7 +183,7 @@ namespace vk job = vk::get_compute_task(); } - job->run(cmd, src, (u32)region.bufferOffset, packed_length, (u32)z_offset, (u32)s_offset); + job->run(cmd, src, data_offset, packed_length, z_offset, s_offset); vk::insert_buffer_memory_barrier(cmd, src->value, z_offset, in_depth_size + in_stencil_size, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.h b/rpcs3/Emu/RSX/VK/VKTextureCache.h index b444870e40..0258426f35 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.h +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.h @@ -151,8 +151,13 @@ namespace vk VkFormat get_format() const { + if (context == rsx::texture_upload_context::dma) + { + return VK_FORMAT_R32_UINT; + } + ASSERT(vram_texture != nullptr); - return vram_texture->info.format; + return vram_texture->format(); } bool is_flushed() const @@ -161,18 +166,9 @@ namespace vk return flushed; } - void copy_texture(vk::command_buffer& cmd, bool miss) + void dma_transfer(vk::command_buffer& cmd, vk::image* src, const areai& src_area, const utils::address_range& valid_range, u32 pitch) { - ASSERT(exists()); - - if (LIKELY(!miss)) - { - baseclass::on_speculative_flush(); - } - else - { - baseclass::on_miss(); - } + verify(HERE), src->samples() == 1; if (m_device == nullptr) { @@ -186,9 +182,146 @@ namespace vk vkCreateEvent(*m_device, &createInfo, nullptr, &dma_fence); } + src->push_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL); + + const auto internal_bpp = vk::get_format_texel_width(src->format()); + const auto transfer_width = (u32)src_area.width(); + const auto transfer_height = (u32)src_area.height(); + real_pitch = internal_bpp * transfer_width; + rsx_pitch = pitch; + + const bool is_depth_stencil = !!(src->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT); + if (is_depth_stencil || pack_unpack_swap_bytes) + { + const auto section_length = valid_range.length(); + const auto transfer_pitch = real_pitch; + const auto task_length = transfer_pitch * src_area.height(); + + auto working_buffer = vk::get_scratch_buffer(); + auto final_mapping = vk::map_dma(cmd, valid_range.start, section_length); + + VkBufferImageCopy region = {}; + region.imageSubresource = { src->aspect(), 0, 0, 1 }; + region.imageOffset = { src_area.x1, src_area.y1, 0 }; + region.imageExtent = { transfer_width, transfer_height, 1 }; + vk::copy_image_to_buffer(cmd, src, working_buffer, region, (is_depth_stencil && pack_unpack_swap_bytes)); + + // NOTE: For depth-stencil formats, copying to buffer and byteswap are combined into one step above + if (pack_unpack_swap_bytes && !is_depth_stencil) + { + const auto texel_layout = vk::get_format_element_size(src->format()); + const auto elem_size = texel_layout.first; + vk::cs_shuffle_base *shuffle_kernel; + + if (elem_size == 2) + { + shuffle_kernel = vk::get_compute_task(); + } + else if (elem_size == 4) + { + shuffle_kernel = vk::get_compute_task(); + } + else + { + fmt::throw_exception("Unreachable" HERE); + } + + vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length, + VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT); + + shuffle_kernel->run(cmd, working_buffer, task_length); + + vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT); + } + + if (LIKELY(rsx_pitch == real_pitch)) + { + VkBufferCopy copy = {}; + copy.dstOffset = final_mapping.first; + copy.size = section_length; + vkCmdCopyBuffer(cmd, working_buffer->value, final_mapping.second->value, 1, ©); + } + else + { + std::vector copy; + copy.reserve(transfer_height); + + u32 dst_offset = final_mapping.first; + u32 src_offset = 0; + + for (unsigned row = 0; row < transfer_height; ++row) + { + copy.push_back({ src_offset, dst_offset, transfer_pitch }); + src_offset += real_pitch; + dst_offset += rsx_pitch; + } + + vkCmdCopyBuffer(cmd, working_buffer->value, final_mapping.second->value, transfer_height, copy.data()); + } + } + else + { + VkBufferImageCopy region = {}; + region.bufferRowLength = (rsx_pitch / internal_bpp); + region.imageSubresource = { src->aspect(), 0, 0, 1 }; + region.imageOffset = { src_area.x1, src_area.y1, 0 }; + region.imageExtent = { transfer_width, transfer_height, 1 }; + + auto mapping = vk::map_dma(cmd, valid_range.start, valid_range.length()); + region.bufferOffset = mapping.first; + vkCmdCopyImageToBuffer(cmd, src->value, src->current_layout, mapping.second->value, 1, ®ion); + } + + src->pop_layout(cmd); + + if (UNLIKELY(synchronized)) + { + // Replace the wait event with a new one to avoid premature signaling! + vk::get_resource_manager()->dispose(dma_fence); + + VkEventCreateInfo createInfo = {}; + createInfo.sType = VK_STRUCTURE_TYPE_EVENT_CREATE_INFO; + vkCreateEvent(*m_device, &createInfo, nullptr, &dma_fence); + } + else + { + // If this is speculated, it should only occur once + verify(HERE), vkGetEventStatus(*m_device, dma_fence) == VK_EVENT_RESET; + } + + cmd.set_flag(vk::command_buffer::cb_has_dma_transfer); + vkCmdSetEvent(cmd, dma_fence, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT); + + synchronized = true; + sync_timestamp = get_system_time(); + } + + void copy_texture(vk::command_buffer& cmd, bool miss) + { + ASSERT(exists()); + + if (LIKELY(!miss)) + { + verify(HERE), !synchronized; + baseclass::on_speculative_flush(); + } + else + { + baseclass::on_miss(); + } + + if (m_device == nullptr) + { + m_device = &cmd.get_command_pool().get_owner(); + } + vk::image *locked_resource = vram_texture; u32 transfer_width = width; u32 transfer_height = height; + u32 transfer_x = 0, transfer_y = 0; if (context == rsx::texture_upload_context::framebuffer_storage) { @@ -199,12 +332,7 @@ namespace vk transfer_height *= surface->samples_y; } - verify(HERE), locked_resource->samples() == 1; - vk::image* target = locked_resource; - locked_resource->push_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL); - real_pitch = vk::get_format_texel_width(locked_resource->info.format) * locked_resource->width(); - if (transfer_width != locked_resource->width() || transfer_height != locked_resource->height()) { // TODO: Synchronize access to typeles textures @@ -221,14 +349,9 @@ namespace vk target->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL); } - verify(HERE), target->current_layout == VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL; - - // TODO: Read back stencil values (is this really necessary?) const auto internal_bpp = vk::get_format_texel_width(vram_texture->format()); const auto valid_range = get_confirmed_range(); - real_pitch = internal_bpp * transfer_width; - u32 transfer_x = 0, transfer_y = 0; if (const auto section_range = get_section_range(); section_range != valid_range) { if (const auto offset = (valid_range.start - get_section_base())) @@ -250,111 +373,12 @@ namespace vk } } - if ((vram_texture->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT) || - pack_unpack_swap_bytes) - { - const auto section_length = valid_range.length(); - const auto transfer_pitch = transfer_width * internal_bpp; - const auto task_length = transfer_pitch * transfer_height; - - auto working_buffer = vk::get_scratch_buffer(); - auto final_mapping = vk::map_dma(cmd, valid_range.start, section_length); - - VkBufferImageCopy region = {}; - region.imageSubresource = { vram_texture->aspect(), 0, 0, 1 }; - region.imageOffset = { (s32)transfer_x, (s32)transfer_y, 0 }; - region.imageExtent = { transfer_width, transfer_height, 1 }; - vk::copy_image_to_buffer(cmd, target, working_buffer, region); - - const auto texel_layout = vk::get_format_element_size(vram_texture->format()); - const auto elem_size = texel_layout.first; - vk::cs_shuffle_base *shuffle_kernel; - - if (elem_size == 2) - { - shuffle_kernel = vk::get_compute_task(); - } - else if (elem_size == 4) - { - shuffle_kernel = vk::get_compute_task(); - } - else - { - fmt::throw_exception("Unreachable" HERE); - } - - vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length, - VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT); - - shuffle_kernel->run(cmd, working_buffer, task_length); - - vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, - VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT); - - if (LIKELY(rsx_pitch == real_pitch)) - { - VkBufferCopy copy = {}; - copy.dstOffset = final_mapping.first; - copy.size = section_length; - vkCmdCopyBuffer(cmd, working_buffer->value, final_mapping.second->value, 1, ©); - } - else - { - std::vector copy; - copy.reserve(transfer_height); - - u32 dst_offset = final_mapping.first; - u32 src_offset = 0; - - for (unsigned row = 0; row < transfer_height; ++row) - { - copy.push_back({src_offset, dst_offset, transfer_pitch}); - src_offset += real_pitch; - dst_offset += rsx_pitch; - } - - vkCmdCopyBuffer(cmd, working_buffer->value, final_mapping.second->value, transfer_height, copy.data()); - } - } - else - { - VkBufferImageCopy region = {}; - region.bufferRowLength = (rsx_pitch / internal_bpp); - region.imageSubresource = { vram_texture->aspect(), 0, 0, 1 }; - region.imageOffset = { (s32)transfer_x, (s32)transfer_y, 0 }; - region.imageExtent = { transfer_width, transfer_height, 1 }; - - auto mapping = vk::map_dma(cmd, valid_range.start, valid_range.length()); - region.bufferOffset = mapping.first; - vkCmdCopyImageToBuffer(cmd, target->value, target->current_layout, mapping.second->value, 1, ®ion); - } - - locked_resource->pop_layout(cmd); - - if (UNLIKELY(synchronized)) - { - verify(HERE), miss; - - // Replace the wait event with a new one to avoid premature signaling! - vk::get_resource_manager()->dispose(dma_fence); - - VkEventCreateInfo createInfo = {}; - createInfo.sType = VK_STRUCTURE_TYPE_EVENT_CREATE_INFO; - vkCreateEvent(*m_device, &createInfo, nullptr, &dma_fence); - } - else - { - // If this is speculated, it should only occur once - verify(HERE), vkGetEventStatus(*m_device, dma_fence) == VK_EVENT_RESET; - } - - cmd.set_flag(vk::command_buffer::cb_has_dma_transfer); - vkCmdSetEvent(cmd, dma_fence, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT); - - synchronized = true; - sync_timestamp = get_system_time(); + areai src_area; + src_area.x1 = (s32)transfer_x; + src_area.y1 = (s32)transfer_y; + src_area.x2 = s32(transfer_x + transfer_width); + src_area.y2 = s32(transfer_y + transfer_height); + dma_transfer(cmd, target, src_area, valid_range, rsx_pitch); } /** @@ -1079,24 +1103,51 @@ namespace vk region.create(width, height, section_depth, mipmaps, image, pitch, true, gcm_format); region.set_dirty(false); - //Its not necessary to lock blit dst textures as they are just reused as necessary - if (context != rsx::texture_upload_context::blit_engine_dst) + // Its not necessary to lock blit dst textures as they are just reused as necessary + switch (context) { + case rsx::texture_upload_context::shader_read: + case rsx::texture_upload_context::blit_engine_src: region.protect(utils::protection::ro); read_only_range = region.get_min_max(read_only_range, rsx::section_bounds::locked_range); - } - else - { - //TODO: Confirm byte swap patterns - //NOTE: Protection is handled by the caller - region.set_unpack_swap_bytes((aspect_flags & VK_IMAGE_ASPECT_COLOR_BIT) == VK_IMAGE_ASPECT_COLOR_BIT); + break; + case rsx::texture_upload_context::blit_engine_dst: + region.set_unpack_swap_bytes(true); no_access_range = region.get_min_max(no_access_range, rsx::section_bounds::locked_range); + break; + case rsx::texture_upload_context::dma: + case rsx::texture_upload_context::framebuffer_storage: + // Should not initialized with this method + default: + fmt::throw_exception("Unexpected upload context 0x%x", u32(context)); } update_cache_tag(); return ®ion; } + cached_texture_section* create_nul_section(vk::command_buffer& cmd, const utils::address_range& rsx_range, bool memory_load) override + { + auto& region = *find_cached_texture(rsx_range, RSX_GCM_FORMAT_IGNORED, true, false); + ASSERT(!region.is_locked()); + + // Prepare section + region.reset(rsx_range); + region.set_context(rsx::texture_upload_context::dma); + region.set_dirty(false); + region.set_unpack_swap_bytes(true); + + if (memory_load) + { + vk::map_dma(cmd, rsx_range.start, rsx_range.length()); + vk::load_dma(rsx_range.start, rsx_range.length()); + } + + no_access_range = region.get_min_max(no_access_range, rsx::section_bounds::locked_range); + update_cache_tag(); + return ®ion; + } + cached_texture_section* upload_image_from_cpu(vk::command_buffer& cmd, const utils::address_range& rsx_range, u16 width, u16 height, u16 depth, u16 mipmaps, u16 pitch, u32 gcm_format, rsx::texture_upload_context context, const std::vector& subresource_layout, rsx::texture_dimension_extended type, bool swizzled) override {