diff --git a/rpcs3/Emu/RSX/VK/VKHelpers.h b/rpcs3/Emu/RSX/VK/VKHelpers.h index 3b87ba46da..63973a785a 100644 --- a/rpcs3/Emu/RSX/VK/VKHelpers.h +++ b/rpcs3/Emu/RSX/VK/VKHelpers.h @@ -75,6 +75,7 @@ namespace vk upload_contents_async = 1, initialize_image_layout = 2, preserve_image_layout = 4, + source_is_gpu_resident = 8, // meta-flags upload_contents_inline = 0, diff --git a/rpcs3/Emu/RSX/VK/VKRenderTargets.cpp b/rpcs3/Emu/RSX/VK/VKRenderTargets.cpp index f19908d764..db9a51962e 100644 --- a/rpcs3/Emu/RSX/VK/VKRenderTargets.cpp +++ b/rpcs3/Emu/RSX/VK/VKRenderTargets.cpp @@ -1,3 +1,5 @@ +#include "VKCompute.h" +#include "VKDMA.h" #include "VKRenderTargets.h" #include "VKResourceManager.h" #include "Emu/RSX/rsx_methods.h" @@ -681,32 +683,75 @@ namespace vk subres.depth = 1; subres.data = { vm::get_super_ptr(base_addr), static_cast::size_type>(rsx_pitch * surface_height * samples_y) }; - // FIXME: Move to GPU queue - std::vector ext_data; const auto range = get_memory_range(); + rsx::flags32_t upload_flags = upload_contents_inline; + u32 heap_align = rsx_pitch; - if (auto region = rsx::get_current_renderer()->get_tiled_memory_region(range)) + if (auto tiled_region = rsx::get_current_renderer()->get_tiled_memory_region(range)) { - auto real_data = vm::get_super_ptr(range.start); - ext_data.resize(region.tile->size); - rsx::tile_texel_data( - ext_data.data(), - real_data, - region.base_address, - range.start - region.base_address, - region.tile->size, - region.tile->bank, - region.tile->pitch, - subres.width_in_block, - subres.height_in_block - ); - subres.data = std::span(ext_data); + const auto available_tile_size = tiled_region.tile->size - (range.start - tiled_region.base_address); + const auto max_content_size = tiled_region.tile->pitch * utils::align(subres.height_in_block, 64); + const auto section_length = std::min(max_content_size, available_tile_size); + + const auto dma_mapping = vk::map_dma(range.start, section_length); + const auto scratch_buf = vk::get_scratch_buffer(cmd, section_length * 3); // 0 = linear data, 1 = padding (deswz), 2 = tiled data + const auto tiled_data_scratch_offset = section_length * 2; + const auto linear_data_scratch_offset = 0; + + // Schedule the job + const RSX_detiler_config config = + { + .tile_base_address = tiled_region.base_address, + .tile_base_offset = range.start - tiled_region.base_address, + .tile_size = tiled_region.tile->size, + .tile_pitch = tiled_region.tile->pitch, + .bank = tiled_region.tile->bank, + + .dst = scratch_buf, + .dst_offset = linear_data_scratch_offset, + .src = scratch_buf, + .src_offset = section_length * 2, + + .image_width = subres.width_in_block, + .image_height = subres.height_in_block, + .image_pitch = subres.width_in_block * static_cast(get_bpp()), + .image_bpp = get_bpp() + }; + + // Transfer + VkBufferCopy copy_rgn + { + .srcOffset = dma_mapping.first, + .dstOffset = tiled_data_scratch_offset, + .size = section_length + }; + vkCmdCopyBuffer(cmd, dma_mapping.second->value, scratch_buf->value, 1, ©_rgn); + + // Barrier + vk::insert_buffer_memory_barrier( + cmd, scratch_buf->value, linear_data_scratch_offset, section_length, + VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT); + + // Detile + vk::get_compute_task>()->run(cmd, config); + + // Barrier + vk::insert_buffer_memory_barrier( + cmd, scratch_buf->value, linear_data_scratch_offset, subres.width_in_block * get_bpp() * subres.height_in_block, + VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, + VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT); + + // FIXME: !!EVIL!! + subres.data = { scratch_buf, linear_data_scratch_offset }; + upload_flags |= source_is_gpu_resident; + heap_align = subres.width_in_block * get_bpp(); } if (g_cfg.video.resolution_scale_percent == 100 && spp == 1) [[likely]] { push_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL); - vk::upload_image(cmd, this, { subres }, get_gcm_format(), is_swizzled, 1, aspect(), upload_heap, rsx_pitch, upload_contents_inline); + vk::upload_image(cmd, this, { subres }, get_gcm_format(), is_swizzled, 1, aspect(), upload_heap, heap_align, upload_flags); pop_layout(cmd); } else @@ -735,7 +780,7 @@ namespace vk } // Load Cell data into temp buffer - vk::upload_image(cmd, content, { subres }, get_gcm_format(), is_swizzled, 1, aspect(), upload_heap, rsx_pitch, upload_contents_inline); + vk::upload_image(cmd, content, { subres }, get_gcm_format(), is_swizzled, 1, aspect(), upload_heap, heap_align, upload_flags); // Write into final image if (content != final_dst) diff --git a/rpcs3/Emu/RSX/VK/VKTexture.cpp b/rpcs3/Emu/RSX/VK/VKTexture.cpp index 5de83a08fe..770381767d 100644 --- a/rpcs3/Emu/RSX/VK/VKTexture.cpp +++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp @@ -1009,13 +1009,19 @@ namespace vk { caps.supports_byteswap = (image_linear_size >= 1024); caps.supports_hw_deswizzle = caps.supports_byteswap; - caps.supports_zero_copy = false;// caps.supports_byteswap; + caps.supports_zero_copy = caps.supports_byteswap; caps.supports_vtc_decoding = false; check_caps = false; } auto buf_allocator = [&]() -> std::tuple { + if (image_setup_flags & source_is_gpu_resident) + { + // We should never reach here, unless something is very wrong... + fmt::throw_exception("Cannot allocate CPU memory for GPU-only data"); + } + // Map with extra padding bytes in case of realignment offset_in_upload_buffer = upload_heap.alloc<512>(image_linear_size + 8); void* mapped_buffer = upload_heap.map(offset_in_upload_buffer, image_linear_size + 8); @@ -1026,6 +1032,21 @@ namespace vk opt = upload_texture_subresource(io_buf, layout, format, is_swizzled, caps); upload_heap.unmap(); + if (image_setup_flags & source_is_gpu_resident) + { + // Read from GPU buf if the input is already uploaded. + auto [iobuf, io_offset] = layout.data.raw(); + upload_buffer = static_cast(iobuf); + offset_in_upload_buffer = io_offset; + // Never upload. Data is already resident. + opt.require_upload = false; + } + else + { + // Read from upload buffer + upload_buffer = upload_heap.heap.get(); + } + copy_regions.push_back({}); auto& copy_info = copy_regions.back(); copy_info.bufferOffset = offset_in_upload_buffer; @@ -1038,8 +1059,6 @@ namespace vk copy_info.imageSubresource.mipLevel = layout.level; copy_info.bufferRowLength = upload_pitch_in_texel; - upload_buffer = upload_heap.heap.get(); - if (opt.require_upload) { ensure(!opt.deferred_cmds.empty()); @@ -1117,7 +1136,7 @@ namespace vk copy.size = copy_cmd.length; } } - else + else if (upload_buffer != scratch_buf || offset_in_upload_buffer != scratch_offset) { buffer_copies.push_back({}); auto& copy = buffer_copies.back(); @@ -1163,7 +1182,7 @@ namespace vk range_ptr += op.second; } } - else + else if (!buffer_copies.empty()) { vkCmdCopyBuffer(cmd2, upload_buffer->value, scratch_buf->value, static_cast(buffer_copies.size()), buffer_copies.data()); } diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp index a4abcc4d93..1c49c2b0ec 100644 --- a/rpcs3/Emu/RSX/VK/VKTextureCache.cpp +++ b/rpcs3/Emu/RSX/VK/VKTextureCache.cpp @@ -194,6 +194,7 @@ namespace vk .src = working_buffer, .src_offset = 0, + // TODO: Check interaction with anti-aliasing .image_width = width, .image_height = height, .image_pitch = real_pitch, diff --git a/rpcs3/io_buffer.h b/rpcs3/io_buffer.h index 945bdc9c90..876d885f59 100644 --- a/rpcs3/io_buffer.h +++ b/rpcs3/io_buffer.h @@ -22,15 +22,22 @@ namespace rsx mutable void* m_ptr = nullptr; mutable usz m_size = 0; - std::function ()> m_allocator = nullptr; + std::function()> m_allocator{}; public: io_buffer() = default; + io_buffer(const io_buffer& that) + { + m_ptr = that.m_ptr; + m_size = that.m_size; + m_allocator = that.m_allocator; + } + template io_buffer(const T& container) { - m_ptr = reinterpret_cast(container.data()); + m_ptr = const_cast(reinterpret_cast(container.data())); m_size = container.size_bytes(); } @@ -50,6 +57,11 @@ namespace rsx : m_ptr(const_cast(ptr)), m_size(size) {} + std::pair raw() const + { + return { m_ptr, m_size }; + } + template T* data() const {