rsx: Experiments with nul sink

2019-09-04 22:19:58 +03:00 · 2019-09-04 22:19:58 +03:00 · 858014b718
parent 212ac19c11
commit 858014b718
12 changed files with 420 additions and 301 deletions
--- a/rpcs3/Emu/RSX/Common/TextureUtils.h
+++ b/rpcs3/Emu/RSX/Common/TextureUtils.h
@ -12,7 +12,8 @@ namespace rsx
 		shader_read = 1,
 		blit_engine_src = 2,
 		blit_engine_dst = 4,
-		framebuffer_storage = 8
+		framebuffer_storage = 8,
+		dma = 16
 	};

 	enum texture_colorspace : u32
--- a/rpcs3/Emu/RSX/Common/surface_store.h
+++ b/rpcs3/Emu/RSX/Common/surface_store.h
@ -775,6 +775,9 @@ namespace rsx
 						continue;

 					auto surface = tex_info.second.get();
+					if (access == rsx::surface_access::transfer && surface->write_through())
+						continue;
+
 					if (!rsx::pitch_compatible(surface, required_pitch, required_height))
 						continue;

--- a/rpcs3/Emu/RSX/Common/surface_utils.h
+++ b/rpcs3/Emu/RSX/Common/surface_utils.h
@ -309,6 +309,11 @@ namespace rsx
 			return (state_flags != rsx::surface_state_flags::ready) || !old_contents.empty();
 		}

+		bool write_through() const
+		{
+			return (state_flags & rsx::surface_state_flags::erase_bkgnd) && old_contents.empty();
+		}
+
 #if (ENABLE_SURFACE_CACHE_DEBUG)
 		u64 hash_block() const
 		{
--- a/rpcs3/Emu/RSX/Common/texture_cache.h
+++ b/rpcs3/Emu/RSX/Common/texture_cache.h
@ -362,6 +362,7 @@ namespace rsx
 			rsx::texture_upload_context context, rsx::texture_dimension_extended type, texture_create_flags flags) = 0;
 		virtual section_storage_type* upload_image_from_cpu(commandbuffer_type&, const address_range &rsx_range, u16 width, u16 height, u16 depth, u16 mipmaps, u16 pitch, u32 gcm_format, texture_upload_context context,
 			const std::vector<rsx_subresource_layout>& subresource_layout, rsx::texture_dimension_extended type, bool swizzled) = 0;
+		virtual section_storage_type* create_nul_section(commandbuffer_type&, const address_range &rsx_range, bool memory_load) = 0;
 		virtual void enforce_surface_creation_type(section_storage_type& section, u32 gcm_format, texture_create_flags expected) = 0;
 		virtual void insert_texture_barrier(commandbuffer_type&, image_storage_type* tex) = 0;
 		virtual image_view_type generate_cubemap_from_images(commandbuffer_type&, u32 gcm_format, u16 size, const std::vector<copy_region_descriptor>& sources, const texture_channel_remap_t& remap_vector) = 0;
@ -2429,6 +2430,7 @@ namespace rsx

 			// Check if src/dst are parts of render targets
 			typename surface_store_type::surface_overlap_info dst_subres;
+			bool use_null_region = false;
 			if (dst_address > 0xc0000000)
 			{
 				// TODO: HACK
@ -2442,6 +2444,7 @@ namespace rsx
 				// 1. Invalidate surfaces in range
 				// 2. Proceed as normal, blit into a 'normal' surface and any upload routines should catch it
 				m_rtts.invalidate_range(utils::address_range::start_length(dst_address, dst.pitch * dst_h));
+				use_null_region = (scale_x == 1.f && scale_y == 1.f);
 			}

 			// TODO: Handle cases where src or dst can be a depth texture while the other is a color texture - requires a render pass to emulate
@ -2545,7 +2548,9 @@ namespace rsx
 			if (!dst_is_render_target)
 			{
 				// Check for any available region that will fit this one
-				auto overlapping_surfaces = find_texture_from_range(address_range::start_length(dst_address, dst.pitch * dst.clip_height), dst.pitch, rsx::texture_upload_context::blit_engine_dst);
+				const auto required_type = (use_null_region) ? texture_upload_context::dma : texture_upload_context::blit_engine_dst;
+				const auto dst_range = address_range::start_length(dst_address, dst.pitch * dst.clip_height);
+				auto overlapping_surfaces = find_texture_from_range(dst_range, dst.pitch, required_type);
 				for (const auto &surface : overlapping_surfaces)
 				{
 					if (!surface->is_locked())
@ -2561,6 +2566,17 @@ namespace rsx
 						continue;
 					}

+					if (use_null_region)
+					{
+						if (dst_range.inside(surface->get_section_range()))
+						{
+							// Attach to existing region
+							cached_dest = surface;
+						}
+
+						continue;
+					}
+
 					const auto this_address = surface->get_section_base();
 					if (this_address > dst_address)
 					{
@ -2609,9 +2625,9 @@ namespace rsx

 			// Check if available target is acceptable
 			// TODO: Check for other types of format mismatch
-			bool format_mismatch = false;
-			if (cached_dest)
+			if (cached_dest && !use_null_region)
 			{
+				bool format_mismatch = false;
 				if (cached_dest->is_depth_texture() != src_subres.is_depth)
 				{
 					// Dest surface has the wrong 'aspect'
@ -2635,14 +2651,14 @@ namespace rsx
 						break;
 					}
 				}
-			}

-			if (format_mismatch)
-			{
-				// The invalidate call before creating a new target will remove this section
-				cached_dest = nullptr;
-				dest_texture = 0;
-				dst_area = old_dst_area;
+				if (format_mismatch)
+				{
+					// The invalidate call before creating a new target will remove this section
+					cached_dest = nullptr;
+					dest_texture = 0;
+					dst_area = old_dst_area;
+				}
 			}

 			// Create source texture if does not exist
@ -2795,7 +2811,7 @@ namespace rsx
 			else
 				gcm_format = (dst_is_argb8) ? CELL_GCM_TEXTURE_A8R8G8B8 : CELL_GCM_TEXTURE_R5G6B5;

-			if (cached_dest)
+			if (cached_dest && !use_null_region)
 			{
 				// Prep surface
 				auto channel_order = src_is_render_target ? rsx::texture_create_flags::native_component_order :
@ -2847,9 +2863,9 @@ namespace rsx

 			const auto modified_range = utils::address_range::start_length(dst_address, mem_length);

-			if (dest_texture == 0)
+			if (!cached_dest && !dst_is_render_target)
 			{
-				verify(HERE), !dst_is_render_target;
+				verify(HERE), !dest_texture;

 				// Need to calculate the minium required size that will fit the data, anchored on the rsx_address
 				// If the application starts off with an 'inseted' section, the guessed dimensions may not fit!
@ -2859,55 +2875,72 @@ namespace rsx
 				const u32 section_length = std::max(write_end, expected_end) - dst.rsx_address;
 				dst_dimensions.height = section_length / dst.pitch;

-				// render target data is already in correct swizzle layout
-				auto channel_order = src_is_render_target ? rsx::texture_create_flags::native_component_order :
-					dst_is_argb8 ? rsx::texture_create_flags::default_component_order :
-					rsx::texture_create_flags::swapped_native_component_order;
-
-				// Translate dst_area into the 'full' dst block based on dst.rsx_address as (0, 0)
-				dst_area.x1 += dst.offset_x;
-				dst_area.x2 += dst.offset_x;
-				dst_area.y1 += dst.offset_y;
-				dst_area.y2 += dst.offset_y;
-
 				lock.upgrade();

 				// NOTE: Write flag set to remove all other overlapping regions (e.g shader_read or blit_src)
 				const auto rsx_range = address_range::start_length(dst.rsx_address, section_length);
 				invalidate_range_impl_base(cmd, rsx_range, invalidation_cause::write, std::forward<Args>(extras)...);

-				if (!dst_area.x1 && !dst_area.y1 && dst_area.x2 == dst_dimensions.width && dst_area.y2 == dst_dimensions.height)
+				if (LIKELY(use_null_region))
 				{
-					cached_dest = create_new_texture(cmd, rsx_range, dst_dimensions.width, dst_dimensions.height, 1, 1, dst.pitch,
-						gcm_format, rsx::texture_upload_context::blit_engine_dst, rsx::texture_dimension_extended::texture_dimension_2d,
-						channel_order);
+					bool force_dma_load = false;
+					if ((dst_w * dst_bpp) != dst.pitch)
+					{
+						// Keep Cell from touching the range we need
+						const auto prot_range = modified_range.to_page_range();
+						utils::memory_protect(vm::base(prot_range.start), prot_range.length(), utils::protection::no);
+
+						force_dma_load = true;
+					}
+
+					cached_dest = create_nul_section(cmd, rsx_range, force_dma_load);
 				}
 				else
 				{
-					// HACK: workaround for data race with Cell
-					// Pre-lock the memory range we'll be touching, then load with super_ptr
-					const auto prot_range = modified_range.to_page_range();
-					utils::memory_protect(vm::base(prot_range.start), prot_range.length(), utils::protection::no);
+					// render target data is already in correct swizzle layout
+					auto channel_order = src_is_render_target ? rsx::texture_create_flags::native_component_order :
+						dst_is_argb8 ? rsx::texture_create_flags::default_component_order :
+						rsx::texture_create_flags::swapped_native_component_order;

-					const u16 pitch_in_block = dst.pitch / dst_bpp;
-					std::vector<rsx_subresource_layout> subresource_layout;
-					rsx_subresource_layout subres = {};
-					subres.width_in_block = dst_dimensions.width;
-					subres.height_in_block = dst_dimensions.height;
-					subres.pitch_in_block = pitch_in_block;
-					subres.depth = 1;
-					subres.data = { reinterpret_cast<const gsl::byte*>(vm::get_super_ptr(dst.rsx_address)), dst.pitch * dst_dimensions.height };
-					subresource_layout.push_back(subres);
+					// Translate dst_area into the 'full' dst block based on dst.rsx_address as (0, 0)
+					dst_area.x1 += dst.offset_x;
+					dst_area.x2 += dst.offset_x;
+					dst_area.y1 += dst.offset_y;
+					dst_area.y2 += dst.offset_y;

-					cached_dest = upload_image_from_cpu(cmd, rsx_range, dst_dimensions.width, dst_dimensions.height, 1, 1, dst.pitch,
-						gcm_format, rsx::texture_upload_context::blit_engine_dst, subresource_layout,
-						rsx::texture_dimension_extended::texture_dimension_2d, false);
+					if (!dst_area.x1 && !dst_area.y1 && dst_area.x2 == dst_dimensions.width && dst_area.y2 == dst_dimensions.height)
+					{
+						cached_dest = create_new_texture(cmd, rsx_range, dst_dimensions.width, dst_dimensions.height, 1, 1, dst.pitch,
+							gcm_format, rsx::texture_upload_context::blit_engine_dst, rsx::texture_dimension_extended::texture_dimension_2d,
+							channel_order);
+					}
+					else
+					{
+						// HACK: workaround for data race with Cell
+						// Pre-lock the memory range we'll be touching, then load with super_ptr
+						const auto prot_range = modified_range.to_page_range();
+						utils::memory_protect(vm::base(prot_range.start), prot_range.length(), utils::protection::no);

-					enforce_surface_creation_type(*cached_dest, gcm_format, channel_order);
+						const u16 pitch_in_block = dst.pitch / dst_bpp;
+						std::vector<rsx_subresource_layout> subresource_layout;
+						rsx_subresource_layout subres = {};
+						subres.width_in_block = dst_dimensions.width;
+						subres.height_in_block = dst_dimensions.height;
+						subres.pitch_in_block = pitch_in_block;
+						subres.depth = 1;
+						subres.data = { reinterpret_cast<const gsl::byte*>(vm::get_super_ptr(dst.rsx_address)), dst.pitch * dst_dimensions.height };
+						subresource_layout.push_back(subres);
+
+						cached_dest = upload_image_from_cpu(cmd, rsx_range, dst_dimensions.width, dst_dimensions.height, 1, 1, dst.pitch,
+							gcm_format, rsx::texture_upload_context::blit_engine_dst, subresource_layout,
+							rsx::texture_dimension_extended::texture_dimension_2d, false);
+
+						enforce_surface_creation_type(*cached_dest, gcm_format, channel_order);
+					}
+
+					dest_texture = cached_dest->get_raw_texture();
+					typeless_info.dst_context = texture_upload_context::blit_engine_dst;
 				}
-
-				dest_texture = cached_dest->get_raw_texture();
-				typeless_info.dst_context = texture_upload_context::blit_engine_dst;
 			}

 			verify(HERE), cached_dest || dst_is_render_target;
@ -2979,8 +3012,15 @@ namespace rsx
 				dst_subres.surface->transform_blit_coordinates(rsx::surface_access::transfer, dst_area);
 			}

-			typeless_info.analyse();
-			blitter.scale_image(cmd, vram_texture, dest_texture, src_area, dst_area, interpolate, is_depth_blit, typeless_info);
+			if (!use_null_region)
+			{
+				typeless_info.analyse();
+				blitter.scale_image(cmd, vram_texture, dest_texture, src_area, dst_area, interpolate, is_depth_blit, typeless_info);
+			}
+			else
+			{
+				cached_dest->dma_transfer(cmd, vram_texture, src_area, modified_range, dst.pitch);
+			}

 			blit_op_result result = true;
 			result.is_depth = is_depth_blit;
--- a/rpcs3/Emu/RSX/Common/texture_cache_utils.h
+++ b/rpcs3/Emu/RSX/Common/texture_cache_utils.h
@ -1504,7 +1504,7 @@ namespace rsx

 		void add_flush_exclusion(const address_range& rng)
 		{
-			AUDIT(exists() && is_locked() && is_flushable());
+			AUDIT(is_locked() && is_flushable());
 			const auto _rng = rng.get_intersect(get_section_range());
 			flush_exclusions.merge(_rng);
 		}
@ -1710,7 +1710,14 @@ namespace rsx

 		bool exists() const
 		{
-			return derived()->exists();
+			if (derived()->exists())
+			{
+				return true;
+			}
+			else
+			{
+				return (context == rsx::texture_upload_context::dma && is_locked());
+			}
 		}
 	};

--- a/rpcs3/Emu/RSX/GL/GLHelpers.h
+++ b/rpcs3/Emu/RSX/GL/GLHelpers.h
@ -883,7 +883,7 @@ namespace gl

 		void data(GLsizeiptr size, const void* data_ = nullptr, GLenum usage = GL_STREAM_DRAW)
 		{
-			verify(HERE), m_memory_type == memory_type::undefined;
+			verify(HERE), m_memory_type != memory_type::local;

 			target target_ = current_target();
 			save_binding_state save(target_, *this);
--- a/rpcs3/Emu/RSX/GL/GLTextureCache.h
+++ b/rpcs3/Emu/RSX/GL/GLTextureCache.h
@ -61,72 +61,6 @@ namespace gl
 		texture::format format = texture::format::rgba;
 		texture::type type = texture::type::ubyte;

-		u8 get_pixel_size(texture::format fmt_, texture::type type_)
-		{
-			u8 size = 1;
-			switch (type_)
-			{
-			case texture::type::ubyte:
-			case texture::type::sbyte:
-				break;
-			case texture::type::ushort:
-			case texture::type::sshort:
-			case texture::type::f16:
-				size = 2;
-				break;
-			case texture::type::ushort_5_6_5:
-			case texture::type::ushort_5_6_5_rev:
-			case texture::type::ushort_4_4_4_4:
-			case texture::type::ushort_4_4_4_4_rev:
-			case texture::type::ushort_5_5_5_1:
-			case texture::type::ushort_1_5_5_5_rev:
-				return 2;
-			case texture::type::uint_8_8_8_8:
-			case texture::type::uint_8_8_8_8_rev:
-			case texture::type::uint_10_10_10_2:
-			case texture::type::uint_2_10_10_10_rev:
-			case texture::type::uint_24_8:
-				return 4;
-			case texture::type::f32:
-			case texture::type::sint:
-			case texture::type::uint:
-				size = 4;
-				break;
-			default:
-				LOG_ERROR(RSX, "Unsupported texture type");
-			}
-
-			switch (fmt_)
-			{
-			case texture::format::r:
-				break;
-			case texture::format::rg:
-				size *= 2;
-				break;
-			case texture::format::rgb:
-			case texture::format::bgr:
-				size *= 3;
-				break;
-			case texture::format::rgba:
-			case texture::format::bgra:
-				size *= 4;
-				break;
-
-			//Depth formats..
-			case texture::format::depth:
-				size = 2;
-				break;
-			case texture::format::depth_stencil:
-				size = 4;
-				break;
-			default:
-				LOG_ERROR(RSX, "Unsupported rtt format %d", (GLenum)fmt_);
-				size = 4;
-			}
-
-			return size;
-		}
-
 		void init_buffer(const gl::texture* src)
 		{
 			const u32 vram_size = src->pitch() * src->height();
@ -218,6 +152,61 @@ namespace gl
 			}
 		}

+		void dma_transfer(gl::command_context& cmd, gl::texture* src, const areai& /*src_area*/, const utils::address_range& /*valid_range*/, u32 pitch)
+		{
+			init_buffer(src);
+
+			glGetError();
+			glBindBuffer(GL_PIXEL_PACK_BUFFER, pbo_id);
+
+			if (context == rsx::texture_upload_context::dma)
+			{
+				// Determine unpack config dynamically
+				const auto format_info = gl::get_format_type(src->get_internal_format());
+				format = static_cast<gl::texture::format>(std::get<0>(format_info));
+				type = static_cast<gl::texture::type>(std::get<1>(format_info));
+
+				if ((src->aspect() & gl::image_aspect::stencil) == 0)
+				{
+					pack_unpack_swap_bytes = std::get<2>(format_info);
+				}
+				else
+				{
+					// Z24S8 decode is done on the CPU for now
+					pack_unpack_swap_bytes = false;
+				}
+			}
+
+			pixel_pack_settings pack_settings;
+			pack_settings.alignment(1);
+			pack_settings.swap_bytes(pack_unpack_swap_bytes);
+
+			src->copy_to(nullptr, format, type, pack_settings);
+			real_pitch = src->pitch();
+			rsx_pitch = pitch;
+
+			if (auto error = glGetError())
+			{
+				if (error == GL_OUT_OF_MEMORY && ::gl::get_driver_caps().vendor_AMD)
+				{
+					// AMD driver bug
+					// Pixel transfer fails with GL_OUT_OF_MEMORY. Usually happens with float textures or operations attempting to swap endianness.
+					// Failed operations also leak a large amount of memory
+					LOG_ERROR(RSX, "Memory transfer failure (AMD bug). Please update your driver to Adrenalin 19.4.3 or newer. Format=0x%x, Type=0x%x, Swap=%d", (u32)format, (u32)type, pack_unpack_swap_bytes);
+				}
+				else
+				{
+					LOG_ERROR(RSX, "Memory transfer failed with error 0x%x. Format=0x%x, Type=0x%x", error, (u32)format, (u32)type);
+				}
+			}
+
+			glBindBuffer(GL_PIXEL_PACK_BUFFER, GL_NONE);
+
+			m_fence.reset();
+			synchronized = true;
+			sync_timestamp = get_system_time();
+		}
+
 		void copy_texture(gl::command_context& cmd, bool miss)
 		{
 			ASSERT(exists());
@ -284,38 +273,7 @@ namespace gl
 				}
 			}

-			init_buffer(target_texture);
-
-			glGetError();
-			glBindBuffer(GL_PIXEL_PACK_BUFFER, pbo_id);
-
-			pixel_pack_settings pack_settings;
-			pack_settings.alignment(1);
-			pack_settings.swap_bytes(pack_unpack_swap_bytes);
-
-			target_texture->copy_to(nullptr, format, type, pack_settings);
-			real_pitch = target_texture->pitch();
-
-			if (auto error = glGetError())
-			{
-				if (error == GL_OUT_OF_MEMORY && ::gl::get_driver_caps().vendor_AMD)
-				{
-					// AMD driver bug
-					// Pixel transfer fails with GL_OUT_OF_MEMORY. Usually happens with float textures or operations attempting to swap endianness.
-					// Failed operations also leak a large amount of memory
-					LOG_ERROR(RSX, "Memory transfer failure (AMD bug). Please update your driver to Adrenalin 19.4.3 or newer. Format=0x%x, Type=0x%x, Swap=%d", (u32)format, (u32)type, pack_unpack_swap_bytes);
-				}
-				else
-				{
-					LOG_ERROR(RSX, "Memory transfer failed with error 0x%x. Format=0x%x, Type=0x%x", error, (u32)format, (u32)type);
-				}
-			}
-
-			glBindBuffer(GL_PIXEL_PACK_BUFFER, GL_NONE);
-
-			m_fence.reset();
-			synchronized = true;
-			sync_timestamp = get_system_time();
+			dma_transfer(cmd, target_texture, {}, {}, rsx_pitch);
 		}

 		void fill_texture(gl::texture* tex)
@ -889,6 +847,21 @@ namespace gl
 			return &cached;
 		}

+		cached_texture_section* create_nul_section(gl::command_context& cmd, const utils::address_range& rsx_range, bool memory_load) override
+		{
+			auto& cached = *find_cached_texture(rsx_range, RSX_GCM_FORMAT_IGNORED, true, false);
+			ASSERT(!cached.is_locked());
+
+			// Prepare section
+			cached.reset(rsx_range);
+			cached.set_context(rsx::texture_upload_context::dma);
+			cached.set_dirty(false);
+
+			no_access_range = cached.get_min_max(no_access_range, rsx::section_bounds::locked_range);
+			update_cache_tag();
+			return &cached;
+		}
+
 		cached_texture_section* upload_image_from_cpu(gl::command_context &cmd, const utils::address_range& rsx_range, u16 width, u16 height, u16 depth, u16 mipmaps, u16 pitch, u32 gcm_format,
 			rsx::texture_upload_context context, const std::vector<rsx_subresource_layout>& subresource_layout, rsx::texture_dimension_extended type, bool input_swizzled) override
 		{
--- a/rpcs3/Emu/RSX/VK/VKCompute.h
+++ b/rpcs3/Emu/RSX/VK/VKCompute.h
@ -443,6 +443,7 @@ namespace vk
 		}
 	};

+	template<bool _SwapBytes = false>
 	struct cs_gather_d24x8 : cs_interleave_task
 	{
 		cs_gather_d24x8()
@ -456,13 +457,24 @@ namespace vk
 				"		stencil_shift = (index % 4) * 8;\n"
 				"		stencil = data[stencil_offset + s_offset];\n"
 				"		stencil = (stencil >> stencil_shift) & 0xFF;\n"
-				"		value = (depth << 8) | stencil;\n"
+				"		value = (depth << 8) | stencil;\n";
+
+			if constexpr (!_SwapBytes)
+			{
+				work_kernel +=
 				"		data[index] = value;\n";
+			}
+			else
+			{
+				work_kernel +=
+				"		data[index] = bswap_u32(value);\n";
+			}

 			cs_shuffle_base::build("");
 		}
 	};

+	template<bool _SwapBytes = false>
 	struct cs_gather_d32x8 : cs_interleave_task
 	{
 		cs_gather_d32x8()
@ -476,8 +488,18 @@ namespace vk
 				"		stencil_shift = (index % 4) * 8;\n"
 				"		stencil = data[stencil_offset + s_offset];\n"
 				"		stencil = (stencil >> stencil_shift) & 0xFF;\n"
-				"		value = (depth << 8) | stencil;\n"
+				"		value = (depth << 8) | stencil;\n";
+
+			if constexpr (!_SwapBytes)
+			{
+				work_kernel +=
 				"		data[index] = value;\n";
+			}
+			else
+			{
+				work_kernel +=
+				"		data[index] = bswap_u32(value);\n";
+			}

 			cs_shuffle_base::build("");
 		}
--- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp
+++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp
@ -2949,7 +2949,7 @@ void VKGSRender::prepare_rtts(rsx::framebuffer_creation_context context)
 			const u32 gcm_format = (m_depth_surface_info.depth_format != rsx::surface_depth_format::z16) ? CELL_GCM_TEXTURE_DEPTH16 : CELL_GCM_TEXTURE_DEPTH24_D8;
 			m_texture_cache.lock_memory_region(
 				*m_current_command_buffer, m_rtts.m_bound_depth_stencil.second, surface_range, true,
-				m_depth_surface_info.width, m_depth_surface_info.height, m_framebuffer_layout.actual_zeta_pitch, gcm_format, false);
+				m_depth_surface_info.width, m_depth_surface_info.height, m_framebuffer_layout.actual_zeta_pitch, gcm_format, true);
 		}
 		else
 		{
--- a/rpcs3/Emu/RSX/VK/VKHelpers.h
+++ b/rpcs3/Emu/RSX/VK/VKHelpers.h
@ -148,7 +148,7 @@ namespace vk
 	void change_image_layout(VkCommandBuffer cmd, vk::image *image, VkImageLayout new_layout, const VkImageSubresourceRange& range);
 	void change_image_layout(VkCommandBuffer cmd, vk::image *image, VkImageLayout new_layout);

-	void copy_image_to_buffer(VkCommandBuffer cmd, const vk::image* src, const vk::buffer* dst, const VkBufferImageCopy& region);
+	void copy_image_to_buffer(VkCommandBuffer cmd, const vk::image* src, const vk::buffer* dst, const VkBufferImageCopy& region, bool swap_bytes = false);
 	void copy_buffer_to_image(VkCommandBuffer cmd, const vk::buffer* src, const vk::image* dst, const VkBufferImageCopy& region);

 	void copy_image_typeless(const command_buffer &cmd, image *src, image *dst, const areai& src_rect, const areai& dst_rect,
--- a/rpcs3/Emu/RSX/VK/VKTexture.cpp
+++ b/rpcs3/Emu/RSX/VK/VKTexture.cpp
@ -56,7 +56,7 @@ namespace vk
 		}
 	}

-	void copy_image_to_buffer(VkCommandBuffer cmd, const vk::image* src, const vk::buffer* dst, const VkBufferImageCopy& region)
+	void copy_image_to_buffer(VkCommandBuffer cmd, const vk::image* src, const vk::buffer* dst, const VkBufferImageCopy& region, bool swap_bytes)
 	{
 		// Always validate
 		verify("Invalid image layout!" HERE),
@ -66,6 +66,7 @@ namespace vk
 		{
 		default:
 		{
+			verify("Implicit byteswap option not supported for speficied format" HERE), !swap_bytes;
 			vkCmdCopyImageToBuffer(cmd, src->value, src->current_layout, dst->value, 1, &region);
 			break;
 		}
@ -83,8 +84,9 @@ namespace vk
 			const auto allocation_end = region.bufferOffset + packed_length + in_depth_size + in_stencil_size;
 			verify(HERE), dst->size() >= allocation_end;

-			const VkDeviceSize z_offset = align<VkDeviceSize>(region.bufferOffset + packed_length, 256);
-			const VkDeviceSize s_offset = align<VkDeviceSize>(z_offset + in_depth_size, 256);
+			const auto data_offset = u32(region.bufferOffset);
+			const auto z_offset = align<u32>(data_offset + packed_length, 256);
+			const auto s_offset = align<u32>(z_offset + in_depth_size, 256);

 			// 1. Copy the depth and stencil blocks to separate banks
 			VkBufferImageCopy sub_regions[2];
@ -97,20 +99,34 @@ namespace vk

 			// 2. Interleave the separated data blocks with a compute job
 			vk::cs_interleave_task *job;
-			if (src->format() == VK_FORMAT_D24_UNORM_S8_UINT)
+			if (LIKELY(!swap_bytes))
 			{
-				job = vk::get_compute_task<vk::cs_gather_d24x8>();
+				if (src->format() == VK_FORMAT_D24_UNORM_S8_UINT)
+				{
+					job = vk::get_compute_task<vk::cs_gather_d24x8<false>>();
+				}
+				else
+				{
+					job = vk::get_compute_task<vk::cs_gather_d32x8<false>>();
+				}
 			}
 			else
 			{
-				job = vk::get_compute_task<vk::cs_gather_d32x8>();
+				if (src->format() == VK_FORMAT_D24_UNORM_S8_UINT)
+				{
+					job = vk::get_compute_task<vk::cs_gather_d24x8<true>>();
+				}
+				else
+				{
+					job = vk::get_compute_task<vk::cs_gather_d32x8<true>>();
+				}
 			}

 			vk::insert_buffer_memory_barrier(cmd, dst->value, z_offset, in_depth_size + in_stencil_size,
 				VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
 				VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);

-			job->run(cmd, dst, (u32)region.bufferOffset, packed_length, (u32)z_offset, (u32)s_offset);
+			job->run(cmd, dst, data_offset, packed_length, z_offset, s_offset);

 			vk::insert_buffer_memory_barrier(cmd, dst->value, region.bufferOffset, packed_length,
 				VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
@ -145,8 +161,9 @@ namespace vk
 			const auto allocation_end = region.bufferOffset + packed_length + in_depth_size + in_stencil_size;
 			verify("Out of memory (compute heap). Lower your resolution scale setting." HERE), src->size() >= allocation_end;

-			const VkDeviceSize z_offset = align<VkDeviceSize>(region.bufferOffset + packed_length, 256);
-			const VkDeviceSize s_offset = align<VkDeviceSize>(z_offset + in_depth_size, 256);
+			const auto data_offset = u32(region.bufferOffset);
+			const auto z_offset = align<u32>(data_offset + packed_length, 256);
+			const auto s_offset = align<u32>(z_offset + in_depth_size, 256);

 			// Zero out the stencil block
 			vkCmdFillBuffer(cmd, src->value, s_offset, in_stencil_size, 0);
@ -166,7 +183,7 @@ namespace vk
 				job = vk::get_compute_task<vk::cs_scatter_d32x8>();
 			}

-			job->run(cmd, src, (u32)region.bufferOffset, packed_length, (u32)z_offset, (u32)s_offset);
+			job->run(cmd, src, data_offset, packed_length, z_offset, s_offset);

 			vk::insert_buffer_memory_barrier(cmd, src->value, z_offset, in_depth_size + in_stencil_size,
 				VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
--- a/rpcs3/Emu/RSX/VK/VKTextureCache.h
+++ b/rpcs3/Emu/RSX/VK/VKTextureCache.h
@ -151,8 +151,13 @@ namespace vk

 		VkFormat get_format() const
 		{
+			if (context == rsx::texture_upload_context::dma)
+			{
+				return VK_FORMAT_R32_UINT;
+			}
+
 			ASSERT(vram_texture != nullptr);
-			return vram_texture->info.format;
+			return vram_texture->format();
 		}

 		bool is_flushed() const
@ -161,18 +166,9 @@ namespace vk
 			return flushed;
 		}

-		void copy_texture(vk::command_buffer& cmd, bool miss)
+		void dma_transfer(vk::command_buffer& cmd, vk::image* src, const areai& src_area, const utils::address_range& valid_range, u32 pitch)
 		{
-			ASSERT(exists());
-
-			if (LIKELY(!miss))
-			{
-				baseclass::on_speculative_flush();
-			}
-			else
-			{
-				baseclass::on_miss();
-			}
+			verify(HERE), src->samples() == 1;

 			if (m_device == nullptr)
 			{
@ -186,9 +182,146 @@ namespace vk
 				vkCreateEvent(*m_device, &createInfo, nullptr, &dma_fence);
 			}

+			src->push_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
+
+			const auto internal_bpp = vk::get_format_texel_width(src->format());
+			const auto transfer_width = (u32)src_area.width();
+			const auto transfer_height = (u32)src_area.height();
+			real_pitch = internal_bpp * transfer_width;
+			rsx_pitch = pitch;
+
+			const bool is_depth_stencil = !!(src->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT);
+			if (is_depth_stencil || pack_unpack_swap_bytes)
+			{
+				const auto section_length = valid_range.length();
+				const auto transfer_pitch = real_pitch;
+				const auto task_length = transfer_pitch * src_area.height();
+
+				auto working_buffer = vk::get_scratch_buffer();
+				auto final_mapping = vk::map_dma(cmd, valid_range.start, section_length);
+
+				VkBufferImageCopy region = {};
+				region.imageSubresource = { src->aspect(), 0, 0, 1 };
+				region.imageOffset = { src_area.x1, src_area.y1, 0 };
+				region.imageExtent = { transfer_width, transfer_height, 1 };
+				vk::copy_image_to_buffer(cmd, src, working_buffer, region, (is_depth_stencil && pack_unpack_swap_bytes));
+
+				// NOTE: For depth-stencil formats, copying to buffer and byteswap are combined into one step above
+				if (pack_unpack_swap_bytes && !is_depth_stencil)
+				{
+					const auto texel_layout = vk::get_format_element_size(src->format());
+					const auto elem_size = texel_layout.first;
+					vk::cs_shuffle_base *shuffle_kernel;
+
+					if (elem_size == 2)
+					{
+						shuffle_kernel = vk::get_compute_task<vk::cs_shuffle_16>();
+					}
+					else if (elem_size == 4)
+					{
+						shuffle_kernel = vk::get_compute_task<vk::cs_shuffle_32>();
+					}
+					else
+					{
+						fmt::throw_exception("Unreachable" HERE);
+					}
+
+					vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length,
+						VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
+						VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
+
+					shuffle_kernel->run(cmd, working_buffer, task_length);
+
+					vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length,
+						VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
+						VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
+				}
+
+				if (LIKELY(rsx_pitch == real_pitch))
+				{
+					VkBufferCopy copy = {};
+					copy.dstOffset = final_mapping.first;
+					copy.size = section_length;
+					vkCmdCopyBuffer(cmd, working_buffer->value, final_mapping.second->value, 1, &copy);
+				}
+				else
+				{
+					std::vector<VkBufferCopy> copy;
+					copy.reserve(transfer_height);
+
+					u32 dst_offset = final_mapping.first;
+					u32 src_offset = 0;
+
+					for (unsigned row = 0; row < transfer_height; ++row)
+					{
+						copy.push_back({ src_offset, dst_offset, transfer_pitch });
+						src_offset += real_pitch;
+						dst_offset += rsx_pitch;
+					}
+
+					vkCmdCopyBuffer(cmd, working_buffer->value, final_mapping.second->value, transfer_height, copy.data());
+				}
+			}
+			else
+			{
+				VkBufferImageCopy region = {};
+				region.bufferRowLength = (rsx_pitch / internal_bpp);
+				region.imageSubresource = { src->aspect(), 0, 0, 1 };
+				region.imageOffset = { src_area.x1, src_area.y1, 0 };
+				region.imageExtent = { transfer_width, transfer_height, 1 };
+
+				auto mapping = vk::map_dma(cmd, valid_range.start, valid_range.length());
+				region.bufferOffset = mapping.first;
+				vkCmdCopyImageToBuffer(cmd, src->value, src->current_layout, mapping.second->value, 1, &region);
+			}
+
+			src->pop_layout(cmd);
+
+			if (UNLIKELY(synchronized))
+			{
+				// Replace the wait event with a new one to avoid premature signaling!
+				vk::get_resource_manager()->dispose(dma_fence);
+
+				VkEventCreateInfo createInfo = {};
+				createInfo.sType = VK_STRUCTURE_TYPE_EVENT_CREATE_INFO;
+				vkCreateEvent(*m_device, &createInfo, nullptr, &dma_fence);
+			}
+			else
+			{
+				// If this is speculated, it should only occur once
+				verify(HERE), vkGetEventStatus(*m_device, dma_fence) == VK_EVENT_RESET;
+			}
+
+			cmd.set_flag(vk::command_buffer::cb_has_dma_transfer);
+			vkCmdSetEvent(cmd, dma_fence, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT);
+
+			synchronized = true;
+			sync_timestamp = get_system_time();
+		}
+
+		void copy_texture(vk::command_buffer& cmd, bool miss)
+		{
+			ASSERT(exists());
+
+			if (LIKELY(!miss))
+			{
+				verify(HERE), !synchronized;
+				baseclass::on_speculative_flush();
+			}
+			else
+			{
+				baseclass::on_miss();
+			}
+
+			if (m_device == nullptr)
+			{
+				m_device = &cmd.get_command_pool().get_owner();
+			}
+
 			vk::image *locked_resource = vram_texture;
 			u32 transfer_width = width;
 			u32 transfer_height = height;
+			u32 transfer_x = 0, transfer_y = 0;

 			if (context == rsx::texture_upload_context::framebuffer_storage)
 			{
@ -199,12 +332,7 @@ namespace vk
 				transfer_height *= surface->samples_y;
 			}

-			verify(HERE), locked_resource->samples() == 1;
-
 			vk::image* target = locked_resource;
-			locked_resource->push_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
-			real_pitch = vk::get_format_texel_width(locked_resource->info.format) * locked_resource->width();
-
 			if (transfer_width != locked_resource->width() || transfer_height != locked_resource->height())
 			{
 				// TODO: Synchronize access to typeles textures
@ -221,14 +349,9 @@ namespace vk
 				target->change_layout(cmd, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL);
 			}

-			verify(HERE), target->current_layout == VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL;
-
-			// TODO: Read back stencil values (is this really necessary?)
 			const auto internal_bpp = vk::get_format_texel_width(vram_texture->format());
 			const auto valid_range = get_confirmed_range();
-			real_pitch = internal_bpp * transfer_width;

-			u32 transfer_x = 0, transfer_y = 0;
 			if (const auto section_range = get_section_range(); section_range != valid_range)
 			{
 				if (const auto offset = (valid_range.start - get_section_base()))
@ -250,111 +373,12 @@ namespace vk
 				}
 			}

-			if ((vram_texture->aspect() & VK_IMAGE_ASPECT_STENCIL_BIT) ||
-				pack_unpack_swap_bytes)
-			{
-				const auto section_length = valid_range.length();
-				const auto transfer_pitch = transfer_width * internal_bpp;
-				const auto task_length = transfer_pitch * transfer_height;
-
-				auto working_buffer = vk::get_scratch_buffer();
-				auto final_mapping = vk::map_dma(cmd, valid_range.start, section_length);
-
-				VkBufferImageCopy region = {};
-				region.imageSubresource = { vram_texture->aspect(), 0, 0, 1 };
-				region.imageOffset = { (s32)transfer_x, (s32)transfer_y, 0 };
-				region.imageExtent = { transfer_width, transfer_height, 1 };
-				vk::copy_image_to_buffer(cmd, target, working_buffer, region);
-
-				const auto texel_layout = vk::get_format_element_size(vram_texture->format());
-				const auto elem_size = texel_layout.first;
-				vk::cs_shuffle_base *shuffle_kernel;
-
-				if (elem_size == 2)
-				{
-					shuffle_kernel = vk::get_compute_task<vk::cs_shuffle_16>();
-				}
-				else if (elem_size == 4)
-				{
-					shuffle_kernel = vk::get_compute_task<vk::cs_shuffle_32>();
-				}
-				else
-				{
-					fmt::throw_exception("Unreachable" HERE);
-				}
-
-				vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length,
-					VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
-					VK_ACCESS_TRANSFER_WRITE_BIT, VK_ACCESS_SHADER_READ_BIT);
-
-				shuffle_kernel->run(cmd, working_buffer, task_length);
-
-				vk::insert_buffer_memory_barrier(cmd, working_buffer->value, 0, task_length,
-					VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT,
-					VK_ACCESS_SHADER_WRITE_BIT, VK_ACCESS_TRANSFER_READ_BIT);
-
-				if (LIKELY(rsx_pitch == real_pitch))
-				{
-					VkBufferCopy copy = {};
-					copy.dstOffset = final_mapping.first;
-					copy.size = section_length;
-					vkCmdCopyBuffer(cmd, working_buffer->value, final_mapping.second->value, 1, &copy);
-				}
-				else
-				{
-					std::vector<VkBufferCopy> copy;
-					copy.reserve(transfer_height);
-
-					u32 dst_offset = final_mapping.first;
-					u32 src_offset = 0;
-
-					for (unsigned row = 0; row < transfer_height; ++row)
-					{
-						copy.push_back({src_offset, dst_offset, transfer_pitch});
-						src_offset += real_pitch;
-						dst_offset += rsx_pitch;
-					}
-
-					vkCmdCopyBuffer(cmd, working_buffer->value, final_mapping.second->value, transfer_height, copy.data());
-				}
-			}
-			else
-			{
-				VkBufferImageCopy region = {};
-				region.bufferRowLength = (rsx_pitch / internal_bpp);
-				region.imageSubresource = { vram_texture->aspect(), 0, 0, 1 };
-				region.imageOffset = { (s32)transfer_x, (s32)transfer_y, 0 };
-				region.imageExtent = { transfer_width, transfer_height, 1 };
-
-				auto mapping = vk::map_dma(cmd, valid_range.start, valid_range.length());
-				region.bufferOffset = mapping.first;
-				vkCmdCopyImageToBuffer(cmd, target->value, target->current_layout, mapping.second->value, 1, &region);
-			}
-
-			locked_resource->pop_layout(cmd);
-
-			if (UNLIKELY(synchronized))
-			{
-				verify(HERE), miss;
-
-				// Replace the wait event with a new one to avoid premature signaling!
-				vk::get_resource_manager()->dispose(dma_fence);
-
-				VkEventCreateInfo createInfo = {};
-				createInfo.sType = VK_STRUCTURE_TYPE_EVENT_CREATE_INFO;
-				vkCreateEvent(*m_device, &createInfo, nullptr, &dma_fence);
-			}
-			else
-			{
-				// If this is speculated, it should only occur once
-				verify(HERE), vkGetEventStatus(*m_device, dma_fence) == VK_EVENT_RESET;
-			}
-
-			cmd.set_flag(vk::command_buffer::cb_has_dma_transfer);
-			vkCmdSetEvent(cmd, dma_fence, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_TRANSFER_BIT);
-
-			synchronized = true;
-			sync_timestamp = get_system_time();
+			areai src_area;
+			src_area.x1 = (s32)transfer_x;
+			src_area.y1 = (s32)transfer_y;
+			src_area.x2 = s32(transfer_x + transfer_width);
+			src_area.y2 = s32(transfer_y + transfer_height);
+			dma_transfer(cmd, target, src_area, valid_range, rsx_pitch);
 		}

 		/**
@ -1079,24 +1103,51 @@ namespace vk
 			region.create(width, height, section_depth, mipmaps, image, pitch, true, gcm_format);
 			region.set_dirty(false);

-			//Its not necessary to lock blit dst textures as they are just reused as necessary
-			if (context != rsx::texture_upload_context::blit_engine_dst)
+			// Its not necessary to lock blit dst textures as they are just reused as necessary
+			switch (context)
 			{
+			case rsx::texture_upload_context::shader_read:
+			case rsx::texture_upload_context::blit_engine_src:
 				region.protect(utils::protection::ro);
 				read_only_range = region.get_min_max(read_only_range, rsx::section_bounds::locked_range);
-			}
-			else
-			{
-				//TODO: Confirm byte swap patterns
-				//NOTE: Protection is handled by the caller
-				region.set_unpack_swap_bytes((aspect_flags & VK_IMAGE_ASPECT_COLOR_BIT) == VK_IMAGE_ASPECT_COLOR_BIT);
+				break;
+			case rsx::texture_upload_context::blit_engine_dst:
+				region.set_unpack_swap_bytes(true);
 				no_access_range = region.get_min_max(no_access_range, rsx::section_bounds::locked_range);
+				break;
+			case rsx::texture_upload_context::dma:
+			case rsx::texture_upload_context::framebuffer_storage:
+				// Should not initialized with this method
+			default:
+				fmt::throw_exception("Unexpected upload context 0x%x", u32(context));
 			}

 			update_cache_tag();
 			return &region;
 		}

+		cached_texture_section* create_nul_section(vk::command_buffer& cmd, const utils::address_range& rsx_range, bool memory_load) override
+		{
+			auto& region = *find_cached_texture(rsx_range, RSX_GCM_FORMAT_IGNORED, true, false);
+			ASSERT(!region.is_locked());
+
+			// Prepare section
+			region.reset(rsx_range);
+			region.set_context(rsx::texture_upload_context::dma);
+			region.set_dirty(false);
+			region.set_unpack_swap_bytes(true);
+
+			if (memory_load)
+			{
+				vk::map_dma(cmd, rsx_range.start, rsx_range.length());
+				vk::load_dma(rsx_range.start, rsx_range.length());
+			}
+
+			no_access_range = region.get_min_max(no_access_range, rsx::section_bounds::locked_range);
+			update_cache_tag();
+			return &region;
+		}
+
 		cached_texture_section* upload_image_from_cpu(vk::command_buffer& cmd, const utils::address_range& rsx_range, u16 width, u16 height, u16 depth, u16 mipmaps, u16 pitch, u32 gcm_format,
 			rsx::texture_upload_context context, const std::vector<rsx_subresource_layout>& subresource_layout, rsx::texture_dimension_extended type, bool swizzled) override
 		{