From 2033f3f7dcae6fcdb9497d9ed8f46a9e2781c9a0 Mon Sep 17 00:00:00 2001
From: kd-11 <karokidii@gmail.com>
Date: Mon, 14 Aug 2017 00:27:19 +0300
Subject: [PATCH] rsx/vk/gl: Refactoring and reimplementation of blit engine
 Fix rsx offscreen-render-to-display-buffer-blit surface reads - Also,
 properly scale display output height if reading from compressed tile

gl: Fix broken dst height computation
- The extra padding is only there to force power-of-2 sizes and isnt used

gl: Ignore compression scaling if output is rendered to in a renderpass

rsx/gl/vk: Cleanup for GPU texture scaling. Initial impl [WIP]
- TODO: Refactor more shared code into RSX/common
---
 rpcs3/Emu/RSX/Common/surface_store.h        | 205 +++++++++++++
 rpcs3/Emu/RSX/D3D12/D3D12RenderTargetSets.h |  12 +
 rpcs3/Emu/RSX/GL/GLGSRender.cpp             |  52 +++-
 rpcs3/Emu/RSX/GL/GLHelpers.h                |   6 +
 rpcs3/Emu/RSX/GL/GLRenderTargets.h          | 209 +------------
 rpcs3/Emu/RSX/GL/GLTextureCache.h           | 107 +++----
 rpcs3/Emu/RSX/VK/VKGSRender.cpp             |  14 +-
 rpcs3/Emu/RSX/VK/VKGSRender.h               |   1 +
 rpcs3/Emu/RSX/VK/VKRenderTargets.h          |  11 +
 rpcs3/Emu/RSX/VK/VKTextureCache.h           | 312 +++++++++++++++++++-
 rpcs3/Emu/RSX/rsx_methods.cpp               |  31 --
 11 files changed, 664 insertions(+), 296 deletions(-)
diff --git a/rpcs3/Emu/RSX/Common/surface_store.h b/rpcs3/Emu/RSX/Common/surface_store.h
index eddd1d57f6..46d28bec0f 100644
--- a/rpcs3/Emu/RSX/Common/surface_store.h
+++ b/rpcs3/Emu/RSX/Common/surface_store.h
@@ -13,6 +13,36 @@ namespace rsx
 		size_t get_packed_pitch(surface_color_format format, u32 width);
 	}
 
+	template <typename surface_type>
+	struct surface_subresource_storage
+	{
+		surface_type surface = nullptr;
+
+		u16 x = 0;
+		u16 y = 0;
+		u16 w = 0;
+		u16 h = 0;
+
+		bool is_bound = false;
+		bool is_depth_surface = false;
+		bool is_clipped = false;
+
+		surface_subresource_storage() {}
+
+		surface_subresource_storage(surface_type src, u16 X, u16 Y, u16 W, u16 H, bool _Bound, bool _Depth, bool _Clipped = false)
+			: surface(src), x(X), y(Y), w(W), h(H), is_bound(_Bound), is_depth_surface(_Depth), is_clipped(_Clipped)
+		{}
+	};
+
+	struct surface_format_info
+	{
+		u32 surface_width;
+		u32 surface_height;
+		u16 native_pitch;
+		u16 rsx_pitch;
+		u8 bpp;
+	};
+
 	/**
 	 * Helper for surface (ie color and depth stencil render target) management.
 	 * It handles surface creation and storage. Backend should only retrieve pointer to surface.
@@ -64,6 +94,7 @@ namespace rsx
 		using surface_type = typename Traits::surface_type;
 		using command_list_type = typename Traits::command_list_type;
 		using download_buffer_object = typename Traits::download_buffer_object;
+		using surface_subresource = typename surface_subresource_storage<surface_type>;
 
 		std::unordered_map<u32, surface_storage_type> m_render_targets_storage = {};
 		std::unordered_map<u32, surface_storage_type> m_depth_stencil_storage = {};
@@ -437,5 +468,179 @@ namespace rsx
 			for (auto &ds : m_depth_stencil_storage)
 				Traits::invalidate_depth_surface_contents(command_list, Traits::get(std::get<1>(ds)), nullptr, true);
 		}
+
+		/**
+		 * Clipping and fitting lookup funcrions
+		 * surface_overlaps - returns true if surface overlaps a given surface address and returns the relative x and y position of the surface address within the surface
+		 * address_is_bound - returns true if the surface at a given address is actively bound
+		 * get_surface_subresource_if_available - returns a sectiion descriptor that allows to crop surfaces stored in memory
+		 */
+		bool surface_overlaps_address(surface_type surface, u32 surface_address, u32 texaddr, u16 *x, u16 *y, bool scale_to_fit)
+		{
+			bool is_subslice = false;
+			u16  x_offset = 0;
+			u16  y_offset = 0;
+
+			if (surface_address > texaddr)
+				return false;
+
+			u32 offset = texaddr - surface_address;
+			if (texaddr >= surface_address)
+			{
+
+				if (offset == 0)
+				{
+					is_subslice = true;
+				}
+				else
+				{
+					surface_format_info info;
+					Traits::get_surface_info(surface, &info);
+
+					u32 range = info.rsx_pitch * info.surface_height;
+					if (offset < range)
+					{
+						const u32 y = (offset / info.rsx_pitch);
+						u32 x = (offset % info.rsx_pitch) / info.bpp;
+
+						if (scale_to_fit)
+						{
+							const f32 x_scale = (f32)info.rsx_pitch / info.native_pitch;
+							x = (u32)((f32)x / x_scale);
+						}
+
+						x_offset = x;
+						y_offset = y;
+
+						is_subslice = true;
+					}
+				}
+
+				if (is_subslice)
+				{
+					*x = x_offset;
+					*y = y_offset;
+
+					return true;
+				}
+			}
+
+			return false;
+		}
+
+		bool address_is_bound(u32 address, bool is_depth) const
+		{
+			if (is_depth)
+			{
+				const u32 bound_depth_address = std::get<0>(m_bound_depth_stencil);
+				return (bound_depth_address == address);
+			}
+
+			for (auto &surface : m_bound_render_targets)
+			{
+				const u32 bound_address = std::get<0>(surface);
+				if (bound_address == address)
+					return true;
+			}
+
+			return false;
+		}
+
+		inline bool region_fits(u16 region_width, u16 region_height, u16 x_offset, u16 y_offset, u16 width, u16 height) const
+		{
+			if ((x_offset + width) > region_width) return false;
+			if ((y_offset + height) > region_height) return false;
+
+			return true;
+		}
+
+		surface_subresource get_surface_subresource_if_applicable(u32 texaddr, u16 requested_width, u16 requested_height, u16 requested_pitch, bool scale_to_fit = false, bool crop = false, bool ignore_depth_formats = false)
+		{
+			auto test_surface = [&](surface_type surface, u32 this_address, u16 &x_offset, u16 &y_offset, u16 &w, u16 &h, bool &clipped)
+			{
+				if (surface_overlaps_address(surface, this_address, texaddr, &x_offset, &y_offset, scale_to_fit))
+				{
+					surface_format_info info;
+					Traits::get_surface_info(surface, &info);
+
+					if (info.rsx_pitch != requested_pitch)
+						return false;
+
+					u16 real_width = requested_width;
+
+					if (scale_to_fit)
+					{
+						f32 pitch_scaling = (f32)requested_pitch / info.native_pitch;
+						real_width = (u16)((f32)requested_width / pitch_scaling);
+					}
+
+					if (region_fits(info.surface_width, info.surface_height, x_offset, y_offset, real_width, requested_height))
+					{
+						w = info.surface_width;
+						h = info.surface_height;
+						clipped = false;
+
+						return true;
+					}
+					else
+					{
+						if (crop) //Forcefully fit the requested region by clipping and scaling
+						{
+							u16 remaining_width = info.surface_width - x_offset;
+							u16 remaining_height = info.surface_height - y_offset;
+
+							w = remaining_width;
+							h = remaining_height;
+							clipped = true;
+
+							return true;
+						}
+
+						if (info.surface_width >= requested_width && info.surface_height >= requested_height)
+						{
+							LOG_WARNING(RSX, "Overlapping surface exceeds bounds; returning full surface region");
+							w = requested_width;
+							h = requested_height;
+							clipped = true;
+
+							return true;
+						}
+					}
+				}
+
+				return false;
+			};
+
+			surface_type surface = nullptr;
+			bool clipped = false;
+			u16  x_offset = 0;
+			u16  y_offset = 0;
+			u16  w;
+			u16  h;
+
+			for (auto &tex_info : m_render_targets_storage)
+			{
+				u32 this_address = std::get<0>(tex_info);
+				surface = std::get<1>(tex_info).get();
+
+				if (test_surface(surface, this_address, x_offset, y_offset, w, h, clipped))
+					return { surface, x_offset, y_offset, w, h, address_is_bound(this_address, false), false, clipped };
+			}
+
+			if (ignore_depth_formats)
+				return{};
+
+			//Check depth surfaces for overlap
+			for (auto &tex_info : m_depth_stencil_storage)
+			{
+				u32 this_address = std::get<0>(tex_info);
+				surface = std::get<1>(tex_info).get();
+
+				if (test_surface(surface, this_address, x_offset, y_offset, w, h, clipped))
+					return { surface, x_offset, y_offset, w, h, address_is_bound(this_address, true), true, clipped };
+			}
+
+			return{};
+		}
 	};
 }
diff --git a/rpcs3/Emu/RSX/D3D12/D3D12RenderTargetSets.h b/rpcs3/Emu/RSX/D3D12/D3D12RenderTargetSets.h
index c6a9fbaedb..3c8a9ba157 100644
--- a/rpcs3/Emu/RSX/D3D12/D3D12RenderTargetSets.h
+++ b/rpcs3/Emu/RSX/D3D12/D3D12RenderTargetSets.h
@@ -54,6 +54,18 @@ struct render_target_traits
 		return rtt;
 	}
 
+	static
+	void get_surface_info(ID3D12Resource *surface, rsx::surface_format_info *info)
+	{
+		//TODO
+		auto desc = surface->GetDesc();
+		info->rsx_pitch = desc.Width;
+		info->native_pitch = desc.Width;
+		info->surface_width = desc.Width;
+		info->surface_height = desc.Height;
+		info->bpp = 1;
+	}
+
 	static
 	void prepare_rtt_for_drawing(
 		gsl::not_null<ID3D12GraphicsCommandList*> command_list,
diff --git a/rpcs3/Emu/RSX/GL/GLGSRender.cpp b/rpcs3/Emu/RSX/GL/GLGSRender.cpp
index 56679938b3..91d24de2de 100644
--- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp
+++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp
@@ -1028,7 +1028,6 @@ void GLGSRender::flip(int buffer)
 
 	// Calculate blit coordinates
 	coordi aspect_ratio;
-	areai screen_area = coordi({}, { (int)buffer_width, (int)buffer_height });
 	sizei csize(m_frame->client_width(), m_frame->client_height());
 	sizei new_size = csize;
 
@@ -1055,19 +1054,33 @@ void GLGSRender::flip(int buffer)
 	// Find the source image
 	rsx::tiled_region buffer_region = get_tiled_address(display_buffers[buffer].offset, CELL_GCM_LOCATION_LOCAL);
 	u32 absolute_address = buffer_region.address + buffer_region.base;
-	gl::texture *render_target_texture = m_rtts.get_texture_from_render_target_if_applicable(absolute_address);
 
 	m_flip_fbo.recreate();
 	m_flip_fbo.bind();
 
-	if (render_target_texture)
+	//The render might have been done offscreen and a blit used to display
+	//Check the texture cache for a blitted copy
+	const u32 size = buffer_pitch * buffer_height;
+	auto surface = m_gl_texture_cache.find_texture_from_range(absolute_address, size);
+	bool ignore_scaling = false;
+
+	if (surface != nullptr)
+	{
+		auto dims = surface->get_dimensions();
+		buffer_width = std::get<0>(dims);
+		buffer_height = std::get<1>(dims);
+
+		m_flip_fbo.color = surface->id();
+		m_flip_fbo.read_buffer(m_flip_fbo.color);
+	}
+	else if (auto render_target_texture = m_rtts.get_texture_from_render_target_if_applicable(absolute_address))
 	{
 		buffer_width = render_target_texture->width();
 		buffer_height = render_target_texture->height();
 
-		__glcheck m_flip_fbo.color = *render_target_texture;
-		__glcheck m_flip_fbo.read_buffer(m_flip_fbo.color);
-
+		m_flip_fbo.color = *render_target_texture;
+		m_flip_fbo.read_buffer(m_flip_fbo.color);
+		ignore_scaling = true;
 	}
 	else
 	{
@@ -1077,7 +1090,7 @@ void GLGSRender::flip(int buffer)
 		{
 			m_flip_tex_color.recreate(gl::texture::target::texture2D);
 
-			__glcheck m_flip_tex_color.config()
+			m_flip_tex_color.config()
 				.size({ (int)buffer_width, (int)buffer_height })
 				.type(gl::texture::type::uint_8_8_8_8)
 				.format(gl::texture::format::bgra);
@@ -1089,23 +1102,38 @@ void GLGSRender::flip(int buffer)
 		{
 			std::unique_ptr<u8[]> temp(new u8[buffer_height * buffer_pitch]);
 			buffer_region.read(temp.get(), buffer_width, buffer_height, buffer_pitch);
-			__glcheck m_flip_tex_color.copy_from(temp.get(), gl::texture::format::bgra, gl::texture::type::uint_8_8_8_8);
+			m_flip_tex_color.copy_from(temp.get(), gl::texture::format::bgra, gl::texture::type::uint_8_8_8_8);
 		}
 		else
 		{
-			__glcheck m_flip_tex_color.copy_from(buffer_region.ptr, gl::texture::format::bgra, gl::texture::type::uint_8_8_8_8);
+			m_flip_tex_color.copy_from(buffer_region.ptr, gl::texture::format::bgra, gl::texture::type::uint_8_8_8_8);
 		}
 
 		m_flip_fbo.color = m_flip_tex_color;
-		__glcheck m_flip_fbo.read_buffer(m_flip_fbo.color);
+		m_flip_fbo.read_buffer(m_flip_fbo.color);
+		ignore_scaling = true;
+	}
+
+	if (!ignore_scaling && buffer_region.tile && buffer_region.tile->comp != CELL_GCM_COMPMODE_DISABLED)
+	{
+		LOG_ERROR(RSX, "Output buffer compression mode = 0x%X", buffer_region.tile->comp);
+
+		switch (buffer_region.tile->comp)
+		{
+		case CELL_GCM_COMPMODE_C32_2X2:
+		case CELL_GCM_COMPMODE_C32_2X1:
+			buffer_height = display_buffers[buffer].height / 2;
+			break;
+		}
 	}
 
 	// Blit source image to the screen
 	// Disable scissor test (affects blit)
 	glDisable(GL_SCISSOR_TEST);
 
-	gl::screen.clear(gl::buffers::color_depth_stencil);
-	__glcheck m_flip_fbo.blit(gl::screen, screen_area, areai(aspect_ratio).flipped_vertical(), gl::buffers::color, gl::filter::linear);
+	areai screen_area = coordi({}, { (int)buffer_width, (int)buffer_height });
+	gl::screen.clear(gl::buffers::color);
+	m_flip_fbo.blit(gl::screen, screen_area, areai(aspect_ratio).flipped_vertical(), gl::buffers::color, gl::filter::linear);
 
 	if (g_cfg.video.overlay)
 	{
diff --git a/rpcs3/Emu/RSX/GL/GLHelpers.h b/rpcs3/Emu/RSX/GL/GLHelpers.h
index 192e0ba5d1..2bfd55d8ba 100644
--- a/rpcs3/Emu/RSX/GL/GLHelpers.h
+++ b/rpcs3/Emu/RSX/GL/GLHelpers.h
@@ -1953,6 +1953,12 @@ namespace gl
 				case texture::target::texture3D: glFramebufferTexture3D(GL_FRAMEBUFFER, m_id, GL_TEXTURE_3D, rhs.id(), rhs.level(), 0); break;
 				}
 			}
+
+			void operator = (const GLuint rhs)
+			{
+				save_binding_state save(m_parent);
+				glFramebufferTexture2D(GL_FRAMEBUFFER, m_id, GL_TEXTURE_2D, rhs, 0);
+			}
 		};
 
 		class indexed_attachment : public attachment
diff --git a/rpcs3/Emu/RSX/GL/GLRenderTargets.h b/rpcs3/Emu/RSX/GL/GLRenderTargets.h
index 608fc90ea3..5e012fcf75 100644
--- a/rpcs3/Emu/RSX/GL/GLRenderTargets.h
+++ b/rpcs3/Emu/RSX/GL/GLRenderTargets.h
@@ -115,40 +115,6 @@ namespace gl
 		{
 			return compatible_internal_format;
 		}
-
-		// For an address within the texture, extract this sub-section's rect origin
-		// Checks whether we need to scale the subresource if it is not handled in shader
-		// NOTE1: When surface->real_pitch < rsx_pitch, the surface is assumed to have been scaled to fill the rsx_region
-		std::tuple<bool, u16, u16> get_texture_subresource(u32 offset, bool scale_to_fit)
-		{
-			if (!offset)
-			{
-				return std::make_tuple(true, 0, 0);
-			}
-
-			if (!surface_height) surface_height = height();
-			if (!surface_width) surface_width = width();
-
-			u32 range = rsx_pitch * surface_height;
-			if (offset < range)
-			{
-				if (!surface_pixel_size)
-					surface_pixel_size = native_pitch / surface_width;
-
-				const u32 y = (offset / rsx_pitch);
-				u32 x = (offset % rsx_pitch) / surface_pixel_size;
-
-				if (scale_to_fit)
-				{
-					const f32 x_scale = (f32)rsx_pitch / native_pitch;
-					x = (u32)((f32)x / x_scale);
-				}
-
-				return std::make_tuple(true, (u16)x, (u16)y);
-			}
-			else
-				return std::make_tuple(false, 0, 0);
-		}
 	};
 }
 
@@ -235,6 +201,18 @@ struct gl_render_target_traits
 		return result;
 	}
 
+	static
+	void get_surface_info(gl::render_target *surface, rsx::surface_format_info *info)
+	{
+		const auto dims = surface->get_dimensions();
+
+		info->rsx_pitch = surface->get_rsx_pitch();
+		info->native_pitch = surface->get_native_pitch();
+		info->surface_width = std::get<0>(dims);
+		info->surface_height = std::get<1>(dims);
+		info->bpp = static_cast<u8>(info->native_pitch / info->surface_width);
+	}
+
 	static void prepare_rtt_for_drawing(void *, gl::render_target*) {}
 	static void prepare_rtt_for_sampling(void *, gl::render_target*) {}
 	
@@ -307,169 +285,6 @@ struct gl_render_target_traits
 	}
 };
 
-struct surface_subresource
-{
-	gl::render_target *surface = nullptr;
-	
-	u16 x = 0;
-	u16 y = 0;
-	u16 w = 0;
-	u16 h = 0;
-
-	bool is_bound = false;
-	bool is_depth_surface = false;
-	bool is_clipped = false;
-
-	surface_subresource() {}
-
-	surface_subresource(gl::render_target *src, u16 X, u16 Y, u16 W, u16 H, bool _Bound, bool _Depth, bool _Clipped = false)
-		: surface(src), x(X), y(Y), w(W), h(H), is_bound(_Bound), is_depth_surface(_Depth), is_clipped(_Clipped)
-	{}
-};
-
 class gl_render_targets : public rsx::surface_store<gl_render_target_traits>
 {
-private:
-	bool surface_overlaps(gl::render_target *surface, u32 surface_address, u32 texaddr, u16 *x, u16 *y, bool scale_to_fit)
-	{
-		bool is_subslice = false;
-		u16  x_offset = 0;
-		u16  y_offset = 0;
-
-		if (surface_address > texaddr)
-			return false;
-
-		u32 offset = texaddr - surface_address;
-		if (texaddr >= surface_address)
-		{
-			std::tie(is_subslice, x_offset, y_offset) = surface->get_texture_subresource(offset, scale_to_fit);
-			if (is_subslice)
-			{
-				*x = x_offset;
-				*y = y_offset;
-
-				return true;
-			}
-		}
-
-		return false;
-	}
-
-	bool is_bound(u32 address, bool is_depth)
-	{
-		if (is_depth)
-		{
-			const u32 bound_depth_address = std::get<0>(m_bound_depth_stencil);
-			return (bound_depth_address == address);
-		}
-
-		for (auto &surface: m_bound_render_targets)
-		{
-			const u32 bound_address = std::get<0>(surface);
-			if (bound_address == address)
-				return true;
-		}
-
-		return false;
-	}
-
-	bool fits(gl::render_target*, std::pair<u16, u16> &dims, u16 x_offset, u16 y_offset, u16 width, u16 height) const
-	{
-		if ((x_offset + width) > dims.first) return false;
-		if ((y_offset + height) > dims.second) return false;
-
-		return true;
-	}
-
-public:
-	surface_subresource get_surface_subresource_if_applicable(u32 texaddr, u16 requested_width, u16 requested_height, u16 requested_pitch, bool scale_to_fit=false, bool crop=false, bool ignore_depth_formats=false)
-	{
-		gl::render_target *surface = nullptr;
-		u16  x_offset = 0;
-		u16  y_offset = 0;
-
-		for (auto &tex_info : m_render_targets_storage)
-		{
-			u32 this_address = std::get<0>(tex_info);
-			surface = std::get<1>(tex_info).get();
-
-			if (surface_overlaps(surface, this_address, texaddr, &x_offset, &y_offset, scale_to_fit))
-			{
-				if (surface->get_rsx_pitch() != requested_pitch)
-					continue;
-
-				auto dims = surface->get_dimensions();
-
-				if (scale_to_fit)
-				{
-					f32  pitch_scaling = (f32)requested_pitch / surface->get_native_pitch();
-					requested_width = (u16)((f32)requested_width / pitch_scaling);
-				}
-
-				if (fits(surface, dims, x_offset, y_offset, requested_width, requested_height))
-					return{ surface, x_offset, y_offset, requested_width, requested_height, is_bound(this_address, false), false };
-				else
-				{
-					if (crop) //Forcefully fit the requested region by clipping and scaling
-					{
-						u16 remaining_width = dims.first - x_offset;
-						u16 remaining_height = dims.second - y_offset;
-
-						return{ surface, x_offset, y_offset, remaining_width, remaining_height, is_bound(this_address, false), false, true };
-					}
-
-					if (dims.first >= requested_width && dims.second >= requested_height)
-					{
-						LOG_WARNING(RSX, "Overlapping surface exceeds bounds; returning full surface region");
-						return{ surface, 0, 0, requested_width, requested_height, is_bound(this_address, false), false, true };
-					}
-				}
-			}
-		}
-
-		if (ignore_depth_formats)
-			return{};
-
-		//Check depth surfaces for overlap
-		for (auto &tex_info : m_depth_stencil_storage)
-		{
-			u32 this_address = std::get<0>(tex_info);
-			surface = std::get<1>(tex_info).get();
-
-			if (surface_overlaps(surface, this_address, texaddr, &x_offset, &y_offset, scale_to_fit))
-			{
-				if (surface->get_rsx_pitch() != requested_pitch)
-					continue;
-
-				auto dims = surface->get_dimensions();
-				
-				if (scale_to_fit)
-				{
-					f32  pitch_scaling = (f32)requested_pitch / surface->get_native_pitch();
-					requested_width = (u16)((f32)requested_width / pitch_scaling);
-				}
-
-				if (fits(surface, dims, x_offset, y_offset, requested_width, requested_height))
-					return{ surface, x_offset, y_offset, requested_width, requested_height, is_bound(this_address, true), true };
-				else
-				{
-					if (crop) //Forcefully fit the requested region by clipping and scaling
-					{
-						u16 remaining_width = dims.first - x_offset;
-						u16 remaining_height = dims.second - y_offset;
-
-						return{ surface, x_offset, y_offset, remaining_width, remaining_height, is_bound(this_address, true), true, true };
-					}
-
-					if (dims.first >= requested_width && dims.second >= requested_height)
-					{
-						LOG_WARNING(RSX, "Overlapping depth surface exceeds bounds; returning full surface region");
-						return{ surface, 0, 0, requested_width, requested_height, is_bound(this_address, true), true, true };
-					}
-				}
-			}
-		}
-
-		return {};
-	}
 };
diff --git a/rpcs3/Emu/RSX/GL/GLTextureCache.h b/rpcs3/Emu/RSX/GL/GLTextureCache.h
index cd888af0e0..cb57102199 100644
--- a/rpcs3/Emu/RSX/GL/GLTextureCache.h
+++ b/rpcs3/Emu/RSX/GL/GLTextureCache.h
@@ -460,40 +460,6 @@ namespace gl
 		GLGSRender *m_renderer;
 		std::thread::id m_renderer_thread;
 
-		cached_texture_section *find_texture_from_dimensions(u32 texaddr, u32 w, u32 h)
-		{
-			reader_lock lock(m_section_mutex);
-
-			for (cached_texture_section &tex : read_only_memory_sections)
-			{
-				if (tex.matches(texaddr, w, h) && !tex.is_dirty())
-					return &tex;
-			}
-
-			return nullptr;
-		}
-
-		/**
-		 * Searches for a texture from read_only memory sections
-		 * Texture origin + size must be a subsection of the existing texture
-		 */
-		cached_texture_section *find_texture_from_range(u32 texaddr, u32 range)
-		{
-			reader_lock lock(m_section_mutex);
-
-			auto test = std::make_pair(texaddr, range);
-			for (cached_texture_section &tex : read_only_memory_sections)
-			{
-				if (tex.get_section_base() > texaddr)
-					continue;
-
-				if (tex.overlaps(test, true) && !tex.is_dirty())
-					return &tex;
-			}
-
-			return nullptr;
-		}
-
 		cached_texture_section& create_texture(u32 id, u32 texaddr, u32 texsize, u32 w, u32 h)
 		{
 			for (cached_texture_section &tex : read_only_memory_sections)
@@ -536,19 +502,6 @@ namespace gl
 			clear_temporary_surfaces();
 		}
 
-		cached_texture_section* find_cached_rtt_section(u32 base, u32 size)
-		{
-			for (cached_texture_section &rtt : no_access_memory_sections)
-			{
-				if (rtt.matches(base, size))
-				{
-					return &rtt;
-				}
-			}
-
-			return nullptr;
-		}
-
 		cached_texture_section *create_locked_view_of_section(u32 base, u32 size)
 		{
 			cached_texture_section *region = find_cached_rtt_section(base, size);
@@ -647,6 +600,53 @@ namespace gl
 			m_hw_blitter.destroy();
 		}
 
+		cached_texture_section *find_texture_from_dimensions(u32 texaddr, u32 w, u32 h)
+		{
+			reader_lock lock(m_section_mutex);
+
+			for (cached_texture_section &tex : read_only_memory_sections)
+			{
+				if (tex.matches(texaddr, w, h) && !tex.is_dirty())
+					return &tex;
+			}
+
+			return nullptr;
+		}
+
+		/**
+		* Searches for a texture from read_only memory sections
+		* Texture origin + size must be a subsection of the existing texture
+		*/
+		cached_texture_section *find_texture_from_range(u32 texaddr, u32 range)
+		{
+			reader_lock lock(m_section_mutex);
+
+			auto test = std::make_pair(texaddr, range);
+			for (cached_texture_section &tex : read_only_memory_sections)
+			{
+				if (tex.get_section_base() > texaddr)
+					continue;
+
+				if (tex.overlaps(test, true) && !tex.is_dirty())
+					return &tex;
+			}
+
+			return nullptr;
+		}
+
+		cached_texture_section* find_cached_rtt_section(u32 base, u32 size)
+		{
+			for (cached_texture_section &rtt : no_access_memory_sections)
+			{
+				if (rtt.matches(base, size))
+				{
+					return &rtt;
+				}
+			}
+
+			return nullptr;
+		}
+
 		template<typename RsxTextureType>
 		void upload_texture(int index, RsxTextureType &tex, rsx::gl::texture &gl_texture, gl_render_targets &m_rtts)
 		{
@@ -739,7 +739,7 @@ namespace gl
 			const f32 internal_scale = (f32)tex_pitch / native_pitch;
 			const u32 internal_width = (const u32)(tex_width * internal_scale);
 
-			const surface_subresource rsc = m_rtts.get_surface_subresource_if_applicable(texaddr, internal_width, tex_height, tex_pitch, true);
+			const auto rsc = m_rtts.get_surface_subresource_if_applicable(texaddr, internal_width, tex_height, tex_pitch, true);
 			if (rsc.surface)
 			{
 				//Check that this region is not cpu-dirty before doing a copy
@@ -1078,7 +1078,7 @@ namespace gl
 			const u32 dst_address = (u32)((u64)dst.pixels - (u64)vm::base(0));
 
 			//Check if src/dst are parts of render targets
-			surface_subresource dst_subres = m_rtts.get_surface_subresource_if_applicable(dst_address, dst.width, dst.clip_height, dst.pitch, true, true, true);
+			auto dst_subres = m_rtts.get_surface_subresource_if_applicable(dst_address, dst.width, dst.clip_height, dst.pitch, true, true, true);
 			dst_is_render_target = dst_subres.surface != nullptr;
 
 			u16 max_dst_width = dst.width;
@@ -1097,7 +1097,8 @@ namespace gl
 			position2i dst_offset = { dst.offset_x, dst.offset_y };
 
 			size2i clip_dimensions = { dst.clip_width, dst.clip_height };
-			const size2i dst_dimensions = { dst.pitch / (dst_is_argb8 ? 4 : 2), dst.height };
+			//Dimensions passed are restricted to powers of 2; get real height from clip_height and width from pitch
+			const size2i dst_dimensions = { dst.pitch / (dst_is_argb8 ? 4 : 2), dst.clip_height };
 
 			//Offset in x and y for src is 0 (it is already accounted for when getting pixels_src)
 			//Reproject final clip onto source...
@@ -1184,7 +1185,7 @@ namespace gl
 			}
 
 			//TODO: Handle cases where src or dst can be a depth texture while the other is a color texture - requires a render pass to emulate
-			surface_subresource src_subres = m_rtts.get_surface_subresource_if_applicable(src_address, src.width, src.height, src.pitch, true, true, true);
+			auto src_subres = m_rtts.get_surface_subresource_if_applicable(src_address, src.width, src.height, src.pitch, true, true, true);
 			src_is_render_target = src_subres.surface != nullptr;
 
 			//Create source texture if does not exist
@@ -1283,7 +1284,9 @@ namespace gl
 			//If so, add this texture to the no_access queue not the read_only queue
 			writer_lock lock(m_section_mutex);
 
-			cached_texture_section &cached = create_texture(texture_id, dst.rsx_address, dst.pitch * dst.clip_height, dst.width, dst.clip_height);
+			const u8 bpp = dst_is_argb8 ? 4 : 2;
+			const u32 real_width = dst.pitch / bpp;
+			cached_texture_section &cached = create_texture(texture_id, dst.rsx_address, dst.pitch * dst.clip_height, real_width, dst.clip_height);
 			//These textures are completely GPU resident so we dont watch for CPU access
 			//There's no data to be fetched from the CPU
 			//Its is possible for a title to attempt to read from the region, but the CPU path should be used in such cases
diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.cpp b/rpcs3/Emu/RSX/VK/VKGSRender.cpp
index f7f750ac91..51b7685fdd 100644
--- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp
+++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp
@@ -2080,10 +2080,12 @@ void VKGSRender::prepare_rtts()
 
 	for (u8 index : draw_buffers)
 	{
-		bound_images.push_back(std::get<1>(m_rtts.m_bound_render_targets[index]));
+		auto surface = std::get<1>(m_rtts.m_bound_render_targets[index]);
+		bound_images.push_back(surface);
 
 		m_surface_info[index].address = surface_addresses[index];
 		m_surface_info[index].pitch = surface_pitchs[index];
+		surface->rsx_pitch = surface_pitchs[index];
 
 		if (surface_pitchs[index] <= 64)
 		{
@@ -2095,10 +2097,12 @@ void VKGSRender::prepare_rtts()
 
 	if (std::get<0>(m_rtts.m_bound_depth_stencil) != 0)
 	{
-		bound_images.push_back(std::get<1>(m_rtts.m_bound_depth_stencil));
+		auto ds = std::get<1>(m_rtts.m_bound_depth_stencil);
+		bound_images.push_back(ds);
 
 		m_depth_surface_info.address = zeta_address;
 		m_depth_surface_info.pitch = rsx::method_registers.surface_z_pitch();
+		ds->rsx_pitch = m_depth_surface_info.pitch;
 
 		if (m_depth_surface_info.pitch <= 64 && clip_width > m_depth_surface_info.pitch)
 			m_depth_surface_info.pitch = 0;
@@ -2519,3 +2523,9 @@ void VKGSRender::flip(int buffer)
 	m_uploads_8k = 0;
 	m_uploads_16k = 0;
 }
+
+bool VKGSRender::scaled_image_from_memory(rsx::blit_src_info& src, rsx::blit_dst_info& dst, bool interpolate)
+{
+	return m_texture_cache.upload_scaled_image(src, dst, interpolate, (*m_device), *m_current_command_buffer, m_memory_type_mapping,
+			m_swap_chain->get_present_queue(), m_rtts, m_texture_upload_buffer_ring_info, m_texture_upload_buffer_ring_info.heap.get());
+}
diff --git a/rpcs3/Emu/RSX/VK/VKGSRender.h b/rpcs3/Emu/RSX/VK/VKGSRender.h
index 4f405728a5..6feb21270a 100644
--- a/rpcs3/Emu/RSX/VK/VKGSRender.h
+++ b/rpcs3/Emu/RSX/VK/VKGSRender.h
@@ -294,6 +294,7 @@ protected:
 	void flip(int buffer) override;
 
 	void do_local_task() override;
+	bool scaled_image_from_memory(rsx::blit_src_info& src, rsx::blit_dst_info& dst, bool interpolate) override;
 
 	bool on_access_violation(u32 address, bool is_writing) override;
 	void on_notify_memory_unmapped(u32 address_base, u32 size) override;
diff --git a/rpcs3/Emu/RSX/VK/VKRenderTargets.h b/rpcs3/Emu/RSX/VK/VKRenderTargets.h
index 3a99503872..7762fe55e9 100644
--- a/rpcs3/Emu/RSX/VK/VKRenderTargets.h
+++ b/rpcs3/Emu/RSX/VK/VKRenderTargets.h
@@ -20,6 +20,7 @@ namespace vk
 	{
 		bool dirty = false;
 		u16 native_pitch = 0;
+		u16 rsx_pitch = 0;
 		VkImageAspectFlags attachment_aspect_flag = VK_IMAGE_ASPECT_COLOR_BIT;
 		std::unique_ptr<vk::image_view> view;
 
@@ -171,6 +172,16 @@ namespace rsx
 			return ds;
 		}
 
+		static
+		void get_surface_info(vk::render_target *surface, rsx::surface_format_info *info)
+		{
+			info->rsx_pitch = surface->rsx_pitch;
+			info->native_pitch = surface->native_pitch;
+			info->surface_width = surface->info.extent.width;
+			info->surface_height = surface->info.extent.height;
+			info->bpp = static_cast<u8>(info->native_pitch / info->surface_width);
+		}
+
 		static void prepare_rtt_for_drawing(vk::command_buffer* pcmd, vk::render_target *surface)
 		{
 			VkImageSubresourceRange range = vk::get_image_subresource_range(0, 0, 1, 1, surface->attachment_aspect_flag);
diff --git a/rpcs3/Emu/RSX/VK/VKTextureCache.h b/rpcs3/Emu/RSX/VK/VKTextureCache.h
index 9fe0bfcfe3..d58e10a5ed 100644
--- a/rpcs3/Emu/RSX/VK/VKTextureCache.h
+++ b/rpcs3/Emu/RSX/VK/VKTextureCache.h
@@ -94,7 +94,16 @@ namespace vk
 				if (!width && !height && !mipmaps)
 					return true;
 
-				return (width == this->width && height == this->height && mipmaps == this->mipmaps);
+				if (width && width != this->width)
+					return false;
+
+				if (height && height != this->height)
+					return false;
+
+				if (mipmaps && mipmaps != this->mipmaps)
+					return false;
+
+				return true;
 			}
 
 			return false;
@@ -361,6 +370,40 @@ namespace vk
 		const s32 m_max_zombie_objects = 32; //Limit on how many texture objects to keep around for reuse after they are invalidated
 		s32 m_unreleased_texture_objects = 0; //Number of invalidated objects not yet freed from memory
 
+		cached_texture_section *find_texture_from_range(u32 rsx_address, u32 range)
+		{
+			auto test = std::make_pair(rsx_address, range);
+			for (auto &address_range : m_cache)
+			{
+				auto &range_data = address_range.second;
+				for (auto &tex : range_data.data)
+				{
+					if (!tex.is_dirty() && tex.overlaps(test, true))
+						return &tex;
+				}
+			}
+
+			return nullptr;
+		}
+
+		cached_texture_section *find_texture_from_dimensions(u32 rsx_address, u32 rsx_size, u16 width = 0, u16 height = 0, u16 mipmaps = 0)
+		{
+			auto found = m_cache.find(rsx_address);
+			if (found != m_cache.end())
+			{
+				auto &range_data = found->second;
+				for (auto &tex : range_data.data)
+				{
+					if (tex.matches(rsx_address, width, height, mipmaps) && !tex.is_dirty())
+					{
+						return &tex;
+					}
+				}
+			}
+
+			return nullptr;
+		}
+
 		cached_texture_section& find_cached_texture(u32 rsx_address, u32 rsx_size, bool confirm_dimensions = false, u16 width = 0, u16 height = 0, u16 mipmaps = 0)
 		{
 			{
@@ -565,7 +608,7 @@ namespace vk
 		}
 
 		template <typename RsxTextureType>
-		vk::image_view* upload_texture(command_buffer cmd, RsxTextureType &tex, rsx::vk_render_targets &m_rtts, const vk::memory_type_mapping &memory_type_mapping, vk_data_heap& upload_heap, vk::buffer* upload_buffer)
+		vk::image_view* upload_texture(command_buffer &cmd, RsxTextureType &tex, rsx::vk_render_targets &m_rtts, const vk::memory_type_mapping &memory_type_mapping, vk_data_heap& upload_heap, vk::buffer* upload_buffer)
 		{
 			const u32 texaddr = rsx::get_address(tex.offset(), tex.location());
 			const u32 range = (u32)get_texture_size(tex);
@@ -1048,5 +1091,270 @@ namespace vk
 					value.misses --;
 			}
 		}
+
+		bool upload_scaled_image(rsx::blit_src_info& src, rsx::blit_dst_info& dst, bool interpolate,
+				vk::render_device& dev, vk::command_buffer& cmd, vk::memory_type_mapping& memory_types, VkQueue submit_queue,
+				rsx::vk_render_targets &m_rtts, vk_data_heap &upload_heap, vk::buffer* upload_buffer)
+		{
+			//Since we will have dst in vram, we can 'safely' ignore the swizzle flag
+			//TODO: Verify correct behavior
+
+			bool src_is_render_target = false;
+			bool dst_is_render_target = false;
+			bool dst_is_argb8 = (dst.format == rsx::blit_engine::transfer_destination_format::a8r8g8b8);
+			bool src_is_argb8 = (src.format == rsx::blit_engine::transfer_source_format::a8r8g8b8);
+
+			VkFormat src_vk_format = src_is_argb8 ? VK_FORMAT_B8G8R8A8_UNORM : VK_FORMAT_R5G6B5_UNORM_PACK16;
+
+			vk::image* vram_texture = nullptr;
+			vk::image* dest_texture = nullptr;
+
+			const u32 src_address = (u32)((u64)src.pixels - (u64)vm::base(0));
+			const u32 dst_address = (u32)((u64)dst.pixels - (u64)vm::base(0));
+
+			//Check if src/dst are parts of render targets
+			auto dst_subres = m_rtts.get_surface_subresource_if_applicable(dst_address, dst.width, dst.clip_height, dst.pitch, true, true, true);
+			dst_is_render_target = dst_subres.surface != nullptr;
+
+			u16 max_dst_width = dst.width;
+			u16 max_dst_height = dst.height;
+
+			//Prepare areas and offsets
+			//Copy from [src.offset_x, src.offset_y] a region of [clip.width, clip.height]
+			//Stretch onto [dst.offset_x, y] with clipping performed on the source region
+			//The implementation here adds the inverse scaled clip dimensions onto the source to completely bypass final clipping step
+
+			float scale_x = (f32)dst.width / src.width;
+			float scale_y = (f32)dst.height / src.height;
+
+			//Clip offset is unused if the clip offsets are reprojected onto the source
+			position2i clip_offset = { 0, 0 };//{ dst.clip_x, dst.clip_y };
+			position2i dst_offset = { dst.offset_x, dst.offset_y };
+
+			size2i clip_dimensions = { dst.clip_width, dst.clip_height };
+			//Dimensions passed are restricted to powers of 2; get real height from clip_height and width from pitch
+			const size2i dst_dimensions = { dst.pitch / (dst_is_argb8 ? 4 : 2), dst.clip_height };
+
+			//Offset in x and y for src is 0 (it is already accounted for when getting pixels_src)
+			//Reproject final clip onto source...
+			const u16 src_w = (const u16)((f32)clip_dimensions.width / scale_x);
+			const u16 src_h = (const u16)((f32)clip_dimensions.height / scale_y);
+
+			areai src_area = { 0, 0, src_w, src_h };
+			areai dst_area = { 0, 0, dst.clip_width, dst.clip_height };
+
+			//If destination is neither a render target nor an existing texture in VRAM
+			//its possible that this method is being used to perform a memcpy into RSX memory, so we check
+			//parameters. Whenever a simple memcpy can get the job done, use it instead.
+			//Dai-3-ji Super Robot Taisen for example uses this to copy program code to GPU RAM
+
+			bool is_memcpy = false;
+			u32 memcpy_bytes_length = 0;
+			if (dst_is_argb8 == src_is_argb8 && !dst.swizzled)
+			{
+				if ((src.slice_h == 1 && dst.clip_height == 1) ||
+					(dst.clip_width == src.width && dst.clip_height == src.slice_h && src.pitch == dst.pitch))
+				{
+					const u8 bpp = dst_is_argb8 ? 4 : 2;
+					is_memcpy = true;
+					memcpy_bytes_length = dst.clip_width * bpp * dst.clip_height;
+				}
+			}
+
+			if (!dst_is_render_target)
+			{
+				//First check if this surface exists in VRAM with exact dimensions
+				//Since scaled GPU resources are not invalidated by the CPU, we need to reuse older surfaces if possible
+				auto cached_dest = find_texture_from_dimensions(dst.rsx_address, dst.pitch * dst.clip_height, dst_dimensions.width, dst_dimensions.height);
+
+				//Check for any available region that will fit this one
+				if (!cached_dest) cached_dest = find_texture_from_range(dst.rsx_address, dst.pitch * dst.clip_height);
+
+				if (cached_dest)
+				{
+					//TODO: Verify that the new surface will fit
+					dest_texture = cached_dest->get_texture().get();
+
+					//TODO: Move this code into utils since it is used alot
+					const u32 address_offset = dst.rsx_address - cached_dest->get_section_base();
+
+					const u16 bpp = dst_is_argb8 ? 4 : 2;
+					const u16 offset_y = address_offset / dst.pitch;
+					const u16 offset_x = address_offset % dst.pitch;
+
+					dst_offset.x += offset_x / bpp;
+					dst_offset.y += offset_y;
+
+					max_dst_width = cached_dest->get_width();
+					max_dst_height = cached_dest->get_height();
+				}
+				else if (is_memcpy)
+				{
+					memcpy(dst.pixels, src.pixels, memcpy_bytes_length);
+					return true;
+				}
+			}
+			else
+			{
+				dst_offset.x = dst_subres.x;
+				dst_offset.y = dst_subres.y;
+
+				dest_texture = dst_subres.surface;
+
+				max_dst_width = dst_subres.surface->width();
+				max_dst_height = dst_subres.surface->height();
+
+				if (is_memcpy)
+				{
+					//Some render target descriptions are actually invalid
+					//Confirm this is a flushable RTT
+					const auto rsx_pitch = dst_subres.surface->rsx_pitch;
+					const auto native_pitch = dst_subres.surface->native_pitch;
+
+					if (rsx_pitch <= 64 && native_pitch != rsx_pitch)
+					{
+						memcpy(dst.pixels, src.pixels, memcpy_bytes_length);
+						return true;
+					}
+				}
+			}
+
+			//TODO: Handle cases where src or dst can be a depth texture while the other is a color texture - requires a render pass to emulate
+			auto src_subres = m_rtts.get_surface_subresource_if_applicable(src_address, src.width, src.height, src.pitch, true, true, true);
+			src_is_render_target = src_subres.surface != nullptr;
+
+			//Create source texture if does not exist
+			if (!src_is_render_target)
+			{
+				auto preloaded_texture = find_texture_from_dimensions(src_address, src.pitch * src.slice_h, src.width, src.slice_h);
+
+				if (preloaded_texture != nullptr)
+				{
+					vram_texture = preloaded_texture->get_texture().get();
+				}
+				else
+				{
+					flush_address(src_address, dev, cmd, memory_types, submit_queue);
+					writer_lock lock(m_cache_mutex);
+
+					//Upload texture from CPU
+					vk::image *image = new vk::image(*vk::get_current_renderer(), memory_types.device_local, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+						VK_IMAGE_TYPE_2D,
+						src_vk_format,
+						src.width, src.slice_h, 1, 1, 1, VK_SAMPLE_COUNT_1_BIT, VK_IMAGE_LAYOUT_UNDEFINED,
+						VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT, 0);
+
+					vk::image_view *view = new vk::image_view(*vk::get_current_renderer(), image->value, VK_IMAGE_VIEW_TYPE_2D, src_vk_format,
+						{ VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_B, VK_COMPONENT_SWIZZLE_A },
+						{ VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1 });
+
+					cached_texture_section& region = find_cached_texture(dst.rsx_address, src.pitch * src.slice_h, true, src.width, src.slice_h, 1);
+					region.reset(src.rsx_address, src.pitch * src.slice_h);
+					region.create(src.width, src.slice_h, 1, 1, view, dest_texture);
+					region.protect(utils::protection::ro);
+					region.set_dirty(false);
+
+					read_only_range = region.get_min_max(read_only_range);
+
+					vk::enter_uninterruptible();
+
+					std::vector<rsx_subresource_layout> layout(1);
+					auto &subres = layout.back();
+					subres.width_in_block = src.width;
+					subres.height_in_block = src.slice_h;
+					subres.pitch_in_bytes = src.pitch;
+					subres.depth = 1;
+					subres.data = {(const gsl::byte*)src.pixels, src.pitch * src.slice_h};
+
+					copy_mipmaped_image_using_buffer(cmd, image->value, layout, src_vk_format, false, 1,
+						upload_heap, upload_buffer);
+
+					vk::leave_uninterruptible();
+				}
+			}
+			else
+			{
+				if (src_subres.w != clip_dimensions.width ||
+					src_subres.h != clip_dimensions.height)
+				{
+					f32 subres_scaling_x = (f32)src.pitch / src_subres.surface->native_pitch;
+
+					dst_area.x2 = (int)(src_subres.w * scale_x * subres_scaling_x);
+					dst_area.y2 = (int)(src_subres.h * scale_y);
+				}
+
+				src_area.x2 = src_subres.w;
+				src_area.y2 = src_subres.h;
+
+				src_area.x1 += src_subres.x;
+				src_area.x2 += src_subres.x;
+				src_area.y1 += src_subres.y;
+				src_area.y2 += src_subres.y;
+
+				vram_texture = src_subres.surface;
+			}
+
+			//Validate clip offsets (Persona 4 Arena at 720p)
+			//Check if can fit
+			//NOTE: It is possible that the check is simpler (if (clip_x >= clip_width))
+			//Needs verification
+			if ((dst.offset_x + dst.clip_x + dst.clip_width) > max_dst_width) dst.clip_x = 0;
+			if ((dst.offset_y + dst.clip_y + dst.clip_height) > max_dst_height) dst.clip_y = 0;
+
+			if (dst.clip_x || dst.clip_y)
+			{
+				//Reproject clip offsets onto source
+				const u16 scaled_clip_offset_x = (const u16)((f32)dst.clip_x / scale_x);
+				const u16 scaled_clip_offset_y = (const u16)((f32)dst.clip_y / scale_y);
+
+				src_area.x1 += scaled_clip_offset_x;
+				src_area.x2 += scaled_clip_offset_x;
+				src_area.y1 += scaled_clip_offset_y;
+				src_area.y2 += scaled_clip_offset_y;
+			}
+
+			bool dest_exists = dest_texture != nullptr;
+			const VkFormat dst_vk_format = dst_is_argb8 ? VK_FORMAT_R8G8B8A8_UNORM : VK_FORMAT_R5G6B5_UNORM_PACK16;
+			const u8 bpp = dst_is_argb8 ? 4 : 2;
+			const u32 real_width = dst.pitch / bpp;
+
+			if (!dest_exists)
+			{
+				dest_texture = new vk::image(*vk::get_current_renderer(), memory_types.device_local, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
+					VK_IMAGE_TYPE_2D,
+					dst_vk_format,
+					real_width, dst.clip_height, 1, 1, 1, VK_SAMPLE_COUNT_1_BIT, VK_IMAGE_LAYOUT_UNDEFINED,
+					VK_IMAGE_TILING_OPTIMAL, VK_IMAGE_USAGE_TRANSFER_DST_BIT | VK_IMAGE_USAGE_SAMPLED_BIT, 0);
+			}
+
+			//Copy data
+			copy_scaled_image(cmd, vram_texture->value, dest_texture->value, vram_texture->current_layout, dest_texture->current_layout,
+					src_area.x1, src_area.y1, src_w, src_h, dst_area.x1, dst_area.y1, dst.clip_width, dst.clip_height, 1, VK_IMAGE_ASPECT_COLOR_BIT);
+
+			if (dest_exists)
+				return true;
+
+			//TODO: Verify if any titles ever scale into CPU memory. It defeats the purpose of uploading data to the GPU, but it could happen
+			//If so, add this texture to the no_access queue not the read_only queue
+			cached_texture_section& region = find_cached_texture(dst.rsx_address, dst.pitch * dst.clip_height, true, real_width, dst.clip_height, 1);
+			writer_lock lock(m_cache_mutex);
+
+			//These textures are completely GPU resident so we dont watch for CPU access
+			//There's no data to be fetched from the CPU
+			//Its is possible for a title to attempt to read from the region, but the CPU path should be used in such cases
+
+			vk::image_view *view = new vk::image_view(*vk::get_current_renderer(), dest_texture->value, VK_IMAGE_VIEW_TYPE_2D, dst_vk_format,
+					{ VK_COMPONENT_SWIZZLE_R, VK_COMPONENT_SWIZZLE_G, VK_COMPONENT_SWIZZLE_B, VK_COMPONENT_SWIZZLE_A },
+					{ VK_IMAGE_ASPECT_COLOR_BIT, 0, 1, 0, 1 });
+
+			region.reset(dst.rsx_address, dst.pitch * dst.clip_height);
+			region.create(real_width, dst.clip_height, 1, 1, view, dest_texture);
+			region.protect(utils::protection::rw);
+			region.set_dirty(false);
+
+			read_only_range = region.get_min_max(read_only_range);
+
+			return true;
+		}
 	};
 }
diff --git a/rpcs3/Emu/RSX/rsx_methods.cpp b/rpcs3/Emu/RSX/rsx_methods.cpp
index bc7402e917..1079b93b88 100644
--- a/rpcs3/Emu/RSX/rsx_methods.cpp
+++ b/rpcs3/Emu/RSX/rsx_methods.cpp
@@ -580,37 +580,6 @@ namespace rsx
 				return;
 			}
 
-			if (dst_dma == CELL_GCM_CONTEXT_DMA_MEMORY_FRAME_BUFFER)
-			{
-				//HACK: it's extension of the flip-hack. remove this when textures cache would be properly implemented
-				for (int i = 0; i < rsx::limits::color_buffers_count; ++i)
-				{
-					u32 begin = rsx->display_buffers[i].offset;
-
-					if (dst_offset < begin || !begin)
-					{
-						continue;
-					}
-
-					if (rsx->display_buffers[i].width < 720 || rsx->display_buffers[i].height < 480)
-					{
-						continue;
-					}
-
-					if (begin == dst_offset)
-					{
-						return;
-					}
-
-					u32 end = begin + rsx->display_buffers[i].height * rsx->display_buffers[i].pitch;
-
-					if (dst_offset < end)
-					{
-						return;
-					}
-				}
-			}
-
 			const u32 in_bpp = (src_color_format == rsx::blit_engine::transfer_source_format::r5g6b5) ? 2 : 4; // bytes per pixel
 			const u32 out_bpp = (dst_color_format == rsx::blit_engine::transfer_destination_format::r5g6b5) ? 2 : 4;