gl: Minor optimizations

rsx: Texture cache - improvements to locking rsx: Minor optimizations to get_current_vertex_program and begin-end batch flushes rsx: Optimize texture cache storage - Manages storage in blocks of 16MB rsx/vk/gl: Fix swizzled texture input gl: Hotfix for compressed texture formats
2017-09-14 14:37:14 +03:00 · 2017-09-14 14:37:14 +03:00 · 45d0e821dc
parent e37a2a8f7d
commit 45d0e821dc
9 changed files with 372 additions and 267 deletions
--- a/rpcs3/Emu/RSX/Common/texture_cache.h
+++ b/rpcs3/Emu/RSX/Common/texture_cache.h
@ -14,6 +14,12 @@ namespace rsx
 		swapped_native_component_order = 2,
 	};

+	enum texture_upload_context
+	{
+		shader_read = 0,
+		blit_engine_src = 1
+	};
+
 	template <typename commandbuffer_type, typename section_storage_type, typename image_resource_type, typename image_view_type, typename image_storage_type, typename texture_format>
 	class texture_cache
 	{
@ -34,12 +40,14 @@ namespace rsx

 			void notify(u32 data_size)
 			{
+				verify(HERE), valid_count >= 0;
 				max_range = std::max(data_size, max_range);
 				valid_count++;
 			}

 			void add(section_storage_type& section, u32 data_size)
 			{
+				verify(HERE), valid_count >= 0;
 				max_range = std::max(data_size, max_range);
 				valid_count++;

@ -65,7 +73,7 @@ namespace rsx
 		std::unordered_map<u32, framebuffer_memory_characteristics> m_cache_miss_statistics_table;
 		
 		//Memory usage
-		const s32 m_max_zombie_objects = 32; //Limit on how many texture objects to keep around for reuse after they are invalidated
+		const s32 m_max_zombie_objects = 128; //Limit on how many texture objects to keep around for reuse after they are invalidated
 		s32 m_unreleased_texture_objects = 0; //Number of invalidated objects not yet freed from memory
 		
 		/* Helpers */
@ -74,11 +82,141 @@ namespace rsx
 		virtual image_view_type create_temporary_subresource_view(commandbuffer_type&, image_storage_type* src, u32 gcm_format, u16 x, u16 y, u16 w, u16 h) = 0;
 		virtual section_storage_type* create_new_texture(commandbuffer_type&, u32 rsx_address, u32 rsx_size, u16 width, u16 height, u16 depth, u16 mipmaps, const u32 gcm_format,
 				const rsx::texture_dimension_extended type, const texture_create_flags flags, std::pair<std::array<u8, 4>, std::array<u8, 4>>& remap_vector) = 0;
-		virtual section_storage_type* upload_image_from_cpu(commandbuffer_type&, u32 rsx_address, u16 width, u16 height, u16 depth, u16 mipmaps, u16 pitch, const u32 gcm_format,
+		virtual section_storage_type* upload_image_from_cpu(commandbuffer_type&, u32 rsx_address, u16 width, u16 height, u16 depth, u16 mipmaps, u16 pitch, const u32 gcm_format, const texture_upload_context context,
 				std::vector<rsx_subresource_layout>& subresource_layout, const rsx::texture_dimension_extended type, const bool swizzled, std::pair<std::array<u8, 4>, std::array<u8, 4>>& remap_vector) = 0;
 		virtual void enforce_surface_creation_type(section_storage_type& section, const texture_create_flags expected) = 0;
 		virtual void insert_texture_barrier() = 0;

+	private:
+		//Internal implementation methods
+		bool invalidate_range_impl(u32 address, u32 range, bool unprotect)
+		{
+			bool response = false;
+			u32 last_dirty_block = 0;
+			std::pair<u32, u32> trampled_range = std::make_pair(address, address + range);
+
+			for (auto It = m_cache.begin(); It != m_cache.end(); It++)
+			{
+				auto &range_data = It->second;
+				const u32 base = It->first;
+				bool range_reset = false;
+
+				if (base == last_dirty_block && range_data.valid_count == 0)
+					continue;
+
+				if (trampled_range.first >= (base + get_block_size()) || base >= trampled_range.second)
+					continue;
+
+				for (int i = 0; i < range_data.data.size(); i++)
+				{
+					auto &tex = range_data.data[i];
+
+					if (tex.is_dirty()) continue;
+					if (!tex.is_locked()) continue;	//flushable sections can be 'clean' but unlocked. TODO: Handle this better
+
+					auto overlapped = tex.overlaps_page(trampled_range, address);
+					if (std::get<0>(overlapped))
+					{
+						auto &new_range = std::get<1>(overlapped);
+
+						if (new_range.first != trampled_range.first ||
+							new_range.second != trampled_range.second)
+						{
+							i = 0;
+							trampled_range = new_range;
+							range_reset = true;
+						}
+
+						if (unprotect)
+						{
+							tex.set_dirty(true);
+							tex.unprotect();
+						}
+						else
+						{
+							tex.discard();
+						}
+
+						m_unreleased_texture_objects++;
+						range_data.valid_count--;
+						response = true;
+					}
+				}
+
+				if (range_reset)
+				{
+					last_dirty_block = base;
+					It = m_cache.begin();
+				}
+			}
+
+			return response;
+		}
+
+		template <typename ...Args>
+		bool flush_address_impl(u32 address, Args&&... extras)
+		{
+			bool response = false;
+			u32 last_dirty_block = 0;
+			std::pair<u32, u32> trampled_range = std::make_pair(0xffffffff, 0x0);
+
+			for (auto It = m_cache.begin(); It != m_cache.end(); It++)
+			{
+				auto &range_data = It->second;
+				const u32 base = It->first;
+				bool range_reset = false;
+
+				if (base == last_dirty_block && range_data.valid_count == 0)
+					continue;
+
+				if (trampled_range.first >= (base + get_block_size()) || base >= trampled_range.second)
+					continue;
+
+				for (int i = 0; i < range_data.data.size(); i++)
+				{
+					auto &tex = range_data.data[i];
+
+					if (tex.is_dirty()) continue;
+					if (!tex.is_flushable()) continue;
+
+					auto overlapped = tex.overlaps_page(trampled_range, address);
+					if (std::get<0>(overlapped))
+					{
+						auto &new_range = std::get<1>(overlapped);
+
+						if (new_range.first != trampled_range.first ||
+							new_range.second != trampled_range.second)
+						{
+							i = 0;
+							trampled_range = new_range;
+							range_reset = true;
+						}
+
+						//TODO: Map basic host_visible memory without coherent constraint
+						if (!tex.flush(std::forward<Args>(extras)...))
+						{
+							//Missed address, note this
+							//TODO: Lower severity when successful to keep the cache from overworking
+							record_cache_miss(tex);
+						}
+
+						response = true;
+						range_data.valid_count--;
+					}
+				}
+
+				if (range_reset)
+				{
+					It = m_cache.begin();
+				}
+			}
+
+			return response;
+		}
+
+		constexpr u32 get_block_size() const { return 0x1000000; }
+		inline u32 get_block_address(u32 address) const { return (address & ~0xFFFFFF);  }
+
 	public:

 		texture_cache() {}
@ -93,7 +231,9 @@ namespace rsx
 			auto test = std::make_pair(rsx_address, range);
 			for (auto &address_range : m_cache)
 			{
+				if (address_range.second.valid_count == 0) continue;
 				auto &range_data = address_range.second;
+
 				for (auto &tex : range_data.data)
 				{
 					if (tex.get_section_base() > rsx_address)
@ -109,7 +249,7 @@ namespace rsx

 		section_storage_type *find_texture_from_dimensions(u32 rsx_address, u16 width = 0, u16 height = 0, u16 mipmaps = 0)
 		{
-			auto found = m_cache.find(rsx_address);
+			auto found = m_cache.find(get_block_address(rsx_address));
 			if (found != m_cache.end())
 			{
 				auto &range_data = found->second;
@ -127,10 +267,9 @@ namespace rsx

 		section_storage_type& find_cached_texture(u32 rsx_address, u32 rsx_size, bool confirm_dimensions = false, u16 width = 0, u16 height = 0, u16 mipmaps = 0)
 		{
-			{
-				reader_lock lock(m_cache_mutex);
+			const u32 block_address = get_block_address(rsx_address);

-				auto found = m_cache.find(rsx_address);
+			auto found = m_cache.find(block_address);
 			if (found != m_cache.end())
 			{
 				auto &range_data = found->second;
@ -166,20 +305,15 @@ namespace rsx
 					}
 				}
 			}
-			}
-
-			writer_lock lock(m_cache_mutex);

 			section_storage_type tmp;
-			m_cache[rsx_address].add(tmp, rsx_size);
-			return m_cache[rsx_address].data.back();
+			m_cache[block_address].add(tmp, rsx_size);
+			return m_cache[block_address].data.back();
 		}

 		section_storage_type* find_flushable_section(const u32 address, const u32 range)
 		{
-			reader_lock lock(m_cache_mutex);
-
-			auto found = m_cache.find(address);
+			auto found = m_cache.find(get_block_address(address));
 			if (found != m_cache.end())
 			{
 				auto &range_data = found->second;
@ -199,9 +333,8 @@ namespace rsx
 		template <typename ...Args>
 		void lock_memory_region(image_storage_type* image, const u32 memory_address, const u32 memory_size, const u32 width, const u32 height, const u32 pitch, Args&&... extras)
 		{
-			section_storage_type& region = find_cached_texture(memory_address, memory_size, true, width, height, 1);
-
 			writer_lock lock(m_cache_mutex);
+			section_storage_type& region = find_cached_texture(memory_address, memory_size, true, width, height, 1);

 			if (!region.is_locked())
 			{
@ -217,6 +350,7 @@ namespace rsx
 		template <typename ...Args>
 		bool flush_memory_to_cache(const u32 memory_address, const u32 memory_size, bool skip_synchronized, Args&&... extra)
 		{
+			writer_lock lock(m_cache_mutex);
 			section_storage_type* region = find_flushable_section(memory_address, memory_size);

 			//TODO: Make this an assertion
@ -236,6 +370,7 @@ namespace rsx
 		template <typename ...Args>
 		bool load_memory_from_cache(const u32 memory_address, const u32 memory_size, Args&&... extras)
 		{
+			reader_lock lock(m_cache_mutex);
 			section_storage_type *region = find_flushable_section(memory_address, memory_size);

 			if (region && !region->is_dirty())
@ -256,7 +391,7 @@ namespace rsx

 			reader_lock lock(m_cache_mutex);

-			auto found = m_cache.find(address);
+			auto found = m_cache.find(get_block_address(address));
 			if (found != m_cache.end())
 			{
 				auto &range_data = found->second;
@ -304,74 +439,8 @@ namespace rsx
 				address > no_access_range.second)
 				return false;

-			bool response = false;
-			std::pair<u32, u32> trampled_range = std::make_pair(0xffffffff, 0x0);
-			std::unordered_map<u32, bool> processed_ranges;
-
 			rsx::conditional_lock<shared_mutex> lock(in_access_violation_handler, m_cache_mutex);
-
-			for (auto It = m_cache.begin(); It != m_cache.end(); It++)
-			{
-				auto &range_data = It->second;
-				const u32 base = It->first;
-				bool range_reset = false;
-
-				if (processed_ranges[base] || range_data.valid_count == 0)
-					continue;
-
-				//Quickly discard range
-				const u32 lock_base = base & ~0xfff;
-				const u32 lock_limit = align(range_data.max_range + base, 4096);
-
-				if ((trampled_range.first >= lock_limit || lock_base >= trampled_range.second) &&
-					(lock_base > address || lock_limit <= address))
-				{
-					processed_ranges[base] = true;
-					continue;
-				}
-
-				for (int i = 0; i < range_data.data.size(); i++)
-				{
-					auto &tex = range_data.data[i];
-
-					if (tex.is_dirty()) continue;
-					if (!tex.is_flushable()) continue;
-
-					auto overlapped = tex.overlaps_page(trampled_range, address);
-					if (std::get<0>(overlapped))
-					{
-						auto &new_range = std::get<1>(overlapped);
-
-						if (new_range.first != trampled_range.first ||
-							new_range.second != trampled_range.second)
-						{
-							i = 0;
-							trampled_range = new_range;
-							range_reset = true;
-						}
-
-						//TODO: Map basic host_visible memory without coherent constraint
-						if (!tex.flush(std::forward<Args>(extras)...))
-						{
-							//Missed address, note this
-							//TODO: Lower severity when successful to keep the cache from overworking
-							record_cache_miss(tex);
-						}
-
-						response = true;
-					}
-				}
-
-				if (range_reset)
-				{
-					processed_ranges.clear();
-					It = m_cache.begin();
-				}
-
-				processed_ranges[base] = true;
-			}
-
-			return response;
+			return flush_address_impl(address, std::forward<Args>(extras)...);
 		}

 		bool invalidate_address(u32 address)
@ -392,76 +461,8 @@ namespace rsx
 					return false;
 			}

-			bool response = false;
-			std::unordered_map<u32, bool> processed_ranges;
-
 			rsx::conditional_lock<shared_mutex> lock(in_access_violation_handler, m_cache_mutex);
-
-			for (auto It = m_cache.begin(); It != m_cache.end(); It++)
-			{
-				auto &range_data = It->second;
-				const u32 base = It->first;
-				bool range_reset = false;
-
-				if (processed_ranges[base] || range_data.valid_count == 0)
-					continue;
-
-				//Quickly discard range
-				const u32 lock_base = base & ~0xfff;
-				const u32 lock_limit = align(range_data.max_range + base, 4096);
-
-				if (trampled_range.first >= lock_limit || lock_base >= trampled_range.second)
-				{
-					processed_ranges[base] = true;
-					continue;
-				}
-
-				for (int i = 0; i < range_data.data.size(); i++)
-				{
-					auto &tex = range_data.data[i];
-
-					if (tex.is_dirty()) continue;
-					if (!tex.is_locked()) continue;	//flushable sections can be 'clean' but unlocked. TODO: Handle this better
-
-					auto overlapped = tex.overlaps_page(trampled_range, address);
-					if (std::get<0>(overlapped))
-					{
-						auto &new_range = std::get<1>(overlapped);
-
-						if (new_range.first != trampled_range.first ||
-							new_range.second != trampled_range.second)
-						{
-							i = 0;
-							trampled_range = new_range;
-							range_reset = true;
-						}
-
-						if (unprotect)
-						{
-							tex.set_dirty(true);
-							tex.unprotect();
-						}
-						else
-						{
-							tex.discard();
-						}
-
-						m_unreleased_texture_objects++;
-						range_data.valid_count--;
-						response = true;
-					}
-				}
-
-				if (range_reset)
-				{
-					processed_ranges.clear();
-					It = m_cache.begin();
-				}
-
-				processed_ranges[base] = true;
-			}
-
-			return response;
+			return invalidate_range_impl(address, range, unprotect);
 		}

 		void record_cache_miss(section_storage_type &tex)
@ -521,6 +522,8 @@ namespace rsx
 		
 		void purge_dirty()
 		{
+			writer_lock lock(m_cache_mutex);
+
 			//Reclaims all graphics memory consumed by dirty textures
 			std::vector<u32> empty_addresses;
 			empty_addresses.resize(32);
@ -611,6 +614,17 @@ namespace rsx
 				return texptr->get_view();
 			}

+			{
+				//Search in cache and upload/bind
+				reader_lock lock(m_cache_mutex);
+
+				auto cached_texture = find_texture_from_dimensions(texaddr, tex_width, tex_height);
+				if (cached_texture)
+				{
+					return cached_texture->get_raw_view();
+				}
+			}
+
 			/* Check if we are re-sampling a subresource of an RTV/DSV texture, bound or otherwise
 			 * (Turbo: Super Stunt Squad does this; bypassing the need for a sync object)
 			 * The engine does not read back the texture resource through cell, but specifies a texture location that is
@ -664,15 +678,6 @@ namespace rsx
 				}
 			}

-			//If all the above failed, then its probably a generic texture.
-			//Search in cache and upload/bind
-
-			auto cached_texture = find_texture_from_dimensions(texaddr, tex_width, tex_height);
-			if (cached_texture)
-			{
-				return cached_texture->get_raw_view();
-			}
-
 			//Do direct upload from CPU as the last resort
 			const auto extended_dimension = tex.get_extended_texture_dimension();
 			u16 height = 0;
@ -698,12 +703,13 @@ namespace rsx
 				break;
 			}

+			writer_lock lock(m_cache_mutex);
 			const bool is_swizzled = !(tex.format() & CELL_GCM_TEXTURE_LN);
 			auto subresources_layout = get_subresources_layout(tex);
 			auto remap_vector = tex.decoded_remap();

 			return upload_image_from_cpu(cmd, texaddr, tex_width, height, depth, tex.get_exact_mipmap_count(), tex_pitch, format,
-				subresources_layout, extended_dimension, is_swizzled, remap_vector)->get_raw_view();
+				texture_upload_context::shader_read, subresources_layout, extended_dimension, is_swizzled, remap_vector)->get_raw_view();
 		}

 		template <typename surface_store_type, typename blitter_type, typename ...Args>
@ -770,7 +776,9 @@ namespace rsx
 				}
 			}

+			reader_lock lock(m_cache_mutex);
 			section_storage_type* cached_dest = nullptr;
+
 			if (!dst_is_render_target)
 			{
 				//First check if this surface exists in VRAM with exact dimensions
@ -785,7 +793,7 @@ namespace rsx
 					//Prep surface
 					enforce_surface_creation_type(*cached_dest, dst.swizzled ? rsx::texture_create_flags::swapped_native_component_order : rsx::texture_create_flags::native_component_order);

-					//TODO: Move this code into utils since it is used alot
+					const auto old_dst_area = dst_area;
 					if (const u32 address_offset = dst.rsx_address - cached_dest->get_section_base())
 					{
 						const u16 bpp = dst_is_argb8 ? 4 : 2;
@ -809,11 +817,16 @@ namespace rsx
 						max_dst_height = cached_dest->get_height();
 					}
 					else
+					{
 						cached_dest = nullptr;
+						dst_area = old_dst_area;
+					}
 				}

 				if (!cached_dest && is_memcpy)
 				{
+					lock.upgrade();
+					invalidate_range_impl(dst_address, memcpy_bytes_length, true);
 					memcpy(dst.pixels, src.pixels, memcpy_bytes_length);
 					return true;
 				}
@ -839,6 +852,8 @@ namespace rsx

 					if (rsx_pitch <= 64 && native_pitch != rsx_pitch)
 					{
+						lock.upgrade();
+						invalidate_range_impl(dst_address, memcpy_bytes_length, true);
 						memcpy(dst.pixels, src.pixels, memcpy_bytes_length);
 						return true;
 					}
@ -856,7 +871,9 @@ namespace rsx
 				}
 				else
 				{
-					flush_address(src.rsx_address, std::forward<Args>(extras)...);
+					lock.upgrade();
+
+					flush_address_impl(src_address, std::forward<Args>(extras)...);

 					const u16 pitch_in_block = src_is_argb8 ? src.pitch >> 2 : src.pitch >> 1;
 					std::vector<rsx_subresource_layout> subresource_layout;
@ -869,7 +886,7 @@ namespace rsx
 					subresource_layout.push_back(subres);

 					const u32 gcm_format = src_is_argb8 ? CELL_GCM_TEXTURE_A8R8G8B8 : CELL_GCM_TEXTURE_R5G6B5;
-					vram_texture = upload_image_from_cpu(cmd, src_address, src.width, src.slice_h, 1, 1, src.pitch, gcm_format,
+					vram_texture = upload_image_from_cpu(cmd, src_address, src.width, src.slice_h, 1, 1, src.pitch, gcm_format, texture_upload_context::blit_engine_src,
 						subresource_layout, rsx::texture_dimension_extended::texture_dimension_2d, dst.swizzled, default_remap_vector)->get_raw_texture();
 				}
 			}
@ -928,7 +945,8 @@ namespace rsx
 			//TODO: Check for other types of format mismatch
 			if (format_mismatch)
 			{
-				invalidate_range(cached_dest->get_section_base(), cached_dest->get_section_size());
+				lock.upgrade();
+				invalidate_range_impl(cached_dest->get_section_base(), cached_dest->get_section_size(), true);

 				dest_texture = 0;
 				cached_dest = nullptr;
@ -958,6 +976,8 @@ namespace rsx
 				else
 					gcm_format = (dst_is_argb8) ? CELL_GCM_TEXTURE_A8R8G8B8 : CELL_GCM_TEXTURE_R5G6B5;

+				lock.upgrade();
+
 				dest_texture = create_new_texture(cmd, dst.rsx_address, dst.pitch * dst.clip_height,
 					dst_dimensions.width, dst_dimensions.height, 1, 1,
 					gcm_format, rsx::texture_dimension_extended::texture_dimension_2d,
--- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp
+++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp
@ -462,14 +462,15 @@ void GLGSRender::end()
 	for (int i = 0; i < rsx::limits::fragment_textures_count; ++i)
 	{
 		int location;
-		if (!rsx::method_registers.fragment_textures[i].enabled())
-			continue;
-
-		if (m_program->uniforms.has_location("tex" + std::to_string(i), &location))
+		if (rsx::method_registers.fragment_textures[i].enabled() && m_program->uniforms.has_location("tex" + std::to_string(i), &location))
 		{
 			m_gl_texture_cache.upload_and_bind_texture(i, get_gl_target_for_texture(rsx::method_registers.fragment_textures[i]), rsx::method_registers.fragment_textures[i], m_rtts);
+
+			if (m_textures_dirty[i])
 				m_gl_sampler_states[i].apply(rsx::method_registers.fragment_textures[i]);
 		}
+
+		m_textures_dirty[i] = false;
 	}

 	//Vertex textures
--- a/rpcs3/Emu/RSX/GL/GLRenderTargets.cpp
+++ b/rpcs3/Emu/RSX/GL/GLRenderTargets.cpp
@ -186,8 +186,22 @@ void GLGSRender::init_buffers(bool skip_reading)

 	draw_fbo.recreate();

+	bool old_format_found = false;
+	gl::texture::format old_format;
+
 	for (int i = 0; i < rsx::limits::color_buffers_count; ++i)
 	{
+		if (surface_info[i].pitch && g_cfg.video.write_color_buffers)
+		{
+			if (!old_format_found)
+			{
+				old_format = rsx::internals::surface_color_format_to_gl(surface_info[i].color_format).format;
+				old_format_found = true;
+			}
+
+			m_gl_texture_cache.flush_if_cache_miss_likely(old_format, surface_info[i].address, surface_info[i].pitch * surface_info[i].height);
+		}
+
 		if (std::get<0>(m_rtts.m_bound_render_targets[i]))
 		{
 			__glcheck draw_fbo.color[i] = *std::get<1>(m_rtts.m_bound_render_targets[i]);
--- a/rpcs3/Emu/RSX/GL/GLTexture.cpp
+++ b/rpcs3/Emu/RSX/GL/GLTexture.cpp
@ -36,7 +36,7 @@ namespace gl
 		case CELL_GCM_TEXTURE_COMPRESSED_DXT23: return GL_COMPRESSED_RGBA_S3TC_DXT3_EXT;
 		case CELL_GCM_TEXTURE_COMPRESSED_DXT45: return GL_COMPRESSED_RGBA_S3TC_DXT5_EXT;
 		}
-		fmt::throw_exception("Compressed or unknown texture format 0x%x" HERE, texture_format);
+		fmt::throw_exception("Unknown texture format 0x%x" HERE, texture_format);
 	}

 	std::tuple<GLenum, GLenum> get_format_type(u32 texture_format)
@ -63,6 +63,9 @@ namespace gl
 		case CELL_GCM_TEXTURE_D1R5G5B5: return std::make_tuple(GL_BGRA, GL_UNSIGNED_SHORT_1_5_5_5_REV);
 		case CELL_GCM_TEXTURE_D8R8G8B8: return std::make_tuple(GL_BGRA, GL_UNSIGNED_INT_8_8_8_8);
 		case CELL_GCM_TEXTURE_Y16_X16_FLOAT: return std::make_tuple(GL_RG, GL_HALF_FLOAT);
+		case CELL_GCM_TEXTURE_COMPRESSED_DXT1: return std::make_tuple(GL_COMPRESSED_RGBA_S3TC_DXT1_EXT, GL_UNSIGNED_BYTE);
+		case CELL_GCM_TEXTURE_COMPRESSED_DXT23: return std::make_tuple(GL_COMPRESSED_RGBA_S3TC_DXT3_EXT, GL_UNSIGNED_BYTE);
+		case CELL_GCM_TEXTURE_COMPRESSED_DXT45: return std::make_tuple(GL_COMPRESSED_RGBA_S3TC_DXT5_EXT, GL_UNSIGNED_BYTE);
 		}
 		fmt::throw_exception("Compressed or unknown texture format 0x%x" HERE, texture_format);
 	}
@ -333,7 +336,7 @@ namespace gl
 	}

 	void fill_texture(rsx::texture_dimension_extended dim, u16 mipmap_count, int format, u16 width, u16 height, u16 depth,
-			const std::vector<rsx_subresource_layout> &input_layouts, bool is_swizzled, std::vector<gsl::byte> staging_buffer)
+			const std::vector<rsx_subresource_layout> &input_layouts, bool is_swizzled, GLenum gl_format, GLenum gl_type, std::vector<gsl::byte> staging_buffer)
 	{
 		int mip_level = 0;
 		if (is_compressed_format(format))
@ -349,11 +352,10 @@ namespace gl
 			glTexStorage1D(GL_TEXTURE_1D, mipmap_count, get_sized_internal_format(format), width);
 			if (!is_compressed_format(format))
 			{
-				const auto &format_type = get_format_type(format);
 				for (const rsx_subresource_layout &layout : input_layouts)
 				{
 					upload_texture_subresource(staging_buffer, layout, format, is_swizzled, 4);
-					glTexSubImage1D(GL_TEXTURE_1D, mip_level++, 0, layout.width_in_block, std::get<0>(format_type), std::get<1>(format_type), staging_buffer.data());
+					glTexSubImage1D(GL_TEXTURE_1D, mip_level++, 0, layout.width_in_block, gl_format, gl_type, staging_buffer.data());
 				}
 			}
 			else
@ -362,7 +364,7 @@ namespace gl
 				{
 					u32 size = layout.width_in_block * ((format == CELL_GCM_TEXTURE_COMPRESSED_DXT1) ? 8 : 16);
 					upload_texture_subresource(staging_buffer, layout, format, is_swizzled, 4);
-					glCompressedTexSubImage1D(GL_TEXTURE_1D, mip_level++, 0, layout.width_in_block * 4, get_sized_internal_format(format), size, staging_buffer.data());
+					glCompressedTexSubImage1D(GL_TEXTURE_1D, mip_level++, 0, layout.width_in_block * 4, gl_format, size, staging_buffer.data());
 				}
 			}
 			return;
@ -372,11 +374,10 @@ namespace gl
 		{
 			if (!is_compressed_format(format))
 			{
-				const auto &format_type = get_format_type(format);
 				for (const rsx_subresource_layout &layout : input_layouts)
 				{
 					upload_texture_subresource(staging_buffer, layout, format, is_swizzled, 4);
-					glTexSubImage2D(GL_TEXTURE_2D, mip_level++, 0, 0, layout.width_in_block, layout.height_in_block, std::get<0>(format_type), std::get<1>(format_type), staging_buffer.data());
+					glTexSubImage2D(GL_TEXTURE_2D, mip_level++, 0, 0, layout.width_in_block, layout.height_in_block, gl_format, gl_type, staging_buffer.data());
 				}
 			}
 			else
@ -385,7 +386,7 @@ namespace gl
 				{
 					u32 size = layout.width_in_block * layout.height_in_block * ((format == CELL_GCM_TEXTURE_COMPRESSED_DXT1) ? 8 : 16);
 					upload_texture_subresource(staging_buffer, layout, format, is_swizzled, 4);
-					glCompressedTexSubImage2D(GL_TEXTURE_2D, mip_level++, 0, 0, layout.width_in_block * 4, layout.height_in_block * 4, get_sized_internal_format(format), size, staging_buffer.data());
+					glCompressedTexSubImage2D(GL_TEXTURE_2D, mip_level++, 0, 0, layout.width_in_block * 4, layout.height_in_block * 4, gl_format, size, staging_buffer.data());
 				}
 			}
 			return;
@ -398,11 +399,10 @@ namespace gl
 			// mip_level % mipmap_per_layer will always be equal to mip_level
 			if (!is_compressed_format(format))
 			{
-				const auto &format_type = get_format_type(format);
 				for (const rsx_subresource_layout &layout : input_layouts)
 				{
 					upload_texture_subresource(staging_buffer, layout, format, is_swizzled, 4);
-					glTexSubImage2D(GL_TEXTURE_CUBE_MAP_POSITIVE_X + mip_level / mipmap_count, mip_level % mipmap_count, 0, 0, layout.width_in_block, layout.height_in_block, std::get<0>(format_type), std::get<1>(format_type), staging_buffer.data());
+					glTexSubImage2D(GL_TEXTURE_CUBE_MAP_POSITIVE_X + mip_level / mipmap_count, mip_level % mipmap_count, 0, 0, layout.width_in_block, layout.height_in_block, gl_format, gl_type, staging_buffer.data());
 					mip_level++;
 				}
 			}
@ -412,7 +412,7 @@ namespace gl
 				{
 					u32 size = layout.width_in_block * layout.height_in_block * ((format == CELL_GCM_TEXTURE_COMPRESSED_DXT1) ? 8 : 16);
 					upload_texture_subresource(staging_buffer, layout, format, is_swizzled, 4);
-					glCompressedTexSubImage2D(GL_TEXTURE_CUBE_MAP_POSITIVE_X + mip_level / mipmap_count, mip_level % mipmap_count, 0, 0, layout.width_in_block * 4, layout.height_in_block * 4, get_sized_internal_format(format), size, staging_buffer.data());
+					glCompressedTexSubImage2D(GL_TEXTURE_CUBE_MAP_POSITIVE_X + mip_level / mipmap_count, mip_level % mipmap_count, 0, 0, layout.width_in_block * 4, layout.height_in_block * 4, gl_format, size, staging_buffer.data());
 					mip_level++;
 				}
 			}
@ -423,11 +423,10 @@ namespace gl
 		{
 			if (!is_compressed_format(format))
 			{
-				const auto &format_type = get_format_type(format);
 				for (const rsx_subresource_layout &layout : input_layouts)
 				{
 					upload_texture_subresource(staging_buffer, layout, format, is_swizzled, 4);
-					glTexSubImage3D(GL_TEXTURE_3D, mip_level++, 0, 0, 0, layout.width_in_block, layout.height_in_block, depth, std::get<0>(format_type), std::get<1>(format_type), staging_buffer.data());
+					glTexSubImage3D(GL_TEXTURE_3D, mip_level++, 0, 0, 0, layout.width_in_block, layout.height_in_block, depth, gl_format, gl_type, staging_buffer.data());
 				}
 			}
 			else
@ -436,7 +435,7 @@ namespace gl
 				{
 					u32 size = layout.width_in_block * layout.height_in_block * layout.depth * ((format == CELL_GCM_TEXTURE_COMPRESSED_DXT1) ? 8 : 16);
 					upload_texture_subresource(staging_buffer, layout, format, is_swizzled, 4);
-					glCompressedTexSubImage3D(GL_TEXTURE_3D, mip_level++, 0, 0, 0, layout.width_in_block * 4, layout.height_in_block * 4, layout.depth, get_sized_internal_format(format), size, staging_buffer.data());
+					glCompressedTexSubImage3D(GL_TEXTURE_3D, mip_level++, 0, 0, 0, layout.width_in_block * 4, layout.height_in_block * 4, layout.depth, gl_format, size, staging_buffer.data());
 				}
 			}
 			return;
@ -529,6 +528,9 @@ namespace gl

 		//The rest of sampler state is now handled by sampler state objects

-		fill_texture(type, mipmaps, gcm_format, width, height, depth, subresources_layout, is_swizzled, data_upload_buf);
+		const auto format_type = get_format_type(gcm_format);
+		const GLenum gl_format = std::get<0>(format_type);
+		const GLenum gl_type = std::get<1>(format_type);
+		fill_texture(type, mipmaps, gcm_format, width, height, depth, subresources_layout, is_swizzled, gl_format, gl_type, data_upload_buf);
 	}
 }
--- a/rpcs3/Emu/RSX/GL/GLTexture.h
+++ b/rpcs3/Emu/RSX/GL/GLTexture.h
@ -17,6 +17,14 @@ namespace gl

 	GLuint create_texture(u32 gcm_format, u16 width, u16 height, u16 depth, u16 mipmaps, rsx::texture_dimension_extended type);

+	/**
+	 * is_swizzled - determines whether input bytes are in morton order
+	 * subresources_layout - descriptor of the mipmap levels in memory
+	 * decoded_remap - two vectors, first one contains index to read, e.g if v[0] = 1 then component 0[A] in the texture should read as component 1[R]
+	 * - layout of vector is in A-R-G-B
+	 * - second vector contains overrides to force the value to either 0 or 1 instead of reading from texture
+	 * static_state - set up the texture without consideration for sampler state (useful for vertex textures which have no real sampler state on RSX)
+	 */
 	void upload_texture(const GLuint id, const u32 texaddr, const u32 gcm_format, u16 width, u16 height, u16 depth, u16 mipmaps, u16 pitch, bool is_swizzled, rsx::texture_dimension_extended type,
 		std::vector<rsx_subresource_layout>& subresources_layout, std::pair<std::array<u8, 4>, std::array<u8, 4>>& decoded_remap, bool static_state);

--- a/rpcs3/Emu/RSX/GL/GLTextureCache.h
+++ b/rpcs3/Emu/RSX/GL/GLTextureCache.h
@ -47,6 +47,8 @@ namespace gl
 		texture::type type = texture::type::ubyte;
 		bool pack_unpack_swap_bytes = false;

+		rsx::texture_create_flags view_flags = rsx::texture_create_flags::default_component_order;
+
 		u8 get_pixel_size(texture::format fmt_, texture::type type_)
 		{
 			u8 size = 1;
@ -224,6 +226,11 @@ namespace gl
 			vram_texture = source.id();
 		}

+		void set_view_flags(const rsx::texture_create_flags flags)
+		{
+			view_flags = flags;
+		}
+
 		void copy_texture(bool=false)
 		{
 			if (!glIsTexture(vram_texture))
@ -306,7 +313,6 @@ namespace gl

 			glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
 			glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
-			protect(utils::protection::ro);
 			
 			return true;
 		}
@ -410,6 +416,11 @@ namespace gl

 			return (gl::texture::format)fmt == tex->get_internal_format();
 		}
+
+		rsx::texture_create_flags get_view_flags() const
+		{
+			return view_flags;
+		}
 	};
 		
 	class texture_cache : public rsx::texture_cache<void*, cached_texture_section, u32, u32, gl::texture, gl::texture::format>
@ -577,28 +588,54 @@ namespace gl
 				break;
 			}

+			if (flags == rsx::texture_create_flags::swapped_native_component_order)
+			{
+				glBindTexture(GL_TEXTURE_2D, vram_texture);
+				glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_R, GL_ALPHA);
+				glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_G, GL_RED);
+				glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_B, GL_GREEN);
+				glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_A, GL_BLUE);
+			}
+
 			auto& cached = create_texture(vram_texture, rsx_address, rsx_size, width, height);
 			cached.protect(utils::protection::ro);
 			cached.set_dirty(false);
 			cached.set_depth_flag(depth_flag);
+			cached.set_view_flags(flags);

 			return &cached;
 		}

 		cached_texture_section* upload_image_from_cpu(void*&, u32 rsx_address, u16 width, u16 height, u16 depth, u16 mipmaps, u16 pitch, const u32 gcm_format,
-			std::vector<rsx_subresource_layout>& subresource_layout, const rsx::texture_dimension_extended type, const bool swizzled,
+			const rsx::texture_upload_context context, std::vector<rsx_subresource_layout>& subresource_layout, const rsx::texture_dimension_extended type, const bool swizzled,
 			std::pair<std::array<u8, 4>, std::array<u8, 4>>& remap_vector) override
 		{
 			void* unused = nullptr;
 			auto section = create_new_texture(unused, rsx_address, pitch * height, width, height, depth, mipmaps, gcm_format, type,
 				rsx::texture_create_flags::default_component_order, remap_vector);

-			gl::upload_texture(section->get_raw_texture(), rsx_address, gcm_format, width, height, depth, mipmaps, pitch, swizzled, type, subresource_layout, remap_vector, false);
+			//Swizzling is ignored for blit engine copy and emulated using remapping
+			bool input_swizzled = (context == rsx::texture_upload_context::blit_engine_src)? false : swizzled;
+
+			gl::upload_texture(section->get_raw_texture(), rsx_address, gcm_format, width, height, depth, mipmaps, pitch, input_swizzled, type, subresource_layout, remap_vector, false);
 			return section;
 		}

 		void enforce_surface_creation_type(cached_texture_section& section, const rsx::texture_create_flags flags) override
 		{
+			if (flags == section.get_view_flags())
+				return;
+
+			if (flags == rsx::texture_create_flags::swapped_native_component_order)
+			{
+				glBindTexture(GL_TEXTURE_2D, section.get_raw_texture());
+				glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_R, GL_ALPHA);
+				glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_G, GL_RED);
+				glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_B, GL_GREEN);
+				glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_SWIZZLE_A, GL_BLUE);
+			}
+
+			section.set_view_flags(flags);
 		}

 		void insert_texture_barrier() override
@ -630,6 +667,8 @@ namespace gl
 		
 		bool is_depth_texture(const u32 rsx_address) override
 		{
+			reader_lock lock(m_cache_mutex);
+
 			auto section = find_texture_from_range(rsx_address, 64u);
 			if (section != nullptr) return section->is_depth_texture();

--- a/rpcs3/Emu/RSX/RSXThread.cpp
+++ b/rpcs3/Emu/RSX/RSXThread.cpp
@ -403,7 +403,8 @@ namespace rsx
 			std::vector <std::pair<u32, u32>> split_ranges;
 			auto first_count_cmds = method_registers.current_draw_clause.first_count_commands;

-			if (method_registers.current_draw_clause.first_count_commands.size() > 1)
+			if (method_registers.current_draw_clause.first_count_commands.size() > 1 &&
+				method_registers.current_draw_clause.is_disjoint_primitive)
 			{
 				u32 next = method_registers.current_draw_clause.first_count_commands.front().first;
 				u32 last_head = 0;
@ -433,13 +434,18 @@ namespace rsx
 			{
 				std::vector<std::pair<u32, u32>> tmp;
 				auto list_head = first_count_cmds.begin();
+				bool emit_begin = false;

 				for (auto &range : split_ranges)
 				{
 					tmp.resize(range.second - range.first + 1);
 					std::copy(list_head + range.first, list_head + range.second, tmp.begin());

+					if (emit_begin)
 						methods[NV4097_SET_BEGIN_END](this, NV4097_SET_BEGIN_END, deferred_primitive_type);
+					else
+						emit_begin = true;
+
 					method_registers.current_draw_clause.first_count_commands = tmp;
 					methods[NV4097_SET_BEGIN_END](this, NV4097_SET_BEGIN_END, 0);
 				}
@ -565,8 +571,14 @@ namespace rsx
 							deferred_primitive_type = value;
 						else
 						{
+							has_deferred_call = true;
+							flush_commands_flag = false;
+							execute_method_call = false;
+
 							deferred_call_size++;

+							if (method_registers.current_draw_clause.is_disjoint_primitive)
+							{
 								// Combine all calls since the last one
 								auto &first_count = method_registers.current_draw_clause.first_count_commands;
 								if (first_count.size() > deferred_call_size)
@ -596,10 +608,7 @@ namespace rsx
 									first_count[deferred_call_size - 1].second = count;
 									first_count.resize(deferred_call_size);
 								}
-
-							has_deferred_call = true;
-							flush_commands_flag = false;
-							execute_method_call = false;
+							}
 						}

 						break;
@ -1049,24 +1058,33 @@ namespace rsx

 	void thread::get_current_vertex_program()
 	{
-		auto &result = current_vertex_program = {};
-
 		const u32 transform_program_start = rsx::method_registers.transform_program_start();
-		result.data.reserve((512 - transform_program_start) * 4);
-		result.rsx_vertex_inputs.reserve(rsx::limits::vertex_count);
+		current_vertex_program.output_mask = rsx::method_registers.vertex_attrib_output_mask();
+		current_vertex_program.skip_vertex_input_check = false;
+
+		current_vertex_program.rsx_vertex_inputs.resize(0);
+		current_vertex_program.data.resize(512 * 4);
+		current_vertex_program.rsx_vertex_inputs.reserve(rsx::limits::vertex_count);
+
+		u32* ucode_src = rsx::method_registers.transform_program.data() + (transform_program_start * 4);
+		u32* ucode_dst = current_vertex_program.data.data();
+		u32  ucode_size = 0;
+		D3   d3;

 		for (int i = transform_program_start; i < 512; ++i)
 		{
-			result.data.resize((i - transform_program_start) * 4 + 4);
-			memcpy(result.data.data() + (i - transform_program_start) * 4, rsx::method_registers.transform_program.data() + i * 4, 4 * sizeof(u32));
-
-			D3 d3;
-			d3.HEX = rsx::method_registers.transform_program[i * 4 + 3];
+			ucode_size += 4;
+			memcpy(ucode_dst, ucode_src, 4 * sizeof(u32));

+			d3.HEX = ucode_src[3];
 			if (d3.end)
 				break;
+
+			ucode_src += 4;
+			ucode_dst += 4;
 		}
-		result.output_mask = rsx::method_registers.vertex_attrib_output_mask();
+
+		current_vertex_program.data.resize(ucode_size);

 		const u32 input_mask = rsx::method_registers.vertex_attrib_input_mask();
 		const u32 modulo_mask = rsx::method_registers.frequency_divider_operation_mask();
@ -1079,7 +1097,7 @@ namespace rsx

 			if (rsx::method_registers.vertex_arrays_info[index].size() > 0)
 			{
-				result.rsx_vertex_inputs.push_back(
+				current_vertex_program.rsx_vertex_inputs.push_back(
 					{index,
 						rsx::method_registers.vertex_arrays_info[index].size(),
 						rsx::method_registers.vertex_arrays_info[index].frequency(),
@ -1089,7 +1107,7 @@ namespace rsx
 			}
 			else if (vertex_push_buffers[index].vertex_count > 1)
 			{
-				result.rsx_vertex_inputs.push_back(
+				current_vertex_program.rsx_vertex_inputs.push_back(
 				{ index,
 					rsx::method_registers.register_vertex_info[index].size,
 					1,
@ -1099,7 +1117,7 @@ namespace rsx
 			}
 			else if (rsx::method_registers.register_vertex_info[index].size > 0)
 			{
-				result.rsx_vertex_inputs.push_back(
+				current_vertex_program.rsx_vertex_inputs.push_back(
 					{index,
 						rsx::method_registers.register_vertex_info[index].size,
 						rsx::method_registers.register_vertex_info[index].frequency,
--- a/rpcs3/Emu/RSX/VK/VKTextureCache.h
+++ b/rpcs3/Emu/RSX/VK/VKTextureCache.h
@ -565,7 +565,7 @@ namespace vk
 		}

 		cached_texture_section* upload_image_from_cpu(vk::command_buffer& cmd, u32 rsx_address, u16 width, u16 height, u16 depth, u16 mipmaps, u16 pitch, const u32 gcm_format,
-			std::vector<rsx_subresource_layout>& subresource_layout, const rsx::texture_dimension_extended type, const bool swizzled,
+			const rsx::texture_upload_context context, std::vector<rsx_subresource_layout>& subresource_layout, const rsx::texture_dimension_extended type, const bool swizzled,
 			std::pair<std::array<u8, 4>, std::array<u8, 4>>& remap_vector) override
 		{
 			auto section = create_new_texture(cmd, rsx_address, pitch * height, width, height, depth, mipmaps, gcm_format, type,
@ -578,7 +578,10 @@ namespace vk

 			vk::enter_uninterruptible();

-			vk::copy_mipmaped_image_using_buffer(cmd, image->value, subresource_layout, gcm_format, swizzled, mipmaps, subres_range.aspectMask,
+			//Swizzling is ignored for blit engine copy and emulated using a swapped order image view
+			bool input_swizzled = (context == rsx::texture_upload_context::blit_engine_src) ? false : swizzled;
+
+			vk::copy_mipmaped_image_using_buffer(cmd, image->value, subresource_layout, gcm_format, input_swizzled, mipmaps, subres_range.aspectMask,
 				*m_texture_upload_heap, m_texture_upload_buffer);

 			vk::leave_uninterruptible();
--- a/rpcs3/Emu/RSX/rsx_cache.h
+++ b/rpcs3/Emu/RSX/rsx_cache.h
@ -70,7 +70,7 @@ namespace rsx
 		bool locked = false;
 		bool dirty = false;

-		inline bool region_overlaps(u32 base1, u32 limit1, u32 base2, u32 limit2)
+		inline bool region_overlaps(u32 base1, u32 limit1, u32 base2, u32 limit2) const
 		{
 			return (base1 < limit2 && base2 < limit1);
 		}
@ -133,12 +133,12 @@ namespace rsx
 			locked = false;
 		}

-		bool overlaps(std::pair<u32, u32> range)
+		bool overlaps(std::pair<u32, u32> range) const
 		{
 			return region_overlaps(locked_address_base, locked_address_base + locked_address_range, range.first, range.first + range.second);
 		}

-		bool overlaps(u32 address)
+		bool overlaps(u32 address) const
 		{
 			return (locked_address_base <= address && (address - locked_address_base) < locked_address_range);
 		}
@ -148,7 +148,7 @@ namespace rsx
 		 * ignore_protection_range - if true, the test should not check against the aligned protection range, instead
 		 * tests against actual range of contents in memory
 		 */
-		bool overlaps(std::pair<u32, u32> range, bool ignore_protection_range)
+		bool overlaps(std::pair<u32, u32> range, bool ignore_protection_range) const
 		{
 			if (!ignore_protection_range)
 				return region_overlaps(locked_address_base, locked_address_base + locked_address_range, range.first, range.first + range.second);
@ -160,7 +160,7 @@ namespace rsx
 		 * Check if the page containing the address tramples this section. Also compares a former trampled page range to compare
 		 * If true, returns the range <min, max> with updated invalid range 
 		 */
-		std::tuple<bool, std::pair<u32, u32>> overlaps_page(std::pair<u32, u32> old_range, u32 address)
+		std::tuple<bool, std::pair<u32, u32>> overlaps_page(std::pair<u32, u32> old_range, u32 address) const
 		{
 			const u32 page_base = address & ~4095;
 			const u32 page_limit = address + 4096;
@ -204,7 +204,7 @@ namespace rsx
 			return (cpu_address_base == cpu_address && cpu_address_range == size);
 		}

-		std::pair<u32, u32> get_min_max(std::pair<u32, u32> current_min_max)
+		std::pair<u32, u32> get_min_max(std::pair<u32, u32> current_min_max) const
 		{
 			u32 min = std::min(current_min_max.first, locked_address_base);
 			u32 max = std::max(current_min_max.second, locked_address_base + locked_address_range);