rsx: Minor refactoring RSXThread

- Part 1 of many
2023-01-07 19:20:21 +03:00 · 2023-01-07 19:20:21 +03:00 · 3dba894369
parent 659ee81e80
commit 3dba894369
22 changed files with 637 additions and 525 deletions
--- a/rpcs3/Emu/Cell/PPUThread.cpp
+++ b/rpcs3/Emu/Cell/PPUThread.cpp
@ -9,7 +9,7 @@
 #include "Emu/perf_meter.hpp"
 #include "Emu/Memory/vm_reservation.h"
 #include "Emu/Memory/vm_locking.h"
-#include "Emu/RSX/RSXThread.h"
+#include "Emu/RSX/Core/RSXReservationLock.hpp"
 #include "Emu/VFS.h"
 #include "Emu/system_progress.hpp"
 #include "Emu/system_utils.hpp"
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@ -9,7 +9,6 @@
 #include "Emu/VFS.h"
 #include "Emu/IdManager.h"
 #include "Emu/perf_meter.hpp"
-#include "Emu/RSX/RSXThread.h"
 #include "Emu/Cell/PPUThread.h"
 #include "Emu/Cell/ErrorCodes.h"
 #include "Emu/Cell/lv2/sys_spu.h"
@ -23,6 +22,9 @@
 #include "Emu/Cell/SPURecompiler.h"
 #include "Emu/Cell/timers.hpp"

+#include "Emu/RSX/Core/RSXReservationLock.hpp"
+#include "Emu/RSX/RSXThread.h"
+
 #include <cmath>
 #include <cfenv>
 #include <thread>
--- a/rpcs3/Emu/Cell/lv2/sys_rsx.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_rsx.cpp
@ -5,6 +5,8 @@
 #include "Emu/Cell/ErrorCodes.h"
 #include "Emu/Cell/timers.hpp"
 #include "Emu/Memory/vm_locking.h"
+#include "Emu/RSX/Core/RSXEngLock.hpp"
+#include "Emu/RSX/Core/RSXReservationLock.hpp"
 #include "Emu/RSX/RSXThread.h"
 #include "util/asm.hpp"
 #include "sys_event.h"
--- a/rpcs3/Emu/RSX/Core/RSXDisplay.h
+++ b/rpcs3/Emu/RSX/Core/RSXDisplay.h
@ -0,0 +1,73 @@
+#pragma once
+
+#include <util/types.hpp>
+#include <util/logs.hpp>
+#include <deque>
+
+namespace rsx
+{
+	struct frame_statistics_t
+	{
+		u32 draw_calls;
+		u32 submit_count;
+
+		s64 setup_time;
+		s64 vertex_upload_time;
+		s64 textures_upload_time;
+		s64 draw_exec_time;
+		s64 flip_time;
+	};
+
+	struct display_flip_info_t
+	{
+		std::deque<u32> buffer_queue;
+		u32 buffer;
+		bool skip_frame;
+		bool emu_flip;
+		bool in_progress;
+		frame_statistics_t stats;
+
+		inline void push(u32 _buffer)
+		{
+			buffer_queue.push_back(_buffer);
+		}
+
+		inline bool pop(u32 _buffer)
+		{
+			if (buffer_queue.empty())
+			{
+				return false;
+			}
+
+			do
+			{
+				const auto index = buffer_queue.front();
+				buffer_queue.pop_front();
+
+				if (index == _buffer)
+				{
+					buffer = _buffer;
+					return true;
+				}
+			} while (!buffer_queue.empty());
+
+			// Need to observe this happening in the wild
+			rsx_log.error("Display queue was discarded while not empty!");
+			return false;
+		}
+	};
+
+	class vblank_thread
+	{
+		std::shared_ptr<named_thread<std::function<void()>>> m_thread;
+
+	public:
+		vblank_thread() = default;
+		vblank_thread(const vblank_thread&) = delete;
+
+		void set_thread(std::shared_ptr<named_thread<std::function<void()>>> thread);
+
+		vblank_thread& operator=(thread_state);
+		vblank_thread& operator=(const vblank_thread&) = delete;
+	};
+}
--- a/rpcs3/Emu/RSX/Core/RSXEngLock.hpp
+++ b/rpcs3/Emu/RSX/Core/RSXEngLock.hpp
@ -0,0 +1,31 @@
+#pragma once
+
+#include <util/types.hpp>
+#include "../RSXThread.h"
+
+namespace rsx
+{
+	class eng_lock
+	{
+		rsx::thread* pthr;
+
+	public:
+		eng_lock(rsx::thread* target)
+			:pthr(target)
+		{
+			if (pthr->is_current_thread())
+			{
+				pthr = nullptr;
+			}
+			else
+			{
+				pthr->pause();
+			}
+		}
+
+		~eng_lock()
+		{
+			if (pthr) pthr->unpause();
+		}
+	};
+}
--- a/rpcs3/Emu/RSX/Core/RSXFrameBuffer.h
+++ b/rpcs3/Emu/RSX/Core/RSXFrameBuffer.h
@ -0,0 +1,42 @@
+#pragma once
+
+#include <util/types.hpp>
+#include "../gcm_enums.h"
+#include "../GCM.h"
+
+namespace rsx
+{
+	struct tiled_region
+	{
+		u32 address;
+		u32 base;
+		GcmTileInfo* tile;
+		u8* ptr;
+
+		void write(const void* src, u32 width, u32 height, u32 pitch);
+		void read(void* dst, u32 width, u32 height, u32 pitch);
+	};
+
+	struct framebuffer_layout
+	{
+		ENABLE_BITWISE_SERIALIZATION;
+
+		u16 width;
+		u16 height;
+		std::array<u32, 4> color_addresses;
+		std::array<u32, 4> color_pitch;
+		std::array<u32, 4> actual_color_pitch;
+		std::array<bool, 4> color_write_enabled;
+		u32 zeta_address;
+		u32 zeta_pitch;
+		u32 actual_zeta_pitch;
+		bool zeta_write_enabled;
+		rsx::surface_target target;
+		rsx::surface_color_format color_format;
+		rsx::surface_depth_format2 depth_format;
+		rsx::surface_antialiasing aa_mode;
+		rsx::surface_raster_type raster_type;
+		u32 aa_factors[2];
+		bool ignore_change;
+	};
+}
--- a/rpcs3/Emu/RSX/Core/RSXIOMap.hpp
+++ b/rpcs3/Emu/RSX/Core/RSXIOMap.hpp
@ -0,0 +1,86 @@
+#pragma once
+
+#include <util/types.hpp>
+#include "Utilities/mutex.h"
+#include "Emu/CPU/CPUThread.h"
+
+namespace rsx
+{
+	struct rsx_iomap_table
+	{
+		static constexpr u32 c_lock_stride = 8192;
+
+		std::array<atomic_t<u32>, 4096> ea;
+		std::array<atomic_t<u32>, 4096> io;
+		std::array<shared_mutex, 0x1'0000'0000 / c_lock_stride> rs;
+
+		rsx_iomap_table() noexcept;
+
+		// Try to get the real address given a mapped address
+		// Returns -1 on failure
+		u32 get_addr(u32 offs) const noexcept
+		{
+			return this->ea[offs >> 20] | (offs & 0xFFFFF);
+		}
+
+		template <bool IsFullLock, uint Stride>
+		bool lock(u32 addr, u32 len, cpu_thread* self = nullptr) noexcept
+		{
+			if (len <= 1) return false;
+			const u32 end = addr + len - 1;
+
+			bool added_wait = false;
+
+			for (u32 block = addr / c_lock_stride; block <= (end / c_lock_stride); block += Stride)
+			{
+				auto& mutex_ = rs[block];
+
+				if (IsFullLock ? !mutex_.try_lock() : !mutex_.try_lock_shared()) [[ unlikely ]]
+				{
+					if (self)
+					{
+						added_wait |= !self->state.test_and_set(cpu_flag::wait);
+					}
+
+					if (!self || self->id_type() != 0x55u)
+					{
+						IsFullLock ? mutex_.lock() : mutex_.lock_shared();
+					}
+					else
+					{
+						while (IsFullLock ? !mutex_.try_lock() : !mutex_.try_lock_shared())
+						{
+							self->cpu_wait({});
+						}
+					}
+				}
+			}
+
+			if (added_wait)
+			{
+				self->check_state();
+			}
+
+			return true;
+		}
+
+		template <bool IsFullLock, uint Stride>
+		void unlock(u32 addr, u32 len) noexcept
+		{
+			ensure(len >= 1);
+			const u32 end = addr + len - 1;
+
+			for (u32 block = (addr / 8192); block <= (end / 8192); block += Stride)
+			{
+				if constexpr (IsFullLock)
+				{
+					rs[block].unlock();
+				}
+				else
+				{
+					rs[block].unlock_shared();
+				}
+			}
+		}
+	};
+}
--- a/rpcs3/Emu/RSX/Core/RSXReservationLock.hpp
+++ b/rpcs3/Emu/RSX/Core/RSXReservationLock.hpp
@ -0,0 +1,106 @@
+#pragma once
+
+#include <util/types.hpp>
+#include "../RSXThread.h"
+
+namespace rsx
+{
+	template<bool IsFullLock = false, uint Stride = 128>
+	class reservation_lock
+	{
+		u32 addr = 0;
+		u32 length = 0;
+
+		inline void lock_range(u32 addr, u32 length)
+		{
+			if (!get_current_renderer()->iomap_table.lock<IsFullLock, Stride>(addr, length, get_current_cpu_thread()))
+			{
+				length = 0;
+			}
+
+			this->addr = addr;
+			this->length = length;
+		}
+
+	public:
+		reservation_lock(u32 addr, u32 length)
+		{
+			if (g_cfg.core.rsx_accurate_res_access &&
+				addr < constants::local_mem_base)
+			{
+				lock_range(addr, length);
+			}
+		}
+
+		reservation_lock(u32 addr, u32 length, bool setting)
+		{
+			if (setting)
+			{
+				lock_range(addr, length);
+			}
+		}
+
+		// Multi-range lock. If ranges overlap, the combined range will be acquired.
+		// If ranges do not overlap, the first range that is in main memory will be acquired.
+		reservation_lock(u32 dst_addr, u32 dst_length, u32 src_addr, u32 src_length)
+		{
+			if (g_cfg.core.rsx_accurate_res_access)
+			{
+				const auto range1 = utils::address_range::start_length(dst_addr, dst_length);
+				const auto range2 = utils::address_range::start_length(src_addr, src_length);
+				utils::address_range target_range;
+
+				if (!range1.overlaps(range2)) [[likely]]
+				{
+					target_range = (dst_addr < constants::local_mem_base) ? range1 : range2;
+				}
+				else
+				{
+					// Very unlikely
+					target_range = range1.get_min_max(range2);
+				}
+
+				if (target_range.start < constants::local_mem_base)
+				{
+					lock_range(target_range.start, target_range.length());
+				}
+			}
+		}
+
+		// Very special utility for batched transfers (SPU related)
+		template <typename T = void>
+		void update_if_enabled(u32 addr, u32 _length, const std::add_pointer_t<T>& lock_release = std::add_pointer_t<void>{})
+		{
+			// This check is not perfect but it covers the important cases fast (this check is only an optimization - forcing true disables it)
+			if (length && (this->addr / rsx_iomap_table::c_lock_stride != addr / rsx_iomap_table::c_lock_stride || (addr % rsx_iomap_table::c_lock_stride + _length) > rsx_iomap_table::c_lock_stride) && _length > 1)
+			{
+				if constexpr (!std::is_void_v<T>)
+				{
+					// See SPUThread.cpp
+					lock_release->release(0);
+				}
+
+				unlock();
+				lock_range(addr, _length);
+			}
+		}
+
+		void unlock(bool destructor = false)
+		{
+			if (length)
+			{
+				get_current_renderer()->iomap_table.unlock<IsFullLock, Stride>(addr, length);
+
+				if (!destructor)
+				{
+					length = 0;
+				}
+			}
+		}
+
+		~reservation_lock()
+		{
+			unlock(true);
+		}
+	};
+}
--- a/rpcs3/Emu/RSX/Core/RSXVertexTypes.h
+++ b/rpcs3/Emu/RSX/Core/RSXVertexTypes.h
@ -0,0 +1,168 @@
+#pragma once
+
+#include <util/types.hpp>
+#include "../Common/simple_array.hpp"
+#include "../gcm_enums.h"
+
+#include <span>
+
+namespace rsx
+{
+	struct vertex_array_buffer
+	{
+		rsx::vertex_base_type type;
+		u8 attribute_size;
+		u8 stride;
+		std::span<const std::byte> data;
+		u8 index;
+		bool is_be;
+	};
+
+	struct vertex_array_register
+	{
+		rsx::vertex_base_type type;
+		u8 attribute_size;
+		std::array<u32, 4> data;
+		u8 index;
+	};
+
+	struct empty_vertex_array
+	{
+		u8 index;
+	};
+
+	struct draw_array_command
+	{
+		u32 __dummy;
+	};
+
+	struct draw_indexed_array_command
+	{
+		std::span<const std::byte> raw_index_buffer;
+	};
+
+	struct draw_inlined_array
+	{
+		u32 __dummy;
+		u32 __dummy2;
+	};
+
+	struct interleaved_attribute_t
+	{
+		u8 index;
+		bool modulo;
+		u16 frequency;
+	};
+
+	struct interleaved_range_info
+	{
+		bool interleaved = false;
+		bool single_vertex = false;
+		u32  base_offset = 0;
+		u32  real_offset_address = 0;
+		u8   memory_location = 0;
+		u8   attribute_stride = 0;
+
+		rsx::simple_array<interleaved_attribute_t> locations;
+
+		// Check if we need to upload a full unoptimized range, i.e [0-max_index]
+		std::pair<u32, u32> calculate_required_range(u32 first, u32 count) const;
+	};
+
+	enum attribute_buffer_placement : u8
+	{
+		none = 0,
+		persistent = 1,
+		transient = 2
+	};
+
+	class vertex_input_layout
+	{
+		int m_num_used_blocks = 0;
+		std::array<interleaved_range_info, 16> m_blocks_data{};
+
+	public:
+		rsx::simple_array<interleaved_range_info*> interleaved_blocks{};  // Interleaved blocks to be uploaded as-is
+		std::vector<std::pair<u8, u32>> volatile_blocks{};                // Volatile data blocks (immediate draw vertex data for example)
+		rsx::simple_array<u8> referenced_registers{};                     // Volatile register data
+
+		std::array<attribute_buffer_placement, 16> attribute_placement = fill_array(attribute_buffer_placement::none);
+
+		vertex_input_layout() = default;
+
+		interleaved_range_info* alloc_interleaved_block()
+		{
+			auto result = &m_blocks_data[m_num_used_blocks++];
+			result->attribute_stride = 0;
+			result->base_offset = 0;
+			result->memory_location = 0;
+			result->real_offset_address = 0;
+			result->single_vertex = false;
+			result->locations.clear();
+			result->interleaved = true;
+			return result;
+		}
+
+		void clear()
+		{
+			m_num_used_blocks = 0;
+			interleaved_blocks.clear();
+			volatile_blocks.clear();
+			referenced_registers.clear();
+		}
+
+		bool validate() const
+		{
+			// Criteria: At least one array stream has to be defined to feed vertex positions
+			// This stream cannot be a const register as the vertices cannot create a zero-area primitive
+
+			if (!interleaved_blocks.empty() && interleaved_blocks[0]->attribute_stride != 0)
+				return true;
+
+			if (!volatile_blocks.empty())
+				return true;
+
+			for (u8 index = 0; index < limits::vertex_count; ++index)
+			{
+				switch (attribute_placement[index])
+				{
+				case attribute_buffer_placement::transient:
+				{
+					// Ignore register reference
+					if (std::find(referenced_registers.begin(), referenced_registers.end(), index) != referenced_registers.end())
+						continue;
+
+					// The source is inline array or immediate draw push buffer
+					return true;
+				}
+				case attribute_buffer_placement::persistent:
+				{
+					return true;
+				}
+				case attribute_buffer_placement::none:
+				{
+					continue;
+				}
+				default:
+				{
+					fmt::throw_exception("Unreachable");
+				}
+				}
+			}
+
+			return false;
+		}
+
+		u32 calculate_interleaved_memory_requirements(u32 first_vertex, u32 vertex_count) const
+		{
+			u32 mem = 0;
+			for (auto& block : interleaved_blocks)
+			{
+				const auto range = block->calculate_required_range(first_vertex, vertex_count);
+				mem += range.second * block->attribute_stride;
+			}
+
+			return mem;
+		}
+	};
+}
--- a/rpcs3/Emu/RSX/GL/GLGSRender.cpp
+++ b/rpcs3/Emu/RSX/GL/GLGSRender.cpp
@ -1037,11 +1037,11 @@ void GLGSRender::on_semaphore_acquire_wait()
 	if (!work_queue.empty() ||
 		(async_flip_requested & flip_request::emu_requested))
 	{
-		do_local_task(rsx::FIFO_state::lock_wait);
+		do_local_task(rsx::FIFO::state::lock_wait);
 	}
 }

-void GLGSRender::do_local_task(rsx::FIFO_state state)
+void GLGSRender::do_local_task(rsx::FIFO::state state)
 {
 	if (!work_queue.empty())
 	{
@ -1058,7 +1058,7 @@ void GLGSRender::do_local_task(rsx::FIFO_state state)
 			q.processed = true;
 		}
 	}
-	else if (!in_begin_end && state != rsx::FIFO_state::lock_wait)
+	else if (!in_begin_end && state != rsx::FIFO::state::lock_wait)
 	{
 		if (m_graphics_state & rsx::pipeline_state::framebuffer_reads_dirty)
 		{
@ -1071,7 +1071,7 @@ void GLGSRender::do_local_task(rsx::FIFO_state state)

 	rsx::thread::do_local_task(state);

-	if (state == rsx::FIFO_state::lock_wait)
+	if (state == rsx::FIFO::state::lock_wait)
 	{
 		// Critical check finished
 		return;
--- a/rpcs3/Emu/RSX/GL/GLGSRender.h
+++ b/rpcs3/Emu/RSX/GL/GLGSRender.h
@ -193,7 +193,7 @@ protected:
 	void on_exit() override;
 	void flip(const rsx::display_flip_info_t& info) override;

-	void do_local_task(rsx::FIFO_state state) override;
+	void do_local_task(rsx::FIFO::state state) override;

 	bool on_access_violation(u32 address, bool is_writing) override;
 	void on_invalidate_memory_range(const utils::address_range &range, rsx::invalidation_cause cause) override;
--- a/rpcs3/Emu/RSX/RSXFIFO.cpp
+++ b/rpcs3/Emu/RSX/RSXFIFO.cpp
@ -4,6 +4,7 @@
 #include "RSXThread.h"
 #include "Capture/rsx_capture.h"
 #include "Common/time.hpp"
+#include "Core/RSXReservationLock.hpp"
 #include "Emu/Memory/vm_reservation.h"
 #include "Emu/Cell/lv2/sys_rsx.h"
 #include "util/asm.hpp"
@ -613,20 +614,20 @@ namespace rsx
 			{
 			case FIFO::FIFO_NOP:
 			{
-				if (performance_counters.state == FIFO_state::running)
+				if (performance_counters.state == FIFO::state::running)
 				{
 					performance_counters.FIFO_idle_timestamp = rsx::uclock();
-					performance_counters.state = FIFO_state::nop;
+					performance_counters.state = FIFO::state::nop;
 				}

 				return;
 			}
 			case FIFO::FIFO_EMPTY:
 			{
-				if (performance_counters.state == FIFO_state::running)
+				if (performance_counters.state == FIFO::state::running)
 				{
 					performance_counters.FIFO_idle_timestamp = rsx::uclock();
-					performance_counters.state = FIFO_state::empty;
+					performance_counters.state = FIFO::state::empty;
 				}
 				else
 				{
@ -658,13 +659,13 @@ namespace rsx
 				if (offs == fifo_ctrl->get_pos())
 				{
 					//Jump to self. Often preceded by NOP
-					if (performance_counters.state == FIFO_state::running)
+					if (performance_counters.state == FIFO::state::running)
 					{
 						performance_counters.FIFO_idle_timestamp = rsx::uclock();
 						sync_point_request.release(true);
 					}

-					performance_counters.state = FIFO_state::spinning;
+					performance_counters.state = FIFO::state::spinning;
 				}
 				else
 				{
@ -710,14 +711,14 @@ namespace rsx
 		}

 		if (const auto state = performance_counters.state;
-			state != FIFO_state::running)
+			state != FIFO::state::running)
 		{
-			performance_counters.state = FIFO_state::running;
+			performance_counters.state = FIFO::state::running;

 			// Hack: Delay FIFO wake-up according to setting
 			// NOTE: The typical spin setup is a NOP followed by a jump-to-self
 			// NOTE: There is a small delay when the jump address is dynamically edited by cell
-			if (state != FIFO_state::nop)
+			if (state != FIFO::state::nop)
 			{
 				fifo_wake_delay();
 			}
--- a/rpcs3/Emu/RSX/RSXFIFO.h
+++ b/rpcs3/Emu/RSX/RSXFIFO.h
@ -32,6 +32,22 @@ namespace rsx
 			EMIT_BARRIER = 2
 		};

+		enum class state : u8
+		{
+			running = 0,
+			empty = 1,    // PUT == GET
+			spinning = 2, // Puller continuously jumps to self addr (synchronization technique)
+			nop = 3,      // Puller is processing a NOP command
+			lock_wait = 4,// Puller is processing a lock acquire
+			paused = 5,   // Puller is paused externallly
+		};
+
+		enum class interrupt_hint : u8
+		{
+			conditional_render_eval = 1,
+			zcull_sync = 2
+		};
+
 		struct register_pair
 		{
 			u32 reg;
--- a/rpcs3/Emu/RSX/RSXOffload.cpp
+++ b/rpcs3/Emu/RSX/RSXOffload.cpp
@ -2,6 +2,7 @@

 #include "Emu/Memory/vm.h"
 #include "Common/BufferUtils.h"
+#include "Core/RSXReservationLock.hpp"
 #include "RSXOffload.h"
 #include "RSXThread.h"

--- a/rpcs3/Emu/RSX/RSXThread.cpp
+++ b/rpcs3/Emu/RSX/RSXThread.cpp
@ -5,12 +5,14 @@
 #include "Emu/Cell/SPUThread.h"
 #include "Emu/Cell/timers.hpp"

+#include "Capture/rsx_capture.h"
 #include "Common/BufferUtils.h"
 #include "Common/buffer_stream.hpp"
 #include "Common/texture_cache.h"
 #include "Common/surface_store.h"
 #include "Common/time.hpp"
-#include "Capture/rsx_capture.h"
+#include "Core/RSXReservationLock.hpp"
+#include "Core/RSXEngLock.hpp"
 #include "rsx_methods.h"
 #include "gcm_printing.h"
 #include "RSXDisAsm.h"
@ -733,7 +735,7 @@ namespace rsx
 		if ((state & (cpu_flag::dbg_global_pause + cpu_flag::exit)) == cpu_flag::dbg_global_pause)
 		{
 			// Wait 16ms during emulation pause. This reduces cpu load while still giving us the chance to render overlays.
-			do_local_task(rsx::FIFO_state::paused);
+			do_local_task(rsx::FIFO::state::paused);
 			thread_ctrl::wait_on(state, old, 16000);
 		}
 		else
@ -803,7 +805,7 @@ namespace rsx
 		check_zcull_status(false);
 		nv4097::set_render_mode(this, 0, method_registers.registers[NV4097_SET_RENDER_ENABLE]);

-		performance_counters.state = FIFO_state::empty;
+		performance_counters.state = FIFO::state::empty;

 		const u64 event_flags = unsent_gcm_events.exchange(0);

@ -832,7 +834,7 @@ namespace rsx
 			thread_ctrl::wait_for(1000);
 		}

-		performance_counters.state = FIFO_state::running;
+		performance_counters.state = FIFO::state::running;

 		fifo_ctrl = std::make_unique<::rsx::FIFO::FIFO_control>(this);
 		fifo_ctrl->set_get(ctrl->get);
@ -994,7 +996,7 @@ namespace rsx

 		// Clear any pending flush requests to release threads
 		std::this_thread::sleep_for(10ms);
-		do_local_task(rsx::FIFO_state::lock_wait);
+		do_local_task(rsx::FIFO::state::lock_wait);

 		g_fxo->get<rsx::dma_manager>().join();
 		g_fxo->get<vblank_thread>() = thread_state::finished;
@ -1261,7 +1263,7 @@ namespace rsx
 		fmt::throw_exception("ill-formed draw command");
 	}

-	void thread::do_local_task(FIFO_state state)
+	void thread::do_local_task(FIFO::state state)
 	{
 		m_eng_interrupt_mask.clear(rsx::backend_interrupt);

@ -1272,7 +1274,7 @@ namespace rsx
 			handle_emu_flip(async_flip_buffer);
 		}

-		if (!in_begin_end && state != FIFO_state::lock_wait)
+		if (!in_begin_end && state != FIFO::state::lock_wait)
 		{
 			if (atomic_storage<u32>::load(m_invalidated_memory_range.end) != 0)
 			{
@ -2845,7 +2847,7 @@ namespace rsx
 			if (!result.queries.empty())
 			{
 				cond_render_ctrl.set_eval_sources(result.queries);
-				sync_hint(FIFO_hint::hint_conditional_render_eval, { .query = cond_render_ctrl.eval_sources.front(), .address = ref });
+				sync_hint(FIFO::interrupt_hint::conditional_render_eval, { .query = cond_render_ctrl.eval_sources.front(), .address = ref });
 			}
 			else
 			{
@ -2895,7 +2897,7 @@ namespace rsx
 		//ensure(async_tasks_pending.load() == 0);
 	}

-	void thread::sync_hint(FIFO_hint /*hint*/, rsx::reports::sync_hint_payload_t payload)
+	void thread::sync_hint(FIFO::interrupt_hint /*hint*/, rsx::reports::sync_hint_payload_t payload)
 	{
 		zcull_ctrl->on_sync_hint(payload);
 	}
--- a/rpcs3/Emu/RSX/RSXThread.h
+++ b/rpcs3/Emu/RSX/RSXThread.h
@ -28,6 +28,11 @@
 #include "Emu/IdManager.h"
 #include "Emu/system_config.h"

+#include "Core/RSXDisplay.h"
+#include "Core/RSXFrameBuffer.h"
+#include "Core/RSXIOMap.hpp"
+#include "Core/RSXVertexTypes.h"
+
 extern atomic_t<bool> g_user_asked_for_frame_capture;
 extern atomic_t<bool> g_disable_frame_limit;
 extern rsx::frame_trace_data frame_debug;
@ -40,84 +45,6 @@ namespace rsx
 		class display_manager;
 	}

-	struct rsx_iomap_table
-	{
-		static constexpr u32 c_lock_stride = 8192;
-
-		std::array<atomic_t<u32>, 4096> ea;
-		std::array<atomic_t<u32>, 4096> io;
-		std::array<shared_mutex, 0x1'0000'0000 / c_lock_stride> rs;
-
-		rsx_iomap_table() noexcept;
-
-		// Try to get the real address given a mapped address
-		// Returns -1 on failure
-		u32 get_addr(u32 offs) const noexcept
-		{
-			return this->ea[offs >> 20] | (offs & 0xFFFFF);
-		}
-
-		template <bool IsFullLock, uint Stride>
-		bool lock(u32 addr, u32 len, cpu_thread* self = nullptr) noexcept
-		{
-			if (len <= 1) return false;
-			const u32 end = addr + len - 1;
-
-			bool added_wait = false;
-
-			for (u32 block = addr / c_lock_stride; block <= (end / c_lock_stride); block += Stride)
-			{
-				auto& mutex_ = rs[block];
-
-				if (IsFullLock ? !mutex_.try_lock() : !mutex_.try_lock_shared()) [[ unlikely ]]
-				{
-					if (self)
-					{
-						added_wait |= !self->state.test_and_set(cpu_flag::wait);
-					}
-
-					if (!self || self->id_type() != 0x55u)
-					{
-						IsFullLock ? mutex_.lock() : mutex_.lock_shared();
-					}
-					else
-					{
-						while (IsFullLock ? !mutex_.try_lock() : !mutex_.try_lock_shared())
-						{
-							self->cpu_wait({});
-						}
-					}
-				}
-			}
-
-			if (added_wait)
-			{
-				self->check_state();
-			}
-
-			return true;
-		}
-
-		template <bool IsFullLock, uint Stride>
-		void unlock(u32 addr, u32 len) noexcept
-		{
-			ensure(len >= 1);
-			const u32 end = addr + len - 1;
-
-			for (u32 block = (addr / 8192); block <= (end / 8192); block += Stride)
-			{
-				if constexpr (IsFullLock)
-				{
-					rs[block].unlock();
-				}
-				else
-				{
-					rs[block].unlock_shared();
-				}
-			}
-		}
-	};
-
 	enum framebuffer_creation_context : u8
 	{
 		context_draw = 0,
@ -175,22 +102,6 @@ namespace rsx
 		all_interrupt_bits = memory_config_interrupt | backend_interrupt | display_interrupt | pipe_flush_interrupt
 	};

-	enum FIFO_state : u8
-	{
-		running = 0,
-		empty = 1,    // PUT == GET
-		spinning = 2, // Puller continuously jumps to self addr (synchronization technique)
-		nop = 3,      // Puller is processing a NOP command
-		lock_wait = 4,// Puller is processing a lock acquire
-		paused = 5,   // Puller is paused externallly
-	};
-
-	enum FIFO_hint : u8
-	{
-		hint_conditional_render_eval = 1,
-		hint_zcull_sync = 2
-	};
-
 	enum result_flags: u8
 	{
 		result_none = 0,
@ -206,264 +117,6 @@ namespace rsx
 		const char* file = __builtin_FILE(),
 		const char* func = __builtin_FUNCTION());

-	struct tiled_region
-	{
-		u32 address;
-		u32 base;
-		GcmTileInfo *tile;
-		u8 *ptr;
-
-		void write(const void *src, u32 width, u32 height, u32 pitch);
-		void read(void *dst, u32 width, u32 height, u32 pitch);
-	};
-
-	struct vertex_array_buffer
-	{
-		rsx::vertex_base_type type;
-		u8 attribute_size;
-		u8 stride;
-		std::span<const std::byte> data;
-		u8 index;
-		bool is_be;
-	};
-
-	struct vertex_array_register
-	{
-		rsx::vertex_base_type type;
-		u8 attribute_size;
-		std::array<u32, 4> data;
-		u8 index;
-	};
-
-	struct empty_vertex_array
-	{
-		u8 index;
-	};
-
-	struct draw_array_command
-	{
-		u32 __dummy;
-	};
-
-	struct draw_indexed_array_command
-	{
-		std::span<const std::byte> raw_index_buffer;
-	};
-
-	struct draw_inlined_array
-	{
-		u32 __dummy;
-		u32 __dummy2;
-	};
-
-	struct interleaved_attribute_t
-	{
-		u8 index;
-		bool modulo;
-		u16 frequency;
-	};
-
-	struct interleaved_range_info
-	{
-		bool interleaved = false;
-		bool single_vertex = false;
-		u32  base_offset = 0;
-		u32  real_offset_address = 0;
-		u8   memory_location = 0;
-		u8   attribute_stride = 0;
-
-		rsx::simple_array<interleaved_attribute_t> locations;
-
-		// Check if we need to upload a full unoptimized range, i.e [0-max_index]
-		std::pair<u32, u32> calculate_required_range(u32 first, u32 count) const;
-	};
-
-	enum attribute_buffer_placement : u8
-	{
-		none = 0,
-		persistent = 1,
-		transient = 2
-	};
-
-	class vertex_input_layout
-	{
-		int m_num_used_blocks = 0;
-		std::array<interleaved_range_info, 16> m_blocks_data{};
-
-	public:
-		rsx::simple_array<interleaved_range_info*> interleaved_blocks{};  // Interleaved blocks to be uploaded as-is
-		std::vector<std::pair<u8, u32>> volatile_blocks{};                // Volatile data blocks (immediate draw vertex data for example)
-		rsx::simple_array<u8> referenced_registers{};                     // Volatile register data
-
-		std::array<attribute_buffer_placement, 16> attribute_placement = fill_array(attribute_buffer_placement::none);
-
-		vertex_input_layout() = default;
-
-		interleaved_range_info* alloc_interleaved_block()
-		{
-			auto result = &m_blocks_data[m_num_used_blocks++];
-			result->attribute_stride = 0;
-			result->base_offset = 0;
-			result->memory_location = 0;
-			result->real_offset_address = 0;
-			result->single_vertex = false;
-			result->locations.clear();
-			result->interleaved = true;
-			return result;
-		}
-
-		void clear()
-		{
-			m_num_used_blocks = 0;
-			interleaved_blocks.clear();
-			volatile_blocks.clear();
-			referenced_registers.clear();
-		}
-
-		bool validate() const
-		{
-			// Criteria: At least one array stream has to be defined to feed vertex positions
-			// This stream cannot be a const register as the vertices cannot create a zero-area primitive
-
-			if (!interleaved_blocks.empty() && interleaved_blocks[0]->attribute_stride != 0)
-				return true;
-
-			if (!volatile_blocks.empty())
-				return true;
-
-			for (u8 index = 0; index < limits::vertex_count; ++index)
-			{
-				switch (attribute_placement[index])
-				{
-				case attribute_buffer_placement::transient:
-				{
-					// Ignore register reference
-					if (std::find(referenced_registers.begin(), referenced_registers.end(), index) != referenced_registers.end())
-						continue;
-
-					// The source is inline array or immediate draw push buffer
-					return true;
-				}
-				case attribute_buffer_placement::persistent:
-				{
-					return true;
-				}
-				case attribute_buffer_placement::none:
-				{
-					continue;
-				}
-				default:
-				{
-					fmt::throw_exception("Unreachable");
-				}
-				}
-			}
-
-			return false;
-		}
-
-		u32 calculate_interleaved_memory_requirements(u32 first_vertex, u32 vertex_count) const
-		{
-			u32 mem = 0;
-			for (auto &block : interleaved_blocks)
-			{
-				const auto range = block->calculate_required_range(first_vertex, vertex_count);
-				mem += range.second * block->attribute_stride;
-			}
-
-			return mem;
-		}
-	};
-
-	struct framebuffer_layout
-	{
-		ENABLE_BITWISE_SERIALIZATION;
-
-		u16 width;
-		u16 height;
-		std::array<u32, 4> color_addresses;
-		std::array<u32, 4> color_pitch;
-		std::array<u32, 4> actual_color_pitch;
-		std::array<bool, 4> color_write_enabled;
-		u32 zeta_address;
-		u32 zeta_pitch;
-		u32 actual_zeta_pitch;
-		bool zeta_write_enabled;
-		rsx::surface_target target;
-		rsx::surface_color_format color_format;
-		rsx::surface_depth_format2 depth_format;
-		rsx::surface_antialiasing aa_mode;
-		rsx::surface_raster_type raster_type;
-		u32 aa_factors[2];
-		bool ignore_change;
-	};
-
-	struct frame_statistics_t
-	{
-		u32 draw_calls;
-		u32 submit_count;
-
-		s64 setup_time;
-		s64 vertex_upload_time;
-		s64 textures_upload_time;
-		s64 draw_exec_time;
-		s64 flip_time;
-	};
-
-	struct display_flip_info_t
-	{
-		std::deque<u32> buffer_queue;
-		u32 buffer;
-		bool skip_frame;
-		bool emu_flip;
-		bool in_progress;
-		frame_statistics_t stats;
-
-		inline void push(u32 _buffer)
-		{
-			buffer_queue.push_back(_buffer);
-		}
-
-		inline bool pop(u32 _buffer)
-		{
-			if (buffer_queue.empty())
-			{
-				return false;
-			}
-
-			do
-			{
-				const auto index = buffer_queue.front();
-				buffer_queue.pop_front();
-
-				if (index == _buffer)
-				{
-					buffer = _buffer;
-					return true;
-				}
-			}
-			while (!buffer_queue.empty());
-
-			// Need to observe this happening in the wild
-			rsx_log.error("Display queue was discarded while not empty!");
-			return false;
-		}
-	};
-
-	class vblank_thread
-	{
-		std::shared_ptr<named_thread<std::function<void()>>> m_thread;
-
-	public:
-		vblank_thread() = default;
-		vblank_thread(const vblank_thread&) = delete;
-
-		void set_thread(std::shared_ptr<named_thread<std::function<void()>>> thread);
-
-		vblank_thread& operator=(thread_state);
-		vblank_thread& operator=(const vblank_thread&) = delete;
-	};
-
 	struct backend_configuration
 	{
 		bool supports_multidraw;               // Draw call batching
@ -493,6 +146,7 @@ namespace rsx
 		u64 tsc;
 	};

+	// TODO: This class is a mess, this needs to be broken into smaller chunks, like I did for RSXFIFO and RSXZCULL (kd)
 	class thread : public cpu_thread
 	{
 		u64 timestamp_ctrl = 0;
@ -586,7 +240,7 @@ namespace rsx
 			atomic_t<u64> idle_time{ 0 };  // Time spent idling in microseconds
 			u64 last_update_timestamp = 0; // Timestamp of last load update
 			u64 FIFO_idle_timestamp = 0;   // Timestamp of when FIFO queue becomes idle
-			FIFO_state state = FIFO_state::running;
+			FIFO::state state = FIFO::state::running;
 			u32 approximate_load = 0;
 			u32 sampled_frames = 0;
 		}
@ -736,7 +390,7 @@ namespace rsx
 		/**
 		 * Execute a backend local task queue
 		 */
-		virtual void do_local_task(FIFO_state state);
+		virtual void do_local_task(FIFO::state state);

 		virtual void emit_geometry(u32) {}

@ -778,7 +432,7 @@ namespace rsx
 		// sync
 		void sync();
 		flags32_t read_barrier(u32 memory_address, u32 memory_range, bool unconditional);
-		virtual void sync_hint(FIFO_hint hint, reports::sync_hint_payload_t payload);
+		virtual void sync_hint(FIFO::interrupt_hint hint, reports::sync_hint_payload_t payload);
 		virtual bool release_GCM_label(u32 /*address*/, u32 /*value*/) { return false; }

 		std::span<const std::byte> get_raw_index_array(const draw_clause& draw_indexed_clause) const;
@ -899,126 +553,4 @@ namespace rsx
 	{
 		return g_fxo->try_get<rsx::thread>();
 	}
-
-	template<bool IsFullLock = false, uint Stride = 128>
-	class reservation_lock
-	{
-		u32 addr = 0;
-		u32 length = 0;
-
-		inline void lock_range(u32 addr, u32 length)
-		{
-			if (!get_current_renderer()->iomap_table.lock<IsFullLock, Stride>(addr, length, get_current_cpu_thread()))
-			{
-				length = 0;
-			}
-
-			this->addr = addr;
-			this->length = length;
-		}
-
-	public:
-		reservation_lock(u32 addr, u32 length)
-		{
-			if (g_cfg.core.rsx_accurate_res_access &&
-				addr < constants::local_mem_base)
-			{
-				lock_range(addr, length);
-			}
-		}
-
-		reservation_lock(u32 addr, u32 length, bool setting)
-		{
-			if (setting)
-			{
-				lock_range(addr, length);
-			}
-		}
-
-		// Multi-range lock. If ranges overlap, the combined range will be acquired.
-		// If ranges do not overlap, the first range that is in main memory will be acquired.
-		reservation_lock(u32 dst_addr, u32 dst_length, u32 src_addr, u32 src_length)
-		{
-			if (g_cfg.core.rsx_accurate_res_access)
-			{
-				const auto range1 = utils::address_range::start_length(dst_addr, dst_length);
-				const auto range2 = utils::address_range::start_length(src_addr, src_length);
-				utils::address_range target_range;
-
-				if (!range1.overlaps(range2)) [[likely]]
-				{
-					target_range = (dst_addr < constants::local_mem_base) ? range1 : range2;
-				}
-				else
-				{
-					// Very unlikely
-					target_range = range1.get_min_max(range2);
-				}
-
-				if (target_range.start < constants::local_mem_base)
-				{
-					lock_range(target_range.start, target_range.length());
-				}
-			}
-		}
-
-		// Very special utility for batched transfers (SPU related)
-		template <typename T = void>
-		void update_if_enabled(u32 addr, u32 _length, const std::add_pointer_t<T>& lock_release = std::add_pointer_t<void>{})
-		{
-			// This check is not perfect but it covers the important cases fast (this check is only an optimization - forcing true disables it)
-			if (length && (this->addr / rsx_iomap_table::c_lock_stride != addr / rsx_iomap_table::c_lock_stride || (addr % rsx_iomap_table::c_lock_stride + _length) > rsx_iomap_table::c_lock_stride) && _length > 1)
-			{
-				if constexpr (!std::is_void_v<T>)
-				{
-					// See SPUThread.cpp
-					lock_release->release(0);
-				}
-
-				unlock();
-				lock_range(addr, _length);
-			}
-		}
-
-		void unlock(bool destructor = false)
-		{
-			if (length)
-			{
-				get_current_renderer()->iomap_table.unlock<IsFullLock, Stride>(addr, length);
-
-				if (!destructor)
-				{
-					length = 0;
-				}
-			}
-		}
-
-		~reservation_lock()
-		{
-			unlock(true);
-		}
-	};
-
-	class eng_lock
-	{
-		rsx::thread* pthr;
-	public:
-		eng_lock(rsx::thread* target)
-			:pthr(target)
-		{
-			if (pthr->is_current_thread())
-			{
-				pthr = nullptr;
-			}
-			else
-			{
-				pthr->pause();
-			}
-		}
-
-		~eng_lock()
-		{
-			if (pthr) pthr->unpause();
-		}
-	};
 }
--- a/rpcs3/Emu/RSX/RSXZCULL.cpp
+++ b/rpcs3/Emu/RSX/RSXZCULL.cpp
@ -1,4 +1,6 @@
 #include "stdafx.h"
+#include "Core/RSXEngLock.hpp"
+#include "Core/RSXReservationLock.hpp"
 #include "RSXThread.h"

 namespace rsx
@ -422,7 +424,7 @@ namespace rsx
 					if (It->query->sync_tag > m_sync_tag)
 					{
 						// rsx_log.trace("[Performance warning] Query hint emit during sync command.");
-						ptimer->sync_hint(FIFO_hint::hint_zcull_sync, { .query = It->query });
+						ptimer->sync_hint(FIFO::interrupt_hint::zcull_sync, { .query = It->query });
 					}

 					break;
@ -531,7 +533,7 @@ namespace rsx
 						{
 							if (It->query->num_draws && It->query->sync_tag > m_sync_tag)
 							{
-								ptimer->sync_hint(FIFO_hint::hint_zcull_sync, { .query = It->query });
+								ptimer->sync_hint(FIFO::interrupt_hint::zcull_sync, { .query = It->query });
 								ensure(It->query->sync_tag <= m_sync_tag);
 							}

@ -556,7 +558,7 @@ namespace rsx
 						const auto elapsed = m_tsc - front.query->timestamp;
 						if (elapsed > max_zcull_delay_us)
 						{
-							ptimer->sync_hint(FIFO_hint::hint_zcull_sync, { .query = front.query });
+							ptimer->sync_hint(FIFO::interrupt_hint::zcull_sync, { .query = front.query });
 							ensure(front.query->sync_tag <= m_sync_tag);
 						}

@ -704,7 +706,7 @@ namespace rsx
 				{
 					if (query->sync_tag > m_sync_tag) [[unlikely]]
 					{
-						ptimer->sync_hint(FIFO_hint::hint_zcull_sync, { .query = query });
+						ptimer->sync_hint(FIFO::interrupt_hint::zcull_sync, { .query = query });
 						ensure(m_sync_tag >= query->sync_tag);
 					}
 				}
--- a/rpcs3/Emu/RSX/VK/VKGSRender.cpp
+++ b/rpcs3/Emu/RSX/VK/VKGSRender.cpp
@ -673,7 +673,7 @@ VKGSRender::~VKGSRender()
 	// Flush DMA queue
 	while (!g_fxo->get<rsx::dma_manager>().sync())
 	{
-		do_local_task(rsx::FIFO_state::lock_wait);
+		do_local_task(rsx::FIFO::state::lock_wait);
 	}

 	//Wait for device to finish up with resources
@ -895,7 +895,7 @@ void VKGSRender::on_semaphore_acquire_wait()
 		(async_flip_requested & flip_request::emu_requested) ||
 		(m_queue_status & flush_queue_state::deadlock))
 	{
-		do_local_task(rsx::FIFO_state::lock_wait);
+		do_local_task(rsx::FIFO::state::lock_wait);
 	}
 }

@ -1602,7 +1602,7 @@ bool VKGSRender::release_GCM_label(u32 address, u32 args)
 	return true;
 }

-void VKGSRender::sync_hint(rsx::FIFO_hint hint, rsx::reports::sync_hint_payload_t payload)
+void VKGSRender::sync_hint(rsx::FIFO::interrupt_hint hint, rsx::reports::sync_hint_payload_t payload)
 {
 	rsx::thread::sync_hint(hint, payload);

@ -1615,7 +1615,7 @@ void VKGSRender::sync_hint(rsx::FIFO_hint hint, rsx::reports::sync_hint_payload_
 	// Occlusion test result evaluation is coming up, avoid a hard sync
 	switch (hint)
 	{
-	case rsx::FIFO_hint::hint_conditional_render_eval:
+	case rsx::FIFO::interrupt_hint::conditional_render_eval:
 	{
 		// If a flush request is already enqueued, do nothing
 		if (m_flush_requests.pending())
@ -1645,7 +1645,7 @@ void VKGSRender::sync_hint(rsx::FIFO_hint hint, rsx::reports::sync_hint_payload_
 		m_last_cond_render_eval_hint = now;
 		break;
 	}
-	case rsx::FIFO_hint::hint_zcull_sync:
+	case rsx::FIFO::interrupt_hint::zcull_sync:
 	{
 		// Check if the required report is synced to this CB
 		auto& data = m_occlusion_map[payload.query->driver_handle];
@ -1672,7 +1672,7 @@ void VKGSRender::sync_hint(rsx::FIFO_hint hint, rsx::reports::sync_hint_payload_
 	}
 }

-void VKGSRender::do_local_task(rsx::FIFO_state state)
+void VKGSRender::do_local_task(rsx::FIFO::state state)
 {
 	if (m_queue_status & flush_queue_state::deadlock)
 	{
@ -1702,7 +1702,7 @@ void VKGSRender::do_local_task(rsx::FIFO_state state)
 			m_flush_queue_mutex.unlock();
 		}
 	}
-	else if (!in_begin_end && state != rsx::FIFO_state::lock_wait)
+	else if (!in_begin_end && state != rsx::FIFO::state::lock_wait)
 	{
 		if (m_graphics_state & rsx::pipeline_state::framebuffer_reads_dirty)
 		{
@ -1717,11 +1717,11 @@ void VKGSRender::do_local_task(rsx::FIFO_state state)

 	switch (state)
 	{
-	case rsx::FIFO_state::lock_wait:
+	case rsx::FIFO::state::lock_wait:
 		// Critical check finished
 		return;
-	//case rsx::FIFO_state::spinning:
-	//case rsx::FIFO_state::empty:
+	//case rsx::FIFO::state::spinning:
+	//case rsx::FIFO::state::empty:
 		// We have some time, check the present queue
 		//check_present_status();
 		//break;
--- a/rpcs3/Emu/RSX/VK/VKGSRender.h
+++ b/rpcs3/Emu/RSX/VK/VKGSRender.h
@ -248,7 +248,7 @@ public:
 	void set_scissor(bool clip_viewport);
 	void bind_viewport();

-	void sync_hint(rsx::FIFO_hint hint, rsx::reports::sync_hint_payload_t payload) override;
+	void sync_hint(rsx::FIFO::interrupt_hint hint, rsx::reports::sync_hint_payload_t payload) override;
 	bool release_GCM_label(u32 address, u32 data) override;

 	void begin_occlusion_query(rsx::reports::occlusion_query_info* query) override;
@ -282,7 +282,7 @@ protected:

 	void renderctl(u32 request_code, void* args) override;

-	void do_local_task(rsx::FIFO_state state) override;
+	void do_local_task(rsx::FIFO::state state) override;
 	bool scaled_image_from_memory(rsx::blit_src_info& src, rsx::blit_dst_info& dst, bool interpolate) override;
 	void notify_tile_unbound(u32 tile) override;

--- a/rpcs3/Emu/RSX/rsx_methods.cpp
+++ b/rpcs3/Emu/RSX/rsx_methods.cpp
@ -4,6 +4,7 @@
 #include "rsx_utils.h"
 #include "rsx_decode.h"
 #include "Common/time.hpp"
+#include "Core/RSXReservationLock.hpp"
 #include "Emu/Cell/PPUCallback.h"
 #include "Emu/Cell/lv2/sys_rsx.h"
 #include "Emu/RSX/Common/BufferUtils.h"
@ -1278,6 +1279,11 @@ namespace rsx
 				out_pitch = out_bpp * out_w;
 			}

+			if (in_pitch == 0)
+			{
+				in_pitch = in_bpp * in_w;
+			}
+
 			if (in_bpp != out_bpp)
 			{
 				is_block_transfer = false;
@ -1680,12 +1686,6 @@ namespace rsx
 			const u8 in_format = method_registers.nv0039_input_format();
 			const u32 notify = arg;

-			// The existing GCM commands use only the value 0x1 for inFormat and outFormat
-			if (in_format != 0x01 || out_format != 0x01)
-			{
-				rsx_log.error("NV0039_BUFFER_NOTIFY: Unsupported format: inFormat=%d, outFormat=%d", in_format, out_format);
-			}
-
 			if (!line_count || !line_length)
 			{
 				rsx_log.warning("NV0039_BUFFER_NOTIFY NOPed out: pitch(in=0x%x, out=0x%x), line(len=0x%x, cnt=0x%x), fmt(in=0x%x, out=0x%x), notify=0x%x",
@ -1734,7 +1734,28 @@ namespace rsx
 				 (dst_offset >= src_offset && dst_offset < src_max);
 			}();

-			if (is_overlapping)
+			if (in_format > 1 || out_format > 1) [[ unlikely ]]
+			{
+				// The formats are just input channel strides. You can use this to do cool tricks like gathering channels
+				// Very rare, only seen in use by Destiny
+				// TODO: Hw accel
+				for (u32 row = 0; row < line_count; ++row)
+				{
+					auto dst_ptr = dst;
+					auto src_ptr = src;
+					while (src_ptr < src + line_length)
+					{
+						*dst_ptr = *src_ptr;
+
+						src_ptr += in_format;
+						dst_ptr += out_format;
+					}
+
+					dst += out_pitch;
+					src += in_pitch;
+				}
+			}
+			else if (is_overlapping) [[ unlikely ]]
 			{
 				if (is_block_transfer)
 				{
--- a/rpcs3/emucore.vcxproj
+++ b/rpcs3/emucore.vcxproj
@ -525,6 +525,12 @@
    <ClInclude Include="Emu\RSX\Common\simple_array.hpp" />
    <ClInclude Include="Emu\RSX\Common\surface_cache_dma.hpp" />
    <ClInclude Include="Emu\RSX\Common\time.hpp" />
+    <ClInclude Include="Emu\RSX\Core\RSXEngLock.hpp" />
+    <ClInclude Include="Emu\RSX\Core\RSXFrameBuffer.h" />
+    <ClInclude Include="Emu\RSX\Core\RSXIOMap.hpp" />
+    <ClInclude Include="Emu\RSX\Core\RSXDisplay.h" />
+    <ClInclude Include="Emu\RSX\Core\RSXReservationLock.hpp" />
+    <ClInclude Include="Emu\RSX\Core\RSXVertexTypes.h" />
    <ClInclude Include="Emu\RSX\Overlays\overlay_cursor.h" />
    <ClInclude Include="Emu\RSX\Overlays\overlay_edit_text.hpp" />
    <ClInclude Include="Emu\RSX\Overlays\overlay_list_view.hpp" />
--- a/rpcs3/emucore.vcxproj.filters
+++ b/rpcs3/emucore.vcxproj.filters
@ -76,6 +76,9 @@
    <Filter Include="Emu\GPU\RSX\Program\Interpreter">
      <UniqueIdentifier>{bc97b324-1eea-445a-8fa9-6fc49e3df47c}</UniqueIdentifier>
    </Filter>
+    <Filter Include="Emu\GPU\RSX\Core">
+      <UniqueIdentifier>{99b3a1c9-93ea-4498-86b0-1000793013fa}</UniqueIdentifier>
+    </Filter>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="Crypto\aes.cpp">
@ -2206,6 +2209,24 @@
    <ClInclude Include="Emu\Io\recording_config.h">
      <Filter>Emu\Io</Filter>
    </ClInclude>
+    <ClInclude Include="Emu\RSX\Core\RSXIOMap.hpp">
+      <Filter>Emu\GPU\RSX\Core</Filter>
+    </ClInclude>
+    <ClInclude Include="Emu\RSX\Core\RSXDisplay.h">
+      <Filter>Emu\GPU\RSX\Core</Filter>
+    </ClInclude>
+    <ClInclude Include="Emu\RSX\Core\RSXVertexTypes.h">
+      <Filter>Emu\GPU\RSX\Core</Filter>
+    </ClInclude>
+    <ClInclude Include="Emu\RSX\Core\RSXFrameBuffer.h">
+      <Filter>Emu\GPU\RSX\Core</Filter>
+    </ClInclude>
+    <ClInclude Include="Emu\RSX\Core\RSXReservationLock.hpp">
+      <Filter>Emu\GPU\RSX\Core</Filter>
+    </ClInclude>
+    <ClInclude Include="Emu\RSX\Core\RSXEngLock.hpp">
+      <Filter>Emu\GPU\RSX\Core</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <None Include="Emu\RSX\Program\GLSLSnippets\GPUDeswizzle.glsl">