Core: ARM64 compatibility

2024-03-21 17:06:25 +10:00 · 2024-03-21 17:06:25 +10:00 · 71036c95a4
parent 7d098674f2
commit 71036c95a4
26 changed files with 411 additions and 361 deletions
--- a/pcsx2/CMakeLists.txt
+++ b/pcsx2/CMakeLists.txt
@ -954,7 +954,6 @@ set(pcsx2x86Sources
 	x86/ix86-32/iR5900Templates.cpp
 	x86/ix86-32/recVTLB.cpp
 	x86/newVif_Dynarec.cpp
-	x86/newVif_Unpack.cpp
 	x86/newVif_UnpackSSE.cpp
 	)

@ -995,7 +994,6 @@ set(pcsx2x86Headers
 	x86/microVU_Tables.inl
 	x86/microVU_Upper.inl
 	x86/newVif.h
-	x86/newVif_HashBucket.h
 	x86/newVif_UnpackSSE.h
 	x86/R5900_Profiler.h
 	)
--- a/pcsx2/Gif.cpp
+++ b/pcsx2/Gif.cpp
@ -5,7 +5,6 @@
 #include "GS.h"
 #include "Gif_Unit.h"
 #include "Vif_Dma.h"
-#include "x86/iR5900.h"

 // A three-way toggle used to determine if the GIF is stalling (transferring) or done (finished).
 // Should be a gifstate_t rather then int, but I don't feel like possibly interfering with savestates right now.
--- a/pcsx2/Gif_Unit.h
+++ b/pcsx2/Gif_Unit.h
@ -118,6 +118,22 @@ struct Gif_Tag

 		// write out unpacked registers
 		_mm_storeu_si128(reinterpret_cast<__m128i*>(regs), vregs);
+#elif defined(_M_ARM64)
+		// zero out bits for registers which shouldn't be tested
+		u64 REGS64;
+		std::memcpy(&REGS64, tag.REGS, sizeof(u64));
+		REGS64 &= (0xFFFFFFFFFFFFFFFFULL >> (64 - nRegs * 4));
+		uint8x16_t vregs = vsetq_lane_u64(REGS64, vdupq_n_u64(0), 0);
+
+		// get upper nibbles, interleave with lower nibbles, clear upper bits from low nibbles
+		vregs = vandq_u8(vzip1q_u8(vregs, vshrq_n_u8(vregs, 4)), vdupq_n_u8(0x0F));
+
+		// compare with GIF_REG_A_D, set hasAD if any lanes passed
+		const uint8x16_t comp = vceqq_u8(vregs, vdupq_n_u8(GIF_REG_A_D));
+		hasAD = vmaxvq_u8(comp) & 1;
+
+		// write out unpacked registers
+		vst1q_u8(regs, vregs);
 #else
 		// Reference C implementation.
 		hasAD = false;
--- a/pcsx2/IopBios.cpp
+++ b/pcsx2/IopBios.cpp
@ -8,7 +8,6 @@
 #include "R3000A.h"
 #include "R5900.h"
 #include "ps2/BiosTools.h"
-#include "x86/iR3000A.h"
 #include "VMManager.h"

 #include <ctype.h>
--- a/pcsx2/IopHw.cpp
+++ b/pcsx2/IopHw.cpp
@ -11,7 +11,6 @@
 #include "IopHw.h"
 #include "Mdec.h"
 #include "R3000A.h"
-#include "x86/iR5900.h"

 // NOTE: Any modifications to read/write fns should also go into their const counterparts
 // found in iPsxHw.cpp.
--- a/pcsx2/IopMem.cpp
+++ b/pcsx2/IopMem.cpp
@ -14,7 +14,7 @@ const uptr *psxMemRLUT = nullptr;

 IopVM_MemoryAllocMess* iopMem = nullptr;

-alignas(__pagesize) u8 iopHw[Ps2MemSize::IopHardware];
+alignas(__pagealignsize) u8 iopHw[Ps2MemSize::IopHardware];

 void iopMemAlloc()
 {
--- a/pcsx2/MTVU.cpp
+++ b/pcsx2/MTVU.cpp
@ -5,7 +5,7 @@
 #include "Gif_Unit.h"
 #include "MTVU.h"
 #include "VMManager.h"
-#include "x86/newVif.h"
+#include "Vif_Dynarec.h"

 #include <thread>

--- a/pcsx2/Memory.cpp
+++ b/pcsx2/Memory.cpp
@ -98,7 +98,7 @@ u8* SysMemory::TryAllocateVirtualMemory(const char* name, void* file_handle, upt
 	if (!baseptr)
 		return nullptr;

-	if ((uptr)baseptr != base)
+	if (base != 0 && (uptr)baseptr != base)
 	{
 		if (file_handle)
 		{
@ -122,6 +122,8 @@ u8* SysMemory::TryAllocateVirtualMemory(const char* name, void* file_handle, upt

 u8* SysMemory::AllocateVirtualMemory(const char* name, void* file_handle, size_t size, size_t offset_from_base)
 {
+	// ARM64 does not need the rec areas to be in +/- 2GB.
+#ifdef _M_X86
 	pxAssertRel(Common::IsAlignedPow2(size, __pagesize), "Virtual memory size is page aligned");

 	// Everything looks nicer when the start of all the sections is a nice round looking number.
@ -148,6 +150,9 @@ u8* SysMemory::AllocateVirtualMemory(const char* name, void* file_handle, size_t
 		DevCon.Warning("%s: host memory @ 0x%016" PRIXPTR " -> 0x%016" PRIXPTR " is unavailable; attempting to map elsewhere...", name,
 			base, base + size);
 	}
+#else
+	return TryAllocateVirtualMemory(name, file_handle, 0, size);
+#endif

 	return nullptr;
 }
@ -986,8 +991,8 @@ void memClearPageAddr(u32 vaddr)
 ///////////////////////////////////////////////////////////////////////////
 // PS2 Memory Init / Reset / Shutdown

-EEVM_MemoryAllocMess* eeMem = NULL;
-alignas(__pagesize) u8 eeHw[Ps2MemSize::Hardware];
+EEVM_MemoryAllocMess* eeMem = nullptr;
+alignas(__pagealignsize) u8 eeHw[Ps2MemSize::Hardware];


 void memBindConditionalHandlers()
--- a/pcsx2/MemoryTypes.h
+++ b/pcsx2/MemoryTypes.h
@ -59,8 +59,8 @@ struct IopVM_MemoryAllocMess
 // order to allow for simpler macros and reference handles to be defined  (we can safely use
 // compile-time references to registers instead of having to use instance variables).

-alignas(__pagesize) extern u8 eeHw[Ps2MemSize::Hardware];
-alignas(__pagesize) extern u8 iopHw[Ps2MemSize::IopHardware];
+alignas(__pagealignsize) extern u8 eeHw[Ps2MemSize::Hardware];
+alignas(__pagealignsize) extern u8 iopHw[Ps2MemSize::IopHardware];


 extern EEVM_MemoryAllocMess* eeMem;
--- a/pcsx2/R5900.cpp
+++ b/pcsx2/R5900.cpp
@ -34,8 +34,7 @@ using namespace R5900;	// for R5900 disasm tools
 s32 EEsCycle;		// used to sync the IOP to the EE
 u32 EEoCycle;

-alignas(16) cpuRegisters cpuRegs;
-alignas(16) fpuRegisters fpuRegs;
+alignas(16) cpuRegistersPack _cpuRegistersPack;
 alignas(16) tlbs tlb[48];
 R5900cpu *Cpu = NULL;

--- a/pcsx2/R5900.h
+++ b/pcsx2/R5900.h
@ -202,10 +202,18 @@ struct tlbs

 #endif

-alignas(16) extern cpuRegisters cpuRegs;
-alignas(16) extern fpuRegisters fpuRegs;
+struct cpuRegistersPack
+{
+	alignas(16) cpuRegisters cpuRegs;
+	alignas(16) fpuRegisters fpuRegs;
+};
+
+alignas(16) extern cpuRegistersPack _cpuRegistersPack;
 alignas(16) extern tlbs tlb[48];

+static cpuRegisters& cpuRegs = _cpuRegistersPack.cpuRegs;
+static fpuRegisters& fpuRegs = _cpuRegistersPack.fpuRegs;
+
 extern bool eeEventTestIsActive;

 void intUpdateCPUCycles();
--- a/pcsx2/SourceLog.cpp
+++ b/pcsx2/SourceLog.cpp
@ -11,7 +11,7 @@

 #include "DebugTools/Debug.h"
 #include "R3000A.h"
-#include "x86/iR5900.h"
+#include "R5900.h"

 #include "fmt/core.h"

--- a/pcsx2/VMManager.cpp
+++ b/pcsx2/VMManager.cpp
@ -38,6 +38,7 @@
 #include "SIO/Sio2.h"
 #include "SPU2/spu2.h"
 #include "USB/USB.h"
+#include "Vif_Dynarec.h"
 #include "VMManager.h"
 #include "ps2/BiosTools.h"
 #include "svnrev.h"
@ -76,10 +77,6 @@
 #include "common/Darwin/DarwinMisc.h"
 #endif

-#ifdef _M_X86
-#include "x86/newVif.h"
-#endif
-
 namespace VMManager
 {
 	static void SetDefaultLoggingSettings(SettingsInterface& si);
@ -230,6 +227,14 @@ bool VMManager::PerformEarlyHardwareChecks(const char** error)
 		return false;
 	}
 #endif
+#elif defined(_M_ARM64)
+	// Check page size. If it doesn't match, it is a fatal error.
+	const size_t runtime_host_page_size = HostSys::GetRuntimePageSize();
+	if (__pagesize != runtime_host_page_size)
+	{
+		*error = "Page size mismatch. This build cannot run on your Mac.\n\n" COMMON_DOWNLOAD_MESSAGE;
+		return false;
+	}
 #endif

 #undef COMMON_DOWNLOAD_MESSAGE
@ -2502,6 +2507,7 @@ void VMManager::LogCPUCapabilities()
 	LogUserPowerPlan();
 #endif

+#ifdef _M_X86
 	std::string features;
 	if (cpuinfo_has_x86_avx())
 		features += "AVX ";
@ -2513,6 +2519,18 @@ void VMManager::LogCPUCapabilities()
 	Console.WriteLn(Color_StrongBlack, "x86 Features Detected:");
 	Console.WriteLnFmt("  {}", features);
 	Console.WriteLn();
+#endif
+
+#ifdef _M_ARM64
+	const size_t runtime_cache_line_size = HostSys::GetRuntimeCacheLineSize();
+	if (__cachelinesize != runtime_cache_line_size)
+	{
+		// Not fatal, but does have performance implications.
+		WARNING_LOG(
+			"Cache line size mismatch. This build was compiled with {} byte lines, but the system has {} byte lines.",
+			__cachelinesize, runtime_cache_line_size);
+	}
+#endif

 #if 0
 	LogGPUCapabilities();
@ -3197,6 +3215,8 @@ void VMManager::WarnAboutUnsafeSettings()
 		append(ICON_FA_EXCLAMATION_CIRCLE,
 			TRANSLATE_SV("VMManager", "INTC Spin Detection is not enabled, this may reduce performance."));
 	}
+	if (!EmuConfig.Cpu.Recompiler.EnableFastmem)
+		append(ICON_FA_EXCLAMATION_CIRCLE, TRANSLATE_SV("VMManager", "Fastmem is not enabled, this will reduce performance."));
 	if (!EmuConfig.Speedhacks.vu1Instant)
 	{
 		append(ICON_FA_EXCLAMATION_CIRCLE,
@ -3322,6 +3342,12 @@ static u32 GetProcessorIdForProcessor(const cpuinfo_processor* proc)

 static void InitializeProcessorList()
 {
+	if (!cpuinfo_initialize())
+	{
+		Console.Error("cpuinfo_initialize() failed");
+		return;
+	}
+
 	const u32 cluster_count = cpuinfo_get_clusters_count();
 	if (cluster_count == 0)
 	{
@ -3448,6 +3474,10 @@ static void InitializeProcessorList()

 static void SetMTVUAndAffinityControlDefault(SettingsInterface& si)
 {
+#ifdef __APPLE__
+	// Everything we support Mac-wise has enough cores for MTVU.
+	si.SetBoolValue("EmuCore/Speedhacks", "vuThread", true);
+#endif
 }

 #endif
--- a/pcsx2/VU0micro.cpp
+++ b/pcsx2/VU0micro.cpp
@ -29,6 +29,8 @@ static __fi void vu0SetMicroFlags(u32* flags, u32 value)
 {
 #ifdef _M_X86
 	_mm_store_si128(reinterpret_cast<__m128i*>(flags), _mm_set1_epi32(value));
+#elif defined(_M_ARM64)
+	vst1q_u32(flags, vdupq_n_u32(value));
 #else
 	flags[0] = flags[1] = flags[2] = flags[3] = value;
 #endif
--- a/pcsx2/Vif.cpp
+++ b/pcsx2/Vif.cpp
@ -8,7 +8,7 @@
 #include "MTVU.h"
 #include "Vif.h"
 #include "Vif_Dma.h"
-#include "x86/newVif.h"
+#include "Vif_Dynarec.h"

 alignas(16) vifStruct vif0, vif1;

--- a/pcsx2/Vif0_Dma.cpp
+++ b/pcsx2/Vif0_Dma.cpp
@ -3,8 +3,8 @@

 #include "Common.h"
 #include "Vif_Dma.h"
+#include "Vif_Dynarec.h"
 #include "VUmicro.h"
-#include "x86/newVif.h"

 u32 g_vif0Cycles = 0;

--- a/pcsx2/Vif1_Dma.cpp
+++ b/pcsx2/Vif1_Dma.cpp
@ -7,7 +7,7 @@
 #include "MTVU.h"
 #include "VUmicro.h"
 #include "Vif_Dma.h"
-#include "x86/newVif.h"
+#include "Vif_Dynarec.h"

 u32 g_vif1Cycles = 0;

--- a/pcsx2/Vif_Codes.cpp
+++ b/pcsx2/Vif_Codes.cpp
@ -7,7 +7,7 @@
 #include "MTVU.h"
 #include "VUmicro.h"
 #include "Vif_Dma.h"
-#include "x86/newVif.h"
+#include "Vif_Dynarec.h"

 #define vifOp(vifCodeName) _vifT int vifCodeName(int pass, const u32* data)
 #define pass1 if (pass == 0)
--- a/pcsx2/Vif_Dynarec.h
+++ b/pcsx2/Vif_Dynarec.h
@ -0,0 +1,46 @@
+// SPDX-FileCopyrightText: 2002-2023 PCSX2 Dev Team
+// SPDX-License-Identifier: LGPL-3.0+
+
+#pragma once
+
+#include "Vif.h"
+#include "Vif_HashBucket.h"
+#include "VU.h"
+
+typedef u32 (*nVifCall)(void*, const void*);
+typedef void (*nVifrecCall)(uptr dest, uptr src);
+
+extern void _nVifUnpack(int idx, const u8* data, uint mode, bool isFill);
+extern void dVifReset(int idx);
+extern void dVifRelease(int idx);
+extern void VifUnpackSSE_Init();
+
+_vifT extern void dVifUnpack(const u8* data, bool isFill);
+
+struct nVifStruct
+{
+	// Buffer for partial transfers (should always be first to ensure alignment)
+	// Maximum buffer size is 256 (vifRegs.Num max range) * 16 (quadword)
+	alignas(16) u8 buffer[256*16];
+	u32            bSize; // Size of 'buffer'
+
+	// VIF0 or VIF1 - provided for debugging helpfulness only, and is generally unused.
+	// (templates are used for most or all VIF indexing)
+	u32                     idx;
+
+	u8*                     recWritePtr; // current write pos into the reserve
+	u8*                     recEndPtr;
+
+	HashBucket              vifBlocks;   // Vif Blocks
+
+
+	nVifStruct() = default;
+};
+
+extern void resetNewVif(int idx);
+
+alignas(16) extern nVifStruct nVif[2];
+alignas(16) extern nVifCall nVifUpk[(2 * 2 * 16) * 4]; // ([USN][Masking][Unpack Type]) [curCycle]
+alignas(16) extern u32      nVifMask[3][4][4];         // [MaskNumber][CycleNumber][Vector]
+
+static constexpr bool newVifDynaRec = 1; // Use code in newVif_Dynarec.inl
--- a/pcsx2/x86/newVif_HashBucket.h
+++ b/pcsx2/x86/newVif_HashBucket.h
--- a/pcsx2/Vif_Transfer.cpp
+++ b/pcsx2/Vif_Transfer.cpp
@ -3,7 +3,7 @@

 #include "Common.h"
 #include "Vif_Dma.h"
-#include "x86/newVif.h"
+#include "Vif_Dynarec.h"

 //------------------------------------------------------------------
 // VifCode Transfer Interpreter (Vif0/Vif1)
--- a/pcsx2/Vif_Unpack.cpp
+++ b/pcsx2/Vif_Unpack.cpp
@ -4,6 +4,7 @@
 #include "Common.h"
 #include "Vif.h"
 #include "Vif_Dma.h"
+#include "Vif_Dynarec.h"
 #include "MTVU.h"

 enum UnpackOffset {
@ -244,3 +245,277 @@ _vifT void vifUnpackSetup(const u32 *data) {

 template void vifUnpackSetup<0>(const u32 *data);
 template void vifUnpackSetup<1>(const u32 *data);
+
+alignas(16) nVifStruct nVif[2];
+
+// Interpreter-style SSE unpacks.  Array layout matches the interpreter C unpacks.
+//  ([USN][Masking][Unpack Type]) [curCycle]
+alignas(16) nVifCall nVifUpk[(2 * 2 * 16) * 4];
+
+// This is used by the interpreted SSE unpacks only.  Recompiled SSE unpacks
+// and the interpreted C unpacks use the vif.MaskRow/MaskCol members directly.
+//  [MaskNumber][CycleNumber][Vector]
+alignas(16) u32 nVifMask[3][4][4] = {};
+
+// Number of bytes of data in the source stream needed for each vector.
+// [equivalent to ((32 >> VL) * (VN+1)) / 8]
+alignas(16) const u8 nVifT[16] = {
+	4, // S-32
+	2, // S-16
+	1, // S-8
+	0, // ----
+	8, // V2-32
+	4, // V2-16
+	2, // V2-8
+	0, // ----
+	12,// V3-32
+	6, // V3-16
+	3, // V3-8
+	0, // ----
+	16,// V4-32
+	8, // V4-16
+	4, // V4-8
+	2, // V4-5
+};
+
+// ----------------------------------------------------------------------------
+template <int idx, bool doMode, bool isFill>
+__ri void _nVifUnpackLoop(const u8* data);
+
+typedef void FnType_VifUnpackLoop(const u8* data);
+typedef FnType_VifUnpackLoop* Fnptr_VifUnpackLoop;
+
+// Unpacks Until 'Num' is 0
+alignas(16) static const Fnptr_VifUnpackLoop UnpackLoopTable[2][2][2] = {
+	{
+		{_nVifUnpackLoop<0, 0, 0>, _nVifUnpackLoop<0, 0, 1>},
+		{_nVifUnpackLoop<0, 1, 0>, _nVifUnpackLoop<0, 1, 1>},
+	},
+	{
+		{_nVifUnpackLoop<1, 0, 0>, _nVifUnpackLoop<1, 0, 1>},
+		{_nVifUnpackLoop<1, 1, 0>, _nVifUnpackLoop<1, 1, 1>},
+	},
+};
+// ----------------------------------------------------------------------------
+
+void resetNewVif(int idx)
+{
+	// Safety Reset : Reassign all VIF structure info, just in case the VU1 pointers have
+	// changed for some reason.
+
+	nVif[idx].idx   = idx;
+	nVif[idx].bSize = 0;
+	std::memset(nVif[idx].buffer, 0, sizeof(nVif[idx].buffer));
+
+	if (newVifDynaRec)
+		dVifReset(idx);
+}
+
+void releaseNewVif(int idx)
+{
+}
+
+static __fi u8* getVUptr(uint idx, int offset)
+{
+	return (u8*)(vuRegs[idx].Mem + (offset & (idx ? 0x3ff0 : 0xff0)));
+}
+
+
+_vifT int nVifUnpack(const u8* data)
+{
+	nVifStruct&   v       = nVif[idx];
+	vifStruct&    vif     = GetVifX;
+	VIFregisters& vifRegs = vifXRegs;
+
+	const uint wl     = vifRegs.cycle.wl ? vifRegs.cycle.wl : 256;
+	const uint ret    = std::min(vif.vifpacketsize, vif.tag.size);
+	const bool isFill = (vifRegs.cycle.cl < wl);
+	s32        size   = ret << 2;
+
+	if (ret == vif.tag.size) // Full Transfer
+	{
+		if (v.bSize) // Last transfer was partial
+		{
+			memcpy(&v.buffer[v.bSize], data, size);
+			v.bSize += size;
+			size = v.bSize;
+			data = v.buffer;
+
+			vif.cl = 0;
+			vifRegs.num = (vifXRegs.code >> 16) & 0xff; // grab NUM form the original VIFcode input.
+			if (!vifRegs.num)
+				vifRegs.num = 256;
+		}
+
+		if (!idx || !THREAD_VU1)
+		{
+			if (newVifDynaRec)
+				dVifUnpack<idx>(data, isFill);
+			else
+				_nVifUnpack(idx, data, vifRegs.mode, isFill);
+		}
+		else
+			vu1Thread.VifUnpack(vif, vifRegs, (u8*)data, (size + 4) & ~0x3);
+
+		vif.pass     = 0;
+		vif.tag.size = 0;
+		vif.cmd      = 0;
+		vifRegs.num  = 0;
+		v.bSize      = 0;
+	}
+	else // Partial Transfer
+	{
+		memcpy(&v.buffer[v.bSize], data, size);
+		v.bSize += size;
+		vif.tag.size -= ret;
+
+		const u8& vSize = nVifT[vif.cmd & 0x0f];
+
+		// We need to provide accurate accounting of the NUM register, in case games decided
+		// to read back from it mid-transfer.  Since so few games actually use partial transfers
+		// of VIF unpacks, this code should not be any bottleneck.
+
+		if (!isFill)
+		{
+			vifRegs.num -= (size / vSize);
+		}
+		else
+		{
+			int dataSize = (size / vSize);
+			vifRegs.num = vifRegs.num - (((dataSize / vifRegs.cycle.cl) * (vifRegs.cycle.wl - vifRegs.cycle.cl)) + dataSize);
+		}
+	}
+
+	return ret;
+}
+
+template int nVifUnpack<0>(const u8* data);
+template int nVifUnpack<1>(const u8* data);
+
+// This is used by the interpreted SSE unpacks only.  Recompiled SSE unpacks
+// and the interpreted C unpacks use the vif.MaskRow/MaskCol members directly.
+static void setMasks(const vifStruct& vif, const VIFregisters& v)
+{
+	for (int i = 0; i < 16; i++)
+	{
+		int m = (v.mask >> (i * 2)) & 3;
+		switch (m)
+		{
+			case 0: // Data
+				nVifMask[0][i / 4][i % 4] = 0xffffffff;
+				nVifMask[1][i / 4][i % 4] = 0;
+				nVifMask[2][i / 4][i % 4] = 0;
+				break;
+			case 1: // MaskRow
+				nVifMask[0][i / 4][i % 4] = 0;
+				nVifMask[1][i / 4][i % 4] = 0;
+				nVifMask[2][i / 4][i % 4] = vif.MaskRow._u32[i % 4];
+				break;
+			case 2: // MaskCol
+				nVifMask[0][i / 4][i % 4] = 0;
+				nVifMask[1][i / 4][i % 4] = 0;
+				nVifMask[2][i / 4][i % 4] = vif.MaskCol._u32[i / 4];
+				break;
+			case 3: // Write Protect
+				nVifMask[0][i / 4][i % 4] = 0;
+				nVifMask[1][i / 4][i % 4] = 0xffffffff;
+				nVifMask[2][i / 4][i % 4] = 0;
+				break;
+		}
+	}
+}
+
+// ----------------------------------------------------------------------------
+//  Unpacking Optimization notes:
+// ----------------------------------------------------------------------------
+// Some games send a LOT of single-cycle packets (God of War, SotC, TriAce games, etc),
+// so we always need to be weary of keeping loop setup code optimized.  It's not always
+// a "win" to move code outside the loop, like normally in most other loop scenarios.
+//
+// The biggest bottleneck of the current code is the call/ret needed to invoke the SSE
+// unpackers.  A better option is to generate the entire vifRegs.num loop code as part
+// of the SSE template, and inline the SSE code into the heart of it.  This both avoids
+// the call/ret and opens the door for resolving some register dependency chains in the
+// current emitted functions.  (this is what zero's SSE does to get it's final bit of
+// speed advantage over the new vif). --air
+//
+// The BEST optimizatin strategy here is to use data available to us from the UNPACK dispatch
+// -- namely the unpack type and mask flag -- in combination mode and usn values -- to
+// generate ~600 special versions of this function.  But since it's an interpreter, who gives
+// a crap?  Really? :p
+//
+
+// size - size of the packet fragment incoming from DMAC.
+template <int idx, bool doMode, bool isFill>
+__ri void _nVifUnpackLoop(const u8* data)
+{
+
+	vifStruct& vif = MTVU_VifX;
+	VIFregisters& vifRegs = MTVU_VifXRegs;
+
+	// skipSize used for skipping writes only
+	const int skipSize = (vifRegs.cycle.cl - vifRegs.cycle.wl) * 16;
+
+	//DevCon.WriteLn("[%d][%d][%d][num=%d][upk=%d][cl=%d][bl=%d][skip=%d]", isFill, doMask, doMode, vifRegs.num, upkNum, vif.cl, blockSize, skipSize);
+
+	if (!doMode && (vif.cmd & 0x10))
+		setMasks(vif, vifRegs);
+
+	const int usn    = !!vif.usn;
+	const int upkNum = vif.cmd & 0x1f;
+	const u8& vSize  = nVifT[upkNum & 0x0f];
+	//uint vl = vif.cmd & 0x03;
+	//uint vn = (vif.cmd >> 2) & 0x3;
+	//uint vSize = ((32 >> vl) * (vn+1)) / 8;		// size of data (in bytes) used for each write cycle
+
+	const nVifCall* fnbase = &nVifUpk[((usn * 2 * 16) + upkNum) * (4 * 1)];
+	const UNPACKFUNCTYPE ft = VIFfuncTable[idx][doMode ? vifRegs.mode : 0][((usn * 2 * 16) + upkNum)];
+
+	pxAssume(vif.cl == 0);
+	//pxAssume (vifRegs.cycle.wl > 0);
+
+	do
+	{
+		u8* dest = getVUptr(idx, vif.tag.addr);
+
+		if (doMode)
+		{
+			//if (1) {
+			ft(dest, data);
+		}
+		else
+		{
+			//DevCon.WriteLn("SSE Unpack!");
+			uint cl3 = std::min(vif.cl, 3);
+			fnbase[cl3](dest, data);
+		}
+
+		vif.tag.addr += 16;
+		--vifRegs.num;
+		++vif.cl;
+
+		if (isFill)
+		{
+			//DevCon.WriteLn("isFill!");
+			if (vif.cl <= vifRegs.cycle.cl)
+				data += vSize;
+			else if (vif.cl == vifRegs.cycle.wl)
+				vif.cl = 0;
+		}
+		else
+		{
+			data += vSize;
+
+			if (vif.cl >= vifRegs.cycle.wl)
+			{
+				vif.tag.addr += skipSize;
+				vif.cl = 0;
+			}
+		}
+	} while (vifRegs.num);
+}
+
+__fi void _nVifUnpack(int idx, const u8* data, uint mode, bool isFill)
+{
+	UnpackLoopTable[idx][!!mode][isFill](data);
+}
--- a/pcsx2/pcsx2.vcxproj
+++ b/pcsx2/pcsx2.vcxproj
@ -425,9 +425,6 @@
    <ClCompile Include="Vif_Codes.cpp" />
    <ClCompile Include="Vif_Transfer.cpp" />
    <ClCompile Include="Vif_Unpack.cpp" />
-    <ClCompile Include="x86\newVif_Unpack.cpp">
-      <ExcludedFromBuild Condition="'$(Platform)'!='x64'">true</ExcludedFromBuild>
-    </ClCompile>
    <ClCompile Include="x86\newVif_Dynarec.cpp">
      <ExcludedFromBuild Condition="'$(Platform)'!='x64'">true</ExcludedFromBuild>
    </ClCompile>
@ -819,6 +816,8 @@
    <ClInclude Include="ps2\HwInternal.h" />
    <ClInclude Include="Cache.h" />
    <ClInclude Include="Memory.h" />
+    <ClInclude Include="Vif_Dynarec.h" />
+    <ClInclude Include="Vif_HashBucket.h" />
    <ClInclude Include="VMManager.h" />
    <ClInclude Include="vtlb.h" />
    <ClInclude Include="MTVU.h" />
@ -838,7 +837,6 @@
    <ClInclude Include="Vif_Dma.h" />
    <ClInclude Include="Vif_Unpack.h" />
    <ClInclude Include="x86\newVif.h" />
-    <ClInclude Include="x86\newVif_HashBucket.h" />
    <ClInclude Include="x86\newVif_UnpackSSE.h" />
    <ClInclude Include="SPR.h" />
    <ClInclude Include="Gif.h" />
--- a/pcsx2/pcsx2.vcxproj.filters
+++ b/pcsx2/pcsx2.vcxproj.filters
@ -512,9 +512,6 @@
    <ClCompile Include="Vif_Unpack.cpp">
      <Filter>System\Ps2\EmotionEngine\DMAC\Vif\Unpack</Filter>
    </ClCompile>
-    <ClCompile Include="x86\newVif_Unpack.cpp">
-      <Filter>System\Ps2\EmotionEngine\DMAC\Vif\Unpack\newVif</Filter>
-    </ClCompile>
    <ClCompile Include="x86\newVif_Dynarec.cpp">
      <Filter>System\Ps2\EmotionEngine\DMAC\Vif\Unpack\newVif\Dynarec</Filter>
    </ClCompile>
@ -1484,9 +1481,6 @@
    <ClInclude Include="x86\newVif.h">
      <Filter>System\Ps2\EmotionEngine\DMAC\Vif\Unpack\newVif</Filter>
    </ClInclude>
-    <ClInclude Include="x86\newVif_HashBucket.h">
-      <Filter>System\Ps2\EmotionEngine\DMAC\Vif\Unpack\newVif</Filter>
-    </ClInclude>
    <ClInclude Include="x86\newVif_UnpackSSE.h">
      <Filter>System\Ps2\EmotionEngine\DMAC\Vif\Unpack\newVif\Dynarec</Filter>
    </ClInclude>
@ -2303,6 +2297,12 @@
    <ClInclude Include="CDVD\FlatFileReader.h">
      <Filter>System\ISO</Filter>
    </ClInclude>
+    <ClInclude Include="Vif_Dynarec.h">
+      <Filter>System\Ps2\EmotionEngine\DMAC\Vif\Unpack\newVif\Dynarec</Filter>
+    </ClInclude>
+    <ClInclude Include="Vif_HashBucket.h">
+      <Filter>System\Ps2\EmotionEngine\DMAC\Vif\Unpack\newVif\Dynarec</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <CustomBuildStep Include="rdebug\deci2.h">
--- a/pcsx2/x86/newVif.h
+++ b/pcsx2/x86/newVif.h
@ -3,28 +3,14 @@

 #pragma once

-#include "Vif.h"
-#include "VU.h"
+#include "Vif_Dynarec.h"

 #include "common/emitter/x86emitter.h"

 using namespace x86Emitter;

-// newVif_HashBucket.h uses this typedef, so it has to be declared first.
-typedef u32  (*nVifCall)(void*, const void*);
-typedef void (*nVifrecCall)(uptr dest, uptr src);
-
-#include "newVif_HashBucket.h"
-
 extern void  mVUmergeRegs(const xRegisterSSE& dest, const xRegisterSSE& src,  int xyzw, bool modXYZW = 0);
 extern void  mVUsaveReg(const xRegisterSSE& reg, xAddressVoid ptr, int xyzw, bool modXYZW);
-extern void _nVifUnpack  (int idx, const u8* data, uint mode, bool isFill);
-extern void  dVifReset   (int idx);
-extern void  dVifClose   (int idx);
-extern void  dVifRelease (int idx);
-extern void  VifUnpackSSE_Init();
-
-_vifT extern void dVifUnpack(const u8* data, bool isFill);

 #define VUFT VIFUnpackFuncTable
 #define _v0 0
@ -37,31 +23,3 @@ _vifT extern void dVifUnpack(const u8* data, bool isFill);
 #define xmmCol3 xmm5
 #define xmmRow  xmm6
 #define xmmTemp xmm7
-
-struct nVifStruct
-{
-	// Buffer for partial transfers (should always be first to ensure alignment)
-	// Maximum buffer size is 256 (vifRegs.Num max range) * 16 (quadword)
-	alignas(16) u8 buffer[256*16];
-	u32            bSize; // Size of 'buffer'
-
-	// VIF0 or VIF1 - provided for debugging helpfulness only, and is generally unused.
-	// (templates are used for most or all VIF indexing)
-	u32                     idx;
-
-	u8*                     recWritePtr; // current write pos into the reserve
-	u8*                     recEndPtr;
-
-	HashBucket              vifBlocks;   // Vif Blocks
-
-
-	nVifStruct() = default;
-};
-
-extern void resetNewVif(int idx);
-
-alignas(16) extern nVifStruct nVif[2];
-alignas(16) extern nVifCall nVifUpk[(2 * 2 * 16) * 4]; // ([USN][Masking][Unpack Type]) [curCycle]
-alignas(16) extern u32      nVifMask[3][4][4];         // [MaskNumber][CycleNumber][Vector]
-
-static constexpr bool newVifDynaRec = 1; // Use code in newVif_Dynarec.inl
--- a/pcsx2/x86/newVif_Unpack.cpp
+++ b/pcsx2/x86/newVif_Unpack.cpp
@ -1,282 +0,0 @@
-// SPDX-FileCopyrightText: 2002-2023 PCSX2 Dev Team
-// SPDX-License-Identifier: LGPL-3.0+
-
-#include "Common.h"
-#include "Vif_Dma.h"
-#include "newVif.h"
-#include "MTVU.h"
-
-alignas(16) nVifStruct nVif[2];
-
-// Interpreter-style SSE unpacks.  Array layout matches the interpreter C unpacks.
-//  ([USN][Masking][Unpack Type]) [curCycle]
-alignas(16) nVifCall nVifUpk[(2 * 2 * 16) * 4];
-
-// This is used by the interpreted SSE unpacks only.  Recompiled SSE unpacks
-// and the interpreted C unpacks use the vif.MaskRow/MaskCol members directly.
-//  [MaskNumber][CycleNumber][Vector]
-alignas(16) u32 nVifMask[3][4][4] = {};
-
-// Number of bytes of data in the source stream needed for each vector.
-// [equivalent to ((32 >> VL) * (VN+1)) / 8]
-alignas(16) const u8 nVifT[16] = {
-	4, // S-32
-	2, // S-16
-	1, // S-8
-	0, // ----
-	8, // V2-32
-	4, // V2-16
-	2, // V2-8
-	0, // ----
-	12,// V3-32
-	6, // V3-16
-	3, // V3-8
-	0, // ----
-	16,// V4-32
-	8, // V4-16
-	4, // V4-8
-	2, // V4-5
-};
-
-// ----------------------------------------------------------------------------
-template <int idx, bool doMode, bool isFill>
-__ri void _nVifUnpackLoop(const u8* data);
-
-typedef void FnType_VifUnpackLoop(const u8* data);
-typedef FnType_VifUnpackLoop* Fnptr_VifUnpackLoop;
-
-// Unpacks Until 'Num' is 0
-alignas(16) static const Fnptr_VifUnpackLoop UnpackLoopTable[2][2][2] = {
-	{
-		{_nVifUnpackLoop<0, 0, 0>, _nVifUnpackLoop<0, 0, 1>},
-		{_nVifUnpackLoop<0, 1, 0>, _nVifUnpackLoop<0, 1, 1>},
-	},
-	{
-		{_nVifUnpackLoop<1, 0, 0>, _nVifUnpackLoop<1, 0, 1>},
-		{_nVifUnpackLoop<1, 1, 0>, _nVifUnpackLoop<1, 1, 1>},
-	},
-};
-// ----------------------------------------------------------------------------
-
-void resetNewVif(int idx)
-{
-	// Safety Reset : Reassign all VIF structure info, just in case the VU1 pointers have
-	// changed for some reason.
-
-	nVif[idx].idx   = idx;
-	nVif[idx].bSize = 0;
-	std::memset(nVif[idx].buffer, 0, sizeof(nVif[idx].buffer));
-
-	if (newVifDynaRec)
-		dVifReset(idx);
-}
-
-void releaseNewVif(int idx)
-{
-}
-
-static __fi u8* getVUptr(uint idx, int offset)
-{
-	return (u8*)(vuRegs[idx].Mem + (offset & (idx ? 0x3ff0 : 0xff0)));
-}
-
-
-_vifT int nVifUnpack(const u8* data)
-{
-	nVifStruct&   v       = nVif[idx];
-	vifStruct&    vif     = GetVifX;
-	VIFregisters& vifRegs = vifXRegs;
-
-	const uint wl     = vifRegs.cycle.wl ? vifRegs.cycle.wl : 256;
-	const uint ret    = std::min(vif.vifpacketsize, vif.tag.size);
-	const bool isFill = (vifRegs.cycle.cl < wl);
-	s32        size   = ret << 2;
-
-	if (ret == vif.tag.size) // Full Transfer
-	{
-		if (v.bSize) // Last transfer was partial
-		{
-			memcpy(&v.buffer[v.bSize], data, size);
-			v.bSize += size;
-			size = v.bSize;
-			data = v.buffer;
-
-			vif.cl = 0;
-			vifRegs.num = (vifXRegs.code >> 16) & 0xff; // grab NUM form the original VIFcode input.
-			if (!vifRegs.num)
-				vifRegs.num = 256;
-		}
-
-		if (!idx || !THREAD_VU1)
-		{
-			if (newVifDynaRec)
-				dVifUnpack<idx>(data, isFill);
-			else
-				_nVifUnpack(idx, data, vifRegs.mode, isFill);
-		}
-		else
-			vu1Thread.VifUnpack(vif, vifRegs, (u8*)data, (size + 4) & ~0x3);
-
-		vif.pass     = 0;
-		vif.tag.size = 0;
-		vif.cmd      = 0;
-		vifRegs.num  = 0;
-		v.bSize      = 0;
-	}
-	else // Partial Transfer
-	{
-		memcpy(&v.buffer[v.bSize], data, size);
-		v.bSize += size;
-		vif.tag.size -= ret;
-
-		const u8& vSize = nVifT[vif.cmd & 0x0f];
-
-		// We need to provide accurate accounting of the NUM register, in case games decided
-		// to read back from it mid-transfer.  Since so few games actually use partial transfers
-		// of VIF unpacks, this code should not be any bottleneck.
-
-		if (!isFill)
-		{
-			vifRegs.num -= (size / vSize);
-		}
-		else
-		{
-			int dataSize = (size / vSize);
-			vifRegs.num = vifRegs.num - (((dataSize / vifRegs.cycle.cl) * (vifRegs.cycle.wl - vifRegs.cycle.cl)) + dataSize);
-		}
-	}
-
-	return ret;
-}
-
-template int nVifUnpack<0>(const u8* data);
-template int nVifUnpack<1>(const u8* data);
-
-// This is used by the interpreted SSE unpacks only.  Recompiled SSE unpacks
-// and the interpreted C unpacks use the vif.MaskRow/MaskCol members directly.
-static void setMasks(const vifStruct& vif, const VIFregisters& v)
-{
-	for (int i = 0; i < 16; i++)
-	{
-		int m = (v.mask >> (i * 2)) & 3;
-		switch (m)
-		{
-			case 0: // Data
-				nVifMask[0][i / 4][i % 4] = 0xffffffff;
-				nVifMask[1][i / 4][i % 4] = 0;
-				nVifMask[2][i / 4][i % 4] = 0;
-				break;
-			case 1: // MaskRow
-				nVifMask[0][i / 4][i % 4] = 0;
-				nVifMask[1][i / 4][i % 4] = 0;
-				nVifMask[2][i / 4][i % 4] = vif.MaskRow._u32[i % 4];
-				break;
-			case 2: // MaskCol
-				nVifMask[0][i / 4][i % 4] = 0;
-				nVifMask[1][i / 4][i % 4] = 0;
-				nVifMask[2][i / 4][i % 4] = vif.MaskCol._u32[i / 4];
-				break;
-			case 3: // Write Protect
-				nVifMask[0][i / 4][i % 4] = 0;
-				nVifMask[1][i / 4][i % 4] = 0xffffffff;
-				nVifMask[2][i / 4][i % 4] = 0;
-				break;
-		}
-	}
-}
-
-// ----------------------------------------------------------------------------
-//  Unpacking Optimization notes:
-// ----------------------------------------------------------------------------
-// Some games send a LOT of single-cycle packets (God of War, SotC, TriAce games, etc),
-// so we always need to be weary of keeping loop setup code optimized.  It's not always
-// a "win" to move code outside the loop, like normally in most other loop scenarios.
-//
-// The biggest bottleneck of the current code is the call/ret needed to invoke the SSE
-// unpackers.  A better option is to generate the entire vifRegs.num loop code as part
-// of the SSE template, and inline the SSE code into the heart of it.  This both avoids
-// the call/ret and opens the door for resolving some register dependency chains in the
-// current emitted functions.  (this is what zero's SSE does to get it's final bit of
-// speed advantage over the new vif). --air
-//
-// The BEST optimizatin strategy here is to use data available to us from the UNPACK dispatch
-// -- namely the unpack type and mask flag -- in combination mode and usn values -- to
-// generate ~600 special versions of this function.  But since it's an interpreter, who gives
-// a crap?  Really? :p
-//
-
-// size - size of the packet fragment incoming from DMAC.
-template <int idx, bool doMode, bool isFill>
-__ri void _nVifUnpackLoop(const u8* data)
-{
-
-	vifStruct& vif = MTVU_VifX;
-	VIFregisters& vifRegs = MTVU_VifXRegs;
-
-	// skipSize used for skipping writes only
-	const int skipSize = (vifRegs.cycle.cl - vifRegs.cycle.wl) * 16;
-
-	//DevCon.WriteLn("[%d][%d][%d][num=%d][upk=%d][cl=%d][bl=%d][skip=%d]", isFill, doMask, doMode, vifRegs.num, upkNum, vif.cl, blockSize, skipSize);
-
-	if (!doMode && (vif.cmd & 0x10))
-		setMasks(vif, vifRegs);
-
-	const int usn    = !!vif.usn;
-	const int upkNum = vif.cmd & 0x1f;
-	const u8& vSize  = nVifT[upkNum & 0x0f];
-	//uint vl = vif.cmd & 0x03;
-	//uint vn = (vif.cmd >> 2) & 0x3;
-	//uint vSize = ((32 >> vl) * (vn+1)) / 8;		// size of data (in bytes) used for each write cycle
-
-	const nVifCall* fnbase = &nVifUpk[((usn * 2 * 16) + upkNum) * (4 * 1)];
-	const UNPACKFUNCTYPE ft = VIFfuncTable[idx][doMode ? vifRegs.mode : 0][((usn * 2 * 16) + upkNum)];
-
-	pxAssume(vif.cl == 0);
-	//pxAssume (vifRegs.cycle.wl > 0);
-
-	do
-	{
-		u8* dest = getVUptr(idx, vif.tag.addr);
-
-		if (doMode)
-		{
-			//if (1) {
-			ft(dest, data);
-		}
-		else
-		{
-			//DevCon.WriteLn("SSE Unpack!");
-			uint cl3 = std::min(vif.cl, 3);
-			fnbase[cl3](dest, data);
-		}
-
-		vif.tag.addr += 16;
-		--vifRegs.num;
-		++vif.cl;
-
-		if (isFill)
-		{
-			//DevCon.WriteLn("isFill!");
-			if (vif.cl <= vifRegs.cycle.cl)
-				data += vSize;
-			else if (vif.cl == vifRegs.cycle.wl)
-				vif.cl = 0;
-		}
-		else
-		{
-			data += vSize;
-
-			if (vif.cl >= vifRegs.cycle.wl)
-			{
-				vif.tag.addr += skipSize;
-				vif.cl = 0;
-			}
-		}
-	} while (vifRegs.num);
-}
-
-__fi void _nVifUnpack(int idx, const u8* data, uint mode, bool isFill)
-{
-
-	UnpackLoopTable[idx][!!mode][isFill](data);
-}