arm64: Add VIF dynarec

2024-03-21 19:24:35 +10:00 · 2024-03-21 19:24:35 +10:00 · fe9399612d
parent 0a4c037898
commit fe9399612d
8 changed files with 1818 additions and 2 deletions
--- a/pcsx2/CMakeLists.txt
+++ b/pcsx2/CMakeLists.txt
@ -1018,20 +1018,37 @@ set(pcsx2x86Headers
 	x86/R5900_Profiler.h
 	)

+# ARM64
+set(pcsx2arm64Sources
+	arm64/AsmHelpers.cpp
+	arm64/newVif_Dynarec.cpp
+	arm64/newVif_UnpackNEON.cpp
+	)
+
+set(pcsx2arm64Headers
+	arm64/AsmHelpers.h
+)
+
 # These ones benefit a lot from LTO
 set(pcsx2LTOSources
 	${pcsx2Sources}
 	${pcsx2Headers}
 	${pcsx2IPUSources}
 	${pcsx2IPUHeaders}
-	${pcsx2x86Sources}
-	${pcsx2x86Headers}
 	${pcsx2SPU2Sources}
 	${pcsx2SPU2Headers}
 	${pcsx2GSSources}
 	${pcsx2GSHeaders}
 )

+if(_M_X86)
+	list(APPEND pcsx2LTOSources ${pcsx2x86Sources} ${pcsx2x86Headers})
+	target_link_libraries(PCSX2_FLAGS INTERFACE zydis)
+elseif(_M_ARM64)
+	list(APPEND pcsx2LTOSources ${pcsx2arm64Sources} ${pcsx2arm64Headers})
+	target_link_libraries(PCSX2_FLAGS INTERFACE vixl)
+endif()
+
 if(LTO_PCSX2_CORE)
 	add_library(PCSX2_LTO ${pcsx2LTOSources})
 	if (DISABLE_ADVANCE_SIMD)
--- a/pcsx2/arm64/AsmHelpers.cpp
+++ b/pcsx2/arm64/AsmHelpers.cpp
@ -0,0 +1,461 @@
+// SPDX-FileCopyrightText: 2021-2024 Connor McLaughlin <stenzek@gmail.com>, PCSX2 Team
+// SPDX-License-Identifier: GPL-3.0
+
+#include "arm64/AsmHelpers.h"
+
+#include "common/Assertions.h"
+#include "common/BitUtils.h"
+#include "common/Console.h"
+#include "common/HostSys.h"
+
+const vixl::aarch64::Register& armWRegister(int n)
+{
+	using namespace vixl::aarch64;
+	static constexpr const Register* regs[32] = {&w0, &w1, &w2, &w3, &w4, &w5, &w6, &w7, &w8, &w9, &w10,
+		&w11, &w12, &w13, &w14, &w15, &w16, &w17, &w18, &w19, &w20, &w21, &w22, &w23, &w24, &w25, &w26, &w27, &w28,
+		&w29, &w30, &w31};
+	pxAssert(static_cast<size_t>(n) < std::size(regs));
+	return *regs[n];
+}
+
+const vixl::aarch64::Register& armXRegister(int n)
+{
+	using namespace vixl::aarch64;
+	static constexpr const Register* regs[32] = {&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &x8, &x9, &x10,
+		&x11, &x12, &x13, &x14, &x15, &x16, &x17, &x18, &x19, &x20, &x21, &x22, &x23, &x24, &x25, &x26, &x27, &x28,
+		&x29, &x30, &x31};
+	pxAssert(static_cast<size_t>(n) < std::size(regs));
+	return *regs[n];
+}
+
+const vixl::aarch64::VRegister& armSRegister(int n)
+{
+	using namespace vixl::aarch64;
+	static constexpr const VRegister* regs[32] = {&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &vixl::aarch64::s8, &s9, &s10,
+		&s11, &s12, &s13, &s14, &s15, &vixl::aarch64::s16, &s17, &s18, &s19, &s20, &s21, &s22, &s23, &s24, &s25, &s26, &s27, &s28,
+		&s29, &s30, &s31};
+	pxAssert(static_cast<size_t>(n) < std::size(regs));
+	return *regs[n];
+}
+
+const vixl::aarch64::VRegister& armDRegister(int n)
+{
+	using namespace vixl::aarch64;
+	static constexpr const VRegister* regs[32] = {&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &d8, &d9, &d10,
+		&d11, &d12, &d13, &d14, &d15, &d16, &d17, &d18, &d19, &d20, &d21, &d22, &d23, &d24, &d25, &d26, &d27, &d28,
+		&d29, &d30, &d31};
+	pxAssert(static_cast<size_t>(n) < std::size(regs));
+	return *regs[n];
+}
+
+const vixl::aarch64::VRegister& armQRegister(int n)
+{
+	using namespace vixl::aarch64;
+	static constexpr const VRegister* regs[32] = {&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7, &q8, &q9, &q10,
+		&q11, &q12, &q13, &q14, &q15, &q16, &q17, &q18, &q19, &q20, &q21, &q22, &q23, &q24, &q25, &q26, &q27, &q28,
+		&q29, &q30, &q31};
+	pxAssert(static_cast<size_t>(n) < std::size(regs));
+	return *regs[n];
+}
+
+
+//#define INCLUDE_DISASSEMBLER
+
+#ifdef INCLUDE_DISASSEMBLER
+#include "vixl/aarch64/disasm-aarch64.h"
+#endif
+
+namespace a64 = vixl::aarch64;
+
+thread_local a64::MacroAssembler* armAsm;
+thread_local u8* armAsmPtr;
+thread_local size_t armAsmCapacity;
+thread_local ArmConstantPool* armConstantPool;
+
+#ifdef INCLUDE_DISASSEMBLER
+static std::mutex armDisasmMutex;
+static std::unique_ptr<a64::PrintDisassembler> armDisasm;
+static std::unique_ptr<a64::Decoder> armDisasmDecoder;
+#endif
+
+void armSetAsmPtr(void* ptr, size_t capacity, ArmConstantPool* pool)
+{
+	pxAssert(!armAsm);
+	armAsmPtr = static_cast<u8*>(ptr);
+	armAsmCapacity = capacity;
+	armConstantPool = pool;
+}
+
+// Align to 16 bytes, apparently ARM likes that.
+void armAlignAsmPtr()
+{
+	static constexpr uintptr_t ALIGNMENT = 16;
+	u8* new_ptr = reinterpret_cast<u8*>((reinterpret_cast<uintptr_t>(armAsmPtr) + (ALIGNMENT - 1)) & ~(ALIGNMENT - 1));
+	pxAssert(static_cast<size_t>(new_ptr - armAsmPtr) <= armAsmCapacity);
+	armAsmCapacity -= (new_ptr - armAsmPtr);
+	armAsmPtr = new_ptr;
+}
+
+u8* armStartBlock()
+{
+	armAlignAsmPtr();
+
+	HostSys::BeginCodeWrite();
+
+	pxAssert(!armAsm);
+	armAsm = new vixl::aarch64::MacroAssembler(static_cast<vixl::byte*>(armAsmPtr), armAsmCapacity);
+	armAsm->GetScratchVRegisterList()->Remove(31);
+	armAsm->GetScratchRegisterList()->Remove(RSCRATCHADDR.GetCode());
+	return armAsmPtr;
+}
+
+u8* armEndBlock()
+{
+	pxAssert(armAsm);
+
+	armAsm->FinalizeCode();
+
+	const u32 size = static_cast<u32>(armAsm->GetSizeOfCodeGenerated());
+	pxAssert(size < armAsmCapacity);
+
+	delete armAsm;
+	armAsm = nullptr;
+
+	HostSys::EndCodeWrite();
+
+	HostSys::FlushInstructionCache(armAsmPtr, size);
+
+	armAsmPtr = armAsmPtr + size;
+	armAsmCapacity -= size;
+	return armAsmPtr;
+}
+
+void armDisassembleAndDumpCode(const void* ptr, size_t size)
+{
+#ifdef INCLUDE_DISASSEMBLER
+	std::unique_lock lock(armDisasmMutex);
+	if (!armDisasm)
+	{
+		armDisasm = std::make_unique<a64::PrintDisassembler>(stderr);
+		armDisasmDecoder = std::make_unique<a64::Decoder>();
+		armDisasmDecoder->AppendVisitor(armDisasm.get());
+	}
+
+	armDisasmDecoder->Decode(static_cast<const vixl::aarch64::Instruction*>(ptr), static_cast<const vixl::aarch64::Instruction*>(ptr) + size);
+#else
+	Console.Error("Not compiled with INCLUDE_DISASSEMBLER");
+#endif
+}
+
+void armEmitJmp(const void* ptr, bool force_inline)
+{
+	s64 displacement = GetPCDisplacement(armGetCurrentCodePointer(), ptr);
+	bool use_blr = !vixl::IsInt26(displacement);
+	if (use_blr && armConstantPool && !force_inline)
+	{
+		if (u8* trampoline = armConstantPool->GetJumpTrampoline(ptr); trampoline)
+		{
+			displacement = GetPCDisplacement(armGetCurrentCodePointer(), trampoline);
+			use_blr = !vixl::IsInt26(displacement);
+		}
+	}
+
+	if (use_blr)
+	{
+		armAsm->Mov(RXVIXLSCRATCH, reinterpret_cast<uintptr_t>(ptr));
+		armAsm->Br(RXVIXLSCRATCH);
+	}
+	else
+	{
+		a64::SingleEmissionCheckScope guard(armAsm);
+		armAsm->b(displacement);
+	}
+}
+
+void armEmitCall(const void* ptr, bool force_inline)
+{
+	s64 displacement = GetPCDisplacement(armGetCurrentCodePointer(), ptr);
+	bool use_blr = !vixl::IsInt26(displacement);
+	if (use_blr && armConstantPool && !force_inline)
+	{
+		if (u8* trampoline = armConstantPool->GetJumpTrampoline(ptr); trampoline)
+		{
+			displacement = GetPCDisplacement(armGetCurrentCodePointer(), trampoline);
+			use_blr = !vixl::IsInt26(displacement);
+		}
+	}
+
+	if (use_blr)
+	{
+		armAsm->Mov(RXVIXLSCRATCH, reinterpret_cast<uintptr_t>(ptr));
+		armAsm->Blr(RXVIXLSCRATCH);
+	}
+	else
+	{
+		a64::SingleEmissionCheckScope guard(armAsm);
+		armAsm->bl(displacement);
+	}
+}
+
+void armEmitCbnz(const vixl::aarch64::Register& reg, const void* ptr)
+{
+	const s64 jump_distance =
+		static_cast<s64>(reinterpret_cast<intptr_t>(ptr) - reinterpret_cast<intptr_t>(armGetCurrentCodePointer()));
+	//pxAssert(Common::IsAligned(jump_distance, 4));
+	if (a64::Instruction::IsValidImmPCOffset(a64::CompareBranchType, jump_distance >> 2))
+	{
+		a64::SingleEmissionCheckScope guard(armAsm);
+		armAsm->cbnz(reg, jump_distance >> 2);
+	}
+	else
+	{
+		a64::MacroEmissionCheckScope guard(armAsm);
+		a64::Label branch_not_taken;
+		armAsm->cbz(reg, &branch_not_taken);
+
+		const s64 new_jump_distance =
+			static_cast<s64>(reinterpret_cast<intptr_t>(ptr) - reinterpret_cast<intptr_t>(armGetCurrentCodePointer()));
+		armAsm->b(new_jump_distance >> 2);
+		armAsm->bind(&branch_not_taken);
+	}
+}
+
+void armEmitCondBranch(a64::Condition cond, const void* ptr)
+{
+	const s64 jump_distance =
+		static_cast<s64>(reinterpret_cast<intptr_t>(ptr) - reinterpret_cast<intptr_t>(armGetCurrentCodePointer()));
+	//pxAssert(Common::IsAligned(jump_distance, 4));
+
+	if (a64::Instruction::IsValidImmPCOffset(a64::CondBranchType, jump_distance >> 2))
+	{
+		a64::SingleEmissionCheckScope guard(armAsm);
+		armAsm->b(jump_distance >> 2, cond);
+	}
+	else
+	{
+		a64::MacroEmissionCheckScope guard(armAsm);
+		a64::Label branch_not_taken;
+		armAsm->b(&branch_not_taken, a64::InvertCondition(cond));
+
+		const s64 new_jump_distance =
+			static_cast<s64>(reinterpret_cast<intptr_t>(ptr) - reinterpret_cast<intptr_t>(armGetCurrentCodePointer()));
+		armAsm->b(new_jump_distance >> 2);
+		armAsm->bind(&branch_not_taken);
+	}
+}
+
+void armMoveAddressToReg(const vixl::aarch64::Register& reg, const void* addr)
+{
+	// psxAsm->Mov(reg, static_cast<u64>(reinterpret_cast<uintptr_t>(addr)));
+	pxAssert(reg.IsX());
+
+	const void* current_code_ptr_page = reinterpret_cast<const void*>(
+		reinterpret_cast<uintptr_t>(armGetCurrentCodePointer()) & ~static_cast<uintptr_t>(0xFFF));
+	const void* ptr_page =
+		reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(addr) & ~static_cast<uintptr_t>(0xFFF));
+	const s64 page_displacement = GetPCDisplacement(current_code_ptr_page, ptr_page) >> 10;
+	const u32 page_offset = static_cast<u32>(reinterpret_cast<uintptr_t>(addr) & 0xFFFu);
+	if (vixl::IsInt21(page_displacement) && a64::Assembler::IsImmAddSub(page_offset))
+	{
+		{
+			a64::SingleEmissionCheckScope guard(armAsm);
+			armAsm->adrp(reg, page_displacement);
+		}
+		armAsm->Add(reg, reg, page_offset);
+	}
+	else if (vixl::IsInt21(page_displacement) && a64::Assembler::IsImmLogical(page_offset, 64))
+	{
+		{
+			a64::SingleEmissionCheckScope guard(armAsm);
+			armAsm->adrp(reg, page_displacement);
+		}
+		armAsm->Orr(reg, reg, page_offset);
+	}
+	else
+	{
+		armAsm->Mov(reg, reinterpret_cast<uintptr_t>(addr));
+	}
+}
+
+void armLoadPtr(const vixl::aarch64::CPURegister& reg, const void* addr)
+{
+	armMoveAddressToReg(RSCRATCHADDR, addr);
+	armAsm->Ldr(reg, a64::MemOperand(RSCRATCHADDR));
+}
+
+void armStorePtr(const vixl::aarch64::CPURegister& reg, const void* addr)
+{
+	armMoveAddressToReg(RSCRATCHADDR, addr);
+	armAsm->Str(reg, a64::MemOperand(RSCRATCHADDR));
+}
+
+void armBeginStackFrame(bool save_fpr)
+{
+	// save x19 through x28, x29 could also be used
+	armAsm->Sub(a64::sp, a64::sp, save_fpr ? 192 : 144);
+	armAsm->Stp(a64::x19, a64::x20, a64::MemOperand(a64::sp, 32));
+	armAsm->Stp(a64::x21, a64::x22, a64::MemOperand(a64::sp, 48));
+	armAsm->Stp(a64::x23, a64::x24, a64::MemOperand(a64::sp, 64));
+	armAsm->Stp(a64::x25, a64::x26, a64::MemOperand(a64::sp, 80));
+	armAsm->Stp(a64::x27, a64::x28, a64::MemOperand(a64::sp, 96));
+	armAsm->Stp(a64::x29, a64::lr, a64::MemOperand(a64::sp, 112));
+	if (save_fpr)
+	{
+		armAsm->Stp(a64::d8, a64::d9, a64::MemOperand(a64::sp, 128));
+		armAsm->Stp(a64::d10, a64::d11, a64::MemOperand(a64::sp, 144));
+		armAsm->Stp(a64::d12, a64::d13, a64::MemOperand(a64::sp, 160));
+		armAsm->Stp(a64::d14, a64::d15, a64::MemOperand(a64::sp, 176));
+	}
+}
+
+void armEndStackFrame(bool save_fpr)
+{
+	if (save_fpr)
+	{
+		armAsm->Ldp(a64::d14, a64::d15, a64::MemOperand(a64::sp, 176));
+		armAsm->Ldp(a64::d12, a64::d13, a64::MemOperand(a64::sp, 160));
+		armAsm->Ldp(a64::d10, a64::d11, a64::MemOperand(a64::sp, 144));
+		armAsm->Ldp(a64::d8, a64::d9, a64::MemOperand(a64::sp, 128));
+	}
+	armAsm->Ldp(a64::x29, a64::lr, a64::MemOperand(a64::sp, 112));
+	armAsm->Ldp(a64::x27, a64::x28, a64::MemOperand(a64::sp, 96));
+	armAsm->Ldp(a64::x25, a64::x26, a64::MemOperand(a64::sp, 80));
+	armAsm->Ldp(a64::x23, a64::x24, a64::MemOperand(a64::sp, 64));
+	armAsm->Ldp(a64::x21, a64::x22, a64::MemOperand(a64::sp, 48));
+	armAsm->Ldp(a64::x19, a64::x20, a64::MemOperand(a64::sp, 32));
+	armAsm->Add(a64::sp, a64::sp, save_fpr ? 192 : 144);
+}
+
+bool armIsCalleeSavedRegister(int reg)
+{
+	// same on both linux and windows
+	return (reg >= 19);
+}
+
+vixl::aarch64::MemOperand armOffsetMemOperand(const vixl::aarch64::MemOperand& op, s64 offset)
+{
+	pxAssert(op.GetBaseRegister().IsValid() && op.GetAddrMode() == vixl::aarch64::Offset && op.GetShift() == vixl::aarch64::NO_SHIFT);
+	return vixl::aarch64::MemOperand(op.GetBaseRegister(), op.GetOffset() + offset, op.GetAddrMode());
+}
+
+void armGetMemOperandInRegister(const vixl::aarch64::Register& addr_reg, const vixl::aarch64::MemOperand& op, s64 extra_offset /*= 0*/)
+{
+	pxAssert(addr_reg.IsX());
+	pxAssert(op.GetBaseRegister().IsValid() && op.GetAddrMode() == vixl::aarch64::Offset && op.GetShift() == vixl::aarch64::NO_SHIFT);
+	armAsm->Add(addr_reg, op.GetBaseRegister(), op.GetOffset() + extra_offset);
+}
+
+void armLoadConstant128(const vixl::aarch64::VRegister& reg, const void* ptr)
+{
+	u64 low, high;
+	memcpy(&low, ptr, sizeof(low));
+	memcpy(&high, static_cast<const u8*>(ptr) + sizeof(low), sizeof(high));
+	armAsm->Ldr(reg, high, low);
+}
+
+void armEmitVTBL(const vixl::aarch64::VRegister& dst, const vixl::aarch64::VRegister& src1, const vixl::aarch64::VRegister& src2, const vixl::aarch64::VRegister& tbl)
+{
+	pxAssert(src1.GetCode() != RQSCRATCH.GetCode() && src2.GetCode() != RQSCRATCH2.GetCode());
+	pxAssert(tbl.GetCode() != RQSCRATCH.GetCode() && tbl.GetCode() != RQSCRATCH2.GetCode());
+
+	// must be consecutive
+	if (src2.GetCode() == (src1.GetCode() + 1))
+	{
+		armAsm->Tbl(dst.V16B(), src1.V16B(), src2.V16B(), tbl.V16B());
+		return;
+	}
+
+	armAsm->Mov(RQSCRATCH.Q(), src1.Q());
+	armAsm->Mov(RQSCRATCH2.Q(), src2.Q());
+	armAsm->Tbl(dst.V16B(), RQSCRATCH.V16B(), RQSCRATCH2.V16B(), tbl.V16B());
+}
+
+
+void ArmConstantPool::Init(void* ptr, u32 capacity)
+{
+	m_base_ptr = static_cast<u8*>(ptr);
+	m_capacity = capacity;
+	m_used = 0;
+	m_jump_targets.clear();
+	m_literals.clear();
+}
+
+void ArmConstantPool::Destroy()
+{
+	m_base_ptr = nullptr;
+	m_capacity = 0;
+	m_used = 0;
+	m_jump_targets.clear();
+	m_literals.clear();
+}
+
+void ArmConstantPool::Reset()
+{
+	m_used = 0;
+	m_jump_targets.clear();
+	m_literals.clear();
+}
+
+u8* ArmConstantPool::GetJumpTrampoline(const void* target)
+{
+	auto it = m_jump_targets.find(target);
+	if (it != m_jump_targets.end())
+		return m_base_ptr + it->second;
+
+	// align to 16 bytes?
+	const u32 offset = Common::AlignUpPow2(m_used, 16);
+
+	// 4 movs plus a jump
+	if ((m_capacity - offset) < 20)
+	{
+		Console.Error("Ran out of space in constant pool");
+		return nullptr;
+	}
+
+	a64::MacroAssembler masm(static_cast<vixl::byte*>(m_base_ptr + offset), m_capacity - offset);
+	masm.Mov(RXVIXLSCRATCH, reinterpret_cast<intptr_t>(target));
+	masm.Br(RXVIXLSCRATCH);
+	masm.FinalizeCode();
+
+	pxAssert(masm.GetSizeOfCodeGenerated() < 20);
+	m_jump_targets.emplace(target, offset);
+	m_used = offset + static_cast<u32>(masm.GetSizeOfCodeGenerated());
+
+	HostSys::FlushInstructionCache(reinterpret_cast<void*>(m_base_ptr + offset), m_used - offset);
+
+	return m_base_ptr + offset;
+}
+
+u8* ArmConstantPool::GetLiteral(u64 value)
+{
+	return GetLiteral(u128::From64(value));
+}
+
+u8* ArmConstantPool::GetLiteral(const u128& value)
+{
+	auto it = m_literals.find(value);
+	if (it != m_literals.end())
+		return m_base_ptr + it->second;
+
+	if (GetRemainingCapacity() < 8)
+		return nullptr;
+
+	const u32 offset = Common::AlignUpPow2(m_used, 16);
+	std::memcpy(&m_base_ptr[offset], &value, sizeof(value));
+	m_used = offset + sizeof(value);
+	return m_base_ptr + offset;
+}
+
+u8* ArmConstantPool::GetLiteral(const u8* bytes, size_t len)
+{
+	pxAssertMsg(len <= 16, "literal length is less than 16 bytes");
+	u128 table_u128 = {};
+	std::memcpy(table_u128._u8, bytes, len);
+	return GetLiteral(table_u128);
+}
+
+void ArmConstantPool::EmitLoadLiteral(const vixl::aarch64::CPURegister& reg, const u8* literal) const
+{
+	armMoveAddressToReg(RXVIXLSCRATCH, literal);
+	armAsm->Ldr(reg, a64::MemOperand(RXVIXLSCRATCH));
+}
--- a/pcsx2/arm64/AsmHelpers.h
+++ b/pcsx2/arm64/AsmHelpers.h
@ -0,0 +1,146 @@
+// SPDX-FileCopyrightText: 2021-2024 Connor McLaughlin <stenzek@gmail.com>, PCSX2 Team
+// SPDX-License-Identifier: GPL-3.0
+
+#pragma once
+
+#include "common/Pcsx2Defs.h"
+#include "common/HashCombine.h"
+
+#include "vixl/aarch64/constants-aarch64.h"
+#include "vixl/aarch64/macro-assembler-aarch64.h"
+
+#include <unordered_map>
+
+#define RWRET vixl::aarch64::w0
+#define RXRET vixl::aarch64::x0
+#define RQRET vixl::aarch64::q0
+
+#define RWARG1 vixl::aarch64::w0
+#define RWARG2 vixl::aarch64::w1
+#define RWARG3 vixl::aarch64::w2
+#define RWARG4 vixl::aarch64::w3
+#define RXARG1 vixl::aarch64::x0
+#define RXARG2 vixl::aarch64::x1
+#define RXARG3 vixl::aarch64::x2
+#define RXARG4 vixl::aarch64::x3
+
+#define RXVIXLSCRATCH vixl::aarch64::x16
+#define RWVIXLSCRATCH vixl::aarch64::w16
+#define RSCRATCHADDR vixl::aarch64::x17
+
+#define RQSCRATCH vixl::aarch64::q30
+#define RDSCRATCH vixl::aarch64::d30
+#define RSSCRATCH vixl::aarch64::s30
+#define RQSCRATCH2 vixl::aarch64::q31
+#define RDSCRATCH2 vixl::aarch64::d31
+#define RSSCRATCH2 vixl::aarch64::s31
+#define RQSCRATCH3 vixl::aarch64::q29
+#define RDSCRATCH3 vixl::aarch64::d29
+#define RSSCRATCH3 vixl::aarch64::s29
+
+#define RQSCRATCHI vixl::aarch64::VRegister(30, 128, 16)
+#define RQSCRATCHF vixl::aarch64::VRegister(30, 128, 4)
+#define RQSCRATCHD vixl::aarch64::VRegister(30, 128, 2)
+
+#define RQSCRATCH2I vixl::aarch64::VRegister(31, 128, 16)
+#define RQSCRATCH2F vixl::aarch64::VRegister(31, 128, 4)
+#define RQSCRATCH2D vixl::aarch64::VRegister(31, 128, 2)
+
+static inline s64 GetPCDisplacement(const void* current, const void* target)
+{
+	return static_cast<s64>((reinterpret_cast<ptrdiff_t>(target) - reinterpret_cast<ptrdiff_t>(current)) >> 2);
+}
+
+const vixl::aarch64::Register& armWRegister(int n);
+const vixl::aarch64::Register& armXRegister(int n);
+const vixl::aarch64::VRegister& armSRegister(int n);
+const vixl::aarch64::VRegister& armDRegister(int n);
+const vixl::aarch64::VRegister& armQRegister(int n);
+
+class ArmConstantPool;
+
+static const u32 SP_SCRATCH_OFFSET = 0;
+
+extern thread_local vixl::aarch64::MacroAssembler* armAsm;
+extern thread_local u8* armAsmPtr;
+extern thread_local size_t armAsmCapacity;
+extern thread_local ArmConstantPool* armConstantPool;
+
+static __fi bool armHasBlock()
+{
+	return (armAsm != nullptr);
+}
+
+static __fi u8* armGetCurrentCodePointer()
+{
+	return static_cast<u8*>(armAsmPtr) + armAsm->GetCursorOffset();
+}
+
+__fi static u8* armGetAsmPtr()
+{
+	return armAsmPtr;
+}
+
+void armSetAsmPtr(void* ptr, size_t capacity, ArmConstantPool* pool);
+void armAlignAsmPtr();
+u8* armStartBlock();
+u8* armEndBlock();
+
+void armDisassembleAndDumpCode(const void* ptr, size_t size);
+void armEmitJmp(const void* ptr, bool force_inline = false);
+void armEmitCall(const void* ptr, bool force_inline = false);
+void armEmitCbnz(const vixl::aarch64::Register& reg, const void* ptr);
+void armEmitCondBranch(vixl::aarch64::Condition cond, const void* ptr);
+void armMoveAddressToReg(const vixl::aarch64::Register& reg, const void* addr);
+void armLoadPtr(const vixl::aarch64::CPURegister& reg, const void* addr);
+void armStorePtr(const vixl::aarch64::CPURegister& reg, const void* addr);
+void armBeginStackFrame(bool save_fpr);
+void armEndStackFrame(bool save_fpr);
+bool armIsCalleeSavedRegister(int reg);
+
+vixl::aarch64::MemOperand armOffsetMemOperand(const vixl::aarch64::MemOperand& op, s64 offset);
+void armGetMemOperandInRegister(const vixl::aarch64::Register& addr_reg,
+	const vixl::aarch64::MemOperand& op, s64 extra_offset = 0);
+
+void armLoadConstant128(const vixl::aarch64::VRegister& reg, const void* ptr);
+
+// may clobber RSCRATCH/RSCRATCH2. they shouldn't be inputs.
+void armEmitVTBL(const vixl::aarch64::VRegister& dst, const vixl::aarch64::VRegister& src1,
+	const vixl::aarch64::VRegister& src2, const vixl::aarch64::VRegister& tbl);
+
+//////////////////////////////////////////////////////////////////////////
+
+class ArmConstantPool
+{
+public:
+	void Init(void* ptr, u32 capacity);
+	void Destroy();
+	void Reset();
+
+	u8* GetJumpTrampoline(const void* target);
+	u8* GetLiteral(u64 value);
+	u8* GetLiteral(const u128& value);
+	u8* GetLiteral(const u8* bytes, size_t len);
+
+	void EmitLoadLiteral(const vixl::aarch64::CPURegister& reg, const u8* literal) const;
+
+private:
+	__fi u32 GetRemainingCapacity() const { return m_capacity - m_used; }
+
+	struct u128_hash
+	{
+		std::size_t operator()(const u128& v) const
+		{
+			std::size_t s = 0;
+			HashCombine(s, v.lo, v.hi);
+			return s;
+		}
+	};
+
+	std::unordered_map<const void*, u32> m_jump_targets;
+	std::unordered_map<u128, u32, u128_hash> m_literals;
+
+	u8* m_base_ptr = nullptr;
+	u32 m_capacity = 0;
+	u32 m_used = 0;
+};
--- a/pcsx2/arm64/newVif_Dynarec.cpp
+++ b/pcsx2/arm64/newVif_Dynarec.cpp
@ -0,0 +1,585 @@
+// SPDX-FileCopyrightText: 2021-2023 Connor McLaughlin <stenzek@gmail.com>, PCSX2 Team
+// SPDX-License-Identifier: GPL-3.0
+
+#include "arm64/newVif_UnpackNEON.h"
+#include "arm64/AsmHelpers.h"
+#include "MTVU.h"
+
+#include "common/Assertions.h"
+#include "common/Perf.h"
+#include "common/StringUtil.h"
+
+namespace a64 = vixl::aarch64;
+
+static void mVUmergeRegs(const vixl::aarch64::VRegister& dest, const vixl::aarch64::VRegister& src, int xyzw, bool modXYZW = false, bool canModifySrc = false)
+{
+	xyzw &= 0xf;
+	if ((dest.GetCode() != src.GetCode()) && (xyzw != 0))
+	{
+		if (xyzw == 0x8)
+			armAsm->Mov(dest.V4S(), 0, src.V4S(), 0);
+		else if (xyzw == 0xf)
+			armAsm->Mov(dest.Q(), src.Q());
+		else
+		{
+			if (modXYZW)
+			{
+				if (xyzw == 1)
+				{
+					armAsm->Ins(dest.V4S(), 3, src.V4S(), 0);
+					return;
+				}
+				else if (xyzw == 2)
+				{
+					armAsm->Ins(dest.V4S(), 2, src.V4S(), 0);
+					return;
+				}
+				else if (xyzw == 4)
+				{
+					armAsm->Ins(dest.V4S(), 1, src.V4S(), 0);
+					return;
+				}
+			}
+
+			if (xyzw == 0)
+				return;
+			if (xyzw == 15)
+			{
+				armAsm->Mov(dest, src);
+				return;
+			}
+			if (xyzw == 14 && canModifySrc)
+			{
+				// xyz - we can get rid of the mov if we swap the RA around
+				armAsm->Mov(src.V4S(), 3, dest.V4S(), 3);
+				armAsm->Mov(dest.V16B(), src.V16B());
+				return;
+			}
+
+			// reverse
+			xyzw = ((xyzw & 1) << 3) | ((xyzw & 2) << 1) | ((xyzw & 4) >> 1) | ((xyzw & 8) >> 3);
+
+			if ((xyzw & 3) == 3)
+			{
+				// xy
+				armAsm->Mov(dest.V2D(), 0, src.V2D(), 0);
+				xyzw &= ~3;
+			}
+			else if ((xyzw & 12) == 12)
+			{
+				// zw
+				armAsm->Mov(dest.V2D(), 1, src.V2D(), 1);
+				xyzw &= ~12;
+			}
+
+			// xyzw
+			for (u32 i = 0; i < 4; i++)
+			{
+				if (xyzw & (1u << i))
+					armAsm->Mov(dest.V4S(), i, src.V4S(), i);
+			}
+		}
+	}
+}
+
+static void maskedVecWrite(const a64::VRegister& reg, const a64::MemOperand& addr, int xyzw)
+{
+	switch (xyzw)
+	{
+		case 5: // YW
+			armGetMemOperandInRegister(RSCRATCHADDR, addr, 4);
+			armAsm->St1(reg.V4S(), 1, a64::MemOperand(RSCRATCHADDR)); // Y
+			armGetMemOperandInRegister(RSCRATCHADDR, addr, 12);
+			armAsm->St1(reg.V4S(), 3, a64::MemOperand(RSCRATCHADDR)); // W
+			break;
+
+		case 9: // XW
+			armGetMemOperandInRegister(RSCRATCHADDR, addr, 12);
+			armAsm->Str(reg.S(), addr); // X
+			armAsm->St1(reg.V4S(), 3, a64::MemOperand(RSCRATCHADDR)); // W
+			break;
+
+		case 10: //XZ
+			armGetMemOperandInRegister(RSCRATCHADDR, addr, 8);
+			armAsm->Str(reg.S(), addr); // X
+			armAsm->St1(reg.V4S(), 2, a64::MemOperand(RSCRATCHADDR)); // Z
+			break;
+
+		case 3: // ZW
+			armGetMemOperandInRegister(RSCRATCHADDR, addr, 8);
+			armAsm->St1(reg.V2D(), 1, a64::MemOperand(RSCRATCHADDR));
+			break;
+
+		case 11: //XZW
+			armGetMemOperandInRegister(RSCRATCHADDR, addr, 8);
+			armAsm->Str(reg.S(), addr); // X
+			armAsm->St1(reg.V2D(), 1, a64::MemOperand(RSCRATCHADDR)); // ZW
+			break;
+
+		case 13: // XYW
+			armGetMemOperandInRegister(RSCRATCHADDR, addr, 12);
+			armAsm->Str(reg.D(), addr);
+			armAsm->St1(reg.V4S(), 3, a64::MemOperand(RSCRATCHADDR));
+			break;
+
+		case 6: // YZ
+			armGetMemOperandInRegister(RSCRATCHADDR, addr, 4);
+			armAsm->St1(reg.V4S(), 1, a64::MemOperand(RSCRATCHADDR, 4, a64::PostIndex));
+			armAsm->St1(reg.V4S(), 2, a64::MemOperand(RSCRATCHADDR));
+			break;
+
+		case 7: // YZW
+			armGetMemOperandInRegister(RSCRATCHADDR, addr, 4);
+			armAsm->St1(reg.V4S(), 1, a64::MemOperand(RSCRATCHADDR, 4, a64::PostIndex));
+			armAsm->St1(reg.V2D(), 1, a64::MemOperand(RSCRATCHADDR));
+			break;
+
+		case 12: // XY
+			armAsm->Str(reg.D(), addr);
+			break;
+
+		case 14: // XYZ
+			armGetMemOperandInRegister(RSCRATCHADDR, addr, 8);
+			armAsm->Str(reg.D(), addr);
+			armAsm->St1(reg.V4S(), 2, a64::MemOperand(RSCRATCHADDR)); // Z
+			break;
+
+		case 4:
+			armGetMemOperandInRegister(RSCRATCHADDR, addr, 4);
+			armAsm->St1(reg.V4S(), 1, a64::MemOperand(RSCRATCHADDR));
+			break; // Y
+		case 2:
+			armGetMemOperandInRegister(RSCRATCHADDR, addr, 8);
+			armAsm->St1(reg.V4S(), 2, a64::MemOperand(RSCRATCHADDR));
+			break; // Z
+		case 1:
+			armGetMemOperandInRegister(RSCRATCHADDR, addr, 12);
+			armAsm->St1(reg.V4S(), 3, a64::MemOperand(RSCRATCHADDR));
+			break; // W
+		case 8:
+			armAsm->Str(reg.S(), addr);
+			break; // X
+
+		case 0:
+			Console.Error("maskedVecWrite case 0!");
+			break;
+
+		default:
+			armAsm->Str(reg.Q(), addr);
+			break; // XYZW
+	}
+}
+
+void dVifReset(int idx)
+{
+	nVif[idx].vifBlocks.reset();
+
+	const size_t offset = idx ? HostMemoryMap::VIF1recOffset : HostMemoryMap::VIF0recOffset;
+	const size_t size = idx ? HostMemoryMap::VIF1recSize : HostMemoryMap::VIF0recSize;
+	nVif[idx].recWritePtr = SysMemory::GetCodePtr(offset);
+	nVif[idx].recEndPtr = nVif[idx].recWritePtr + (size - _256kb);
+}
+
+void dVifRelease(int idx)
+{
+	nVif[idx].vifBlocks.clear();
+}
+
+VifUnpackNEON_Dynarec::VifUnpackNEON_Dynarec(const nVifStruct& vif_, const nVifBlock& vifBlock_)
+	: v(vif_)
+	, vB(vifBlock_)
+{
+	const int wl = vB.wl ? vB.wl : 256; //0 is taken as 256 (KH2)
+	isFill = (vB.cl < wl);
+	usn = (vB.upkType >> 5) & 1;
+	doMask = (vB.upkType >> 4) & 1;
+	doMode = vB.mode & 3;
+	IsAligned = vB.aligned;
+	vCL = 0;
+}
+
+__fi void makeMergeMask(u32& x)
+{
+	x = ((x & 0x40) >> 6) | ((x & 0x10) >> 3) | (x & 4) | ((x & 1) << 3);
+}
+
+__fi void VifUnpackNEON_Dynarec::SetMasks(int cS) const
+{
+	const int idx = v.idx;
+	const vifStruct& vif = MTVU_VifX;
+
+	//This could have ended up copying the row when there was no row to write.1810080
+	u32 m0 = vB.mask; //The actual mask example 0x03020100
+	u32 m3 = ((m0 & 0xaaaaaaaa) >> 1) & ~m0; //all the upper bits, so our example 0x01010000 & 0xFCFDFEFF = 0x00010000 just the cols (shifted right for maskmerge)
+	u32 m2 = (m0 & 0x55555555) & (~m0 >> 1); // 0x1000100 & 0xFE7EFF7F = 0x00000100 Just the row
+
+	if ((doMask && m2) || doMode)
+	{
+		armLoadPtr(xmmRow, &vif.MaskRow);
+		MSKPATH3_LOG("Moving row");
+	}
+	if (doMask && m3)
+	{
+		VIF_LOG("Merging Cols");
+		armLoadPtr(xmmCol0, &vif.MaskCol);
+		if ((cS >= 2) && (m3 & 0x0000ff00))
+			armAsm->Dup(xmmCol1.V4S(), xmmCol0.V4S(), 1);
+		if ((cS >= 3) && (m3 & 0x00ff0000))
+			armAsm->Dup(xmmCol2.V4S(), xmmCol0.V4S(), 2);
+		if ((cS >= 4) && (m3 & 0xff000000))
+			armAsm->Dup(xmmCol3.V4S(), xmmCol0.V4S(), 3);
+		if ((cS >= 1) && (m3 & 0x000000ff))
+			armAsm->Dup(xmmCol0.V4S(), xmmCol0.V4S(), 0);
+	}
+	//if (doMask||doMode) loadRowCol((nVifStruct&)v);
+}
+
+void VifUnpackNEON_Dynarec::doMaskWrite(const vixl::aarch64::VRegister& regX) const
+{
+	pxAssertMsg(regX.GetCode() <= 1, "Reg Overflow! XMM2 thru XMM6 are reserved for masking.");
+
+	const int cc = std::min(vCL, 3);
+	u32 m0 = (vB.mask >> (cc * 8)) & 0xff; //The actual mask example 0xE4 (protect, col, row, clear)
+	u32 m3 = ((m0 & 0xaa) >> 1) & ~m0; //all the upper bits (cols shifted right) cancelling out any write protects 0x10
+	u32 m2 = (m0 & 0x55) & (~m0 >> 1); // all the lower bits (rows)cancelling out any write protects 0x04
+	u32 m4 = (m0 & ~((m3 << 1) | m2)) & 0x55; //  = 0xC0 & 0x55 = 0x40 (for merge mask)
+
+	makeMergeMask(m2);
+	makeMergeMask(m3);
+	makeMergeMask(m4);
+
+	if (doMask && m2) // Merge MaskRow
+	{
+		mVUmergeRegs(regX, xmmRow, m2);
+	}
+
+	if (doMask && m3) // Merge MaskCol
+	{
+		mVUmergeRegs(regX, armQRegister(xmmCol0.GetCode() + cc), m3);
+	}
+
+	if (doMode)
+	{
+		u32 m5 = ~(m2 | m3 | m4) & 0xf;
+
+		if (!doMask)
+			m5 = 0xf;
+
+		if (m5 < 0xf)
+		{
+			armAsm->Movi(xmmTemp.V4S(), 0);
+			if (doMode == 3)
+			{
+				mVUmergeRegs(xmmRow, regX, m5, false, false);
+			}
+			else
+			{
+				mVUmergeRegs(xmmTemp, xmmRow, m5, false, false);
+				armAsm->Add(regX.V4S(), regX.V4S(), xmmTemp.V4S());
+				if (doMode == 2)
+					mVUmergeRegs(xmmRow, regX, m5, false, false);
+			}
+		}
+		else
+		{
+			if (doMode == 3)
+			{
+				armAsm->Mov(xmmRow, regX);
+			}
+			else
+			{
+				armAsm->Add(regX.V4S(), regX.V4S(), xmmRow.V4S());
+				if (doMode == 2)
+				{
+					armAsm->Mov(xmmRow, regX);
+				}
+			}
+		}
+	}
+
+	if (doMask && m4)
+		maskedVecWrite(regX, dstIndirect, m4 ^ 0xf);
+	else
+		armAsm->Str(regX, dstIndirect);
+}
+
+void VifUnpackNEON_Dynarec::writeBackRow() const
+{
+	const int idx = v.idx;
+	armStorePtr(xmmRow, &(MTVU_VifX.MaskRow));
+
+	VIF_LOG("nVif: writing back row reg! [doMode = %d]", doMode);
+}
+
+void VifUnpackNEON_Dynarec::ModUnpack(int upknum, bool PostOp)
+{
+	switch (upknum)
+	{
+		case 0:
+		case 1:
+		case 2:
+			if (PostOp)
+			{
+				UnpkLoopIteration++;
+				UnpkLoopIteration = UnpkLoopIteration & 0x3;
+			}
+			break;
+
+		case 4:
+		case 5:
+		case 6:
+			if (PostOp)
+			{
+				UnpkLoopIteration++;
+				UnpkLoopIteration = UnpkLoopIteration & 0x1;
+			}
+			break;
+
+		case 8:
+			if (PostOp)
+			{
+				UnpkLoopIteration++;
+				UnpkLoopIteration = UnpkLoopIteration & 0x1;
+			}
+			break;
+		case 9:
+			if (!PostOp)
+			{
+				UnpkLoopIteration++;
+			}
+			break;
+		case 10:
+			if (!PostOp)
+			{
+				UnpkLoopIteration++;
+			}
+			break;
+
+		case 12:
+			break;
+		case 13:
+			break;
+		case 14:
+			break;
+		case 15:
+			break;
+
+		case 3:
+		case 7:
+		case 11:
+			pxFailRel(fmt::format("Vpu/Vif - Invalid Unpack! [{}]", upknum).c_str());
+			break;
+	}
+}
+
+void VifUnpackNEON_Dynarec::ProcessMasks()
+{
+	skipProcessing = false;
+	inputMasked = false;
+
+	if (!doMask)
+		return;
+
+	const int cc = std::min(vCL, 3);
+	const u32 full_mask = (vB.mask >> (cc * 8)) & 0xff;
+	const u32 rowcol_mask = ((full_mask >> 1) | full_mask) & 0x55; // Rows or Cols being written instead of data, or protected.
+
+	// Every channel is write protected for this cycle, no need to process anything.
+	skipProcessing = full_mask == 0xff;
+
+	// All channels are masked, no reason to process anything here.
+	inputMasked = rowcol_mask == 0x55;
+}
+
+void VifUnpackNEON_Dynarec::CompileRoutine()
+{
+	const int wl = vB.wl ? vB.wl : 256; //0 is taken as 256 (KH2)
+	const int upkNum = vB.upkType & 0xf;
+	const u8& vift = nVifT[upkNum];
+	const int cycleSize = isFill ? vB.cl : wl;
+	const int blockSize = isFill ? wl : vB.cl;
+	const int skipSize = blockSize - cycleSize;
+
+	uint vNum = vB.num ? vB.num : 256;
+	doMode = (upkNum == 0xf) ? 0 : doMode; // V4_5 has no mode feature.
+	UnpkNoOfIterations = 0;
+	VIF_LOG("Compiling new block, unpack number %x, mode %x, masking %x, vNum %x", upkNum, doMode, doMask, vNum);
+
+	pxAssume(vCL == 0);
+
+	// Value passed determines # of col regs we need to load
+	SetMasks(isFill ? blockSize : cycleSize);
+
+	while (vNum)
+	{
+		// Determine if reads/processing can be skipped.
+		ProcessMasks();
+
+		if (vCL < cycleSize)
+		{
+			ModUnpack(upkNum, false);
+			xUnpack(upkNum);
+			xMovDest();
+			ModUnpack(upkNum, true);
+
+			dstIndirect = armOffsetMemOperand(dstIndirect, 16);
+			srcIndirect = armOffsetMemOperand(srcIndirect, vift);
+
+			vNum--;
+			if (++vCL == blockSize)
+				vCL = 0;
+		}
+		else if (isFill)
+		{
+			xUnpack(upkNum);
+			xMovDest();
+
+			// dstIndirect += 16;
+			dstIndirect = armOffsetMemOperand(dstIndirect, 16);
+
+			vNum--;
+			if (++vCL == blockSize)
+				vCL = 0;
+		}
+		else
+		{
+			// dstIndirect += (16 * skipSize);
+			dstIndirect = armOffsetMemOperand(dstIndirect, 16 * skipSize);
+			vCL = 0;
+		}
+	}
+
+	if (doMode >= 2)
+		writeBackRow();
+
+	armAsm->Ret();
+}
+
+static u16 dVifComputeLength(uint cl, uint wl, u8 num, bool isFill)
+{
+	uint length = (num > 0) ? (num * 16) : 4096; // 0 = 256
+
+	if (!isFill)
+	{
+		uint skipSize = (cl - wl) * 16;
+		uint blocks = (num + (wl - 1)) / wl; //Need to round up num's to calculate skip size correctly.
+		length += (blocks - 1) * skipSize;
+	}
+
+	return std::min(length, 0xFFFFu);
+}
+
+_vifT __fi nVifBlock* dVifCompile(nVifBlock& block, bool isFill)
+{
+	nVifStruct& v = nVif[idx];
+
+	// Check size before the compilation
+	if (v.recWritePtr >= v.recEndPtr)
+	{
+		DevCon.WriteLn("nVif Recompiler Cache Reset! [0x%016" PRIXPTR " > 0x%016" PRIXPTR "]",
+			v.recWritePtr, v.recEndPtr);
+		dVifReset(idx);
+	}
+
+	// Compile the block now
+	armSetAsmPtr(v.recWritePtr, v.recEndPtr - v.recWritePtr, nullptr);
+
+	block.startPtr = (uptr)armStartBlock();
+	block.length = dVifComputeLength(block.cl, block.wl, block.num, isFill);
+	v.vifBlocks.add(block);
+
+	VifUnpackNEON_Dynarec(v, block).CompileRoutine();
+
+	Perf::vif.RegisterPC(v.recWritePtr, armGetCurrentCodePointer() - v.recWritePtr, block.upkType /* FIXME ideally a key*/);
+	v.recWritePtr = armEndBlock();
+
+	return &block;
+}
+
+_vifT __fi void dVifUnpack(const u8* data, bool isFill)
+{
+	nVifStruct& v = nVif[idx];
+	vifStruct& vif = MTVU_VifX;
+	VIFregisters& vifRegs = MTVU_VifXRegs;
+
+	const u8 upkType = (vif.cmd & 0x1f) | (vif.usn << 5);
+	const int doMask = isFill ? 1 : (vif.cmd & 0x10);
+
+	nVifBlock block;
+
+	// Performance note: initial code was using u8/u16 field of the struct
+	// directly. However reading back the data (as u32) in HashBucket.find
+	// leads to various memory stalls. So it is way faster to manually build the data
+	// in u32 (aka x86 register).
+	//
+	// Warning the order of data in hash_key/key0/key1 depends on the nVifBlock struct
+	u32 hash_key = (u32)(upkType & 0xFF) << 8 | (vifRegs.num & 0xFF);
+
+	u32 key1 = ((u32)vifRegs.cycle.wl << 24) | ((u32)vifRegs.cycle.cl << 16) | ((u32)(vif.start_aligned & 0xFF) << 8) | ((u32)vifRegs.mode & 0xFF);
+	if ((upkType & 0xf) != 9)
+		key1 &= 0xFFFF01FF;
+
+	// Zero out the mask parameter if it's unused -- games leave random junk
+	// values here which cause false recblock cache misses.
+	u32 key0 = doMask ? vifRegs.mask : 0;
+
+	block.hash_key = hash_key;
+	block.key0 = key0;
+	block.key1 = key1;
+
+	//DevCon.WriteLn("nVif%d: Recompiled Block!", idx);
+	//DevCon.WriteLn(L"[num=% 3d][upkType=0x%02x][scl=%d][cl=%d][wl=%d][mode=%d][m=%d][mask=%s]",
+	//	block.num, block.upkType, block.scl, block.cl, block.wl, block.mode,
+	//	doMask >> 4, doMask ? wxsFormat( L"0x%08x", block.mask ).c_str() : L"ignored"
+	//);
+
+	// Seach in cache before trying to compile the block
+	nVifBlock* b = v.vifBlocks.find(block);
+	if (!b) [[unlikely]]
+	{
+		b = dVifCompile<idx>(block, isFill);
+	}
+
+	{ // Execute the block
+		const VURegs& VU = vuRegs[idx];
+		const uint vuMemLimit = idx ? 0x4000 : 0x1000;
+
+		u8* startmem = VU.Mem + (vif.tag.addr & (vuMemLimit - 0x10));
+		u8* endmem = VU.Mem + vuMemLimit;
+
+		if ((startmem + b->length) <= endmem) [[likely]]
+		{
+#if 1
+			// No wrapping, you can run the fast dynarec
+			((nVifrecCall)b->startPtr)((uptr)startmem, (uptr)data);
+#else
+			// comparison mode
+			static u8 tmpbuf[512 * 1024];
+			((nVifrecCall)b->startPtr)((uptr)tmpbuf, (uptr)data);
+
+			_nVifUnpack(idx, data, vifRegs.mode, isFill);
+
+			const u32 words = b->length / 4;
+			for (u32 i = 0; i < words; i++)
+			{
+				if (*((u32*)tmpbuf + i) != *((u32*)startmem + i))
+				{
+					// fprintf(stderr, "%08X %08X @ %u\n", *((u32*)tmpbuf + i), *((u32*)startmem + i), i);
+					pauseCCC(*((u32*)tmpbuf + i), *((u32*)startmem + i), i);
+					((nVifrecCall)b->startPtr)((uptr)tmpbuf, (uptr)data);
+					break;
+				}
+			}
+#endif
+		}
+		else
+		{
+			VIF_LOG("Running Interpreter Block: nVif%x - VU Mem Ptr Overflow; falling back to interpreter. Start = %x End = %x num = %x, wl = %x, cl = %x",
+				v.idx, vif.tag.addr, vif.tag.addr + (block.num * 16), block.num, block.wl, block.cl);
+			_nVifUnpack(idx, data, vifRegs.mode, isFill);
+		}
+	}
+}
+
+template void dVifUnpack<0>(const u8* data, bool isFill);
+template void dVifUnpack<1>(const u8* data, bool isFill);
--- a/pcsx2/arm64/newVif_UnpackNEON.cpp
+++ b/pcsx2/arm64/newVif_UnpackNEON.cpp
@ -0,0 +1,425 @@
+// SPDX-FileCopyrightText: 2021-2023 Connor McLaughlin <stenzek@gmail.com>, PCSX2 Team
+// SPDX-License-Identifier: GPL-3.0
+
+#include "newVif_UnpackNEON.h"
+#include "common/Perf.h"
+
+namespace a64 = vixl::aarch64;
+
+// =====================================================================================================
+//  VifUnpackSSE_Base Section
+// =====================================================================================================
+VifUnpackNEON_Base::VifUnpackNEON_Base()
+	: usn(false)
+	, doMask(false)
+	, UnpkLoopIteration(0)
+	, UnpkNoOfIterations(0)
+	, IsAligned(0)
+	, dstIndirect(a64::MemOperand(RXARG1))
+	, srcIndirect(a64::MemOperand(RXARG2))
+	, workReg(a64::q1)
+	, destReg(a64::q0)
+	, workGprW(a64::w4)
+{
+}
+
+void VifUnpackNEON_Base::xMovDest() const
+{
+	if (!IsWriteProtectedOp())
+	{
+		if (IsUnmaskedOp())
+			armAsm->Str(destReg, dstIndirect);
+		else
+			doMaskWrite(destReg);
+	}
+}
+
+void VifUnpackNEON_Base::xShiftR(const vixl::aarch64::VRegister& regX, int n) const
+{
+	if (usn)
+		armAsm->Ushr(regX.V4S(), regX.V4S(), n);
+	else
+		armAsm->Sshr(regX.V4S(), regX.V4S(), n);
+}
+
+void VifUnpackNEON_Base::xPMOVXX8(const vixl::aarch64::VRegister& regX) const
+{
+	// TODO(Stenzek): Check this
+	armAsm->Ldr(regX.S(), srcIndirect);
+
+	if (usn)
+	{
+		armAsm->Ushll(regX.V8H(), regX.V8B(), 0);
+		armAsm->Ushll(regX.V4S(), regX.V4H(), 0);
+	}
+	else
+	{
+		armAsm->Sshll(regX.V8H(), regX.V8B(), 0);
+		armAsm->Sshll(regX.V4S(), regX.V4H(), 0);
+	}
+}
+
+void VifUnpackNEON_Base::xPMOVXX16(const vixl::aarch64::VRegister& regX) const
+{
+	armAsm->Ldr(regX.D(), srcIndirect);
+
+	if (usn)
+		armAsm->Ushll(regX.V4S(), regX.V4H(), 0);
+	else
+		armAsm->Sshll(regX.V4S(), regX.V4H(), 0);
+}
+
+void VifUnpackNEON_Base::xUPK_S_32() const
+{
+	if (UnpkLoopIteration == 0)
+		armAsm->Ldr(workReg, srcIndirect);
+
+	if (IsInputMasked())
+		return;
+
+	switch (UnpkLoopIteration)
+	{
+		case 0:
+			armAsm->Dup(destReg.V4S(), workReg.V4S(), 0);
+			break;
+		case 1:
+			armAsm->Dup(destReg.V4S(), workReg.V4S(), 1);
+			break;
+		case 2:
+			armAsm->Dup(destReg.V4S(), workReg.V4S(), 2);
+			break;
+		case 3:
+			armAsm->Dup(destReg.V4S(), workReg.V4S(), 3);
+			break;
+	}
+}
+
+void VifUnpackNEON_Base::xUPK_S_16() const
+{
+	if (UnpkLoopIteration == 0)
+		xPMOVXX16(workReg);
+
+	if (IsInputMasked())
+		return;
+
+	switch (UnpkLoopIteration)
+	{
+		case 0:
+			armAsm->Dup(destReg.V4S(), workReg.V4S(), 0);
+			break;
+		case 1:
+			armAsm->Dup(destReg.V4S(), workReg.V4S(), 1);
+			break;
+		case 2:
+			armAsm->Dup(destReg.V4S(), workReg.V4S(), 2);
+			break;
+		case 3:
+			armAsm->Dup(destReg.V4S(), workReg.V4S(), 3);
+			break;
+	}
+}
+
+void VifUnpackNEON_Base::xUPK_S_8() const
+{
+	if (UnpkLoopIteration == 0)
+		xPMOVXX8(workReg);
+
+	if (IsInputMasked())
+		return;
+
+	switch (UnpkLoopIteration)
+	{
+		case 0:
+			armAsm->Dup(destReg.V4S(), workReg.V4S(), 0);
+			break;
+		case 1:
+			armAsm->Dup(destReg.V4S(), workReg.V4S(), 1);
+			break;
+		case 2:
+			armAsm->Dup(destReg.V4S(), workReg.V4S(), 2);
+			break;
+		case 3:
+			armAsm->Dup(destReg.V4S(), workReg.V4S(), 3);
+			break;
+	}
+}
+
+// The V2 + V3 unpacks have freaky behaviour, the manual claims "indeterminate".
+// After testing on the PS2, it's very much determinate in 99% of cases
+// and games like Lemmings, And1 Streetball rely on this data to be like this!
+// I have commented after each shuffle to show what data is going where - Ref
+
+void VifUnpackNEON_Base::xUPK_V2_32() const
+{
+	if (UnpkLoopIteration == 0)
+	{
+		armAsm->Ldr(workReg, srcIndirect);
+
+		if (IsInputMasked())
+			return;
+
+		armAsm->Dup(destReg.V2D(), workReg.V2D(), 0); //v1v0v1v0
+		if (IsAligned)
+			armAsm->Ins(destReg.V4S(), 3, a64::wzr); //zero last word - tested on ps2
+	}
+	else
+	{
+		if (IsInputMasked())
+			return;
+
+		armAsm->Dup(destReg.V2D(), workReg.V2D(), 1); //v3v2v3v2
+		if (IsAligned)
+			armAsm->Ins(destReg.V4S(), 3, a64::wzr); //zero last word - tested on ps2
+	}
+}
+
+void VifUnpackNEON_Base::xUPK_V2_16() const
+{
+	if (UnpkLoopIteration == 0)
+	{
+		xPMOVXX16(workReg);
+
+		if (IsInputMasked())
+			return;
+
+		armAsm->Dup(destReg.V2D(), workReg.V2D(), 0); //v1v0v1v0
+	}
+	else
+	{
+		if (IsInputMasked())
+			return;
+
+		armAsm->Dup(destReg.V2D(), workReg.V2D(), 1); //v3v2v3v2
+	}
+}
+
+void VifUnpackNEON_Base::xUPK_V2_8() const
+{
+	if (UnpkLoopIteration == 0)
+	{
+		xPMOVXX8(workReg);
+
+		if (IsInputMasked())
+			return;
+
+		armAsm->Dup(destReg.V2D(), workReg.V2D(), 0); //v1v0v1v0
+	}
+	else
+	{
+		if (IsInputMasked())
+			return;
+
+		armAsm->Dup(destReg.V2D(), workReg.V2D(), 1); //v3v2v3v2
+	}
+}
+
+void VifUnpackNEON_Base::xUPK_V3_32() const
+{
+	if (IsInputMasked())
+		return;
+
+	armAsm->Ldr(destReg, srcIndirect);
+	if (UnpkLoopIteration != IsAligned)
+		armAsm->Ins(destReg.V4S(), 3, a64::wzr);
+}
+
+void VifUnpackNEON_Base::xUPK_V3_16() const
+{
+	if (IsInputMasked())
+		return;
+
+	xPMOVXX16(destReg);
+
+	//With V3-16, it takes the first vector from the next position as the W vector
+	//However - IF the end of this iteration of the unpack falls on a quadword boundary, W becomes 0
+	//IsAligned is the position through the current QW in the vif packet
+	//Iteration counts where we are in the packet.
+	int result = (((UnpkLoopIteration / 4) + 1 + (4 - IsAligned)) & 0x3);
+
+	if ((UnpkLoopIteration & 0x1) == 0 && result == 0)
+		armAsm->Ins(destReg.V4S(), 3, a64::wzr); //zero last word on QW boundary if whole 32bit word is used - tested on ps2
+}
+
+void VifUnpackNEON_Base::xUPK_V3_8() const
+{
+	if (IsInputMasked())
+		return;
+
+	xPMOVXX8(destReg);
+	if (UnpkLoopIteration != IsAligned)
+		armAsm->Ins(destReg.V4S(), 3, a64::wzr);
+}
+
+void VifUnpackNEON_Base::xUPK_V4_32() const
+{
+	if (IsInputMasked())
+		return;
+
+	armAsm->Ldr(destReg.Q(), a64::MemOperand(srcIndirect));
+}
+
+void VifUnpackNEON_Base::xUPK_V4_16() const
+{
+	if (IsInputMasked())
+		return;
+
+	xPMOVXX16(destReg);
+}
+
+void VifUnpackNEON_Base::xUPK_V4_8() const
+{
+	if (IsInputMasked())
+		return;
+
+	xPMOVXX8(destReg);
+}
+
+void VifUnpackNEON_Base::xUPK_V4_5() const
+{
+	if (IsInputMasked())
+		return;
+
+	armAsm->Ldrh(workGprW, srcIndirect);
+	armAsm->Lsl(workGprW, workGprW, 3); // ABG|R5.000
+	armAsm->Dup(destReg.V4S(), workGprW); // x|x|x|R
+	armAsm->Lsr(workGprW, workGprW, 8); // ABG
+	armAsm->Lsl(workGprW, workGprW, 3); // AB|G5.000
+	armAsm->Ins(destReg.V4S(), 1, workGprW); // x|x|G|R
+	armAsm->Lsr(workGprW, workGprW, 8); // AB
+	armAsm->Lsl(workGprW, workGprW, 3); // A|B5.000
+	armAsm->Ins(destReg.V4S(), 2, workGprW); // x|B|G|R
+	armAsm->Lsr(workGprW, workGprW, 8); // A
+	armAsm->Lsl(workGprW, workGprW, 7); // A.0000000
+	armAsm->Ins(destReg.V4S(), 3, workGprW); // A|B|G|R
+	armAsm->Shl(destReg.V4S(), destReg.V4S(), 24); // can optimize to
+	armAsm->Ushr(destReg.V4S(), destReg.V4S(), 24); // single AND...
+}
+
+void VifUnpackNEON_Base::xUnpack(int upknum) const
+{
+	switch (upknum)
+	{
+		case 0:
+			xUPK_S_32();
+			break;
+		case 1:
+			xUPK_S_16();
+			break;
+		case 2:
+			xUPK_S_8();
+			break;
+
+		case 4:
+			xUPK_V2_32();
+			break;
+		case 5:
+			xUPK_V2_16();
+			break;
+		case 6:
+			xUPK_V2_8();
+			break;
+
+		case 8:
+			xUPK_V3_32();
+			break;
+		case 9:
+			xUPK_V3_16();
+			break;
+		case 10:
+			xUPK_V3_8();
+			break;
+
+		case 12:
+			xUPK_V4_32();
+			break;
+		case 13:
+			xUPK_V4_16();
+			break;
+		case 14:
+			xUPK_V4_8();
+			break;
+		case 15:
+			xUPK_V4_5();
+			break;
+
+		case 3:
+		case 7:
+		case 11:
+			pxFailRel(fmt::format("Vpu/Vif - Invalid Unpack! [{}]", upknum).c_str());
+			break;
+	}
+}
+
+// =====================================================================================================
+//  VifUnpackSSE_Simple
+// =====================================================================================================
+
+VifUnpackNEON_Simple::VifUnpackNEON_Simple(bool usn_, bool domask_, int curCycle_)
+{
+	curCycle = curCycle_;
+	usn = usn_;
+	doMask = domask_;
+	IsAligned = true;
+}
+
+void VifUnpackNEON_Simple::doMaskWrite(const vixl::aarch64::VRegister& regX) const
+{
+	armAsm->Ldr(a64::q7, dstIndirect);
+
+	int offX = std::min(curCycle, 3);
+	armMoveAddressToReg(RXVIXLSCRATCH, nVifMask);
+	armAsm->Ldr(a64::q29, a64::MemOperand(RXVIXLSCRATCH, reinterpret_cast<const u8*>(nVifMask[0][offX]) - reinterpret_cast<const u8*>(nVifMask)));
+	armAsm->Ldr(a64::q30, a64::MemOperand(RXVIXLSCRATCH, reinterpret_cast<const u8*>(nVifMask[1][offX]) - reinterpret_cast<const u8*>(nVifMask)));
+	armAsm->Ldr(a64::q31, a64::MemOperand(RXVIXLSCRATCH, reinterpret_cast<const u8*>(nVifMask[2][offX]) - reinterpret_cast<const u8*>(nVifMask)));
+	armAsm->And(regX.V16B(), regX.V16B(), a64::q29.V16B());
+	armAsm->And(a64::q7.V16B(), a64::q7.V16B(), a64::q30.V16B());
+	armAsm->Orr(regX.V16B(), regX.V16B(), a64::q31.V16B());
+	armAsm->Orr(regX.V16B(), regX.V16B(), a64::q7.V16B());
+	armAsm->Str(regX, dstIndirect);
+}
+
+// ecx = dest, edx = src
+static void nVifGen(int usn, int mask, int curCycle)
+{
+
+	int usnpart = usn * 2 * 16;
+	int maskpart = mask * 16;
+
+	VifUnpackNEON_Simple vpugen(!!usn, !!mask, curCycle);
+
+	for (int i = 0; i < 16; ++i)
+	{
+		nVifCall& ucall(nVifUpk[((usnpart + maskpart + i) * 4) + curCycle]);
+		ucall = NULL;
+		if (nVifT[i] == 0)
+			continue;
+
+		ucall = (nVifCall)armStartBlock();
+		vpugen.xUnpack(i);
+		vpugen.xMovDest();
+		armAsm->Ret();
+		armEndBlock();
+	}
+}
+
+void VifUnpackSSE_Init()
+{
+	DevCon.WriteLn("Generating NEON-optimized unpacking functions for VIF interpreters...");
+
+	HostSys::BeginCodeWrite();
+	armSetAsmPtr(SysMemory::GetVIFUnpackRec(), SysMemory::GetVIFUnpackRecEnd() - SysMemory::GetVIFUnpackRec(), nullptr);
+
+	for (int a = 0; a < 2; a++)
+	{
+		for (int b = 0; b < 2; b++)
+		{
+			for (int c = 0; c < 4; c++)
+			{
+				nVifGen(a, b, c);
+			}
+		}
+	}
+
+	Perf::any.Register(SysMemory::GetVIFUnpackRec(), armGetAsmPtr() - SysMemory::GetVIFUnpackRec(), "VIF Unpack");
+	HostSys::EndCodeWrite();
+}
--- a/pcsx2/arm64/newVif_UnpackNEON.h
+++ b/pcsx2/arm64/newVif_UnpackNEON.h
@ -0,0 +1,146 @@
+// SPDX-FileCopyrightText: 2021-2023 Connor McLaughlin <stenzek@gmail.com>, PCSX2 Team
+// SPDX-License-Identifier: GPL-3.0
+
+#pragma once
+
+#include "Common.h"
+#include "Vif_Dma.h"
+#include "Vif_Dynarec.h"
+#include "arm64/AsmHelpers.h"
+
+#define xmmCol0 vixl::aarch64::q2
+#define xmmCol1 vixl::aarch64::q3
+#define xmmCol2 vixl::aarch64::q4
+#define xmmCol3 vixl::aarch64::q5
+#define xmmRow vixl::aarch64::q6
+#define xmmTemp vixl::aarch64::q7
+
+// --------------------------------------------------------------------------------------
+//  VifUnpackSSE_Base
+// --------------------------------------------------------------------------------------
+class VifUnpackNEON_Base
+{
+public:
+	bool usn; // unsigned flag
+	bool doMask; // masking write enable flag
+	int UnpkLoopIteration;
+	int UnpkNoOfIterations;
+	int IsAligned;
+
+
+protected:
+	vixl::aarch64::MemOperand dstIndirect;
+	vixl::aarch64::MemOperand srcIndirect;
+	vixl::aarch64::VRegister workReg;
+	vixl::aarch64::VRegister destReg;
+	vixl::aarch64::WRegister workGprW;
+
+public:
+	VifUnpackNEON_Base();
+	virtual ~VifUnpackNEON_Base() = default;
+
+	virtual void xUnpack(int upktype) const;
+	virtual bool IsWriteProtectedOp() const = 0;
+	virtual bool IsInputMasked() const = 0;
+	virtual bool IsUnmaskedOp() const = 0;
+	virtual void xMovDest() const;
+
+protected:
+	virtual void doMaskWrite(const vixl::aarch64::VRegister& regX) const = 0;
+
+	virtual void xShiftR(const vixl::aarch64::VRegister& regX, int n) const;
+	virtual void xPMOVXX8(const vixl::aarch64::VRegister& regX) const;
+	virtual void xPMOVXX16(const vixl::aarch64::VRegister& regX) const;
+
+	virtual void xUPK_S_32() const;
+	virtual void xUPK_S_16() const;
+	virtual void xUPK_S_8() const;
+
+	virtual void xUPK_V2_32() const;
+	virtual void xUPK_V2_16() const;
+	virtual void xUPK_V2_8() const;
+
+	virtual void xUPK_V3_32() const;
+	virtual void xUPK_V3_16() const;
+	virtual void xUPK_V3_8() const;
+
+	virtual void xUPK_V4_32() const;
+	virtual void xUPK_V4_16() const;
+	virtual void xUPK_V4_8() const;
+	virtual void xUPK_V4_5() const;
+};
+
+// --------------------------------------------------------------------------------------
+//  VifUnpackSSE_Simple
+// --------------------------------------------------------------------------------------
+class VifUnpackNEON_Simple : public VifUnpackNEON_Base
+{
+	typedef VifUnpackNEON_Base _parent;
+
+public:
+	int curCycle;
+
+public:
+	VifUnpackNEON_Simple(bool usn_, bool domask_, int curCycle_);
+	virtual ~VifUnpackNEON_Simple() = default;
+
+	virtual bool IsWriteProtectedOp() const { return false; }
+	virtual bool IsInputMasked() const { return false; }
+	virtual bool IsUnmaskedOp() const { return !doMask; }
+
+protected:
+	virtual void doMaskWrite(const vixl::aarch64::VRegister& regX) const;
+};
+
+// --------------------------------------------------------------------------------------
+//  VifUnpackSSE_Dynarec
+// --------------------------------------------------------------------------------------
+class VifUnpackNEON_Dynarec : public VifUnpackNEON_Base
+{
+	typedef VifUnpackNEON_Base _parent;
+
+public:
+	bool isFill;
+	int doMode; // two bit value representing difference mode
+	bool skipProcessing;
+	bool inputMasked;
+
+protected:
+	const nVifStruct& v; // vif0 or vif1
+	const nVifBlock& vB; // some pre-collected data from VifStruct
+	int vCL; // internal copy of vif->cl
+
+public:
+	VifUnpackNEON_Dynarec(const nVifStruct& vif_, const nVifBlock& vifBlock_);
+	VifUnpackNEON_Dynarec(const VifUnpackNEON_Dynarec& src) // copy constructor
+		: _parent(src)
+		, v(src.v)
+		, vB(src.vB)
+	{
+		isFill = src.isFill;
+		vCL = src.vCL;
+	}
+
+	virtual ~VifUnpackNEON_Dynarec() = default;
+
+	virtual bool IsWriteProtectedOp() const { return skipProcessing; }
+	virtual bool IsInputMasked() const { return inputMasked; }
+	virtual bool IsUnmaskedOp() const { return !doMode && !doMask; }
+
+	void ModUnpack(int upknum, bool PostOp);
+	void ProcessMasks();
+	void CompileRoutine();
+
+protected:
+	virtual void doMaskWrite(const vixl::aarch64::VRegister& regX) const;
+	void SetMasks(int cS) const;
+	void writeBackRow() const;
+
+	static VifUnpackNEON_Dynarec FillingWrite(const VifUnpackNEON_Dynarec& src)
+	{
+		VifUnpackNEON_Dynarec fillingWrite(src);
+		fillingWrite.doMask = true;
+		fillingWrite.doMode = 0;
+		return fillingWrite;
+	}
+};
--- a/pcsx2/pcsx2.vcxproj
+++ b/pcsx2/pcsx2.vcxproj
@ -112,6 +112,15 @@
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="Achievements.cpp" />
+    <ClCompile Include="arm64\AsmHelpers.cpp">
+      <ExcludedFromBuild Condition="'$(Platform)'!='ARM64'">true</ExcludedFromBuild>
+    </ClCompile>
+    <ClCompile Include="arm64\newVif_Dynarec.cpp">
+      <ExcludedFromBuild Condition="'$(Platform)'!='ARM64'">true</ExcludedFromBuild>
+    </ClCompile>
+    <ClCompile Include="arm64\newVif_UnpackNEON.cpp">
+      <ExcludedFromBuild Condition="'$(Platform)'!='ARM64'">true</ExcludedFromBuild>
+    </ClCompile>
    <ClCompile Include="CDVD\BlockdumpFileReader.cpp" />
    <ClCompile Include="CDVD\CDVDdiscReader.cpp" />
    <ClCompile Include="CDVD\CDVDdiscThread.cpp" />
@ -554,6 +563,12 @@
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="Achievements.h" />
+    <ClInclude Include="arm64\AsmHelpers.h">
+      <ExcludedFromBuild Condition="'$(Platform)'!='ARM64'">true</ExcludedFromBuild>
+    </ClInclude>
+    <ClInclude Include="arm64\newVif_UnpackNEON.h">
+      <ExcludedFromBuild Condition="'$(Platform)'!='ARM64'">true</ExcludedFromBuild>
+    </ClInclude>
    <ClInclude Include="CDVD\BlockdumpFileReader.h" />
    <ClInclude Include="CDVD\CDVDdiscReader.h" />
    <ClInclude Include="CDVD\CsoFileReader.h" />
--- a/pcsx2/pcsx2.vcxproj.filters
+++ b/pcsx2/pcsx2.vcxproj.filters
@ -280,6 +280,12 @@
    <Filter Include="Misc\Host">
      <UniqueIdentifier>{9f0d3bda-76d4-42d3-87e9-ce65db9163ef}</UniqueIdentifier>
    </Filter>
+    <Filter Include="System\Ps2\EmotionEngine\DMAC\Vif\Unpack\newVif\Dynarec\arm64">
+      <UniqueIdentifier>{8aea3ae6-9722-463a-94ac-34f3738a3153}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Tools\arm64">
+      <UniqueIdentifier>{cf847f4e-744e-4c27-a7ac-8564726fb4e6}</UniqueIdentifier>
+    </Filter>
  </ItemGroup>
  <ItemGroup>
    <None Include="Docs\License.txt">
@ -1398,6 +1404,15 @@
    <ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.arm64.cpp">
      <Filter>System\Ps2\GS\Renderers\Software</Filter>
    </ClCompile>
+    <ClCompile Include="arm64\newVif_Dynarec.cpp">
+      <Filter>System\Ps2\EmotionEngine\DMAC\Vif\Unpack\newVif\Dynarec\arm64</Filter>
+    </ClCompile>
+    <ClCompile Include="arm64\newVif_UnpackNEON.cpp">
+      <Filter>System\Ps2\EmotionEngine\DMAC\Vif\Unpack\newVif\Dynarec\arm64</Filter>
+    </ClCompile>
+    <ClCompile Include="arm64\AsmHelpers.cpp">
+      <Filter>Tools\arm64</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="Patch.h">
@ -2321,6 +2336,12 @@
    <ClInclude Include="GS\GSVector4i_arm64.h">
      <Filter>System\Ps2\GS</Filter>
    </ClInclude>
+    <ClInclude Include="arm64\newVif_UnpackNEON.h">
+      <Filter>System\Ps2\EmotionEngine\DMAC\Vif\Unpack\newVif\Dynarec\arm64</Filter>
+    </ClInclude>
+    <ClInclude Include="arm64\AsmHelpers.h">
+      <Filter>Tools\arm64</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <CustomBuildStep Include="rdebug\deci2.h">