From fe9399612dc075090b285699b188fbc298ca1fc8 Mon Sep 17 00:00:00 2001 From: Stenzek Date: Thu, 21 Mar 2024 19:24:35 +1000 Subject: [PATCH] arm64: Add VIF dynarec --- pcsx2/CMakeLists.txt | 21 +- pcsx2/arm64/AsmHelpers.cpp | 461 +++++++++++++++++++++++ pcsx2/arm64/AsmHelpers.h | 146 ++++++++ pcsx2/arm64/newVif_Dynarec.cpp | 585 ++++++++++++++++++++++++++++++ pcsx2/arm64/newVif_UnpackNEON.cpp | 425 ++++++++++++++++++++++ pcsx2/arm64/newVif_UnpackNEON.h | 146 ++++++++ pcsx2/pcsx2.vcxproj | 15 + pcsx2/pcsx2.vcxproj.filters | 21 ++ 8 files changed, 1818 insertions(+), 2 deletions(-) create mode 100644 pcsx2/arm64/AsmHelpers.cpp create mode 100644 pcsx2/arm64/AsmHelpers.h create mode 100644 pcsx2/arm64/newVif_Dynarec.cpp create mode 100644 pcsx2/arm64/newVif_UnpackNEON.cpp create mode 100644 pcsx2/arm64/newVif_UnpackNEON.h diff --git a/pcsx2/CMakeLists.txt b/pcsx2/CMakeLists.txt index d350fdcfe7..359d35b70c 100644 --- a/pcsx2/CMakeLists.txt +++ b/pcsx2/CMakeLists.txt @@ -1018,20 +1018,37 @@ set(pcsx2x86Headers x86/R5900_Profiler.h ) +# ARM64 +set(pcsx2arm64Sources + arm64/AsmHelpers.cpp + arm64/newVif_Dynarec.cpp + arm64/newVif_UnpackNEON.cpp + ) + +set(pcsx2arm64Headers + arm64/AsmHelpers.h +) + # These ones benefit a lot from LTO set(pcsx2LTOSources ${pcsx2Sources} ${pcsx2Headers} ${pcsx2IPUSources} ${pcsx2IPUHeaders} - ${pcsx2x86Sources} - ${pcsx2x86Headers} ${pcsx2SPU2Sources} ${pcsx2SPU2Headers} ${pcsx2GSSources} ${pcsx2GSHeaders} ) +if(_M_X86) + list(APPEND pcsx2LTOSources ${pcsx2x86Sources} ${pcsx2x86Headers}) + target_link_libraries(PCSX2_FLAGS INTERFACE zydis) +elseif(_M_ARM64) + list(APPEND pcsx2LTOSources ${pcsx2arm64Sources} ${pcsx2arm64Headers}) + target_link_libraries(PCSX2_FLAGS INTERFACE vixl) +endif() + if(LTO_PCSX2_CORE) add_library(PCSX2_LTO ${pcsx2LTOSources}) if (DISABLE_ADVANCE_SIMD) diff --git a/pcsx2/arm64/AsmHelpers.cpp b/pcsx2/arm64/AsmHelpers.cpp new file mode 100644 index 0000000000..ddfe163e2f --- /dev/null +++ b/pcsx2/arm64/AsmHelpers.cpp @@ -0,0 +1,461 @@ +// SPDX-FileCopyrightText: 2021-2024 Connor McLaughlin , PCSX2 Team +// SPDX-License-Identifier: GPL-3.0 + +#include "arm64/AsmHelpers.h" + +#include "common/Assertions.h" +#include "common/BitUtils.h" +#include "common/Console.h" +#include "common/HostSys.h" + +const vixl::aarch64::Register& armWRegister(int n) +{ + using namespace vixl::aarch64; + static constexpr const Register* regs[32] = {&w0, &w1, &w2, &w3, &w4, &w5, &w6, &w7, &w8, &w9, &w10, + &w11, &w12, &w13, &w14, &w15, &w16, &w17, &w18, &w19, &w20, &w21, &w22, &w23, &w24, &w25, &w26, &w27, &w28, + &w29, &w30, &w31}; + pxAssert(static_cast(n) < std::size(regs)); + return *regs[n]; +} + +const vixl::aarch64::Register& armXRegister(int n) +{ + using namespace vixl::aarch64; + static constexpr const Register* regs[32] = {&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &x8, &x9, &x10, + &x11, &x12, &x13, &x14, &x15, &x16, &x17, &x18, &x19, &x20, &x21, &x22, &x23, &x24, &x25, &x26, &x27, &x28, + &x29, &x30, &x31}; + pxAssert(static_cast(n) < std::size(regs)); + return *regs[n]; +} + +const vixl::aarch64::VRegister& armSRegister(int n) +{ + using namespace vixl::aarch64; + static constexpr const VRegister* regs[32] = {&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &vixl::aarch64::s8, &s9, &s10, + &s11, &s12, &s13, &s14, &s15, &vixl::aarch64::s16, &s17, &s18, &s19, &s20, &s21, &s22, &s23, &s24, &s25, &s26, &s27, &s28, + &s29, &s30, &s31}; + pxAssert(static_cast(n) < std::size(regs)); + return *regs[n]; +} + +const vixl::aarch64::VRegister& armDRegister(int n) +{ + using namespace vixl::aarch64; + static constexpr const VRegister* regs[32] = {&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &d8, &d9, &d10, + &d11, &d12, &d13, &d14, &d15, &d16, &d17, &d18, &d19, &d20, &d21, &d22, &d23, &d24, &d25, &d26, &d27, &d28, + &d29, &d30, &d31}; + pxAssert(static_cast(n) < std::size(regs)); + return *regs[n]; +} + +const vixl::aarch64::VRegister& armQRegister(int n) +{ + using namespace vixl::aarch64; + static constexpr const VRegister* regs[32] = {&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7, &q8, &q9, &q10, + &q11, &q12, &q13, &q14, &q15, &q16, &q17, &q18, &q19, &q20, &q21, &q22, &q23, &q24, &q25, &q26, &q27, &q28, + &q29, &q30, &q31}; + pxAssert(static_cast(n) < std::size(regs)); + return *regs[n]; +} + + +//#define INCLUDE_DISASSEMBLER + +#ifdef INCLUDE_DISASSEMBLER +#include "vixl/aarch64/disasm-aarch64.h" +#endif + +namespace a64 = vixl::aarch64; + +thread_local a64::MacroAssembler* armAsm; +thread_local u8* armAsmPtr; +thread_local size_t armAsmCapacity; +thread_local ArmConstantPool* armConstantPool; + +#ifdef INCLUDE_DISASSEMBLER +static std::mutex armDisasmMutex; +static std::unique_ptr armDisasm; +static std::unique_ptr armDisasmDecoder; +#endif + +void armSetAsmPtr(void* ptr, size_t capacity, ArmConstantPool* pool) +{ + pxAssert(!armAsm); + armAsmPtr = static_cast(ptr); + armAsmCapacity = capacity; + armConstantPool = pool; +} + +// Align to 16 bytes, apparently ARM likes that. +void armAlignAsmPtr() +{ + static constexpr uintptr_t ALIGNMENT = 16; + u8* new_ptr = reinterpret_cast((reinterpret_cast(armAsmPtr) + (ALIGNMENT - 1)) & ~(ALIGNMENT - 1)); + pxAssert(static_cast(new_ptr - armAsmPtr) <= armAsmCapacity); + armAsmCapacity -= (new_ptr - armAsmPtr); + armAsmPtr = new_ptr; +} + +u8* armStartBlock() +{ + armAlignAsmPtr(); + + HostSys::BeginCodeWrite(); + + pxAssert(!armAsm); + armAsm = new vixl::aarch64::MacroAssembler(static_cast(armAsmPtr), armAsmCapacity); + armAsm->GetScratchVRegisterList()->Remove(31); + armAsm->GetScratchRegisterList()->Remove(RSCRATCHADDR.GetCode()); + return armAsmPtr; +} + +u8* armEndBlock() +{ + pxAssert(armAsm); + + armAsm->FinalizeCode(); + + const u32 size = static_cast(armAsm->GetSizeOfCodeGenerated()); + pxAssert(size < armAsmCapacity); + + delete armAsm; + armAsm = nullptr; + + HostSys::EndCodeWrite(); + + HostSys::FlushInstructionCache(armAsmPtr, size); + + armAsmPtr = armAsmPtr + size; + armAsmCapacity -= size; + return armAsmPtr; +} + +void armDisassembleAndDumpCode(const void* ptr, size_t size) +{ +#ifdef INCLUDE_DISASSEMBLER + std::unique_lock lock(armDisasmMutex); + if (!armDisasm) + { + armDisasm = std::make_unique(stderr); + armDisasmDecoder = std::make_unique(); + armDisasmDecoder->AppendVisitor(armDisasm.get()); + } + + armDisasmDecoder->Decode(static_cast(ptr), static_cast(ptr) + size); +#else + Console.Error("Not compiled with INCLUDE_DISASSEMBLER"); +#endif +} + +void armEmitJmp(const void* ptr, bool force_inline) +{ + s64 displacement = GetPCDisplacement(armGetCurrentCodePointer(), ptr); + bool use_blr = !vixl::IsInt26(displacement); + if (use_blr && armConstantPool && !force_inline) + { + if (u8* trampoline = armConstantPool->GetJumpTrampoline(ptr); trampoline) + { + displacement = GetPCDisplacement(armGetCurrentCodePointer(), trampoline); + use_blr = !vixl::IsInt26(displacement); + } + } + + if (use_blr) + { + armAsm->Mov(RXVIXLSCRATCH, reinterpret_cast(ptr)); + armAsm->Br(RXVIXLSCRATCH); + } + else + { + a64::SingleEmissionCheckScope guard(armAsm); + armAsm->b(displacement); + } +} + +void armEmitCall(const void* ptr, bool force_inline) +{ + s64 displacement = GetPCDisplacement(armGetCurrentCodePointer(), ptr); + bool use_blr = !vixl::IsInt26(displacement); + if (use_blr && armConstantPool && !force_inline) + { + if (u8* trampoline = armConstantPool->GetJumpTrampoline(ptr); trampoline) + { + displacement = GetPCDisplacement(armGetCurrentCodePointer(), trampoline); + use_blr = !vixl::IsInt26(displacement); + } + } + + if (use_blr) + { + armAsm->Mov(RXVIXLSCRATCH, reinterpret_cast(ptr)); + armAsm->Blr(RXVIXLSCRATCH); + } + else + { + a64::SingleEmissionCheckScope guard(armAsm); + armAsm->bl(displacement); + } +} + +void armEmitCbnz(const vixl::aarch64::Register& reg, const void* ptr) +{ + const s64 jump_distance = + static_cast(reinterpret_cast(ptr) - reinterpret_cast(armGetCurrentCodePointer())); + //pxAssert(Common::IsAligned(jump_distance, 4)); + if (a64::Instruction::IsValidImmPCOffset(a64::CompareBranchType, jump_distance >> 2)) + { + a64::SingleEmissionCheckScope guard(armAsm); + armAsm->cbnz(reg, jump_distance >> 2); + } + else + { + a64::MacroEmissionCheckScope guard(armAsm); + a64::Label branch_not_taken; + armAsm->cbz(reg, &branch_not_taken); + + const s64 new_jump_distance = + static_cast(reinterpret_cast(ptr) - reinterpret_cast(armGetCurrentCodePointer())); + armAsm->b(new_jump_distance >> 2); + armAsm->bind(&branch_not_taken); + } +} + +void armEmitCondBranch(a64::Condition cond, const void* ptr) +{ + const s64 jump_distance = + static_cast(reinterpret_cast(ptr) - reinterpret_cast(armGetCurrentCodePointer())); + //pxAssert(Common::IsAligned(jump_distance, 4)); + + if (a64::Instruction::IsValidImmPCOffset(a64::CondBranchType, jump_distance >> 2)) + { + a64::SingleEmissionCheckScope guard(armAsm); + armAsm->b(jump_distance >> 2, cond); + } + else + { + a64::MacroEmissionCheckScope guard(armAsm); + a64::Label branch_not_taken; + armAsm->b(&branch_not_taken, a64::InvertCondition(cond)); + + const s64 new_jump_distance = + static_cast(reinterpret_cast(ptr) - reinterpret_cast(armGetCurrentCodePointer())); + armAsm->b(new_jump_distance >> 2); + armAsm->bind(&branch_not_taken); + } +} + +void armMoveAddressToReg(const vixl::aarch64::Register& reg, const void* addr) +{ + // psxAsm->Mov(reg, static_cast(reinterpret_cast(addr))); + pxAssert(reg.IsX()); + + const void* current_code_ptr_page = reinterpret_cast( + reinterpret_cast(armGetCurrentCodePointer()) & ~static_cast(0xFFF)); + const void* ptr_page = + reinterpret_cast(reinterpret_cast(addr) & ~static_cast(0xFFF)); + const s64 page_displacement = GetPCDisplacement(current_code_ptr_page, ptr_page) >> 10; + const u32 page_offset = static_cast(reinterpret_cast(addr) & 0xFFFu); + if (vixl::IsInt21(page_displacement) && a64::Assembler::IsImmAddSub(page_offset)) + { + { + a64::SingleEmissionCheckScope guard(armAsm); + armAsm->adrp(reg, page_displacement); + } + armAsm->Add(reg, reg, page_offset); + } + else if (vixl::IsInt21(page_displacement) && a64::Assembler::IsImmLogical(page_offset, 64)) + { + { + a64::SingleEmissionCheckScope guard(armAsm); + armAsm->adrp(reg, page_displacement); + } + armAsm->Orr(reg, reg, page_offset); + } + else + { + armAsm->Mov(reg, reinterpret_cast(addr)); + } +} + +void armLoadPtr(const vixl::aarch64::CPURegister& reg, const void* addr) +{ + armMoveAddressToReg(RSCRATCHADDR, addr); + armAsm->Ldr(reg, a64::MemOperand(RSCRATCHADDR)); +} + +void armStorePtr(const vixl::aarch64::CPURegister& reg, const void* addr) +{ + armMoveAddressToReg(RSCRATCHADDR, addr); + armAsm->Str(reg, a64::MemOperand(RSCRATCHADDR)); +} + +void armBeginStackFrame(bool save_fpr) +{ + // save x19 through x28, x29 could also be used + armAsm->Sub(a64::sp, a64::sp, save_fpr ? 192 : 144); + armAsm->Stp(a64::x19, a64::x20, a64::MemOperand(a64::sp, 32)); + armAsm->Stp(a64::x21, a64::x22, a64::MemOperand(a64::sp, 48)); + armAsm->Stp(a64::x23, a64::x24, a64::MemOperand(a64::sp, 64)); + armAsm->Stp(a64::x25, a64::x26, a64::MemOperand(a64::sp, 80)); + armAsm->Stp(a64::x27, a64::x28, a64::MemOperand(a64::sp, 96)); + armAsm->Stp(a64::x29, a64::lr, a64::MemOperand(a64::sp, 112)); + if (save_fpr) + { + armAsm->Stp(a64::d8, a64::d9, a64::MemOperand(a64::sp, 128)); + armAsm->Stp(a64::d10, a64::d11, a64::MemOperand(a64::sp, 144)); + armAsm->Stp(a64::d12, a64::d13, a64::MemOperand(a64::sp, 160)); + armAsm->Stp(a64::d14, a64::d15, a64::MemOperand(a64::sp, 176)); + } +} + +void armEndStackFrame(bool save_fpr) +{ + if (save_fpr) + { + armAsm->Ldp(a64::d14, a64::d15, a64::MemOperand(a64::sp, 176)); + armAsm->Ldp(a64::d12, a64::d13, a64::MemOperand(a64::sp, 160)); + armAsm->Ldp(a64::d10, a64::d11, a64::MemOperand(a64::sp, 144)); + armAsm->Ldp(a64::d8, a64::d9, a64::MemOperand(a64::sp, 128)); + } + armAsm->Ldp(a64::x29, a64::lr, a64::MemOperand(a64::sp, 112)); + armAsm->Ldp(a64::x27, a64::x28, a64::MemOperand(a64::sp, 96)); + armAsm->Ldp(a64::x25, a64::x26, a64::MemOperand(a64::sp, 80)); + armAsm->Ldp(a64::x23, a64::x24, a64::MemOperand(a64::sp, 64)); + armAsm->Ldp(a64::x21, a64::x22, a64::MemOperand(a64::sp, 48)); + armAsm->Ldp(a64::x19, a64::x20, a64::MemOperand(a64::sp, 32)); + armAsm->Add(a64::sp, a64::sp, save_fpr ? 192 : 144); +} + +bool armIsCalleeSavedRegister(int reg) +{ + // same on both linux and windows + return (reg >= 19); +} + +vixl::aarch64::MemOperand armOffsetMemOperand(const vixl::aarch64::MemOperand& op, s64 offset) +{ + pxAssert(op.GetBaseRegister().IsValid() && op.GetAddrMode() == vixl::aarch64::Offset && op.GetShift() == vixl::aarch64::NO_SHIFT); + return vixl::aarch64::MemOperand(op.GetBaseRegister(), op.GetOffset() + offset, op.GetAddrMode()); +} + +void armGetMemOperandInRegister(const vixl::aarch64::Register& addr_reg, const vixl::aarch64::MemOperand& op, s64 extra_offset /*= 0*/) +{ + pxAssert(addr_reg.IsX()); + pxAssert(op.GetBaseRegister().IsValid() && op.GetAddrMode() == vixl::aarch64::Offset && op.GetShift() == vixl::aarch64::NO_SHIFT); + armAsm->Add(addr_reg, op.GetBaseRegister(), op.GetOffset() + extra_offset); +} + +void armLoadConstant128(const vixl::aarch64::VRegister& reg, const void* ptr) +{ + u64 low, high; + memcpy(&low, ptr, sizeof(low)); + memcpy(&high, static_cast(ptr) + sizeof(low), sizeof(high)); + armAsm->Ldr(reg, high, low); +} + +void armEmitVTBL(const vixl::aarch64::VRegister& dst, const vixl::aarch64::VRegister& src1, const vixl::aarch64::VRegister& src2, const vixl::aarch64::VRegister& tbl) +{ + pxAssert(src1.GetCode() != RQSCRATCH.GetCode() && src2.GetCode() != RQSCRATCH2.GetCode()); + pxAssert(tbl.GetCode() != RQSCRATCH.GetCode() && tbl.GetCode() != RQSCRATCH2.GetCode()); + + // must be consecutive + if (src2.GetCode() == (src1.GetCode() + 1)) + { + armAsm->Tbl(dst.V16B(), src1.V16B(), src2.V16B(), tbl.V16B()); + return; + } + + armAsm->Mov(RQSCRATCH.Q(), src1.Q()); + armAsm->Mov(RQSCRATCH2.Q(), src2.Q()); + armAsm->Tbl(dst.V16B(), RQSCRATCH.V16B(), RQSCRATCH2.V16B(), tbl.V16B()); +} + + +void ArmConstantPool::Init(void* ptr, u32 capacity) +{ + m_base_ptr = static_cast(ptr); + m_capacity = capacity; + m_used = 0; + m_jump_targets.clear(); + m_literals.clear(); +} + +void ArmConstantPool::Destroy() +{ + m_base_ptr = nullptr; + m_capacity = 0; + m_used = 0; + m_jump_targets.clear(); + m_literals.clear(); +} + +void ArmConstantPool::Reset() +{ + m_used = 0; + m_jump_targets.clear(); + m_literals.clear(); +} + +u8* ArmConstantPool::GetJumpTrampoline(const void* target) +{ + auto it = m_jump_targets.find(target); + if (it != m_jump_targets.end()) + return m_base_ptr + it->second; + + // align to 16 bytes? + const u32 offset = Common::AlignUpPow2(m_used, 16); + + // 4 movs plus a jump + if ((m_capacity - offset) < 20) + { + Console.Error("Ran out of space in constant pool"); + return nullptr; + } + + a64::MacroAssembler masm(static_cast(m_base_ptr + offset), m_capacity - offset); + masm.Mov(RXVIXLSCRATCH, reinterpret_cast(target)); + masm.Br(RXVIXLSCRATCH); + masm.FinalizeCode(); + + pxAssert(masm.GetSizeOfCodeGenerated() < 20); + m_jump_targets.emplace(target, offset); + m_used = offset + static_cast(masm.GetSizeOfCodeGenerated()); + + HostSys::FlushInstructionCache(reinterpret_cast(m_base_ptr + offset), m_used - offset); + + return m_base_ptr + offset; +} + +u8* ArmConstantPool::GetLiteral(u64 value) +{ + return GetLiteral(u128::From64(value)); +} + +u8* ArmConstantPool::GetLiteral(const u128& value) +{ + auto it = m_literals.find(value); + if (it != m_literals.end()) + return m_base_ptr + it->second; + + if (GetRemainingCapacity() < 8) + return nullptr; + + const u32 offset = Common::AlignUpPow2(m_used, 16); + std::memcpy(&m_base_ptr[offset], &value, sizeof(value)); + m_used = offset + sizeof(value); + return m_base_ptr + offset; +} + +u8* ArmConstantPool::GetLiteral(const u8* bytes, size_t len) +{ + pxAssertMsg(len <= 16, "literal length is less than 16 bytes"); + u128 table_u128 = {}; + std::memcpy(table_u128._u8, bytes, len); + return GetLiteral(table_u128); +} + +void ArmConstantPool::EmitLoadLiteral(const vixl::aarch64::CPURegister& reg, const u8* literal) const +{ + armMoveAddressToReg(RXVIXLSCRATCH, literal); + armAsm->Ldr(reg, a64::MemOperand(RXVIXLSCRATCH)); +} diff --git a/pcsx2/arm64/AsmHelpers.h b/pcsx2/arm64/AsmHelpers.h new file mode 100644 index 0000000000..04ae4ae0a3 --- /dev/null +++ b/pcsx2/arm64/AsmHelpers.h @@ -0,0 +1,146 @@ +// SPDX-FileCopyrightText: 2021-2024 Connor McLaughlin , PCSX2 Team +// SPDX-License-Identifier: GPL-3.0 + +#pragma once + +#include "common/Pcsx2Defs.h" +#include "common/HashCombine.h" + +#include "vixl/aarch64/constants-aarch64.h" +#include "vixl/aarch64/macro-assembler-aarch64.h" + +#include + +#define RWRET vixl::aarch64::w0 +#define RXRET vixl::aarch64::x0 +#define RQRET vixl::aarch64::q0 + +#define RWARG1 vixl::aarch64::w0 +#define RWARG2 vixl::aarch64::w1 +#define RWARG3 vixl::aarch64::w2 +#define RWARG4 vixl::aarch64::w3 +#define RXARG1 vixl::aarch64::x0 +#define RXARG2 vixl::aarch64::x1 +#define RXARG3 vixl::aarch64::x2 +#define RXARG4 vixl::aarch64::x3 + +#define RXVIXLSCRATCH vixl::aarch64::x16 +#define RWVIXLSCRATCH vixl::aarch64::w16 +#define RSCRATCHADDR vixl::aarch64::x17 + +#define RQSCRATCH vixl::aarch64::q30 +#define RDSCRATCH vixl::aarch64::d30 +#define RSSCRATCH vixl::aarch64::s30 +#define RQSCRATCH2 vixl::aarch64::q31 +#define RDSCRATCH2 vixl::aarch64::d31 +#define RSSCRATCH2 vixl::aarch64::s31 +#define RQSCRATCH3 vixl::aarch64::q29 +#define RDSCRATCH3 vixl::aarch64::d29 +#define RSSCRATCH3 vixl::aarch64::s29 + +#define RQSCRATCHI vixl::aarch64::VRegister(30, 128, 16) +#define RQSCRATCHF vixl::aarch64::VRegister(30, 128, 4) +#define RQSCRATCHD vixl::aarch64::VRegister(30, 128, 2) + +#define RQSCRATCH2I vixl::aarch64::VRegister(31, 128, 16) +#define RQSCRATCH2F vixl::aarch64::VRegister(31, 128, 4) +#define RQSCRATCH2D vixl::aarch64::VRegister(31, 128, 2) + +static inline s64 GetPCDisplacement(const void* current, const void* target) +{ + return static_cast((reinterpret_cast(target) - reinterpret_cast(current)) >> 2); +} + +const vixl::aarch64::Register& armWRegister(int n); +const vixl::aarch64::Register& armXRegister(int n); +const vixl::aarch64::VRegister& armSRegister(int n); +const vixl::aarch64::VRegister& armDRegister(int n); +const vixl::aarch64::VRegister& armQRegister(int n); + +class ArmConstantPool; + +static const u32 SP_SCRATCH_OFFSET = 0; + +extern thread_local vixl::aarch64::MacroAssembler* armAsm; +extern thread_local u8* armAsmPtr; +extern thread_local size_t armAsmCapacity; +extern thread_local ArmConstantPool* armConstantPool; + +static __fi bool armHasBlock() +{ + return (armAsm != nullptr); +} + +static __fi u8* armGetCurrentCodePointer() +{ + return static_cast(armAsmPtr) + armAsm->GetCursorOffset(); +} + +__fi static u8* armGetAsmPtr() +{ + return armAsmPtr; +} + +void armSetAsmPtr(void* ptr, size_t capacity, ArmConstantPool* pool); +void armAlignAsmPtr(); +u8* armStartBlock(); +u8* armEndBlock(); + +void armDisassembleAndDumpCode(const void* ptr, size_t size); +void armEmitJmp(const void* ptr, bool force_inline = false); +void armEmitCall(const void* ptr, bool force_inline = false); +void armEmitCbnz(const vixl::aarch64::Register& reg, const void* ptr); +void armEmitCondBranch(vixl::aarch64::Condition cond, const void* ptr); +void armMoveAddressToReg(const vixl::aarch64::Register& reg, const void* addr); +void armLoadPtr(const vixl::aarch64::CPURegister& reg, const void* addr); +void armStorePtr(const vixl::aarch64::CPURegister& reg, const void* addr); +void armBeginStackFrame(bool save_fpr); +void armEndStackFrame(bool save_fpr); +bool armIsCalleeSavedRegister(int reg); + +vixl::aarch64::MemOperand armOffsetMemOperand(const vixl::aarch64::MemOperand& op, s64 offset); +void armGetMemOperandInRegister(const vixl::aarch64::Register& addr_reg, + const vixl::aarch64::MemOperand& op, s64 extra_offset = 0); + +void armLoadConstant128(const vixl::aarch64::VRegister& reg, const void* ptr); + +// may clobber RSCRATCH/RSCRATCH2. they shouldn't be inputs. +void armEmitVTBL(const vixl::aarch64::VRegister& dst, const vixl::aarch64::VRegister& src1, + const vixl::aarch64::VRegister& src2, const vixl::aarch64::VRegister& tbl); + +////////////////////////////////////////////////////////////////////////// + +class ArmConstantPool +{ +public: + void Init(void* ptr, u32 capacity); + void Destroy(); + void Reset(); + + u8* GetJumpTrampoline(const void* target); + u8* GetLiteral(u64 value); + u8* GetLiteral(const u128& value); + u8* GetLiteral(const u8* bytes, size_t len); + + void EmitLoadLiteral(const vixl::aarch64::CPURegister& reg, const u8* literal) const; + +private: + __fi u32 GetRemainingCapacity() const { return m_capacity - m_used; } + + struct u128_hash + { + std::size_t operator()(const u128& v) const + { + std::size_t s = 0; + HashCombine(s, v.lo, v.hi); + return s; + } + }; + + std::unordered_map m_jump_targets; + std::unordered_map m_literals; + + u8* m_base_ptr = nullptr; + u32 m_capacity = 0; + u32 m_used = 0; +}; diff --git a/pcsx2/arm64/newVif_Dynarec.cpp b/pcsx2/arm64/newVif_Dynarec.cpp new file mode 100644 index 0000000000..6410ee8c11 --- /dev/null +++ b/pcsx2/arm64/newVif_Dynarec.cpp @@ -0,0 +1,585 @@ +// SPDX-FileCopyrightText: 2021-2023 Connor McLaughlin , PCSX2 Team +// SPDX-License-Identifier: GPL-3.0 + +#include "arm64/newVif_UnpackNEON.h" +#include "arm64/AsmHelpers.h" +#include "MTVU.h" + +#include "common/Assertions.h" +#include "common/Perf.h" +#include "common/StringUtil.h" + +namespace a64 = vixl::aarch64; + +static void mVUmergeRegs(const vixl::aarch64::VRegister& dest, const vixl::aarch64::VRegister& src, int xyzw, bool modXYZW = false, bool canModifySrc = false) +{ + xyzw &= 0xf; + if ((dest.GetCode() != src.GetCode()) && (xyzw != 0)) + { + if (xyzw == 0x8) + armAsm->Mov(dest.V4S(), 0, src.V4S(), 0); + else if (xyzw == 0xf) + armAsm->Mov(dest.Q(), src.Q()); + else + { + if (modXYZW) + { + if (xyzw == 1) + { + armAsm->Ins(dest.V4S(), 3, src.V4S(), 0); + return; + } + else if (xyzw == 2) + { + armAsm->Ins(dest.V4S(), 2, src.V4S(), 0); + return; + } + else if (xyzw == 4) + { + armAsm->Ins(dest.V4S(), 1, src.V4S(), 0); + return; + } + } + + if (xyzw == 0) + return; + if (xyzw == 15) + { + armAsm->Mov(dest, src); + return; + } + if (xyzw == 14 && canModifySrc) + { + // xyz - we can get rid of the mov if we swap the RA around + armAsm->Mov(src.V4S(), 3, dest.V4S(), 3); + armAsm->Mov(dest.V16B(), src.V16B()); + return; + } + + // reverse + xyzw = ((xyzw & 1) << 3) | ((xyzw & 2) << 1) | ((xyzw & 4) >> 1) | ((xyzw & 8) >> 3); + + if ((xyzw & 3) == 3) + { + // xy + armAsm->Mov(dest.V2D(), 0, src.V2D(), 0); + xyzw &= ~3; + } + else if ((xyzw & 12) == 12) + { + // zw + armAsm->Mov(dest.V2D(), 1, src.V2D(), 1); + xyzw &= ~12; + } + + // xyzw + for (u32 i = 0; i < 4; i++) + { + if (xyzw & (1u << i)) + armAsm->Mov(dest.V4S(), i, src.V4S(), i); + } + } + } +} + +static void maskedVecWrite(const a64::VRegister& reg, const a64::MemOperand& addr, int xyzw) +{ + switch (xyzw) + { + case 5: // YW + armGetMemOperandInRegister(RSCRATCHADDR, addr, 4); + armAsm->St1(reg.V4S(), 1, a64::MemOperand(RSCRATCHADDR)); // Y + armGetMemOperandInRegister(RSCRATCHADDR, addr, 12); + armAsm->St1(reg.V4S(), 3, a64::MemOperand(RSCRATCHADDR)); // W + break; + + case 9: // XW + armGetMemOperandInRegister(RSCRATCHADDR, addr, 12); + armAsm->Str(reg.S(), addr); // X + armAsm->St1(reg.V4S(), 3, a64::MemOperand(RSCRATCHADDR)); // W + break; + + case 10: //XZ + armGetMemOperandInRegister(RSCRATCHADDR, addr, 8); + armAsm->Str(reg.S(), addr); // X + armAsm->St1(reg.V4S(), 2, a64::MemOperand(RSCRATCHADDR)); // Z + break; + + case 3: // ZW + armGetMemOperandInRegister(RSCRATCHADDR, addr, 8); + armAsm->St1(reg.V2D(), 1, a64::MemOperand(RSCRATCHADDR)); + break; + + case 11: //XZW + armGetMemOperandInRegister(RSCRATCHADDR, addr, 8); + armAsm->Str(reg.S(), addr); // X + armAsm->St1(reg.V2D(), 1, a64::MemOperand(RSCRATCHADDR)); // ZW + break; + + case 13: // XYW + armGetMemOperandInRegister(RSCRATCHADDR, addr, 12); + armAsm->Str(reg.D(), addr); + armAsm->St1(reg.V4S(), 3, a64::MemOperand(RSCRATCHADDR)); + break; + + case 6: // YZ + armGetMemOperandInRegister(RSCRATCHADDR, addr, 4); + armAsm->St1(reg.V4S(), 1, a64::MemOperand(RSCRATCHADDR, 4, a64::PostIndex)); + armAsm->St1(reg.V4S(), 2, a64::MemOperand(RSCRATCHADDR)); + break; + + case 7: // YZW + armGetMemOperandInRegister(RSCRATCHADDR, addr, 4); + armAsm->St1(reg.V4S(), 1, a64::MemOperand(RSCRATCHADDR, 4, a64::PostIndex)); + armAsm->St1(reg.V2D(), 1, a64::MemOperand(RSCRATCHADDR)); + break; + + case 12: // XY + armAsm->Str(reg.D(), addr); + break; + + case 14: // XYZ + armGetMemOperandInRegister(RSCRATCHADDR, addr, 8); + armAsm->Str(reg.D(), addr); + armAsm->St1(reg.V4S(), 2, a64::MemOperand(RSCRATCHADDR)); // Z + break; + + case 4: + armGetMemOperandInRegister(RSCRATCHADDR, addr, 4); + armAsm->St1(reg.V4S(), 1, a64::MemOperand(RSCRATCHADDR)); + break; // Y + case 2: + armGetMemOperandInRegister(RSCRATCHADDR, addr, 8); + armAsm->St1(reg.V4S(), 2, a64::MemOperand(RSCRATCHADDR)); + break; // Z + case 1: + armGetMemOperandInRegister(RSCRATCHADDR, addr, 12); + armAsm->St1(reg.V4S(), 3, a64::MemOperand(RSCRATCHADDR)); + break; // W + case 8: + armAsm->Str(reg.S(), addr); + break; // X + + case 0: + Console.Error("maskedVecWrite case 0!"); + break; + + default: + armAsm->Str(reg.Q(), addr); + break; // XYZW + } +} + +void dVifReset(int idx) +{ + nVif[idx].vifBlocks.reset(); + + const size_t offset = idx ? HostMemoryMap::VIF1recOffset : HostMemoryMap::VIF0recOffset; + const size_t size = idx ? HostMemoryMap::VIF1recSize : HostMemoryMap::VIF0recSize; + nVif[idx].recWritePtr = SysMemory::GetCodePtr(offset); + nVif[idx].recEndPtr = nVif[idx].recWritePtr + (size - _256kb); +} + +void dVifRelease(int idx) +{ + nVif[idx].vifBlocks.clear(); +} + +VifUnpackNEON_Dynarec::VifUnpackNEON_Dynarec(const nVifStruct& vif_, const nVifBlock& vifBlock_) + : v(vif_) + , vB(vifBlock_) +{ + const int wl = vB.wl ? vB.wl : 256; //0 is taken as 256 (KH2) + isFill = (vB.cl < wl); + usn = (vB.upkType >> 5) & 1; + doMask = (vB.upkType >> 4) & 1; + doMode = vB.mode & 3; + IsAligned = vB.aligned; + vCL = 0; +} + +__fi void makeMergeMask(u32& x) +{ + x = ((x & 0x40) >> 6) | ((x & 0x10) >> 3) | (x & 4) | ((x & 1) << 3); +} + +__fi void VifUnpackNEON_Dynarec::SetMasks(int cS) const +{ + const int idx = v.idx; + const vifStruct& vif = MTVU_VifX; + + //This could have ended up copying the row when there was no row to write.1810080 + u32 m0 = vB.mask; //The actual mask example 0x03020100 + u32 m3 = ((m0 & 0xaaaaaaaa) >> 1) & ~m0; //all the upper bits, so our example 0x01010000 & 0xFCFDFEFF = 0x00010000 just the cols (shifted right for maskmerge) + u32 m2 = (m0 & 0x55555555) & (~m0 >> 1); // 0x1000100 & 0xFE7EFF7F = 0x00000100 Just the row + + if ((doMask && m2) || doMode) + { + armLoadPtr(xmmRow, &vif.MaskRow); + MSKPATH3_LOG("Moving row"); + } + if (doMask && m3) + { + VIF_LOG("Merging Cols"); + armLoadPtr(xmmCol0, &vif.MaskCol); + if ((cS >= 2) && (m3 & 0x0000ff00)) + armAsm->Dup(xmmCol1.V4S(), xmmCol0.V4S(), 1); + if ((cS >= 3) && (m3 & 0x00ff0000)) + armAsm->Dup(xmmCol2.V4S(), xmmCol0.V4S(), 2); + if ((cS >= 4) && (m3 & 0xff000000)) + armAsm->Dup(xmmCol3.V4S(), xmmCol0.V4S(), 3); + if ((cS >= 1) && (m3 & 0x000000ff)) + armAsm->Dup(xmmCol0.V4S(), xmmCol0.V4S(), 0); + } + //if (doMask||doMode) loadRowCol((nVifStruct&)v); +} + +void VifUnpackNEON_Dynarec::doMaskWrite(const vixl::aarch64::VRegister& regX) const +{ + pxAssertMsg(regX.GetCode() <= 1, "Reg Overflow! XMM2 thru XMM6 are reserved for masking."); + + const int cc = std::min(vCL, 3); + u32 m0 = (vB.mask >> (cc * 8)) & 0xff; //The actual mask example 0xE4 (protect, col, row, clear) + u32 m3 = ((m0 & 0xaa) >> 1) & ~m0; //all the upper bits (cols shifted right) cancelling out any write protects 0x10 + u32 m2 = (m0 & 0x55) & (~m0 >> 1); // all the lower bits (rows)cancelling out any write protects 0x04 + u32 m4 = (m0 & ~((m3 << 1) | m2)) & 0x55; // = 0xC0 & 0x55 = 0x40 (for merge mask) + + makeMergeMask(m2); + makeMergeMask(m3); + makeMergeMask(m4); + + if (doMask && m2) // Merge MaskRow + { + mVUmergeRegs(regX, xmmRow, m2); + } + + if (doMask && m3) // Merge MaskCol + { + mVUmergeRegs(regX, armQRegister(xmmCol0.GetCode() + cc), m3); + } + + if (doMode) + { + u32 m5 = ~(m2 | m3 | m4) & 0xf; + + if (!doMask) + m5 = 0xf; + + if (m5 < 0xf) + { + armAsm->Movi(xmmTemp.V4S(), 0); + if (doMode == 3) + { + mVUmergeRegs(xmmRow, regX, m5, false, false); + } + else + { + mVUmergeRegs(xmmTemp, xmmRow, m5, false, false); + armAsm->Add(regX.V4S(), regX.V4S(), xmmTemp.V4S()); + if (doMode == 2) + mVUmergeRegs(xmmRow, regX, m5, false, false); + } + } + else + { + if (doMode == 3) + { + armAsm->Mov(xmmRow, regX); + } + else + { + armAsm->Add(regX.V4S(), regX.V4S(), xmmRow.V4S()); + if (doMode == 2) + { + armAsm->Mov(xmmRow, regX); + } + } + } + } + + if (doMask && m4) + maskedVecWrite(regX, dstIndirect, m4 ^ 0xf); + else + armAsm->Str(regX, dstIndirect); +} + +void VifUnpackNEON_Dynarec::writeBackRow() const +{ + const int idx = v.idx; + armStorePtr(xmmRow, &(MTVU_VifX.MaskRow)); + + VIF_LOG("nVif: writing back row reg! [doMode = %d]", doMode); +} + +void VifUnpackNEON_Dynarec::ModUnpack(int upknum, bool PostOp) +{ + switch (upknum) + { + case 0: + case 1: + case 2: + if (PostOp) + { + UnpkLoopIteration++; + UnpkLoopIteration = UnpkLoopIteration & 0x3; + } + break; + + case 4: + case 5: + case 6: + if (PostOp) + { + UnpkLoopIteration++; + UnpkLoopIteration = UnpkLoopIteration & 0x1; + } + break; + + case 8: + if (PostOp) + { + UnpkLoopIteration++; + UnpkLoopIteration = UnpkLoopIteration & 0x1; + } + break; + case 9: + if (!PostOp) + { + UnpkLoopIteration++; + } + break; + case 10: + if (!PostOp) + { + UnpkLoopIteration++; + } + break; + + case 12: + break; + case 13: + break; + case 14: + break; + case 15: + break; + + case 3: + case 7: + case 11: + pxFailRel(fmt::format("Vpu/Vif - Invalid Unpack! [{}]", upknum).c_str()); + break; + } +} + +void VifUnpackNEON_Dynarec::ProcessMasks() +{ + skipProcessing = false; + inputMasked = false; + + if (!doMask) + return; + + const int cc = std::min(vCL, 3); + const u32 full_mask = (vB.mask >> (cc * 8)) & 0xff; + const u32 rowcol_mask = ((full_mask >> 1) | full_mask) & 0x55; // Rows or Cols being written instead of data, or protected. + + // Every channel is write protected for this cycle, no need to process anything. + skipProcessing = full_mask == 0xff; + + // All channels are masked, no reason to process anything here. + inputMasked = rowcol_mask == 0x55; +} + +void VifUnpackNEON_Dynarec::CompileRoutine() +{ + const int wl = vB.wl ? vB.wl : 256; //0 is taken as 256 (KH2) + const int upkNum = vB.upkType & 0xf; + const u8& vift = nVifT[upkNum]; + const int cycleSize = isFill ? vB.cl : wl; + const int blockSize = isFill ? wl : vB.cl; + const int skipSize = blockSize - cycleSize; + + uint vNum = vB.num ? vB.num : 256; + doMode = (upkNum == 0xf) ? 0 : doMode; // V4_5 has no mode feature. + UnpkNoOfIterations = 0; + VIF_LOG("Compiling new block, unpack number %x, mode %x, masking %x, vNum %x", upkNum, doMode, doMask, vNum); + + pxAssume(vCL == 0); + + // Value passed determines # of col regs we need to load + SetMasks(isFill ? blockSize : cycleSize); + + while (vNum) + { + // Determine if reads/processing can be skipped. + ProcessMasks(); + + if (vCL < cycleSize) + { + ModUnpack(upkNum, false); + xUnpack(upkNum); + xMovDest(); + ModUnpack(upkNum, true); + + dstIndirect = armOffsetMemOperand(dstIndirect, 16); + srcIndirect = armOffsetMemOperand(srcIndirect, vift); + + vNum--; + if (++vCL == blockSize) + vCL = 0; + } + else if (isFill) + { + xUnpack(upkNum); + xMovDest(); + + // dstIndirect += 16; + dstIndirect = armOffsetMemOperand(dstIndirect, 16); + + vNum--; + if (++vCL == blockSize) + vCL = 0; + } + else + { + // dstIndirect += (16 * skipSize); + dstIndirect = armOffsetMemOperand(dstIndirect, 16 * skipSize); + vCL = 0; + } + } + + if (doMode >= 2) + writeBackRow(); + + armAsm->Ret(); +} + +static u16 dVifComputeLength(uint cl, uint wl, u8 num, bool isFill) +{ + uint length = (num > 0) ? (num * 16) : 4096; // 0 = 256 + + if (!isFill) + { + uint skipSize = (cl - wl) * 16; + uint blocks = (num + (wl - 1)) / wl; //Need to round up num's to calculate skip size correctly. + length += (blocks - 1) * skipSize; + } + + return std::min(length, 0xFFFFu); +} + +_vifT __fi nVifBlock* dVifCompile(nVifBlock& block, bool isFill) +{ + nVifStruct& v = nVif[idx]; + + // Check size before the compilation + if (v.recWritePtr >= v.recEndPtr) + { + DevCon.WriteLn("nVif Recompiler Cache Reset! [0x%016" PRIXPTR " > 0x%016" PRIXPTR "]", + v.recWritePtr, v.recEndPtr); + dVifReset(idx); + } + + // Compile the block now + armSetAsmPtr(v.recWritePtr, v.recEndPtr - v.recWritePtr, nullptr); + + block.startPtr = (uptr)armStartBlock(); + block.length = dVifComputeLength(block.cl, block.wl, block.num, isFill); + v.vifBlocks.add(block); + + VifUnpackNEON_Dynarec(v, block).CompileRoutine(); + + Perf::vif.RegisterPC(v.recWritePtr, armGetCurrentCodePointer() - v.recWritePtr, block.upkType /* FIXME ideally a key*/); + v.recWritePtr = armEndBlock(); + + return █ +} + +_vifT __fi void dVifUnpack(const u8* data, bool isFill) +{ + nVifStruct& v = nVif[idx]; + vifStruct& vif = MTVU_VifX; + VIFregisters& vifRegs = MTVU_VifXRegs; + + const u8 upkType = (vif.cmd & 0x1f) | (vif.usn << 5); + const int doMask = isFill ? 1 : (vif.cmd & 0x10); + + nVifBlock block; + + // Performance note: initial code was using u8/u16 field of the struct + // directly. However reading back the data (as u32) in HashBucket.find + // leads to various memory stalls. So it is way faster to manually build the data + // in u32 (aka x86 register). + // + // Warning the order of data in hash_key/key0/key1 depends on the nVifBlock struct + u32 hash_key = (u32)(upkType & 0xFF) << 8 | (vifRegs.num & 0xFF); + + u32 key1 = ((u32)vifRegs.cycle.wl << 24) | ((u32)vifRegs.cycle.cl << 16) | ((u32)(vif.start_aligned & 0xFF) << 8) | ((u32)vifRegs.mode & 0xFF); + if ((upkType & 0xf) != 9) + key1 &= 0xFFFF01FF; + + // Zero out the mask parameter if it's unused -- games leave random junk + // values here which cause false recblock cache misses. + u32 key0 = doMask ? vifRegs.mask : 0; + + block.hash_key = hash_key; + block.key0 = key0; + block.key1 = key1; + + //DevCon.WriteLn("nVif%d: Recompiled Block!", idx); + //DevCon.WriteLn(L"[num=% 3d][upkType=0x%02x][scl=%d][cl=%d][wl=%d][mode=%d][m=%d][mask=%s]", + // block.num, block.upkType, block.scl, block.cl, block.wl, block.mode, + // doMask >> 4, doMask ? wxsFormat( L"0x%08x", block.mask ).c_str() : L"ignored" + //); + + // Seach in cache before trying to compile the block + nVifBlock* b = v.vifBlocks.find(block); + if (!b) [[unlikely]] + { + b = dVifCompile(block, isFill); + } + + { // Execute the block + const VURegs& VU = vuRegs[idx]; + const uint vuMemLimit = idx ? 0x4000 : 0x1000; + + u8* startmem = VU.Mem + (vif.tag.addr & (vuMemLimit - 0x10)); + u8* endmem = VU.Mem + vuMemLimit; + + if ((startmem + b->length) <= endmem) [[likely]] + { +#if 1 + // No wrapping, you can run the fast dynarec + ((nVifrecCall)b->startPtr)((uptr)startmem, (uptr)data); +#else + // comparison mode + static u8 tmpbuf[512 * 1024]; + ((nVifrecCall)b->startPtr)((uptr)tmpbuf, (uptr)data); + + _nVifUnpack(idx, data, vifRegs.mode, isFill); + + const u32 words = b->length / 4; + for (u32 i = 0; i < words; i++) + { + if (*((u32*)tmpbuf + i) != *((u32*)startmem + i)) + { + // fprintf(stderr, "%08X %08X @ %u\n", *((u32*)tmpbuf + i), *((u32*)startmem + i), i); + pauseCCC(*((u32*)tmpbuf + i), *((u32*)startmem + i), i); + ((nVifrecCall)b->startPtr)((uptr)tmpbuf, (uptr)data); + break; + } + } +#endif + } + else + { + VIF_LOG("Running Interpreter Block: nVif%x - VU Mem Ptr Overflow; falling back to interpreter. Start = %x End = %x num = %x, wl = %x, cl = %x", + v.idx, vif.tag.addr, vif.tag.addr + (block.num * 16), block.num, block.wl, block.cl); + _nVifUnpack(idx, data, vifRegs.mode, isFill); + } + } +} + +template void dVifUnpack<0>(const u8* data, bool isFill); +template void dVifUnpack<1>(const u8* data, bool isFill); diff --git a/pcsx2/arm64/newVif_UnpackNEON.cpp b/pcsx2/arm64/newVif_UnpackNEON.cpp new file mode 100644 index 0000000000..2cb603bbd0 --- /dev/null +++ b/pcsx2/arm64/newVif_UnpackNEON.cpp @@ -0,0 +1,425 @@ +// SPDX-FileCopyrightText: 2021-2023 Connor McLaughlin , PCSX2 Team +// SPDX-License-Identifier: GPL-3.0 + +#include "newVif_UnpackNEON.h" +#include "common/Perf.h" + +namespace a64 = vixl::aarch64; + +// ===================================================================================================== +// VifUnpackSSE_Base Section +// ===================================================================================================== +VifUnpackNEON_Base::VifUnpackNEON_Base() + : usn(false) + , doMask(false) + , UnpkLoopIteration(0) + , UnpkNoOfIterations(0) + , IsAligned(0) + , dstIndirect(a64::MemOperand(RXARG1)) + , srcIndirect(a64::MemOperand(RXARG2)) + , workReg(a64::q1) + , destReg(a64::q0) + , workGprW(a64::w4) +{ +} + +void VifUnpackNEON_Base::xMovDest() const +{ + if (!IsWriteProtectedOp()) + { + if (IsUnmaskedOp()) + armAsm->Str(destReg, dstIndirect); + else + doMaskWrite(destReg); + } +} + +void VifUnpackNEON_Base::xShiftR(const vixl::aarch64::VRegister& regX, int n) const +{ + if (usn) + armAsm->Ushr(regX.V4S(), regX.V4S(), n); + else + armAsm->Sshr(regX.V4S(), regX.V4S(), n); +} + +void VifUnpackNEON_Base::xPMOVXX8(const vixl::aarch64::VRegister& regX) const +{ + // TODO(Stenzek): Check this + armAsm->Ldr(regX.S(), srcIndirect); + + if (usn) + { + armAsm->Ushll(regX.V8H(), regX.V8B(), 0); + armAsm->Ushll(regX.V4S(), regX.V4H(), 0); + } + else + { + armAsm->Sshll(regX.V8H(), regX.V8B(), 0); + armAsm->Sshll(regX.V4S(), regX.V4H(), 0); + } +} + +void VifUnpackNEON_Base::xPMOVXX16(const vixl::aarch64::VRegister& regX) const +{ + armAsm->Ldr(regX.D(), srcIndirect); + + if (usn) + armAsm->Ushll(regX.V4S(), regX.V4H(), 0); + else + armAsm->Sshll(regX.V4S(), regX.V4H(), 0); +} + +void VifUnpackNEON_Base::xUPK_S_32() const +{ + if (UnpkLoopIteration == 0) + armAsm->Ldr(workReg, srcIndirect); + + if (IsInputMasked()) + return; + + switch (UnpkLoopIteration) + { + case 0: + armAsm->Dup(destReg.V4S(), workReg.V4S(), 0); + break; + case 1: + armAsm->Dup(destReg.V4S(), workReg.V4S(), 1); + break; + case 2: + armAsm->Dup(destReg.V4S(), workReg.V4S(), 2); + break; + case 3: + armAsm->Dup(destReg.V4S(), workReg.V4S(), 3); + break; + } +} + +void VifUnpackNEON_Base::xUPK_S_16() const +{ + if (UnpkLoopIteration == 0) + xPMOVXX16(workReg); + + if (IsInputMasked()) + return; + + switch (UnpkLoopIteration) + { + case 0: + armAsm->Dup(destReg.V4S(), workReg.V4S(), 0); + break; + case 1: + armAsm->Dup(destReg.V4S(), workReg.V4S(), 1); + break; + case 2: + armAsm->Dup(destReg.V4S(), workReg.V4S(), 2); + break; + case 3: + armAsm->Dup(destReg.V4S(), workReg.V4S(), 3); + break; + } +} + +void VifUnpackNEON_Base::xUPK_S_8() const +{ + if (UnpkLoopIteration == 0) + xPMOVXX8(workReg); + + if (IsInputMasked()) + return; + + switch (UnpkLoopIteration) + { + case 0: + armAsm->Dup(destReg.V4S(), workReg.V4S(), 0); + break; + case 1: + armAsm->Dup(destReg.V4S(), workReg.V4S(), 1); + break; + case 2: + armAsm->Dup(destReg.V4S(), workReg.V4S(), 2); + break; + case 3: + armAsm->Dup(destReg.V4S(), workReg.V4S(), 3); + break; + } +} + +// The V2 + V3 unpacks have freaky behaviour, the manual claims "indeterminate". +// After testing on the PS2, it's very much determinate in 99% of cases +// and games like Lemmings, And1 Streetball rely on this data to be like this! +// I have commented after each shuffle to show what data is going where - Ref + +void VifUnpackNEON_Base::xUPK_V2_32() const +{ + if (UnpkLoopIteration == 0) + { + armAsm->Ldr(workReg, srcIndirect); + + if (IsInputMasked()) + return; + + armAsm->Dup(destReg.V2D(), workReg.V2D(), 0); //v1v0v1v0 + if (IsAligned) + armAsm->Ins(destReg.V4S(), 3, a64::wzr); //zero last word - tested on ps2 + } + else + { + if (IsInputMasked()) + return; + + armAsm->Dup(destReg.V2D(), workReg.V2D(), 1); //v3v2v3v2 + if (IsAligned) + armAsm->Ins(destReg.V4S(), 3, a64::wzr); //zero last word - tested on ps2 + } +} + +void VifUnpackNEON_Base::xUPK_V2_16() const +{ + if (UnpkLoopIteration == 0) + { + xPMOVXX16(workReg); + + if (IsInputMasked()) + return; + + armAsm->Dup(destReg.V2D(), workReg.V2D(), 0); //v1v0v1v0 + } + else + { + if (IsInputMasked()) + return; + + armAsm->Dup(destReg.V2D(), workReg.V2D(), 1); //v3v2v3v2 + } +} + +void VifUnpackNEON_Base::xUPK_V2_8() const +{ + if (UnpkLoopIteration == 0) + { + xPMOVXX8(workReg); + + if (IsInputMasked()) + return; + + armAsm->Dup(destReg.V2D(), workReg.V2D(), 0); //v1v0v1v0 + } + else + { + if (IsInputMasked()) + return; + + armAsm->Dup(destReg.V2D(), workReg.V2D(), 1); //v3v2v3v2 + } +} + +void VifUnpackNEON_Base::xUPK_V3_32() const +{ + if (IsInputMasked()) + return; + + armAsm->Ldr(destReg, srcIndirect); + if (UnpkLoopIteration != IsAligned) + armAsm->Ins(destReg.V4S(), 3, a64::wzr); +} + +void VifUnpackNEON_Base::xUPK_V3_16() const +{ + if (IsInputMasked()) + return; + + xPMOVXX16(destReg); + + //With V3-16, it takes the first vector from the next position as the W vector + //However - IF the end of this iteration of the unpack falls on a quadword boundary, W becomes 0 + //IsAligned is the position through the current QW in the vif packet + //Iteration counts where we are in the packet. + int result = (((UnpkLoopIteration / 4) + 1 + (4 - IsAligned)) & 0x3); + + if ((UnpkLoopIteration & 0x1) == 0 && result == 0) + armAsm->Ins(destReg.V4S(), 3, a64::wzr); //zero last word on QW boundary if whole 32bit word is used - tested on ps2 +} + +void VifUnpackNEON_Base::xUPK_V3_8() const +{ + if (IsInputMasked()) + return; + + xPMOVXX8(destReg); + if (UnpkLoopIteration != IsAligned) + armAsm->Ins(destReg.V4S(), 3, a64::wzr); +} + +void VifUnpackNEON_Base::xUPK_V4_32() const +{ + if (IsInputMasked()) + return; + + armAsm->Ldr(destReg.Q(), a64::MemOperand(srcIndirect)); +} + +void VifUnpackNEON_Base::xUPK_V4_16() const +{ + if (IsInputMasked()) + return; + + xPMOVXX16(destReg); +} + +void VifUnpackNEON_Base::xUPK_V4_8() const +{ + if (IsInputMasked()) + return; + + xPMOVXX8(destReg); +} + +void VifUnpackNEON_Base::xUPK_V4_5() const +{ + if (IsInputMasked()) + return; + + armAsm->Ldrh(workGprW, srcIndirect); + armAsm->Lsl(workGprW, workGprW, 3); // ABG|R5.000 + armAsm->Dup(destReg.V4S(), workGprW); // x|x|x|R + armAsm->Lsr(workGprW, workGprW, 8); // ABG + armAsm->Lsl(workGprW, workGprW, 3); // AB|G5.000 + armAsm->Ins(destReg.V4S(), 1, workGprW); // x|x|G|R + armAsm->Lsr(workGprW, workGprW, 8); // AB + armAsm->Lsl(workGprW, workGprW, 3); // A|B5.000 + armAsm->Ins(destReg.V4S(), 2, workGprW); // x|B|G|R + armAsm->Lsr(workGprW, workGprW, 8); // A + armAsm->Lsl(workGprW, workGprW, 7); // A.0000000 + armAsm->Ins(destReg.V4S(), 3, workGprW); // A|B|G|R + armAsm->Shl(destReg.V4S(), destReg.V4S(), 24); // can optimize to + armAsm->Ushr(destReg.V4S(), destReg.V4S(), 24); // single AND... +} + +void VifUnpackNEON_Base::xUnpack(int upknum) const +{ + switch (upknum) + { + case 0: + xUPK_S_32(); + break; + case 1: + xUPK_S_16(); + break; + case 2: + xUPK_S_8(); + break; + + case 4: + xUPK_V2_32(); + break; + case 5: + xUPK_V2_16(); + break; + case 6: + xUPK_V2_8(); + break; + + case 8: + xUPK_V3_32(); + break; + case 9: + xUPK_V3_16(); + break; + case 10: + xUPK_V3_8(); + break; + + case 12: + xUPK_V4_32(); + break; + case 13: + xUPK_V4_16(); + break; + case 14: + xUPK_V4_8(); + break; + case 15: + xUPK_V4_5(); + break; + + case 3: + case 7: + case 11: + pxFailRel(fmt::format("Vpu/Vif - Invalid Unpack! [{}]", upknum).c_str()); + break; + } +} + +// ===================================================================================================== +// VifUnpackSSE_Simple +// ===================================================================================================== + +VifUnpackNEON_Simple::VifUnpackNEON_Simple(bool usn_, bool domask_, int curCycle_) +{ + curCycle = curCycle_; + usn = usn_; + doMask = domask_; + IsAligned = true; +} + +void VifUnpackNEON_Simple::doMaskWrite(const vixl::aarch64::VRegister& regX) const +{ + armAsm->Ldr(a64::q7, dstIndirect); + + int offX = std::min(curCycle, 3); + armMoveAddressToReg(RXVIXLSCRATCH, nVifMask); + armAsm->Ldr(a64::q29, a64::MemOperand(RXVIXLSCRATCH, reinterpret_cast(nVifMask[0][offX]) - reinterpret_cast(nVifMask))); + armAsm->Ldr(a64::q30, a64::MemOperand(RXVIXLSCRATCH, reinterpret_cast(nVifMask[1][offX]) - reinterpret_cast(nVifMask))); + armAsm->Ldr(a64::q31, a64::MemOperand(RXVIXLSCRATCH, reinterpret_cast(nVifMask[2][offX]) - reinterpret_cast(nVifMask))); + armAsm->And(regX.V16B(), regX.V16B(), a64::q29.V16B()); + armAsm->And(a64::q7.V16B(), a64::q7.V16B(), a64::q30.V16B()); + armAsm->Orr(regX.V16B(), regX.V16B(), a64::q31.V16B()); + armAsm->Orr(regX.V16B(), regX.V16B(), a64::q7.V16B()); + armAsm->Str(regX, dstIndirect); +} + +// ecx = dest, edx = src +static void nVifGen(int usn, int mask, int curCycle) +{ + + int usnpart = usn * 2 * 16; + int maskpart = mask * 16; + + VifUnpackNEON_Simple vpugen(!!usn, !!mask, curCycle); + + for (int i = 0; i < 16; ++i) + { + nVifCall& ucall(nVifUpk[((usnpart + maskpart + i) * 4) + curCycle]); + ucall = NULL; + if (nVifT[i] == 0) + continue; + + ucall = (nVifCall)armStartBlock(); + vpugen.xUnpack(i); + vpugen.xMovDest(); + armAsm->Ret(); + armEndBlock(); + } +} + +void VifUnpackSSE_Init() +{ + DevCon.WriteLn("Generating NEON-optimized unpacking functions for VIF interpreters..."); + + HostSys::BeginCodeWrite(); + armSetAsmPtr(SysMemory::GetVIFUnpackRec(), SysMemory::GetVIFUnpackRecEnd() - SysMemory::GetVIFUnpackRec(), nullptr); + + for (int a = 0; a < 2; a++) + { + for (int b = 0; b < 2; b++) + { + for (int c = 0; c < 4; c++) + { + nVifGen(a, b, c); + } + } + } + + Perf::any.Register(SysMemory::GetVIFUnpackRec(), armGetAsmPtr() - SysMemory::GetVIFUnpackRec(), "VIF Unpack"); + HostSys::EndCodeWrite(); +} diff --git a/pcsx2/arm64/newVif_UnpackNEON.h b/pcsx2/arm64/newVif_UnpackNEON.h new file mode 100644 index 0000000000..3f6ea4bd0b --- /dev/null +++ b/pcsx2/arm64/newVif_UnpackNEON.h @@ -0,0 +1,146 @@ +// SPDX-FileCopyrightText: 2021-2023 Connor McLaughlin , PCSX2 Team +// SPDX-License-Identifier: GPL-3.0 + +#pragma once + +#include "Common.h" +#include "Vif_Dma.h" +#include "Vif_Dynarec.h" +#include "arm64/AsmHelpers.h" + +#define xmmCol0 vixl::aarch64::q2 +#define xmmCol1 vixl::aarch64::q3 +#define xmmCol2 vixl::aarch64::q4 +#define xmmCol3 vixl::aarch64::q5 +#define xmmRow vixl::aarch64::q6 +#define xmmTemp vixl::aarch64::q7 + +// -------------------------------------------------------------------------------------- +// VifUnpackSSE_Base +// -------------------------------------------------------------------------------------- +class VifUnpackNEON_Base +{ +public: + bool usn; // unsigned flag + bool doMask; // masking write enable flag + int UnpkLoopIteration; + int UnpkNoOfIterations; + int IsAligned; + + +protected: + vixl::aarch64::MemOperand dstIndirect; + vixl::aarch64::MemOperand srcIndirect; + vixl::aarch64::VRegister workReg; + vixl::aarch64::VRegister destReg; + vixl::aarch64::WRegister workGprW; + +public: + VifUnpackNEON_Base(); + virtual ~VifUnpackNEON_Base() = default; + + virtual void xUnpack(int upktype) const; + virtual bool IsWriteProtectedOp() const = 0; + virtual bool IsInputMasked() const = 0; + virtual bool IsUnmaskedOp() const = 0; + virtual void xMovDest() const; + +protected: + virtual void doMaskWrite(const vixl::aarch64::VRegister& regX) const = 0; + + virtual void xShiftR(const vixl::aarch64::VRegister& regX, int n) const; + virtual void xPMOVXX8(const vixl::aarch64::VRegister& regX) const; + virtual void xPMOVXX16(const vixl::aarch64::VRegister& regX) const; + + virtual void xUPK_S_32() const; + virtual void xUPK_S_16() const; + virtual void xUPK_S_8() const; + + virtual void xUPK_V2_32() const; + virtual void xUPK_V2_16() const; + virtual void xUPK_V2_8() const; + + virtual void xUPK_V3_32() const; + virtual void xUPK_V3_16() const; + virtual void xUPK_V3_8() const; + + virtual void xUPK_V4_32() const; + virtual void xUPK_V4_16() const; + virtual void xUPK_V4_8() const; + virtual void xUPK_V4_5() const; +}; + +// -------------------------------------------------------------------------------------- +// VifUnpackSSE_Simple +// -------------------------------------------------------------------------------------- +class VifUnpackNEON_Simple : public VifUnpackNEON_Base +{ + typedef VifUnpackNEON_Base _parent; + +public: + int curCycle; + +public: + VifUnpackNEON_Simple(bool usn_, bool domask_, int curCycle_); + virtual ~VifUnpackNEON_Simple() = default; + + virtual bool IsWriteProtectedOp() const { return false; } + virtual bool IsInputMasked() const { return false; } + virtual bool IsUnmaskedOp() const { return !doMask; } + +protected: + virtual void doMaskWrite(const vixl::aarch64::VRegister& regX) const; +}; + +// -------------------------------------------------------------------------------------- +// VifUnpackSSE_Dynarec +// -------------------------------------------------------------------------------------- +class VifUnpackNEON_Dynarec : public VifUnpackNEON_Base +{ + typedef VifUnpackNEON_Base _parent; + +public: + bool isFill; + int doMode; // two bit value representing difference mode + bool skipProcessing; + bool inputMasked; + +protected: + const nVifStruct& v; // vif0 or vif1 + const nVifBlock& vB; // some pre-collected data from VifStruct + int vCL; // internal copy of vif->cl + +public: + VifUnpackNEON_Dynarec(const nVifStruct& vif_, const nVifBlock& vifBlock_); + VifUnpackNEON_Dynarec(const VifUnpackNEON_Dynarec& src) // copy constructor + : _parent(src) + , v(src.v) + , vB(src.vB) + { + isFill = src.isFill; + vCL = src.vCL; + } + + virtual ~VifUnpackNEON_Dynarec() = default; + + virtual bool IsWriteProtectedOp() const { return skipProcessing; } + virtual bool IsInputMasked() const { return inputMasked; } + virtual bool IsUnmaskedOp() const { return !doMode && !doMask; } + + void ModUnpack(int upknum, bool PostOp); + void ProcessMasks(); + void CompileRoutine(); + +protected: + virtual void doMaskWrite(const vixl::aarch64::VRegister& regX) const; + void SetMasks(int cS) const; + void writeBackRow() const; + + static VifUnpackNEON_Dynarec FillingWrite(const VifUnpackNEON_Dynarec& src) + { + VifUnpackNEON_Dynarec fillingWrite(src); + fillingWrite.doMask = true; + fillingWrite.doMode = 0; + return fillingWrite; + } +}; diff --git a/pcsx2/pcsx2.vcxproj b/pcsx2/pcsx2.vcxproj index a49f77ff60..0f324f8c66 100644 --- a/pcsx2/pcsx2.vcxproj +++ b/pcsx2/pcsx2.vcxproj @@ -112,6 +112,15 @@ + + true + + + true + + + true + @@ -554,6 +563,12 @@ + + true + + + true + diff --git a/pcsx2/pcsx2.vcxproj.filters b/pcsx2/pcsx2.vcxproj.filters index a4e1643a36..b0a4fe4a0a 100644 --- a/pcsx2/pcsx2.vcxproj.filters +++ b/pcsx2/pcsx2.vcxproj.filters @@ -280,6 +280,12 @@ {9f0d3bda-76d4-42d3-87e9-ce65db9163ef} + + {8aea3ae6-9722-463a-94ac-34f3738a3153} + + + {cf847f4e-744e-4c27-a7ac-8564726fb4e6} + @@ -1398,6 +1404,15 @@ System\Ps2\GS\Renderers\Software + + System\Ps2\EmotionEngine\DMAC\Vif\Unpack\newVif\Dynarec\arm64 + + + System\Ps2\EmotionEngine\DMAC\Vif\Unpack\newVif\Dynarec\arm64 + + + Tools\arm64 + @@ -2321,6 +2336,12 @@ System\Ps2\GS + + System\Ps2\EmotionEngine\DMAC\Vif\Unpack\newVif\Dynarec\arm64 + + + Tools\arm64 +