arm64: Add VIF dynarec

This commit is contained in:
Stenzek 2024-03-21 19:24:35 +10:00 committed by Connor McLaughlin
parent 0a4c037898
commit fe9399612d
8 changed files with 1818 additions and 2 deletions

View File

@ -1018,20 +1018,37 @@ set(pcsx2x86Headers
x86/R5900_Profiler.h
)
# ARM64
set(pcsx2arm64Sources
arm64/AsmHelpers.cpp
arm64/newVif_Dynarec.cpp
arm64/newVif_UnpackNEON.cpp
)
set(pcsx2arm64Headers
arm64/AsmHelpers.h
)
# These ones benefit a lot from LTO
set(pcsx2LTOSources
${pcsx2Sources}
${pcsx2Headers}
${pcsx2IPUSources}
${pcsx2IPUHeaders}
${pcsx2x86Sources}
${pcsx2x86Headers}
${pcsx2SPU2Sources}
${pcsx2SPU2Headers}
${pcsx2GSSources}
${pcsx2GSHeaders}
)
if(_M_X86)
list(APPEND pcsx2LTOSources ${pcsx2x86Sources} ${pcsx2x86Headers})
target_link_libraries(PCSX2_FLAGS INTERFACE zydis)
elseif(_M_ARM64)
list(APPEND pcsx2LTOSources ${pcsx2arm64Sources} ${pcsx2arm64Headers})
target_link_libraries(PCSX2_FLAGS INTERFACE vixl)
endif()
if(LTO_PCSX2_CORE)
add_library(PCSX2_LTO ${pcsx2LTOSources})
if (DISABLE_ADVANCE_SIMD)

461
pcsx2/arm64/AsmHelpers.cpp Normal file
View File

@ -0,0 +1,461 @@
// SPDX-FileCopyrightText: 2021-2024 Connor McLaughlin <stenzek@gmail.com>, PCSX2 Team
// SPDX-License-Identifier: GPL-3.0
#include "arm64/AsmHelpers.h"
#include "common/Assertions.h"
#include "common/BitUtils.h"
#include "common/Console.h"
#include "common/HostSys.h"
const vixl::aarch64::Register& armWRegister(int n)
{
using namespace vixl::aarch64;
static constexpr const Register* regs[32] = {&w0, &w1, &w2, &w3, &w4, &w5, &w6, &w7, &w8, &w9, &w10,
&w11, &w12, &w13, &w14, &w15, &w16, &w17, &w18, &w19, &w20, &w21, &w22, &w23, &w24, &w25, &w26, &w27, &w28,
&w29, &w30, &w31};
pxAssert(static_cast<size_t>(n) < std::size(regs));
return *regs[n];
}
const vixl::aarch64::Register& armXRegister(int n)
{
using namespace vixl::aarch64;
static constexpr const Register* regs[32] = {&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &x8, &x9, &x10,
&x11, &x12, &x13, &x14, &x15, &x16, &x17, &x18, &x19, &x20, &x21, &x22, &x23, &x24, &x25, &x26, &x27, &x28,
&x29, &x30, &x31};
pxAssert(static_cast<size_t>(n) < std::size(regs));
return *regs[n];
}
const vixl::aarch64::VRegister& armSRegister(int n)
{
using namespace vixl::aarch64;
static constexpr const VRegister* regs[32] = {&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &vixl::aarch64::s8, &s9, &s10,
&s11, &s12, &s13, &s14, &s15, &vixl::aarch64::s16, &s17, &s18, &s19, &s20, &s21, &s22, &s23, &s24, &s25, &s26, &s27, &s28,
&s29, &s30, &s31};
pxAssert(static_cast<size_t>(n) < std::size(regs));
return *regs[n];
}
const vixl::aarch64::VRegister& armDRegister(int n)
{
using namespace vixl::aarch64;
static constexpr const VRegister* regs[32] = {&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &d8, &d9, &d10,
&d11, &d12, &d13, &d14, &d15, &d16, &d17, &d18, &d19, &d20, &d21, &d22, &d23, &d24, &d25, &d26, &d27, &d28,
&d29, &d30, &d31};
pxAssert(static_cast<size_t>(n) < std::size(regs));
return *regs[n];
}
const vixl::aarch64::VRegister& armQRegister(int n)
{
using namespace vixl::aarch64;
static constexpr const VRegister* regs[32] = {&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7, &q8, &q9, &q10,
&q11, &q12, &q13, &q14, &q15, &q16, &q17, &q18, &q19, &q20, &q21, &q22, &q23, &q24, &q25, &q26, &q27, &q28,
&q29, &q30, &q31};
pxAssert(static_cast<size_t>(n) < std::size(regs));
return *regs[n];
}
//#define INCLUDE_DISASSEMBLER
#ifdef INCLUDE_DISASSEMBLER
#include "vixl/aarch64/disasm-aarch64.h"
#endif
namespace a64 = vixl::aarch64;
thread_local a64::MacroAssembler* armAsm;
thread_local u8* armAsmPtr;
thread_local size_t armAsmCapacity;
thread_local ArmConstantPool* armConstantPool;
#ifdef INCLUDE_DISASSEMBLER
static std::mutex armDisasmMutex;
static std::unique_ptr<a64::PrintDisassembler> armDisasm;
static std::unique_ptr<a64::Decoder> armDisasmDecoder;
#endif
void armSetAsmPtr(void* ptr, size_t capacity, ArmConstantPool* pool)
{
pxAssert(!armAsm);
armAsmPtr = static_cast<u8*>(ptr);
armAsmCapacity = capacity;
armConstantPool = pool;
}
// Align to 16 bytes, apparently ARM likes that.
void armAlignAsmPtr()
{
static constexpr uintptr_t ALIGNMENT = 16;
u8* new_ptr = reinterpret_cast<u8*>((reinterpret_cast<uintptr_t>(armAsmPtr) + (ALIGNMENT - 1)) & ~(ALIGNMENT - 1));
pxAssert(static_cast<size_t>(new_ptr - armAsmPtr) <= armAsmCapacity);
armAsmCapacity -= (new_ptr - armAsmPtr);
armAsmPtr = new_ptr;
}
u8* armStartBlock()
{
armAlignAsmPtr();
HostSys::BeginCodeWrite();
pxAssert(!armAsm);
armAsm = new vixl::aarch64::MacroAssembler(static_cast<vixl::byte*>(armAsmPtr), armAsmCapacity);
armAsm->GetScratchVRegisterList()->Remove(31);
armAsm->GetScratchRegisterList()->Remove(RSCRATCHADDR.GetCode());
return armAsmPtr;
}
u8* armEndBlock()
{
pxAssert(armAsm);
armAsm->FinalizeCode();
const u32 size = static_cast<u32>(armAsm->GetSizeOfCodeGenerated());
pxAssert(size < armAsmCapacity);
delete armAsm;
armAsm = nullptr;
HostSys::EndCodeWrite();
HostSys::FlushInstructionCache(armAsmPtr, size);
armAsmPtr = armAsmPtr + size;
armAsmCapacity -= size;
return armAsmPtr;
}
void armDisassembleAndDumpCode(const void* ptr, size_t size)
{
#ifdef INCLUDE_DISASSEMBLER
std::unique_lock lock(armDisasmMutex);
if (!armDisasm)
{
armDisasm = std::make_unique<a64::PrintDisassembler>(stderr);
armDisasmDecoder = std::make_unique<a64::Decoder>();
armDisasmDecoder->AppendVisitor(armDisasm.get());
}
armDisasmDecoder->Decode(static_cast<const vixl::aarch64::Instruction*>(ptr), static_cast<const vixl::aarch64::Instruction*>(ptr) + size);
#else
Console.Error("Not compiled with INCLUDE_DISASSEMBLER");
#endif
}
void armEmitJmp(const void* ptr, bool force_inline)
{
s64 displacement = GetPCDisplacement(armGetCurrentCodePointer(), ptr);
bool use_blr = !vixl::IsInt26(displacement);
if (use_blr && armConstantPool && !force_inline)
{
if (u8* trampoline = armConstantPool->GetJumpTrampoline(ptr); trampoline)
{
displacement = GetPCDisplacement(armGetCurrentCodePointer(), trampoline);
use_blr = !vixl::IsInt26(displacement);
}
}
if (use_blr)
{
armAsm->Mov(RXVIXLSCRATCH, reinterpret_cast<uintptr_t>(ptr));
armAsm->Br(RXVIXLSCRATCH);
}
else
{
a64::SingleEmissionCheckScope guard(armAsm);
armAsm->b(displacement);
}
}
void armEmitCall(const void* ptr, bool force_inline)
{
s64 displacement = GetPCDisplacement(armGetCurrentCodePointer(), ptr);
bool use_blr = !vixl::IsInt26(displacement);
if (use_blr && armConstantPool && !force_inline)
{
if (u8* trampoline = armConstantPool->GetJumpTrampoline(ptr); trampoline)
{
displacement = GetPCDisplacement(armGetCurrentCodePointer(), trampoline);
use_blr = !vixl::IsInt26(displacement);
}
}
if (use_blr)
{
armAsm->Mov(RXVIXLSCRATCH, reinterpret_cast<uintptr_t>(ptr));
armAsm->Blr(RXVIXLSCRATCH);
}
else
{
a64::SingleEmissionCheckScope guard(armAsm);
armAsm->bl(displacement);
}
}
void armEmitCbnz(const vixl::aarch64::Register& reg, const void* ptr)
{
const s64 jump_distance =
static_cast<s64>(reinterpret_cast<intptr_t>(ptr) - reinterpret_cast<intptr_t>(armGetCurrentCodePointer()));
//pxAssert(Common::IsAligned(jump_distance, 4));
if (a64::Instruction::IsValidImmPCOffset(a64::CompareBranchType, jump_distance >> 2))
{
a64::SingleEmissionCheckScope guard(armAsm);
armAsm->cbnz(reg, jump_distance >> 2);
}
else
{
a64::MacroEmissionCheckScope guard(armAsm);
a64::Label branch_not_taken;
armAsm->cbz(reg, &branch_not_taken);
const s64 new_jump_distance =
static_cast<s64>(reinterpret_cast<intptr_t>(ptr) - reinterpret_cast<intptr_t>(armGetCurrentCodePointer()));
armAsm->b(new_jump_distance >> 2);
armAsm->bind(&branch_not_taken);
}
}
void armEmitCondBranch(a64::Condition cond, const void* ptr)
{
const s64 jump_distance =
static_cast<s64>(reinterpret_cast<intptr_t>(ptr) - reinterpret_cast<intptr_t>(armGetCurrentCodePointer()));
//pxAssert(Common::IsAligned(jump_distance, 4));
if (a64::Instruction::IsValidImmPCOffset(a64::CondBranchType, jump_distance >> 2))
{
a64::SingleEmissionCheckScope guard(armAsm);
armAsm->b(jump_distance >> 2, cond);
}
else
{
a64::MacroEmissionCheckScope guard(armAsm);
a64::Label branch_not_taken;
armAsm->b(&branch_not_taken, a64::InvertCondition(cond));
const s64 new_jump_distance =
static_cast<s64>(reinterpret_cast<intptr_t>(ptr) - reinterpret_cast<intptr_t>(armGetCurrentCodePointer()));
armAsm->b(new_jump_distance >> 2);
armAsm->bind(&branch_not_taken);
}
}
void armMoveAddressToReg(const vixl::aarch64::Register& reg, const void* addr)
{
// psxAsm->Mov(reg, static_cast<u64>(reinterpret_cast<uintptr_t>(addr)));
pxAssert(reg.IsX());
const void* current_code_ptr_page = reinterpret_cast<const void*>(
reinterpret_cast<uintptr_t>(armGetCurrentCodePointer()) & ~static_cast<uintptr_t>(0xFFF));
const void* ptr_page =
reinterpret_cast<const void*>(reinterpret_cast<uintptr_t>(addr) & ~static_cast<uintptr_t>(0xFFF));
const s64 page_displacement = GetPCDisplacement(current_code_ptr_page, ptr_page) >> 10;
const u32 page_offset = static_cast<u32>(reinterpret_cast<uintptr_t>(addr) & 0xFFFu);
if (vixl::IsInt21(page_displacement) && a64::Assembler::IsImmAddSub(page_offset))
{
{
a64::SingleEmissionCheckScope guard(armAsm);
armAsm->adrp(reg, page_displacement);
}
armAsm->Add(reg, reg, page_offset);
}
else if (vixl::IsInt21(page_displacement) && a64::Assembler::IsImmLogical(page_offset, 64))
{
{
a64::SingleEmissionCheckScope guard(armAsm);
armAsm->adrp(reg, page_displacement);
}
armAsm->Orr(reg, reg, page_offset);
}
else
{
armAsm->Mov(reg, reinterpret_cast<uintptr_t>(addr));
}
}
void armLoadPtr(const vixl::aarch64::CPURegister& reg, const void* addr)
{
armMoveAddressToReg(RSCRATCHADDR, addr);
armAsm->Ldr(reg, a64::MemOperand(RSCRATCHADDR));
}
void armStorePtr(const vixl::aarch64::CPURegister& reg, const void* addr)
{
armMoveAddressToReg(RSCRATCHADDR, addr);
armAsm->Str(reg, a64::MemOperand(RSCRATCHADDR));
}
void armBeginStackFrame(bool save_fpr)
{
// save x19 through x28, x29 could also be used
armAsm->Sub(a64::sp, a64::sp, save_fpr ? 192 : 144);
armAsm->Stp(a64::x19, a64::x20, a64::MemOperand(a64::sp, 32));
armAsm->Stp(a64::x21, a64::x22, a64::MemOperand(a64::sp, 48));
armAsm->Stp(a64::x23, a64::x24, a64::MemOperand(a64::sp, 64));
armAsm->Stp(a64::x25, a64::x26, a64::MemOperand(a64::sp, 80));
armAsm->Stp(a64::x27, a64::x28, a64::MemOperand(a64::sp, 96));
armAsm->Stp(a64::x29, a64::lr, a64::MemOperand(a64::sp, 112));
if (save_fpr)
{
armAsm->Stp(a64::d8, a64::d9, a64::MemOperand(a64::sp, 128));
armAsm->Stp(a64::d10, a64::d11, a64::MemOperand(a64::sp, 144));
armAsm->Stp(a64::d12, a64::d13, a64::MemOperand(a64::sp, 160));
armAsm->Stp(a64::d14, a64::d15, a64::MemOperand(a64::sp, 176));
}
}
void armEndStackFrame(bool save_fpr)
{
if (save_fpr)
{
armAsm->Ldp(a64::d14, a64::d15, a64::MemOperand(a64::sp, 176));
armAsm->Ldp(a64::d12, a64::d13, a64::MemOperand(a64::sp, 160));
armAsm->Ldp(a64::d10, a64::d11, a64::MemOperand(a64::sp, 144));
armAsm->Ldp(a64::d8, a64::d9, a64::MemOperand(a64::sp, 128));
}
armAsm->Ldp(a64::x29, a64::lr, a64::MemOperand(a64::sp, 112));
armAsm->Ldp(a64::x27, a64::x28, a64::MemOperand(a64::sp, 96));
armAsm->Ldp(a64::x25, a64::x26, a64::MemOperand(a64::sp, 80));
armAsm->Ldp(a64::x23, a64::x24, a64::MemOperand(a64::sp, 64));
armAsm->Ldp(a64::x21, a64::x22, a64::MemOperand(a64::sp, 48));
armAsm->Ldp(a64::x19, a64::x20, a64::MemOperand(a64::sp, 32));
armAsm->Add(a64::sp, a64::sp, save_fpr ? 192 : 144);
}
bool armIsCalleeSavedRegister(int reg)
{
// same on both linux and windows
return (reg >= 19);
}
vixl::aarch64::MemOperand armOffsetMemOperand(const vixl::aarch64::MemOperand& op, s64 offset)
{
pxAssert(op.GetBaseRegister().IsValid() && op.GetAddrMode() == vixl::aarch64::Offset && op.GetShift() == vixl::aarch64::NO_SHIFT);
return vixl::aarch64::MemOperand(op.GetBaseRegister(), op.GetOffset() + offset, op.GetAddrMode());
}
void armGetMemOperandInRegister(const vixl::aarch64::Register& addr_reg, const vixl::aarch64::MemOperand& op, s64 extra_offset /*= 0*/)
{
pxAssert(addr_reg.IsX());
pxAssert(op.GetBaseRegister().IsValid() && op.GetAddrMode() == vixl::aarch64::Offset && op.GetShift() == vixl::aarch64::NO_SHIFT);
armAsm->Add(addr_reg, op.GetBaseRegister(), op.GetOffset() + extra_offset);
}
void armLoadConstant128(const vixl::aarch64::VRegister& reg, const void* ptr)
{
u64 low, high;
memcpy(&low, ptr, sizeof(low));
memcpy(&high, static_cast<const u8*>(ptr) + sizeof(low), sizeof(high));
armAsm->Ldr(reg, high, low);
}
void armEmitVTBL(const vixl::aarch64::VRegister& dst, const vixl::aarch64::VRegister& src1, const vixl::aarch64::VRegister& src2, const vixl::aarch64::VRegister& tbl)
{
pxAssert(src1.GetCode() != RQSCRATCH.GetCode() && src2.GetCode() != RQSCRATCH2.GetCode());
pxAssert(tbl.GetCode() != RQSCRATCH.GetCode() && tbl.GetCode() != RQSCRATCH2.GetCode());
// must be consecutive
if (src2.GetCode() == (src1.GetCode() + 1))
{
armAsm->Tbl(dst.V16B(), src1.V16B(), src2.V16B(), tbl.V16B());
return;
}
armAsm->Mov(RQSCRATCH.Q(), src1.Q());
armAsm->Mov(RQSCRATCH2.Q(), src2.Q());
armAsm->Tbl(dst.V16B(), RQSCRATCH.V16B(), RQSCRATCH2.V16B(), tbl.V16B());
}
void ArmConstantPool::Init(void* ptr, u32 capacity)
{
m_base_ptr = static_cast<u8*>(ptr);
m_capacity = capacity;
m_used = 0;
m_jump_targets.clear();
m_literals.clear();
}
void ArmConstantPool::Destroy()
{
m_base_ptr = nullptr;
m_capacity = 0;
m_used = 0;
m_jump_targets.clear();
m_literals.clear();
}
void ArmConstantPool::Reset()
{
m_used = 0;
m_jump_targets.clear();
m_literals.clear();
}
u8* ArmConstantPool::GetJumpTrampoline(const void* target)
{
auto it = m_jump_targets.find(target);
if (it != m_jump_targets.end())
return m_base_ptr + it->second;
// align to 16 bytes?
const u32 offset = Common::AlignUpPow2(m_used, 16);
// 4 movs plus a jump
if ((m_capacity - offset) < 20)
{
Console.Error("Ran out of space in constant pool");
return nullptr;
}
a64::MacroAssembler masm(static_cast<vixl::byte*>(m_base_ptr + offset), m_capacity - offset);
masm.Mov(RXVIXLSCRATCH, reinterpret_cast<intptr_t>(target));
masm.Br(RXVIXLSCRATCH);
masm.FinalizeCode();
pxAssert(masm.GetSizeOfCodeGenerated() < 20);
m_jump_targets.emplace(target, offset);
m_used = offset + static_cast<u32>(masm.GetSizeOfCodeGenerated());
HostSys::FlushInstructionCache(reinterpret_cast<void*>(m_base_ptr + offset), m_used - offset);
return m_base_ptr + offset;
}
u8* ArmConstantPool::GetLiteral(u64 value)
{
return GetLiteral(u128::From64(value));
}
u8* ArmConstantPool::GetLiteral(const u128& value)
{
auto it = m_literals.find(value);
if (it != m_literals.end())
return m_base_ptr + it->second;
if (GetRemainingCapacity() < 8)
return nullptr;
const u32 offset = Common::AlignUpPow2(m_used, 16);
std::memcpy(&m_base_ptr[offset], &value, sizeof(value));
m_used = offset + sizeof(value);
return m_base_ptr + offset;
}
u8* ArmConstantPool::GetLiteral(const u8* bytes, size_t len)
{
pxAssertMsg(len <= 16, "literal length is less than 16 bytes");
u128 table_u128 = {};
std::memcpy(table_u128._u8, bytes, len);
return GetLiteral(table_u128);
}
void ArmConstantPool::EmitLoadLiteral(const vixl::aarch64::CPURegister& reg, const u8* literal) const
{
armMoveAddressToReg(RXVIXLSCRATCH, literal);
armAsm->Ldr(reg, a64::MemOperand(RXVIXLSCRATCH));
}

146
pcsx2/arm64/AsmHelpers.h Normal file
View File

@ -0,0 +1,146 @@
// SPDX-FileCopyrightText: 2021-2024 Connor McLaughlin <stenzek@gmail.com>, PCSX2 Team
// SPDX-License-Identifier: GPL-3.0
#pragma once
#include "common/Pcsx2Defs.h"
#include "common/HashCombine.h"
#include "vixl/aarch64/constants-aarch64.h"
#include "vixl/aarch64/macro-assembler-aarch64.h"
#include <unordered_map>
#define RWRET vixl::aarch64::w0
#define RXRET vixl::aarch64::x0
#define RQRET vixl::aarch64::q0
#define RWARG1 vixl::aarch64::w0
#define RWARG2 vixl::aarch64::w1
#define RWARG3 vixl::aarch64::w2
#define RWARG4 vixl::aarch64::w3
#define RXARG1 vixl::aarch64::x0
#define RXARG2 vixl::aarch64::x1
#define RXARG3 vixl::aarch64::x2
#define RXARG4 vixl::aarch64::x3
#define RXVIXLSCRATCH vixl::aarch64::x16
#define RWVIXLSCRATCH vixl::aarch64::w16
#define RSCRATCHADDR vixl::aarch64::x17
#define RQSCRATCH vixl::aarch64::q30
#define RDSCRATCH vixl::aarch64::d30
#define RSSCRATCH vixl::aarch64::s30
#define RQSCRATCH2 vixl::aarch64::q31
#define RDSCRATCH2 vixl::aarch64::d31
#define RSSCRATCH2 vixl::aarch64::s31
#define RQSCRATCH3 vixl::aarch64::q29
#define RDSCRATCH3 vixl::aarch64::d29
#define RSSCRATCH3 vixl::aarch64::s29
#define RQSCRATCHI vixl::aarch64::VRegister(30, 128, 16)
#define RQSCRATCHF vixl::aarch64::VRegister(30, 128, 4)
#define RQSCRATCHD vixl::aarch64::VRegister(30, 128, 2)
#define RQSCRATCH2I vixl::aarch64::VRegister(31, 128, 16)
#define RQSCRATCH2F vixl::aarch64::VRegister(31, 128, 4)
#define RQSCRATCH2D vixl::aarch64::VRegister(31, 128, 2)
static inline s64 GetPCDisplacement(const void* current, const void* target)
{
return static_cast<s64>((reinterpret_cast<ptrdiff_t>(target) - reinterpret_cast<ptrdiff_t>(current)) >> 2);
}
const vixl::aarch64::Register& armWRegister(int n);
const vixl::aarch64::Register& armXRegister(int n);
const vixl::aarch64::VRegister& armSRegister(int n);
const vixl::aarch64::VRegister& armDRegister(int n);
const vixl::aarch64::VRegister& armQRegister(int n);
class ArmConstantPool;
static const u32 SP_SCRATCH_OFFSET = 0;
extern thread_local vixl::aarch64::MacroAssembler* armAsm;
extern thread_local u8* armAsmPtr;
extern thread_local size_t armAsmCapacity;
extern thread_local ArmConstantPool* armConstantPool;
static __fi bool armHasBlock()
{
return (armAsm != nullptr);
}
static __fi u8* armGetCurrentCodePointer()
{
return static_cast<u8*>(armAsmPtr) + armAsm->GetCursorOffset();
}
__fi static u8* armGetAsmPtr()
{
return armAsmPtr;
}
void armSetAsmPtr(void* ptr, size_t capacity, ArmConstantPool* pool);
void armAlignAsmPtr();
u8* armStartBlock();
u8* armEndBlock();
void armDisassembleAndDumpCode(const void* ptr, size_t size);
void armEmitJmp(const void* ptr, bool force_inline = false);
void armEmitCall(const void* ptr, bool force_inline = false);
void armEmitCbnz(const vixl::aarch64::Register& reg, const void* ptr);
void armEmitCondBranch(vixl::aarch64::Condition cond, const void* ptr);
void armMoveAddressToReg(const vixl::aarch64::Register& reg, const void* addr);
void armLoadPtr(const vixl::aarch64::CPURegister& reg, const void* addr);
void armStorePtr(const vixl::aarch64::CPURegister& reg, const void* addr);
void armBeginStackFrame(bool save_fpr);
void armEndStackFrame(bool save_fpr);
bool armIsCalleeSavedRegister(int reg);
vixl::aarch64::MemOperand armOffsetMemOperand(const vixl::aarch64::MemOperand& op, s64 offset);
void armGetMemOperandInRegister(const vixl::aarch64::Register& addr_reg,
const vixl::aarch64::MemOperand& op, s64 extra_offset = 0);
void armLoadConstant128(const vixl::aarch64::VRegister& reg, const void* ptr);
// may clobber RSCRATCH/RSCRATCH2. they shouldn't be inputs.
void armEmitVTBL(const vixl::aarch64::VRegister& dst, const vixl::aarch64::VRegister& src1,
const vixl::aarch64::VRegister& src2, const vixl::aarch64::VRegister& tbl);
//////////////////////////////////////////////////////////////////////////
class ArmConstantPool
{
public:
void Init(void* ptr, u32 capacity);
void Destroy();
void Reset();
u8* GetJumpTrampoline(const void* target);
u8* GetLiteral(u64 value);
u8* GetLiteral(const u128& value);
u8* GetLiteral(const u8* bytes, size_t len);
void EmitLoadLiteral(const vixl::aarch64::CPURegister& reg, const u8* literal) const;
private:
__fi u32 GetRemainingCapacity() const { return m_capacity - m_used; }
struct u128_hash
{
std::size_t operator()(const u128& v) const
{
std::size_t s = 0;
HashCombine(s, v.lo, v.hi);
return s;
}
};
std::unordered_map<const void*, u32> m_jump_targets;
std::unordered_map<u128, u32, u128_hash> m_literals;
u8* m_base_ptr = nullptr;
u32 m_capacity = 0;
u32 m_used = 0;
};

View File

@ -0,0 +1,585 @@
// SPDX-FileCopyrightText: 2021-2023 Connor McLaughlin <stenzek@gmail.com>, PCSX2 Team
// SPDX-License-Identifier: GPL-3.0
#include "arm64/newVif_UnpackNEON.h"
#include "arm64/AsmHelpers.h"
#include "MTVU.h"
#include "common/Assertions.h"
#include "common/Perf.h"
#include "common/StringUtil.h"
namespace a64 = vixl::aarch64;
static void mVUmergeRegs(const vixl::aarch64::VRegister& dest, const vixl::aarch64::VRegister& src, int xyzw, bool modXYZW = false, bool canModifySrc = false)
{
xyzw &= 0xf;
if ((dest.GetCode() != src.GetCode()) && (xyzw != 0))
{
if (xyzw == 0x8)
armAsm->Mov(dest.V4S(), 0, src.V4S(), 0);
else if (xyzw == 0xf)
armAsm->Mov(dest.Q(), src.Q());
else
{
if (modXYZW)
{
if (xyzw == 1)
{
armAsm->Ins(dest.V4S(), 3, src.V4S(), 0);
return;
}
else if (xyzw == 2)
{
armAsm->Ins(dest.V4S(), 2, src.V4S(), 0);
return;
}
else if (xyzw == 4)
{
armAsm->Ins(dest.V4S(), 1, src.V4S(), 0);
return;
}
}
if (xyzw == 0)
return;
if (xyzw == 15)
{
armAsm->Mov(dest, src);
return;
}
if (xyzw == 14 && canModifySrc)
{
// xyz - we can get rid of the mov if we swap the RA around
armAsm->Mov(src.V4S(), 3, dest.V4S(), 3);
armAsm->Mov(dest.V16B(), src.V16B());
return;
}
// reverse
xyzw = ((xyzw & 1) << 3) | ((xyzw & 2) << 1) | ((xyzw & 4) >> 1) | ((xyzw & 8) >> 3);
if ((xyzw & 3) == 3)
{
// xy
armAsm->Mov(dest.V2D(), 0, src.V2D(), 0);
xyzw &= ~3;
}
else if ((xyzw & 12) == 12)
{
// zw
armAsm->Mov(dest.V2D(), 1, src.V2D(), 1);
xyzw &= ~12;
}
// xyzw
for (u32 i = 0; i < 4; i++)
{
if (xyzw & (1u << i))
armAsm->Mov(dest.V4S(), i, src.V4S(), i);
}
}
}
}
static void maskedVecWrite(const a64::VRegister& reg, const a64::MemOperand& addr, int xyzw)
{
switch (xyzw)
{
case 5: // YW
armGetMemOperandInRegister(RSCRATCHADDR, addr, 4);
armAsm->St1(reg.V4S(), 1, a64::MemOperand(RSCRATCHADDR)); // Y
armGetMemOperandInRegister(RSCRATCHADDR, addr, 12);
armAsm->St1(reg.V4S(), 3, a64::MemOperand(RSCRATCHADDR)); // W
break;
case 9: // XW
armGetMemOperandInRegister(RSCRATCHADDR, addr, 12);
armAsm->Str(reg.S(), addr); // X
armAsm->St1(reg.V4S(), 3, a64::MemOperand(RSCRATCHADDR)); // W
break;
case 10: //XZ
armGetMemOperandInRegister(RSCRATCHADDR, addr, 8);
armAsm->Str(reg.S(), addr); // X
armAsm->St1(reg.V4S(), 2, a64::MemOperand(RSCRATCHADDR)); // Z
break;
case 3: // ZW
armGetMemOperandInRegister(RSCRATCHADDR, addr, 8);
armAsm->St1(reg.V2D(), 1, a64::MemOperand(RSCRATCHADDR));
break;
case 11: //XZW
armGetMemOperandInRegister(RSCRATCHADDR, addr, 8);
armAsm->Str(reg.S(), addr); // X
armAsm->St1(reg.V2D(), 1, a64::MemOperand(RSCRATCHADDR)); // ZW
break;
case 13: // XYW
armGetMemOperandInRegister(RSCRATCHADDR, addr, 12);
armAsm->Str(reg.D(), addr);
armAsm->St1(reg.V4S(), 3, a64::MemOperand(RSCRATCHADDR));
break;
case 6: // YZ
armGetMemOperandInRegister(RSCRATCHADDR, addr, 4);
armAsm->St1(reg.V4S(), 1, a64::MemOperand(RSCRATCHADDR, 4, a64::PostIndex));
armAsm->St1(reg.V4S(), 2, a64::MemOperand(RSCRATCHADDR));
break;
case 7: // YZW
armGetMemOperandInRegister(RSCRATCHADDR, addr, 4);
armAsm->St1(reg.V4S(), 1, a64::MemOperand(RSCRATCHADDR, 4, a64::PostIndex));
armAsm->St1(reg.V2D(), 1, a64::MemOperand(RSCRATCHADDR));
break;
case 12: // XY
armAsm->Str(reg.D(), addr);
break;
case 14: // XYZ
armGetMemOperandInRegister(RSCRATCHADDR, addr, 8);
armAsm->Str(reg.D(), addr);
armAsm->St1(reg.V4S(), 2, a64::MemOperand(RSCRATCHADDR)); // Z
break;
case 4:
armGetMemOperandInRegister(RSCRATCHADDR, addr, 4);
armAsm->St1(reg.V4S(), 1, a64::MemOperand(RSCRATCHADDR));
break; // Y
case 2:
armGetMemOperandInRegister(RSCRATCHADDR, addr, 8);
armAsm->St1(reg.V4S(), 2, a64::MemOperand(RSCRATCHADDR));
break; // Z
case 1:
armGetMemOperandInRegister(RSCRATCHADDR, addr, 12);
armAsm->St1(reg.V4S(), 3, a64::MemOperand(RSCRATCHADDR));
break; // W
case 8:
armAsm->Str(reg.S(), addr);
break; // X
case 0:
Console.Error("maskedVecWrite case 0!");
break;
default:
armAsm->Str(reg.Q(), addr);
break; // XYZW
}
}
void dVifReset(int idx)
{
nVif[idx].vifBlocks.reset();
const size_t offset = idx ? HostMemoryMap::VIF1recOffset : HostMemoryMap::VIF0recOffset;
const size_t size = idx ? HostMemoryMap::VIF1recSize : HostMemoryMap::VIF0recSize;
nVif[idx].recWritePtr = SysMemory::GetCodePtr(offset);
nVif[idx].recEndPtr = nVif[idx].recWritePtr + (size - _256kb);
}
void dVifRelease(int idx)
{
nVif[idx].vifBlocks.clear();
}
VifUnpackNEON_Dynarec::VifUnpackNEON_Dynarec(const nVifStruct& vif_, const nVifBlock& vifBlock_)
: v(vif_)
, vB(vifBlock_)
{
const int wl = vB.wl ? vB.wl : 256; //0 is taken as 256 (KH2)
isFill = (vB.cl < wl);
usn = (vB.upkType >> 5) & 1;
doMask = (vB.upkType >> 4) & 1;
doMode = vB.mode & 3;
IsAligned = vB.aligned;
vCL = 0;
}
__fi void makeMergeMask(u32& x)
{
x = ((x & 0x40) >> 6) | ((x & 0x10) >> 3) | (x & 4) | ((x & 1) << 3);
}
__fi void VifUnpackNEON_Dynarec::SetMasks(int cS) const
{
const int idx = v.idx;
const vifStruct& vif = MTVU_VifX;
//This could have ended up copying the row when there was no row to write.1810080
u32 m0 = vB.mask; //The actual mask example 0x03020100
u32 m3 = ((m0 & 0xaaaaaaaa) >> 1) & ~m0; //all the upper bits, so our example 0x01010000 & 0xFCFDFEFF = 0x00010000 just the cols (shifted right for maskmerge)
u32 m2 = (m0 & 0x55555555) & (~m0 >> 1); // 0x1000100 & 0xFE7EFF7F = 0x00000100 Just the row
if ((doMask && m2) || doMode)
{
armLoadPtr(xmmRow, &vif.MaskRow);
MSKPATH3_LOG("Moving row");
}
if (doMask && m3)
{
VIF_LOG("Merging Cols");
armLoadPtr(xmmCol0, &vif.MaskCol);
if ((cS >= 2) && (m3 & 0x0000ff00))
armAsm->Dup(xmmCol1.V4S(), xmmCol0.V4S(), 1);
if ((cS >= 3) && (m3 & 0x00ff0000))
armAsm->Dup(xmmCol2.V4S(), xmmCol0.V4S(), 2);
if ((cS >= 4) && (m3 & 0xff000000))
armAsm->Dup(xmmCol3.V4S(), xmmCol0.V4S(), 3);
if ((cS >= 1) && (m3 & 0x000000ff))
armAsm->Dup(xmmCol0.V4S(), xmmCol0.V4S(), 0);
}
//if (doMask||doMode) loadRowCol((nVifStruct&)v);
}
void VifUnpackNEON_Dynarec::doMaskWrite(const vixl::aarch64::VRegister& regX) const
{
pxAssertMsg(regX.GetCode() <= 1, "Reg Overflow! XMM2 thru XMM6 are reserved for masking.");
const int cc = std::min(vCL, 3);
u32 m0 = (vB.mask >> (cc * 8)) & 0xff; //The actual mask example 0xE4 (protect, col, row, clear)
u32 m3 = ((m0 & 0xaa) >> 1) & ~m0; //all the upper bits (cols shifted right) cancelling out any write protects 0x10
u32 m2 = (m0 & 0x55) & (~m0 >> 1); // all the lower bits (rows)cancelling out any write protects 0x04
u32 m4 = (m0 & ~((m3 << 1) | m2)) & 0x55; // = 0xC0 & 0x55 = 0x40 (for merge mask)
makeMergeMask(m2);
makeMergeMask(m3);
makeMergeMask(m4);
if (doMask && m2) // Merge MaskRow
{
mVUmergeRegs(regX, xmmRow, m2);
}
if (doMask && m3) // Merge MaskCol
{
mVUmergeRegs(regX, armQRegister(xmmCol0.GetCode() + cc), m3);
}
if (doMode)
{
u32 m5 = ~(m2 | m3 | m4) & 0xf;
if (!doMask)
m5 = 0xf;
if (m5 < 0xf)
{
armAsm->Movi(xmmTemp.V4S(), 0);
if (doMode == 3)
{
mVUmergeRegs(xmmRow, regX, m5, false, false);
}
else
{
mVUmergeRegs(xmmTemp, xmmRow, m5, false, false);
armAsm->Add(regX.V4S(), regX.V4S(), xmmTemp.V4S());
if (doMode == 2)
mVUmergeRegs(xmmRow, regX, m5, false, false);
}
}
else
{
if (doMode == 3)
{
armAsm->Mov(xmmRow, regX);
}
else
{
armAsm->Add(regX.V4S(), regX.V4S(), xmmRow.V4S());
if (doMode == 2)
{
armAsm->Mov(xmmRow, regX);
}
}
}
}
if (doMask && m4)
maskedVecWrite(regX, dstIndirect, m4 ^ 0xf);
else
armAsm->Str(regX, dstIndirect);
}
void VifUnpackNEON_Dynarec::writeBackRow() const
{
const int idx = v.idx;
armStorePtr(xmmRow, &(MTVU_VifX.MaskRow));
VIF_LOG("nVif: writing back row reg! [doMode = %d]", doMode);
}
void VifUnpackNEON_Dynarec::ModUnpack(int upknum, bool PostOp)
{
switch (upknum)
{
case 0:
case 1:
case 2:
if (PostOp)
{
UnpkLoopIteration++;
UnpkLoopIteration = UnpkLoopIteration & 0x3;
}
break;
case 4:
case 5:
case 6:
if (PostOp)
{
UnpkLoopIteration++;
UnpkLoopIteration = UnpkLoopIteration & 0x1;
}
break;
case 8:
if (PostOp)
{
UnpkLoopIteration++;
UnpkLoopIteration = UnpkLoopIteration & 0x1;
}
break;
case 9:
if (!PostOp)
{
UnpkLoopIteration++;
}
break;
case 10:
if (!PostOp)
{
UnpkLoopIteration++;
}
break;
case 12:
break;
case 13:
break;
case 14:
break;
case 15:
break;
case 3:
case 7:
case 11:
pxFailRel(fmt::format("Vpu/Vif - Invalid Unpack! [{}]", upknum).c_str());
break;
}
}
void VifUnpackNEON_Dynarec::ProcessMasks()
{
skipProcessing = false;
inputMasked = false;
if (!doMask)
return;
const int cc = std::min(vCL, 3);
const u32 full_mask = (vB.mask >> (cc * 8)) & 0xff;
const u32 rowcol_mask = ((full_mask >> 1) | full_mask) & 0x55; // Rows or Cols being written instead of data, or protected.
// Every channel is write protected for this cycle, no need to process anything.
skipProcessing = full_mask == 0xff;
// All channels are masked, no reason to process anything here.
inputMasked = rowcol_mask == 0x55;
}
void VifUnpackNEON_Dynarec::CompileRoutine()
{
const int wl = vB.wl ? vB.wl : 256; //0 is taken as 256 (KH2)
const int upkNum = vB.upkType & 0xf;
const u8& vift = nVifT[upkNum];
const int cycleSize = isFill ? vB.cl : wl;
const int blockSize = isFill ? wl : vB.cl;
const int skipSize = blockSize - cycleSize;
uint vNum = vB.num ? vB.num : 256;
doMode = (upkNum == 0xf) ? 0 : doMode; // V4_5 has no mode feature.
UnpkNoOfIterations = 0;
VIF_LOG("Compiling new block, unpack number %x, mode %x, masking %x, vNum %x", upkNum, doMode, doMask, vNum);
pxAssume(vCL == 0);
// Value passed determines # of col regs we need to load
SetMasks(isFill ? blockSize : cycleSize);
while (vNum)
{
// Determine if reads/processing can be skipped.
ProcessMasks();
if (vCL < cycleSize)
{
ModUnpack(upkNum, false);
xUnpack(upkNum);
xMovDest();
ModUnpack(upkNum, true);
dstIndirect = armOffsetMemOperand(dstIndirect, 16);
srcIndirect = armOffsetMemOperand(srcIndirect, vift);
vNum--;
if (++vCL == blockSize)
vCL = 0;
}
else if (isFill)
{
xUnpack(upkNum);
xMovDest();
// dstIndirect += 16;
dstIndirect = armOffsetMemOperand(dstIndirect, 16);
vNum--;
if (++vCL == blockSize)
vCL = 0;
}
else
{
// dstIndirect += (16 * skipSize);
dstIndirect = armOffsetMemOperand(dstIndirect, 16 * skipSize);
vCL = 0;
}
}
if (doMode >= 2)
writeBackRow();
armAsm->Ret();
}
static u16 dVifComputeLength(uint cl, uint wl, u8 num, bool isFill)
{
uint length = (num > 0) ? (num * 16) : 4096; // 0 = 256
if (!isFill)
{
uint skipSize = (cl - wl) * 16;
uint blocks = (num + (wl - 1)) / wl; //Need to round up num's to calculate skip size correctly.
length += (blocks - 1) * skipSize;
}
return std::min(length, 0xFFFFu);
}
_vifT __fi nVifBlock* dVifCompile(nVifBlock& block, bool isFill)
{
nVifStruct& v = nVif[idx];
// Check size before the compilation
if (v.recWritePtr >= v.recEndPtr)
{
DevCon.WriteLn("nVif Recompiler Cache Reset! [0x%016" PRIXPTR " > 0x%016" PRIXPTR "]",
v.recWritePtr, v.recEndPtr);
dVifReset(idx);
}
// Compile the block now
armSetAsmPtr(v.recWritePtr, v.recEndPtr - v.recWritePtr, nullptr);
block.startPtr = (uptr)armStartBlock();
block.length = dVifComputeLength(block.cl, block.wl, block.num, isFill);
v.vifBlocks.add(block);
VifUnpackNEON_Dynarec(v, block).CompileRoutine();
Perf::vif.RegisterPC(v.recWritePtr, armGetCurrentCodePointer() - v.recWritePtr, block.upkType /* FIXME ideally a key*/);
v.recWritePtr = armEndBlock();
return &block;
}
_vifT __fi void dVifUnpack(const u8* data, bool isFill)
{
nVifStruct& v = nVif[idx];
vifStruct& vif = MTVU_VifX;
VIFregisters& vifRegs = MTVU_VifXRegs;
const u8 upkType = (vif.cmd & 0x1f) | (vif.usn << 5);
const int doMask = isFill ? 1 : (vif.cmd & 0x10);
nVifBlock block;
// Performance note: initial code was using u8/u16 field of the struct
// directly. However reading back the data (as u32) in HashBucket.find
// leads to various memory stalls. So it is way faster to manually build the data
// in u32 (aka x86 register).
//
// Warning the order of data in hash_key/key0/key1 depends on the nVifBlock struct
u32 hash_key = (u32)(upkType & 0xFF) << 8 | (vifRegs.num & 0xFF);
u32 key1 = ((u32)vifRegs.cycle.wl << 24) | ((u32)vifRegs.cycle.cl << 16) | ((u32)(vif.start_aligned & 0xFF) << 8) | ((u32)vifRegs.mode & 0xFF);
if ((upkType & 0xf) != 9)
key1 &= 0xFFFF01FF;
// Zero out the mask parameter if it's unused -- games leave random junk
// values here which cause false recblock cache misses.
u32 key0 = doMask ? vifRegs.mask : 0;
block.hash_key = hash_key;
block.key0 = key0;
block.key1 = key1;
//DevCon.WriteLn("nVif%d: Recompiled Block!", idx);
//DevCon.WriteLn(L"[num=% 3d][upkType=0x%02x][scl=%d][cl=%d][wl=%d][mode=%d][m=%d][mask=%s]",
// block.num, block.upkType, block.scl, block.cl, block.wl, block.mode,
// doMask >> 4, doMask ? wxsFormat( L"0x%08x", block.mask ).c_str() : L"ignored"
//);
// Seach in cache before trying to compile the block
nVifBlock* b = v.vifBlocks.find(block);
if (!b) [[unlikely]]
{
b = dVifCompile<idx>(block, isFill);
}
{ // Execute the block
const VURegs& VU = vuRegs[idx];
const uint vuMemLimit = idx ? 0x4000 : 0x1000;
u8* startmem = VU.Mem + (vif.tag.addr & (vuMemLimit - 0x10));
u8* endmem = VU.Mem + vuMemLimit;
if ((startmem + b->length) <= endmem) [[likely]]
{
#if 1
// No wrapping, you can run the fast dynarec
((nVifrecCall)b->startPtr)((uptr)startmem, (uptr)data);
#else
// comparison mode
static u8 tmpbuf[512 * 1024];
((nVifrecCall)b->startPtr)((uptr)tmpbuf, (uptr)data);
_nVifUnpack(idx, data, vifRegs.mode, isFill);
const u32 words = b->length / 4;
for (u32 i = 0; i < words; i++)
{
if (*((u32*)tmpbuf + i) != *((u32*)startmem + i))
{
// fprintf(stderr, "%08X %08X @ %u\n", *((u32*)tmpbuf + i), *((u32*)startmem + i), i);
pauseCCC(*((u32*)tmpbuf + i), *((u32*)startmem + i), i);
((nVifrecCall)b->startPtr)((uptr)tmpbuf, (uptr)data);
break;
}
}
#endif
}
else
{
VIF_LOG("Running Interpreter Block: nVif%x - VU Mem Ptr Overflow; falling back to interpreter. Start = %x End = %x num = %x, wl = %x, cl = %x",
v.idx, vif.tag.addr, vif.tag.addr + (block.num * 16), block.num, block.wl, block.cl);
_nVifUnpack(idx, data, vifRegs.mode, isFill);
}
}
}
template void dVifUnpack<0>(const u8* data, bool isFill);
template void dVifUnpack<1>(const u8* data, bool isFill);

View File

@ -0,0 +1,425 @@
// SPDX-FileCopyrightText: 2021-2023 Connor McLaughlin <stenzek@gmail.com>, PCSX2 Team
// SPDX-License-Identifier: GPL-3.0
#include "newVif_UnpackNEON.h"
#include "common/Perf.h"
namespace a64 = vixl::aarch64;
// =====================================================================================================
// VifUnpackSSE_Base Section
// =====================================================================================================
VifUnpackNEON_Base::VifUnpackNEON_Base()
: usn(false)
, doMask(false)
, UnpkLoopIteration(0)
, UnpkNoOfIterations(0)
, IsAligned(0)
, dstIndirect(a64::MemOperand(RXARG1))
, srcIndirect(a64::MemOperand(RXARG2))
, workReg(a64::q1)
, destReg(a64::q0)
, workGprW(a64::w4)
{
}
void VifUnpackNEON_Base::xMovDest() const
{
if (!IsWriteProtectedOp())
{
if (IsUnmaskedOp())
armAsm->Str(destReg, dstIndirect);
else
doMaskWrite(destReg);
}
}
void VifUnpackNEON_Base::xShiftR(const vixl::aarch64::VRegister& regX, int n) const
{
if (usn)
armAsm->Ushr(regX.V4S(), regX.V4S(), n);
else
armAsm->Sshr(regX.V4S(), regX.V4S(), n);
}
void VifUnpackNEON_Base::xPMOVXX8(const vixl::aarch64::VRegister& regX) const
{
// TODO(Stenzek): Check this
armAsm->Ldr(regX.S(), srcIndirect);
if (usn)
{
armAsm->Ushll(regX.V8H(), regX.V8B(), 0);
armAsm->Ushll(regX.V4S(), regX.V4H(), 0);
}
else
{
armAsm->Sshll(regX.V8H(), regX.V8B(), 0);
armAsm->Sshll(regX.V4S(), regX.V4H(), 0);
}
}
void VifUnpackNEON_Base::xPMOVXX16(const vixl::aarch64::VRegister& regX) const
{
armAsm->Ldr(regX.D(), srcIndirect);
if (usn)
armAsm->Ushll(regX.V4S(), regX.V4H(), 0);
else
armAsm->Sshll(regX.V4S(), regX.V4H(), 0);
}
void VifUnpackNEON_Base::xUPK_S_32() const
{
if (UnpkLoopIteration == 0)
armAsm->Ldr(workReg, srcIndirect);
if (IsInputMasked())
return;
switch (UnpkLoopIteration)
{
case 0:
armAsm->Dup(destReg.V4S(), workReg.V4S(), 0);
break;
case 1:
armAsm->Dup(destReg.V4S(), workReg.V4S(), 1);
break;
case 2:
armAsm->Dup(destReg.V4S(), workReg.V4S(), 2);
break;
case 3:
armAsm->Dup(destReg.V4S(), workReg.V4S(), 3);
break;
}
}
void VifUnpackNEON_Base::xUPK_S_16() const
{
if (UnpkLoopIteration == 0)
xPMOVXX16(workReg);
if (IsInputMasked())
return;
switch (UnpkLoopIteration)
{
case 0:
armAsm->Dup(destReg.V4S(), workReg.V4S(), 0);
break;
case 1:
armAsm->Dup(destReg.V4S(), workReg.V4S(), 1);
break;
case 2:
armAsm->Dup(destReg.V4S(), workReg.V4S(), 2);
break;
case 3:
armAsm->Dup(destReg.V4S(), workReg.V4S(), 3);
break;
}
}
void VifUnpackNEON_Base::xUPK_S_8() const
{
if (UnpkLoopIteration == 0)
xPMOVXX8(workReg);
if (IsInputMasked())
return;
switch (UnpkLoopIteration)
{
case 0:
armAsm->Dup(destReg.V4S(), workReg.V4S(), 0);
break;
case 1:
armAsm->Dup(destReg.V4S(), workReg.V4S(), 1);
break;
case 2:
armAsm->Dup(destReg.V4S(), workReg.V4S(), 2);
break;
case 3:
armAsm->Dup(destReg.V4S(), workReg.V4S(), 3);
break;
}
}
// The V2 + V3 unpacks have freaky behaviour, the manual claims "indeterminate".
// After testing on the PS2, it's very much determinate in 99% of cases
// and games like Lemmings, And1 Streetball rely on this data to be like this!
// I have commented after each shuffle to show what data is going where - Ref
void VifUnpackNEON_Base::xUPK_V2_32() const
{
if (UnpkLoopIteration == 0)
{
armAsm->Ldr(workReg, srcIndirect);
if (IsInputMasked())
return;
armAsm->Dup(destReg.V2D(), workReg.V2D(), 0); //v1v0v1v0
if (IsAligned)
armAsm->Ins(destReg.V4S(), 3, a64::wzr); //zero last word - tested on ps2
}
else
{
if (IsInputMasked())
return;
armAsm->Dup(destReg.V2D(), workReg.V2D(), 1); //v3v2v3v2
if (IsAligned)
armAsm->Ins(destReg.V4S(), 3, a64::wzr); //zero last word - tested on ps2
}
}
void VifUnpackNEON_Base::xUPK_V2_16() const
{
if (UnpkLoopIteration == 0)
{
xPMOVXX16(workReg);
if (IsInputMasked())
return;
armAsm->Dup(destReg.V2D(), workReg.V2D(), 0); //v1v0v1v0
}
else
{
if (IsInputMasked())
return;
armAsm->Dup(destReg.V2D(), workReg.V2D(), 1); //v3v2v3v2
}
}
void VifUnpackNEON_Base::xUPK_V2_8() const
{
if (UnpkLoopIteration == 0)
{
xPMOVXX8(workReg);
if (IsInputMasked())
return;
armAsm->Dup(destReg.V2D(), workReg.V2D(), 0); //v1v0v1v0
}
else
{
if (IsInputMasked())
return;
armAsm->Dup(destReg.V2D(), workReg.V2D(), 1); //v3v2v3v2
}
}
void VifUnpackNEON_Base::xUPK_V3_32() const
{
if (IsInputMasked())
return;
armAsm->Ldr(destReg, srcIndirect);
if (UnpkLoopIteration != IsAligned)
armAsm->Ins(destReg.V4S(), 3, a64::wzr);
}
void VifUnpackNEON_Base::xUPK_V3_16() const
{
if (IsInputMasked())
return;
xPMOVXX16(destReg);
//With V3-16, it takes the first vector from the next position as the W vector
//However - IF the end of this iteration of the unpack falls on a quadword boundary, W becomes 0
//IsAligned is the position through the current QW in the vif packet
//Iteration counts where we are in the packet.
int result = (((UnpkLoopIteration / 4) + 1 + (4 - IsAligned)) & 0x3);
if ((UnpkLoopIteration & 0x1) == 0 && result == 0)
armAsm->Ins(destReg.V4S(), 3, a64::wzr); //zero last word on QW boundary if whole 32bit word is used - tested on ps2
}
void VifUnpackNEON_Base::xUPK_V3_8() const
{
if (IsInputMasked())
return;
xPMOVXX8(destReg);
if (UnpkLoopIteration != IsAligned)
armAsm->Ins(destReg.V4S(), 3, a64::wzr);
}
void VifUnpackNEON_Base::xUPK_V4_32() const
{
if (IsInputMasked())
return;
armAsm->Ldr(destReg.Q(), a64::MemOperand(srcIndirect));
}
void VifUnpackNEON_Base::xUPK_V4_16() const
{
if (IsInputMasked())
return;
xPMOVXX16(destReg);
}
void VifUnpackNEON_Base::xUPK_V4_8() const
{
if (IsInputMasked())
return;
xPMOVXX8(destReg);
}
void VifUnpackNEON_Base::xUPK_V4_5() const
{
if (IsInputMasked())
return;
armAsm->Ldrh(workGprW, srcIndirect);
armAsm->Lsl(workGprW, workGprW, 3); // ABG|R5.000
armAsm->Dup(destReg.V4S(), workGprW); // x|x|x|R
armAsm->Lsr(workGprW, workGprW, 8); // ABG
armAsm->Lsl(workGprW, workGprW, 3); // AB|G5.000
armAsm->Ins(destReg.V4S(), 1, workGprW); // x|x|G|R
armAsm->Lsr(workGprW, workGprW, 8); // AB
armAsm->Lsl(workGprW, workGprW, 3); // A|B5.000
armAsm->Ins(destReg.V4S(), 2, workGprW); // x|B|G|R
armAsm->Lsr(workGprW, workGprW, 8); // A
armAsm->Lsl(workGprW, workGprW, 7); // A.0000000
armAsm->Ins(destReg.V4S(), 3, workGprW); // A|B|G|R
armAsm->Shl(destReg.V4S(), destReg.V4S(), 24); // can optimize to
armAsm->Ushr(destReg.V4S(), destReg.V4S(), 24); // single AND...
}
void VifUnpackNEON_Base::xUnpack(int upknum) const
{
switch (upknum)
{
case 0:
xUPK_S_32();
break;
case 1:
xUPK_S_16();
break;
case 2:
xUPK_S_8();
break;
case 4:
xUPK_V2_32();
break;
case 5:
xUPK_V2_16();
break;
case 6:
xUPK_V2_8();
break;
case 8:
xUPK_V3_32();
break;
case 9:
xUPK_V3_16();
break;
case 10:
xUPK_V3_8();
break;
case 12:
xUPK_V4_32();
break;
case 13:
xUPK_V4_16();
break;
case 14:
xUPK_V4_8();
break;
case 15:
xUPK_V4_5();
break;
case 3:
case 7:
case 11:
pxFailRel(fmt::format("Vpu/Vif - Invalid Unpack! [{}]", upknum).c_str());
break;
}
}
// =====================================================================================================
// VifUnpackSSE_Simple
// =====================================================================================================
VifUnpackNEON_Simple::VifUnpackNEON_Simple(bool usn_, bool domask_, int curCycle_)
{
curCycle = curCycle_;
usn = usn_;
doMask = domask_;
IsAligned = true;
}
void VifUnpackNEON_Simple::doMaskWrite(const vixl::aarch64::VRegister& regX) const
{
armAsm->Ldr(a64::q7, dstIndirect);
int offX = std::min(curCycle, 3);
armMoveAddressToReg(RXVIXLSCRATCH, nVifMask);
armAsm->Ldr(a64::q29, a64::MemOperand(RXVIXLSCRATCH, reinterpret_cast<const u8*>(nVifMask[0][offX]) - reinterpret_cast<const u8*>(nVifMask)));
armAsm->Ldr(a64::q30, a64::MemOperand(RXVIXLSCRATCH, reinterpret_cast<const u8*>(nVifMask[1][offX]) - reinterpret_cast<const u8*>(nVifMask)));
armAsm->Ldr(a64::q31, a64::MemOperand(RXVIXLSCRATCH, reinterpret_cast<const u8*>(nVifMask[2][offX]) - reinterpret_cast<const u8*>(nVifMask)));
armAsm->And(regX.V16B(), regX.V16B(), a64::q29.V16B());
armAsm->And(a64::q7.V16B(), a64::q7.V16B(), a64::q30.V16B());
armAsm->Orr(regX.V16B(), regX.V16B(), a64::q31.V16B());
armAsm->Orr(regX.V16B(), regX.V16B(), a64::q7.V16B());
armAsm->Str(regX, dstIndirect);
}
// ecx = dest, edx = src
static void nVifGen(int usn, int mask, int curCycle)
{
int usnpart = usn * 2 * 16;
int maskpart = mask * 16;
VifUnpackNEON_Simple vpugen(!!usn, !!mask, curCycle);
for (int i = 0; i < 16; ++i)
{
nVifCall& ucall(nVifUpk[((usnpart + maskpart + i) * 4) + curCycle]);
ucall = NULL;
if (nVifT[i] == 0)
continue;
ucall = (nVifCall)armStartBlock();
vpugen.xUnpack(i);
vpugen.xMovDest();
armAsm->Ret();
armEndBlock();
}
}
void VifUnpackSSE_Init()
{
DevCon.WriteLn("Generating NEON-optimized unpacking functions for VIF interpreters...");
HostSys::BeginCodeWrite();
armSetAsmPtr(SysMemory::GetVIFUnpackRec(), SysMemory::GetVIFUnpackRecEnd() - SysMemory::GetVIFUnpackRec(), nullptr);
for (int a = 0; a < 2; a++)
{
for (int b = 0; b < 2; b++)
{
for (int c = 0; c < 4; c++)
{
nVifGen(a, b, c);
}
}
}
Perf::any.Register(SysMemory::GetVIFUnpackRec(), armGetAsmPtr() - SysMemory::GetVIFUnpackRec(), "VIF Unpack");
HostSys::EndCodeWrite();
}

View File

@ -0,0 +1,146 @@
// SPDX-FileCopyrightText: 2021-2023 Connor McLaughlin <stenzek@gmail.com>, PCSX2 Team
// SPDX-License-Identifier: GPL-3.0
#pragma once
#include "Common.h"
#include "Vif_Dma.h"
#include "Vif_Dynarec.h"
#include "arm64/AsmHelpers.h"
#define xmmCol0 vixl::aarch64::q2
#define xmmCol1 vixl::aarch64::q3
#define xmmCol2 vixl::aarch64::q4
#define xmmCol3 vixl::aarch64::q5
#define xmmRow vixl::aarch64::q6
#define xmmTemp vixl::aarch64::q7
// --------------------------------------------------------------------------------------
// VifUnpackSSE_Base
// --------------------------------------------------------------------------------------
class VifUnpackNEON_Base
{
public:
bool usn; // unsigned flag
bool doMask; // masking write enable flag
int UnpkLoopIteration;
int UnpkNoOfIterations;
int IsAligned;
protected:
vixl::aarch64::MemOperand dstIndirect;
vixl::aarch64::MemOperand srcIndirect;
vixl::aarch64::VRegister workReg;
vixl::aarch64::VRegister destReg;
vixl::aarch64::WRegister workGprW;
public:
VifUnpackNEON_Base();
virtual ~VifUnpackNEON_Base() = default;
virtual void xUnpack(int upktype) const;
virtual bool IsWriteProtectedOp() const = 0;
virtual bool IsInputMasked() const = 0;
virtual bool IsUnmaskedOp() const = 0;
virtual void xMovDest() const;
protected:
virtual void doMaskWrite(const vixl::aarch64::VRegister& regX) const = 0;
virtual void xShiftR(const vixl::aarch64::VRegister& regX, int n) const;
virtual void xPMOVXX8(const vixl::aarch64::VRegister& regX) const;
virtual void xPMOVXX16(const vixl::aarch64::VRegister& regX) const;
virtual void xUPK_S_32() const;
virtual void xUPK_S_16() const;
virtual void xUPK_S_8() const;
virtual void xUPK_V2_32() const;
virtual void xUPK_V2_16() const;
virtual void xUPK_V2_8() const;
virtual void xUPK_V3_32() const;
virtual void xUPK_V3_16() const;
virtual void xUPK_V3_8() const;
virtual void xUPK_V4_32() const;
virtual void xUPK_V4_16() const;
virtual void xUPK_V4_8() const;
virtual void xUPK_V4_5() const;
};
// --------------------------------------------------------------------------------------
// VifUnpackSSE_Simple
// --------------------------------------------------------------------------------------
class VifUnpackNEON_Simple : public VifUnpackNEON_Base
{
typedef VifUnpackNEON_Base _parent;
public:
int curCycle;
public:
VifUnpackNEON_Simple(bool usn_, bool domask_, int curCycle_);
virtual ~VifUnpackNEON_Simple() = default;
virtual bool IsWriteProtectedOp() const { return false; }
virtual bool IsInputMasked() const { return false; }
virtual bool IsUnmaskedOp() const { return !doMask; }
protected:
virtual void doMaskWrite(const vixl::aarch64::VRegister& regX) const;
};
// --------------------------------------------------------------------------------------
// VifUnpackSSE_Dynarec
// --------------------------------------------------------------------------------------
class VifUnpackNEON_Dynarec : public VifUnpackNEON_Base
{
typedef VifUnpackNEON_Base _parent;
public:
bool isFill;
int doMode; // two bit value representing difference mode
bool skipProcessing;
bool inputMasked;
protected:
const nVifStruct& v; // vif0 or vif1
const nVifBlock& vB; // some pre-collected data from VifStruct
int vCL; // internal copy of vif->cl
public:
VifUnpackNEON_Dynarec(const nVifStruct& vif_, const nVifBlock& vifBlock_);
VifUnpackNEON_Dynarec(const VifUnpackNEON_Dynarec& src) // copy constructor
: _parent(src)
, v(src.v)
, vB(src.vB)
{
isFill = src.isFill;
vCL = src.vCL;
}
virtual ~VifUnpackNEON_Dynarec() = default;
virtual bool IsWriteProtectedOp() const { return skipProcessing; }
virtual bool IsInputMasked() const { return inputMasked; }
virtual bool IsUnmaskedOp() const { return !doMode && !doMask; }
void ModUnpack(int upknum, bool PostOp);
void ProcessMasks();
void CompileRoutine();
protected:
virtual void doMaskWrite(const vixl::aarch64::VRegister& regX) const;
void SetMasks(int cS) const;
void writeBackRow() const;
static VifUnpackNEON_Dynarec FillingWrite(const VifUnpackNEON_Dynarec& src)
{
VifUnpackNEON_Dynarec fillingWrite(src);
fillingWrite.doMask = true;
fillingWrite.doMode = 0;
return fillingWrite;
}
};

View File

@ -112,6 +112,15 @@
</ItemGroup>
<ItemGroup>
<ClCompile Include="Achievements.cpp" />
<ClCompile Include="arm64\AsmHelpers.cpp">
<ExcludedFromBuild Condition="'$(Platform)'!='ARM64'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="arm64\newVif_Dynarec.cpp">
<ExcludedFromBuild Condition="'$(Platform)'!='ARM64'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="arm64\newVif_UnpackNEON.cpp">
<ExcludedFromBuild Condition="'$(Platform)'!='ARM64'">true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="CDVD\BlockdumpFileReader.cpp" />
<ClCompile Include="CDVD\CDVDdiscReader.cpp" />
<ClCompile Include="CDVD\CDVDdiscThread.cpp" />
@ -554,6 +563,12 @@
</ItemGroup>
<ItemGroup>
<ClInclude Include="Achievements.h" />
<ClInclude Include="arm64\AsmHelpers.h">
<ExcludedFromBuild Condition="'$(Platform)'!='ARM64'">true</ExcludedFromBuild>
</ClInclude>
<ClInclude Include="arm64\newVif_UnpackNEON.h">
<ExcludedFromBuild Condition="'$(Platform)'!='ARM64'">true</ExcludedFromBuild>
</ClInclude>
<ClInclude Include="CDVD\BlockdumpFileReader.h" />
<ClInclude Include="CDVD\CDVDdiscReader.h" />
<ClInclude Include="CDVD\CsoFileReader.h" />

View File

@ -280,6 +280,12 @@
<Filter Include="Misc\Host">
<UniqueIdentifier>{9f0d3bda-76d4-42d3-87e9-ce65db9163ef}</UniqueIdentifier>
</Filter>
<Filter Include="System\Ps2\EmotionEngine\DMAC\Vif\Unpack\newVif\Dynarec\arm64">
<UniqueIdentifier>{8aea3ae6-9722-463a-94ac-34f3738a3153}</UniqueIdentifier>
</Filter>
<Filter Include="Tools\arm64">
<UniqueIdentifier>{cf847f4e-744e-4c27-a7ac-8564726fb4e6}</UniqueIdentifier>
</Filter>
</ItemGroup>
<ItemGroup>
<None Include="Docs\License.txt">
@ -1398,6 +1404,15 @@
<ClCompile Include="GS\Renderers\SW\GSSetupPrimCodeGenerator.arm64.cpp">
<Filter>System\Ps2\GS\Renderers\Software</Filter>
</ClCompile>
<ClCompile Include="arm64\newVif_Dynarec.cpp">
<Filter>System\Ps2\EmotionEngine\DMAC\Vif\Unpack\newVif\Dynarec\arm64</Filter>
</ClCompile>
<ClCompile Include="arm64\newVif_UnpackNEON.cpp">
<Filter>System\Ps2\EmotionEngine\DMAC\Vif\Unpack\newVif\Dynarec\arm64</Filter>
</ClCompile>
<ClCompile Include="arm64\AsmHelpers.cpp">
<Filter>Tools\arm64</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="Patch.h">
@ -2321,6 +2336,12 @@
<ClInclude Include="GS\GSVector4i_arm64.h">
<Filter>System\Ps2\GS</Filter>
</ClInclude>
<ClInclude Include="arm64\newVif_UnpackNEON.h">
<Filter>System\Ps2\EmotionEngine\DMAC\Vif\Unpack\newVif\Dynarec\arm64</Filter>
</ClInclude>
<ClInclude Include="arm64\AsmHelpers.h">
<Filter>Tools\arm64</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<CustomBuildStep Include="rdebug\deci2.h">