diff --git a/pcsx2/CMakeLists.txt b/pcsx2/CMakeLists.txt index 0adcf470b1..d5663e70ee 100644 --- a/pcsx2/CMakeLists.txt +++ b/pcsx2/CMakeLists.txt @@ -954,7 +954,6 @@ set(pcsx2x86Sources x86/ix86-32/iR5900Templates.cpp x86/ix86-32/recVTLB.cpp x86/newVif_Dynarec.cpp - x86/newVif_Unpack.cpp x86/newVif_UnpackSSE.cpp ) @@ -995,7 +994,6 @@ set(pcsx2x86Headers x86/microVU_Tables.inl x86/microVU_Upper.inl x86/newVif.h - x86/newVif_HashBucket.h x86/newVif_UnpackSSE.h x86/R5900_Profiler.h ) diff --git a/pcsx2/Gif.cpp b/pcsx2/Gif.cpp index f7fda88106..4e036de92f 100644 --- a/pcsx2/Gif.cpp +++ b/pcsx2/Gif.cpp @@ -5,7 +5,6 @@ #include "GS.h" #include "Gif_Unit.h" #include "Vif_Dma.h" -#include "x86/iR5900.h" // A three-way toggle used to determine if the GIF is stalling (transferring) or done (finished). // Should be a gifstate_t rather then int, but I don't feel like possibly interfering with savestates right now. diff --git a/pcsx2/Gif_Unit.h b/pcsx2/Gif_Unit.h index 4998ba88ba..d4997f115e 100644 --- a/pcsx2/Gif_Unit.h +++ b/pcsx2/Gif_Unit.h @@ -118,6 +118,22 @@ struct Gif_Tag // write out unpacked registers _mm_storeu_si128(reinterpret_cast<__m128i*>(regs), vregs); +#elif defined(_M_ARM64) + // zero out bits for registers which shouldn't be tested + u64 REGS64; + std::memcpy(®S64, tag.REGS, sizeof(u64)); + REGS64 &= (0xFFFFFFFFFFFFFFFFULL >> (64 - nRegs * 4)); + uint8x16_t vregs = vsetq_lane_u64(REGS64, vdupq_n_u64(0), 0); + + // get upper nibbles, interleave with lower nibbles, clear upper bits from low nibbles + vregs = vandq_u8(vzip1q_u8(vregs, vshrq_n_u8(vregs, 4)), vdupq_n_u8(0x0F)); + + // compare with GIF_REG_A_D, set hasAD if any lanes passed + const uint8x16_t comp = vceqq_u8(vregs, vdupq_n_u8(GIF_REG_A_D)); + hasAD = vmaxvq_u8(comp) & 1; + + // write out unpacked registers + vst1q_u8(regs, vregs); #else // Reference C implementation. hasAD = false; diff --git a/pcsx2/IopBios.cpp b/pcsx2/IopBios.cpp index 331765e3d5..cfaa43c8c6 100644 --- a/pcsx2/IopBios.cpp +++ b/pcsx2/IopBios.cpp @@ -8,7 +8,6 @@ #include "R3000A.h" #include "R5900.h" #include "ps2/BiosTools.h" -#include "x86/iR3000A.h" #include "VMManager.h" #include diff --git a/pcsx2/IopHw.cpp b/pcsx2/IopHw.cpp index d850817373..518af4e342 100644 --- a/pcsx2/IopHw.cpp +++ b/pcsx2/IopHw.cpp @@ -11,7 +11,6 @@ #include "IopHw.h" #include "Mdec.h" #include "R3000A.h" -#include "x86/iR5900.h" // NOTE: Any modifications to read/write fns should also go into their const counterparts // found in iPsxHw.cpp. diff --git a/pcsx2/IopMem.cpp b/pcsx2/IopMem.cpp index a79b1c4a59..abd7259a25 100644 --- a/pcsx2/IopMem.cpp +++ b/pcsx2/IopMem.cpp @@ -14,7 +14,7 @@ const uptr *psxMemRLUT = nullptr; IopVM_MemoryAllocMess* iopMem = nullptr; -alignas(__pagesize) u8 iopHw[Ps2MemSize::IopHardware]; +alignas(__pagealignsize) u8 iopHw[Ps2MemSize::IopHardware]; void iopMemAlloc() { diff --git a/pcsx2/MTVU.cpp b/pcsx2/MTVU.cpp index aba037bfae..8609fb86e0 100644 --- a/pcsx2/MTVU.cpp +++ b/pcsx2/MTVU.cpp @@ -5,7 +5,7 @@ #include "Gif_Unit.h" #include "MTVU.h" #include "VMManager.h" -#include "x86/newVif.h" +#include "Vif_Dynarec.h" #include diff --git a/pcsx2/Memory.cpp b/pcsx2/Memory.cpp index 3644d4df87..d6100fed93 100644 --- a/pcsx2/Memory.cpp +++ b/pcsx2/Memory.cpp @@ -98,7 +98,7 @@ u8* SysMemory::TryAllocateVirtualMemory(const char* name, void* file_handle, upt if (!baseptr) return nullptr; - if ((uptr)baseptr != base) + if (base != 0 && (uptr)baseptr != base) { if (file_handle) { @@ -122,6 +122,8 @@ u8* SysMemory::TryAllocateVirtualMemory(const char* name, void* file_handle, upt u8* SysMemory::AllocateVirtualMemory(const char* name, void* file_handle, size_t size, size_t offset_from_base) { + // ARM64 does not need the rec areas to be in +/- 2GB. +#ifdef _M_X86 pxAssertRel(Common::IsAlignedPow2(size, __pagesize), "Virtual memory size is page aligned"); // Everything looks nicer when the start of all the sections is a nice round looking number. @@ -148,6 +150,9 @@ u8* SysMemory::AllocateVirtualMemory(const char* name, void* file_handle, size_t DevCon.Warning("%s: host memory @ 0x%016" PRIXPTR " -> 0x%016" PRIXPTR " is unavailable; attempting to map elsewhere...", name, base, base + size); } +#else + return TryAllocateVirtualMemory(name, file_handle, 0, size); +#endif return nullptr; } @@ -986,8 +991,8 @@ void memClearPageAddr(u32 vaddr) /////////////////////////////////////////////////////////////////////////// // PS2 Memory Init / Reset / Shutdown -EEVM_MemoryAllocMess* eeMem = NULL; -alignas(__pagesize) u8 eeHw[Ps2MemSize::Hardware]; +EEVM_MemoryAllocMess* eeMem = nullptr; +alignas(__pagealignsize) u8 eeHw[Ps2MemSize::Hardware]; void memBindConditionalHandlers() diff --git a/pcsx2/MemoryTypes.h b/pcsx2/MemoryTypes.h index c62492696b..e6fa985956 100644 --- a/pcsx2/MemoryTypes.h +++ b/pcsx2/MemoryTypes.h @@ -59,8 +59,8 @@ struct IopVM_MemoryAllocMess // order to allow for simpler macros and reference handles to be defined (we can safely use // compile-time references to registers instead of having to use instance variables). -alignas(__pagesize) extern u8 eeHw[Ps2MemSize::Hardware]; -alignas(__pagesize) extern u8 iopHw[Ps2MemSize::IopHardware]; +alignas(__pagealignsize) extern u8 eeHw[Ps2MemSize::Hardware]; +alignas(__pagealignsize) extern u8 iopHw[Ps2MemSize::IopHardware]; extern EEVM_MemoryAllocMess* eeMem; diff --git a/pcsx2/R5900.cpp b/pcsx2/R5900.cpp index cb86ac83c1..16214c65ac 100644 --- a/pcsx2/R5900.cpp +++ b/pcsx2/R5900.cpp @@ -34,8 +34,7 @@ using namespace R5900; // for R5900 disasm tools s32 EEsCycle; // used to sync the IOP to the EE u32 EEoCycle; -alignas(16) cpuRegisters cpuRegs; -alignas(16) fpuRegisters fpuRegs; +alignas(16) cpuRegistersPack _cpuRegistersPack; alignas(16) tlbs tlb[48]; R5900cpu *Cpu = NULL; diff --git a/pcsx2/R5900.h b/pcsx2/R5900.h index fd977f73b8..3c80e765f8 100644 --- a/pcsx2/R5900.h +++ b/pcsx2/R5900.h @@ -202,10 +202,18 @@ struct tlbs #endif -alignas(16) extern cpuRegisters cpuRegs; -alignas(16) extern fpuRegisters fpuRegs; +struct cpuRegistersPack +{ + alignas(16) cpuRegisters cpuRegs; + alignas(16) fpuRegisters fpuRegs; +}; + +alignas(16) extern cpuRegistersPack _cpuRegistersPack; alignas(16) extern tlbs tlb[48]; +static cpuRegisters& cpuRegs = _cpuRegistersPack.cpuRegs; +static fpuRegisters& fpuRegs = _cpuRegistersPack.fpuRegs; + extern bool eeEventTestIsActive; void intUpdateCPUCycles(); diff --git a/pcsx2/SourceLog.cpp b/pcsx2/SourceLog.cpp index 65b977d2d1..ce1f9aa2b7 100644 --- a/pcsx2/SourceLog.cpp +++ b/pcsx2/SourceLog.cpp @@ -11,7 +11,7 @@ #include "DebugTools/Debug.h" #include "R3000A.h" -#include "x86/iR5900.h" +#include "R5900.h" #include "fmt/core.h" diff --git a/pcsx2/VMManager.cpp b/pcsx2/VMManager.cpp index 487ec6ea0b..4450d6a3d6 100644 --- a/pcsx2/VMManager.cpp +++ b/pcsx2/VMManager.cpp @@ -38,6 +38,7 @@ #include "SIO/Sio2.h" #include "SPU2/spu2.h" #include "USB/USB.h" +#include "Vif_Dynarec.h" #include "VMManager.h" #include "ps2/BiosTools.h" #include "svnrev.h" @@ -76,10 +77,6 @@ #include "common/Darwin/DarwinMisc.h" #endif -#ifdef _M_X86 -#include "x86/newVif.h" -#endif - namespace VMManager { static void SetDefaultLoggingSettings(SettingsInterface& si); @@ -230,6 +227,14 @@ bool VMManager::PerformEarlyHardwareChecks(const char** error) return false; } #endif +#elif defined(_M_ARM64) + // Check page size. If it doesn't match, it is a fatal error. + const size_t runtime_host_page_size = HostSys::GetRuntimePageSize(); + if (__pagesize != runtime_host_page_size) + { + *error = "Page size mismatch. This build cannot run on your Mac.\n\n" COMMON_DOWNLOAD_MESSAGE; + return false; + } #endif #undef COMMON_DOWNLOAD_MESSAGE @@ -2502,6 +2507,7 @@ void VMManager::LogCPUCapabilities() LogUserPowerPlan(); #endif +#ifdef _M_X86 std::string features; if (cpuinfo_has_x86_avx()) features += "AVX "; @@ -2513,6 +2519,18 @@ void VMManager::LogCPUCapabilities() Console.WriteLn(Color_StrongBlack, "x86 Features Detected:"); Console.WriteLnFmt(" {}", features); Console.WriteLn(); +#endif + +#ifdef _M_ARM64 + const size_t runtime_cache_line_size = HostSys::GetRuntimeCacheLineSize(); + if (__cachelinesize != runtime_cache_line_size) + { + // Not fatal, but does have performance implications. + WARNING_LOG( + "Cache line size mismatch. This build was compiled with {} byte lines, but the system has {} byte lines.", + __cachelinesize, runtime_cache_line_size); + } +#endif #if 0 LogGPUCapabilities(); @@ -3197,6 +3215,8 @@ void VMManager::WarnAboutUnsafeSettings() append(ICON_FA_EXCLAMATION_CIRCLE, TRANSLATE_SV("VMManager", "INTC Spin Detection is not enabled, this may reduce performance.")); } + if (!EmuConfig.Cpu.Recompiler.EnableFastmem) + append(ICON_FA_EXCLAMATION_CIRCLE, TRANSLATE_SV("VMManager", "Fastmem is not enabled, this will reduce performance.")); if (!EmuConfig.Speedhacks.vu1Instant) { append(ICON_FA_EXCLAMATION_CIRCLE, @@ -3322,6 +3342,12 @@ static u32 GetProcessorIdForProcessor(const cpuinfo_processor* proc) static void InitializeProcessorList() { + if (!cpuinfo_initialize()) + { + Console.Error("cpuinfo_initialize() failed"); + return; + } + const u32 cluster_count = cpuinfo_get_clusters_count(); if (cluster_count == 0) { @@ -3448,6 +3474,10 @@ static void InitializeProcessorList() static void SetMTVUAndAffinityControlDefault(SettingsInterface& si) { +#ifdef __APPLE__ + // Everything we support Mac-wise has enough cores for MTVU. + si.SetBoolValue("EmuCore/Speedhacks", "vuThread", true); +#endif } #endif diff --git a/pcsx2/VU0micro.cpp b/pcsx2/VU0micro.cpp index 83fbdb998e..7b6b5cab81 100644 --- a/pcsx2/VU0micro.cpp +++ b/pcsx2/VU0micro.cpp @@ -29,6 +29,8 @@ static __fi void vu0SetMicroFlags(u32* flags, u32 value) { #ifdef _M_X86 _mm_store_si128(reinterpret_cast<__m128i*>(flags), _mm_set1_epi32(value)); +#elif defined(_M_ARM64) + vst1q_u32(flags, vdupq_n_u32(value)); #else flags[0] = flags[1] = flags[2] = flags[3] = value; #endif diff --git a/pcsx2/Vif.cpp b/pcsx2/Vif.cpp index c067ca016f..68669941fc 100644 --- a/pcsx2/Vif.cpp +++ b/pcsx2/Vif.cpp @@ -8,7 +8,7 @@ #include "MTVU.h" #include "Vif.h" #include "Vif_Dma.h" -#include "x86/newVif.h" +#include "Vif_Dynarec.h" alignas(16) vifStruct vif0, vif1; diff --git a/pcsx2/Vif0_Dma.cpp b/pcsx2/Vif0_Dma.cpp index e0d774c0b5..90657badb2 100644 --- a/pcsx2/Vif0_Dma.cpp +++ b/pcsx2/Vif0_Dma.cpp @@ -3,8 +3,8 @@ #include "Common.h" #include "Vif_Dma.h" +#include "Vif_Dynarec.h" #include "VUmicro.h" -#include "x86/newVif.h" u32 g_vif0Cycles = 0; diff --git a/pcsx2/Vif1_Dma.cpp b/pcsx2/Vif1_Dma.cpp index dc4f853690..cb5ea39d08 100644 --- a/pcsx2/Vif1_Dma.cpp +++ b/pcsx2/Vif1_Dma.cpp @@ -7,7 +7,7 @@ #include "MTVU.h" #include "VUmicro.h" #include "Vif_Dma.h" -#include "x86/newVif.h" +#include "Vif_Dynarec.h" u32 g_vif1Cycles = 0; diff --git a/pcsx2/Vif_Codes.cpp b/pcsx2/Vif_Codes.cpp index 5c1aa50055..8fc3151d82 100644 --- a/pcsx2/Vif_Codes.cpp +++ b/pcsx2/Vif_Codes.cpp @@ -7,7 +7,7 @@ #include "MTVU.h" #include "VUmicro.h" #include "Vif_Dma.h" -#include "x86/newVif.h" +#include "Vif_Dynarec.h" #define vifOp(vifCodeName) _vifT int vifCodeName(int pass, const u32* data) #define pass1 if (pass == 0) diff --git a/pcsx2/Vif_Dynarec.h b/pcsx2/Vif_Dynarec.h new file mode 100644 index 0000000000..404fb88626 --- /dev/null +++ b/pcsx2/Vif_Dynarec.h @@ -0,0 +1,46 @@ +// SPDX-FileCopyrightText: 2002-2023 PCSX2 Dev Team +// SPDX-License-Identifier: LGPL-3.0+ + +#pragma once + +#include "Vif.h" +#include "Vif_HashBucket.h" +#include "VU.h" + +typedef u32 (*nVifCall)(void*, const void*); +typedef void (*nVifrecCall)(uptr dest, uptr src); + +extern void _nVifUnpack(int idx, const u8* data, uint mode, bool isFill); +extern void dVifReset(int idx); +extern void dVifRelease(int idx); +extern void VifUnpackSSE_Init(); + +_vifT extern void dVifUnpack(const u8* data, bool isFill); + +struct nVifStruct +{ + // Buffer for partial transfers (should always be first to ensure alignment) + // Maximum buffer size is 256 (vifRegs.Num max range) * 16 (quadword) + alignas(16) u8 buffer[256*16]; + u32 bSize; // Size of 'buffer' + + // VIF0 or VIF1 - provided for debugging helpfulness only, and is generally unused. + // (templates are used for most or all VIF indexing) + u32 idx; + + u8* recWritePtr; // current write pos into the reserve + u8* recEndPtr; + + HashBucket vifBlocks; // Vif Blocks + + + nVifStruct() = default; +}; + +extern void resetNewVif(int idx); + +alignas(16) extern nVifStruct nVif[2]; +alignas(16) extern nVifCall nVifUpk[(2 * 2 * 16) * 4]; // ([USN][Masking][Unpack Type]) [curCycle] +alignas(16) extern u32 nVifMask[3][4][4]; // [MaskNumber][CycleNumber][Vector] + +static constexpr bool newVifDynaRec = 1; // Use code in newVif_Dynarec.inl diff --git a/pcsx2/x86/newVif_HashBucket.h b/pcsx2/Vif_HashBucket.h similarity index 100% rename from pcsx2/x86/newVif_HashBucket.h rename to pcsx2/Vif_HashBucket.h diff --git a/pcsx2/Vif_Transfer.cpp b/pcsx2/Vif_Transfer.cpp index 393c8a3993..1e9eba9f43 100644 --- a/pcsx2/Vif_Transfer.cpp +++ b/pcsx2/Vif_Transfer.cpp @@ -3,7 +3,7 @@ #include "Common.h" #include "Vif_Dma.h" -#include "x86/newVif.h" +#include "Vif_Dynarec.h" //------------------------------------------------------------------ // VifCode Transfer Interpreter (Vif0/Vif1) diff --git a/pcsx2/Vif_Unpack.cpp b/pcsx2/Vif_Unpack.cpp index bac6ceb20c..6d51af56a6 100644 --- a/pcsx2/Vif_Unpack.cpp +++ b/pcsx2/Vif_Unpack.cpp @@ -4,6 +4,7 @@ #include "Common.h" #include "Vif.h" #include "Vif_Dma.h" +#include "Vif_Dynarec.h" #include "MTVU.h" enum UnpackOffset { @@ -244,3 +245,277 @@ _vifT void vifUnpackSetup(const u32 *data) { template void vifUnpackSetup<0>(const u32 *data); template void vifUnpackSetup<1>(const u32 *data); + +alignas(16) nVifStruct nVif[2]; + +// Interpreter-style SSE unpacks. Array layout matches the interpreter C unpacks. +// ([USN][Masking][Unpack Type]) [curCycle] +alignas(16) nVifCall nVifUpk[(2 * 2 * 16) * 4]; + +// This is used by the interpreted SSE unpacks only. Recompiled SSE unpacks +// and the interpreted C unpacks use the vif.MaskRow/MaskCol members directly. +// [MaskNumber][CycleNumber][Vector] +alignas(16) u32 nVifMask[3][4][4] = {}; + +// Number of bytes of data in the source stream needed for each vector. +// [equivalent to ((32 >> VL) * (VN+1)) / 8] +alignas(16) const u8 nVifT[16] = { + 4, // S-32 + 2, // S-16 + 1, // S-8 + 0, // ---- + 8, // V2-32 + 4, // V2-16 + 2, // V2-8 + 0, // ---- + 12,// V3-32 + 6, // V3-16 + 3, // V3-8 + 0, // ---- + 16,// V4-32 + 8, // V4-16 + 4, // V4-8 + 2, // V4-5 +}; + +// ---------------------------------------------------------------------------- +template +__ri void _nVifUnpackLoop(const u8* data); + +typedef void FnType_VifUnpackLoop(const u8* data); +typedef FnType_VifUnpackLoop* Fnptr_VifUnpackLoop; + +// Unpacks Until 'Num' is 0 +alignas(16) static const Fnptr_VifUnpackLoop UnpackLoopTable[2][2][2] = { + { + {_nVifUnpackLoop<0, 0, 0>, _nVifUnpackLoop<0, 0, 1>}, + {_nVifUnpackLoop<0, 1, 0>, _nVifUnpackLoop<0, 1, 1>}, + }, + { + {_nVifUnpackLoop<1, 0, 0>, _nVifUnpackLoop<1, 0, 1>}, + {_nVifUnpackLoop<1, 1, 0>, _nVifUnpackLoop<1, 1, 1>}, + }, +}; +// ---------------------------------------------------------------------------- + +void resetNewVif(int idx) +{ + // Safety Reset : Reassign all VIF structure info, just in case the VU1 pointers have + // changed for some reason. + + nVif[idx].idx = idx; + nVif[idx].bSize = 0; + std::memset(nVif[idx].buffer, 0, sizeof(nVif[idx].buffer)); + + if (newVifDynaRec) + dVifReset(idx); +} + +void releaseNewVif(int idx) +{ +} + +static __fi u8* getVUptr(uint idx, int offset) +{ + return (u8*)(vuRegs[idx].Mem + (offset & (idx ? 0x3ff0 : 0xff0))); +} + + +_vifT int nVifUnpack(const u8* data) +{ + nVifStruct& v = nVif[idx]; + vifStruct& vif = GetVifX; + VIFregisters& vifRegs = vifXRegs; + + const uint wl = vifRegs.cycle.wl ? vifRegs.cycle.wl : 256; + const uint ret = std::min(vif.vifpacketsize, vif.tag.size); + const bool isFill = (vifRegs.cycle.cl < wl); + s32 size = ret << 2; + + if (ret == vif.tag.size) // Full Transfer + { + if (v.bSize) // Last transfer was partial + { + memcpy(&v.buffer[v.bSize], data, size); + v.bSize += size; + size = v.bSize; + data = v.buffer; + + vif.cl = 0; + vifRegs.num = (vifXRegs.code >> 16) & 0xff; // grab NUM form the original VIFcode input. + if (!vifRegs.num) + vifRegs.num = 256; + } + + if (!idx || !THREAD_VU1) + { + if (newVifDynaRec) + dVifUnpack(data, isFill); + else + _nVifUnpack(idx, data, vifRegs.mode, isFill); + } + else + vu1Thread.VifUnpack(vif, vifRegs, (u8*)data, (size + 4) & ~0x3); + + vif.pass = 0; + vif.tag.size = 0; + vif.cmd = 0; + vifRegs.num = 0; + v.bSize = 0; + } + else // Partial Transfer + { + memcpy(&v.buffer[v.bSize], data, size); + v.bSize += size; + vif.tag.size -= ret; + + const u8& vSize = nVifT[vif.cmd & 0x0f]; + + // We need to provide accurate accounting of the NUM register, in case games decided + // to read back from it mid-transfer. Since so few games actually use partial transfers + // of VIF unpacks, this code should not be any bottleneck. + + if (!isFill) + { + vifRegs.num -= (size / vSize); + } + else + { + int dataSize = (size / vSize); + vifRegs.num = vifRegs.num - (((dataSize / vifRegs.cycle.cl) * (vifRegs.cycle.wl - vifRegs.cycle.cl)) + dataSize); + } + } + + return ret; +} + +template int nVifUnpack<0>(const u8* data); +template int nVifUnpack<1>(const u8* data); + +// This is used by the interpreted SSE unpacks only. Recompiled SSE unpacks +// and the interpreted C unpacks use the vif.MaskRow/MaskCol members directly. +static void setMasks(const vifStruct& vif, const VIFregisters& v) +{ + for (int i = 0; i < 16; i++) + { + int m = (v.mask >> (i * 2)) & 3; + switch (m) + { + case 0: // Data + nVifMask[0][i / 4][i % 4] = 0xffffffff; + nVifMask[1][i / 4][i % 4] = 0; + nVifMask[2][i / 4][i % 4] = 0; + break; + case 1: // MaskRow + nVifMask[0][i / 4][i % 4] = 0; + nVifMask[1][i / 4][i % 4] = 0; + nVifMask[2][i / 4][i % 4] = vif.MaskRow._u32[i % 4]; + break; + case 2: // MaskCol + nVifMask[0][i / 4][i % 4] = 0; + nVifMask[1][i / 4][i % 4] = 0; + nVifMask[2][i / 4][i % 4] = vif.MaskCol._u32[i / 4]; + break; + case 3: // Write Protect + nVifMask[0][i / 4][i % 4] = 0; + nVifMask[1][i / 4][i % 4] = 0xffffffff; + nVifMask[2][i / 4][i % 4] = 0; + break; + } + } +} + +// ---------------------------------------------------------------------------- +// Unpacking Optimization notes: +// ---------------------------------------------------------------------------- +// Some games send a LOT of single-cycle packets (God of War, SotC, TriAce games, etc), +// so we always need to be weary of keeping loop setup code optimized. It's not always +// a "win" to move code outside the loop, like normally in most other loop scenarios. +// +// The biggest bottleneck of the current code is the call/ret needed to invoke the SSE +// unpackers. A better option is to generate the entire vifRegs.num loop code as part +// of the SSE template, and inline the SSE code into the heart of it. This both avoids +// the call/ret and opens the door for resolving some register dependency chains in the +// current emitted functions. (this is what zero's SSE does to get it's final bit of +// speed advantage over the new vif). --air +// +// The BEST optimizatin strategy here is to use data available to us from the UNPACK dispatch +// -- namely the unpack type and mask flag -- in combination mode and usn values -- to +// generate ~600 special versions of this function. But since it's an interpreter, who gives +// a crap? Really? :p +// + +// size - size of the packet fragment incoming from DMAC. +template +__ri void _nVifUnpackLoop(const u8* data) +{ + + vifStruct& vif = MTVU_VifX; + VIFregisters& vifRegs = MTVU_VifXRegs; + + // skipSize used for skipping writes only + const int skipSize = (vifRegs.cycle.cl - vifRegs.cycle.wl) * 16; + + //DevCon.WriteLn("[%d][%d][%d][num=%d][upk=%d][cl=%d][bl=%d][skip=%d]", isFill, doMask, doMode, vifRegs.num, upkNum, vif.cl, blockSize, skipSize); + + if (!doMode && (vif.cmd & 0x10)) + setMasks(vif, vifRegs); + + const int usn = !!vif.usn; + const int upkNum = vif.cmd & 0x1f; + const u8& vSize = nVifT[upkNum & 0x0f]; + //uint vl = vif.cmd & 0x03; + //uint vn = (vif.cmd >> 2) & 0x3; + //uint vSize = ((32 >> vl) * (vn+1)) / 8; // size of data (in bytes) used for each write cycle + + const nVifCall* fnbase = &nVifUpk[((usn * 2 * 16) + upkNum) * (4 * 1)]; + const UNPACKFUNCTYPE ft = VIFfuncTable[idx][doMode ? vifRegs.mode : 0][((usn * 2 * 16) + upkNum)]; + + pxAssume(vif.cl == 0); + //pxAssume (vifRegs.cycle.wl > 0); + + do + { + u8* dest = getVUptr(idx, vif.tag.addr); + + if (doMode) + { + //if (1) { + ft(dest, data); + } + else + { + //DevCon.WriteLn("SSE Unpack!"); + uint cl3 = std::min(vif.cl, 3); + fnbase[cl3](dest, data); + } + + vif.tag.addr += 16; + --vifRegs.num; + ++vif.cl; + + if (isFill) + { + //DevCon.WriteLn("isFill!"); + if (vif.cl <= vifRegs.cycle.cl) + data += vSize; + else if (vif.cl == vifRegs.cycle.wl) + vif.cl = 0; + } + else + { + data += vSize; + + if (vif.cl >= vifRegs.cycle.wl) + { + vif.tag.addr += skipSize; + vif.cl = 0; + } + } + } while (vifRegs.num); +} + +__fi void _nVifUnpack(int idx, const u8* data, uint mode, bool isFill) +{ + UnpackLoopTable[idx][!!mode][isFill](data); +} diff --git a/pcsx2/pcsx2.vcxproj b/pcsx2/pcsx2.vcxproj index 0aef347a79..be7f9860fa 100644 --- a/pcsx2/pcsx2.vcxproj +++ b/pcsx2/pcsx2.vcxproj @@ -425,9 +425,6 @@ - - true - true @@ -819,6 +816,8 @@ + + @@ -838,7 +837,6 @@ - diff --git a/pcsx2/pcsx2.vcxproj.filters b/pcsx2/pcsx2.vcxproj.filters index 3cb6590af1..c5f43e59f3 100644 --- a/pcsx2/pcsx2.vcxproj.filters +++ b/pcsx2/pcsx2.vcxproj.filters @@ -512,9 +512,6 @@ System\Ps2\EmotionEngine\DMAC\Vif\Unpack - - System\Ps2\EmotionEngine\DMAC\Vif\Unpack\newVif - System\Ps2\EmotionEngine\DMAC\Vif\Unpack\newVif\Dynarec @@ -1484,9 +1481,6 @@ System\Ps2\EmotionEngine\DMAC\Vif\Unpack\newVif - - System\Ps2\EmotionEngine\DMAC\Vif\Unpack\newVif - System\Ps2\EmotionEngine\DMAC\Vif\Unpack\newVif\Dynarec @@ -2303,6 +2297,12 @@ System\ISO + + System\Ps2\EmotionEngine\DMAC\Vif\Unpack\newVif\Dynarec + + + System\Ps2\EmotionEngine\DMAC\Vif\Unpack\newVif\Dynarec + diff --git a/pcsx2/x86/newVif.h b/pcsx2/x86/newVif.h index bc3fe225ec..3bdf43bef7 100644 --- a/pcsx2/x86/newVif.h +++ b/pcsx2/x86/newVif.h @@ -3,28 +3,14 @@ #pragma once -#include "Vif.h" -#include "VU.h" +#include "Vif_Dynarec.h" #include "common/emitter/x86emitter.h" using namespace x86Emitter; -// newVif_HashBucket.h uses this typedef, so it has to be declared first. -typedef u32 (*nVifCall)(void*, const void*); -typedef void (*nVifrecCall)(uptr dest, uptr src); - -#include "newVif_HashBucket.h" - extern void mVUmergeRegs(const xRegisterSSE& dest, const xRegisterSSE& src, int xyzw, bool modXYZW = 0); extern void mVUsaveReg(const xRegisterSSE& reg, xAddressVoid ptr, int xyzw, bool modXYZW); -extern void _nVifUnpack (int idx, const u8* data, uint mode, bool isFill); -extern void dVifReset (int idx); -extern void dVifClose (int idx); -extern void dVifRelease (int idx); -extern void VifUnpackSSE_Init(); - -_vifT extern void dVifUnpack(const u8* data, bool isFill); #define VUFT VIFUnpackFuncTable #define _v0 0 @@ -37,31 +23,3 @@ _vifT extern void dVifUnpack(const u8* data, bool isFill); #define xmmCol3 xmm5 #define xmmRow xmm6 #define xmmTemp xmm7 - -struct nVifStruct -{ - // Buffer for partial transfers (should always be first to ensure alignment) - // Maximum buffer size is 256 (vifRegs.Num max range) * 16 (quadword) - alignas(16) u8 buffer[256*16]; - u32 bSize; // Size of 'buffer' - - // VIF0 or VIF1 - provided for debugging helpfulness only, and is generally unused. - // (templates are used for most or all VIF indexing) - u32 idx; - - u8* recWritePtr; // current write pos into the reserve - u8* recEndPtr; - - HashBucket vifBlocks; // Vif Blocks - - - nVifStruct() = default; -}; - -extern void resetNewVif(int idx); - -alignas(16) extern nVifStruct nVif[2]; -alignas(16) extern nVifCall nVifUpk[(2 * 2 * 16) * 4]; // ([USN][Masking][Unpack Type]) [curCycle] -alignas(16) extern u32 nVifMask[3][4][4]; // [MaskNumber][CycleNumber][Vector] - -static constexpr bool newVifDynaRec = 1; // Use code in newVif_Dynarec.inl diff --git a/pcsx2/x86/newVif_Unpack.cpp b/pcsx2/x86/newVif_Unpack.cpp deleted file mode 100644 index eea6e3056f..0000000000 --- a/pcsx2/x86/newVif_Unpack.cpp +++ /dev/null @@ -1,282 +0,0 @@ -// SPDX-FileCopyrightText: 2002-2023 PCSX2 Dev Team -// SPDX-License-Identifier: LGPL-3.0+ - -#include "Common.h" -#include "Vif_Dma.h" -#include "newVif.h" -#include "MTVU.h" - -alignas(16) nVifStruct nVif[2]; - -// Interpreter-style SSE unpacks. Array layout matches the interpreter C unpacks. -// ([USN][Masking][Unpack Type]) [curCycle] -alignas(16) nVifCall nVifUpk[(2 * 2 * 16) * 4]; - -// This is used by the interpreted SSE unpacks only. Recompiled SSE unpacks -// and the interpreted C unpacks use the vif.MaskRow/MaskCol members directly. -// [MaskNumber][CycleNumber][Vector] -alignas(16) u32 nVifMask[3][4][4] = {}; - -// Number of bytes of data in the source stream needed for each vector. -// [equivalent to ((32 >> VL) * (VN+1)) / 8] -alignas(16) const u8 nVifT[16] = { - 4, // S-32 - 2, // S-16 - 1, // S-8 - 0, // ---- - 8, // V2-32 - 4, // V2-16 - 2, // V2-8 - 0, // ---- - 12,// V3-32 - 6, // V3-16 - 3, // V3-8 - 0, // ---- - 16,// V4-32 - 8, // V4-16 - 4, // V4-8 - 2, // V4-5 -}; - -// ---------------------------------------------------------------------------- -template -__ri void _nVifUnpackLoop(const u8* data); - -typedef void FnType_VifUnpackLoop(const u8* data); -typedef FnType_VifUnpackLoop* Fnptr_VifUnpackLoop; - -// Unpacks Until 'Num' is 0 -alignas(16) static const Fnptr_VifUnpackLoop UnpackLoopTable[2][2][2] = { - { - {_nVifUnpackLoop<0, 0, 0>, _nVifUnpackLoop<0, 0, 1>}, - {_nVifUnpackLoop<0, 1, 0>, _nVifUnpackLoop<0, 1, 1>}, - }, - { - {_nVifUnpackLoop<1, 0, 0>, _nVifUnpackLoop<1, 0, 1>}, - {_nVifUnpackLoop<1, 1, 0>, _nVifUnpackLoop<1, 1, 1>}, - }, -}; -// ---------------------------------------------------------------------------- - -void resetNewVif(int idx) -{ - // Safety Reset : Reassign all VIF structure info, just in case the VU1 pointers have - // changed for some reason. - - nVif[idx].idx = idx; - nVif[idx].bSize = 0; - std::memset(nVif[idx].buffer, 0, sizeof(nVif[idx].buffer)); - - if (newVifDynaRec) - dVifReset(idx); -} - -void releaseNewVif(int idx) -{ -} - -static __fi u8* getVUptr(uint idx, int offset) -{ - return (u8*)(vuRegs[idx].Mem + (offset & (idx ? 0x3ff0 : 0xff0))); -} - - -_vifT int nVifUnpack(const u8* data) -{ - nVifStruct& v = nVif[idx]; - vifStruct& vif = GetVifX; - VIFregisters& vifRegs = vifXRegs; - - const uint wl = vifRegs.cycle.wl ? vifRegs.cycle.wl : 256; - const uint ret = std::min(vif.vifpacketsize, vif.tag.size); - const bool isFill = (vifRegs.cycle.cl < wl); - s32 size = ret << 2; - - if (ret == vif.tag.size) // Full Transfer - { - if (v.bSize) // Last transfer was partial - { - memcpy(&v.buffer[v.bSize], data, size); - v.bSize += size; - size = v.bSize; - data = v.buffer; - - vif.cl = 0; - vifRegs.num = (vifXRegs.code >> 16) & 0xff; // grab NUM form the original VIFcode input. - if (!vifRegs.num) - vifRegs.num = 256; - } - - if (!idx || !THREAD_VU1) - { - if (newVifDynaRec) - dVifUnpack(data, isFill); - else - _nVifUnpack(idx, data, vifRegs.mode, isFill); - } - else - vu1Thread.VifUnpack(vif, vifRegs, (u8*)data, (size + 4) & ~0x3); - - vif.pass = 0; - vif.tag.size = 0; - vif.cmd = 0; - vifRegs.num = 0; - v.bSize = 0; - } - else // Partial Transfer - { - memcpy(&v.buffer[v.bSize], data, size); - v.bSize += size; - vif.tag.size -= ret; - - const u8& vSize = nVifT[vif.cmd & 0x0f]; - - // We need to provide accurate accounting of the NUM register, in case games decided - // to read back from it mid-transfer. Since so few games actually use partial transfers - // of VIF unpacks, this code should not be any bottleneck. - - if (!isFill) - { - vifRegs.num -= (size / vSize); - } - else - { - int dataSize = (size / vSize); - vifRegs.num = vifRegs.num - (((dataSize / vifRegs.cycle.cl) * (vifRegs.cycle.wl - vifRegs.cycle.cl)) + dataSize); - } - } - - return ret; -} - -template int nVifUnpack<0>(const u8* data); -template int nVifUnpack<1>(const u8* data); - -// This is used by the interpreted SSE unpacks only. Recompiled SSE unpacks -// and the interpreted C unpacks use the vif.MaskRow/MaskCol members directly. -static void setMasks(const vifStruct& vif, const VIFregisters& v) -{ - for (int i = 0; i < 16; i++) - { - int m = (v.mask >> (i * 2)) & 3; - switch (m) - { - case 0: // Data - nVifMask[0][i / 4][i % 4] = 0xffffffff; - nVifMask[1][i / 4][i % 4] = 0; - nVifMask[2][i / 4][i % 4] = 0; - break; - case 1: // MaskRow - nVifMask[0][i / 4][i % 4] = 0; - nVifMask[1][i / 4][i % 4] = 0; - nVifMask[2][i / 4][i % 4] = vif.MaskRow._u32[i % 4]; - break; - case 2: // MaskCol - nVifMask[0][i / 4][i % 4] = 0; - nVifMask[1][i / 4][i % 4] = 0; - nVifMask[2][i / 4][i % 4] = vif.MaskCol._u32[i / 4]; - break; - case 3: // Write Protect - nVifMask[0][i / 4][i % 4] = 0; - nVifMask[1][i / 4][i % 4] = 0xffffffff; - nVifMask[2][i / 4][i % 4] = 0; - break; - } - } -} - -// ---------------------------------------------------------------------------- -// Unpacking Optimization notes: -// ---------------------------------------------------------------------------- -// Some games send a LOT of single-cycle packets (God of War, SotC, TriAce games, etc), -// so we always need to be weary of keeping loop setup code optimized. It's not always -// a "win" to move code outside the loop, like normally in most other loop scenarios. -// -// The biggest bottleneck of the current code is the call/ret needed to invoke the SSE -// unpackers. A better option is to generate the entire vifRegs.num loop code as part -// of the SSE template, and inline the SSE code into the heart of it. This both avoids -// the call/ret and opens the door for resolving some register dependency chains in the -// current emitted functions. (this is what zero's SSE does to get it's final bit of -// speed advantage over the new vif). --air -// -// The BEST optimizatin strategy here is to use data available to us from the UNPACK dispatch -// -- namely the unpack type and mask flag -- in combination mode and usn values -- to -// generate ~600 special versions of this function. But since it's an interpreter, who gives -// a crap? Really? :p -// - -// size - size of the packet fragment incoming from DMAC. -template -__ri void _nVifUnpackLoop(const u8* data) -{ - - vifStruct& vif = MTVU_VifX; - VIFregisters& vifRegs = MTVU_VifXRegs; - - // skipSize used for skipping writes only - const int skipSize = (vifRegs.cycle.cl - vifRegs.cycle.wl) * 16; - - //DevCon.WriteLn("[%d][%d][%d][num=%d][upk=%d][cl=%d][bl=%d][skip=%d]", isFill, doMask, doMode, vifRegs.num, upkNum, vif.cl, blockSize, skipSize); - - if (!doMode && (vif.cmd & 0x10)) - setMasks(vif, vifRegs); - - const int usn = !!vif.usn; - const int upkNum = vif.cmd & 0x1f; - const u8& vSize = nVifT[upkNum & 0x0f]; - //uint vl = vif.cmd & 0x03; - //uint vn = (vif.cmd >> 2) & 0x3; - //uint vSize = ((32 >> vl) * (vn+1)) / 8; // size of data (in bytes) used for each write cycle - - const nVifCall* fnbase = &nVifUpk[((usn * 2 * 16) + upkNum) * (4 * 1)]; - const UNPACKFUNCTYPE ft = VIFfuncTable[idx][doMode ? vifRegs.mode : 0][((usn * 2 * 16) + upkNum)]; - - pxAssume(vif.cl == 0); - //pxAssume (vifRegs.cycle.wl > 0); - - do - { - u8* dest = getVUptr(idx, vif.tag.addr); - - if (doMode) - { - //if (1) { - ft(dest, data); - } - else - { - //DevCon.WriteLn("SSE Unpack!"); - uint cl3 = std::min(vif.cl, 3); - fnbase[cl3](dest, data); - } - - vif.tag.addr += 16; - --vifRegs.num; - ++vif.cl; - - if (isFill) - { - //DevCon.WriteLn("isFill!"); - if (vif.cl <= vifRegs.cycle.cl) - data += vSize; - else if (vif.cl == vifRegs.cycle.wl) - vif.cl = 0; - } - else - { - data += vSize; - - if (vif.cl >= vifRegs.cycle.wl) - { - vif.tag.addr += skipSize; - vif.cl = 0; - } - } - } while (vifRegs.num); -} - -__fi void _nVifUnpack(int idx, const u8* data, uint mode, bool isFill) -{ - - UnpackLoopTable[idx][!!mode][isFill](data); -}