From 00d768a6bfaa3e45f8e4cb484585f402e222edac Mon Sep 17 00:00:00 2001 From: Stenzek Date: Sun, 25 Dec 2022 22:14:15 +1000 Subject: [PATCH] x86/microVU: Add VI caching --- common/emitter/x86types.h | 38 +++ pcsx2/x86/iCore.cpp | 12 +- pcsx2/x86/iCore.h | 13 +- pcsx2/x86/ix86-32/iCore-32.cpp | 16 +- pcsx2/x86/microVU.cpp | 1 + pcsx2/x86/microVU.h | 1 + pcsx2/x86/microVU_Alloc.inl | 28 +-- pcsx2/x86/microVU_Execute.inl | 75 ++++++ pcsx2/x86/microVU_Flags.inl | 6 +- pcsx2/x86/microVU_IR.h | 418 +++++++++++++++++++++++++++++- pcsx2/x86/microVU_Lower.inl | 448 ++++++++++++++++++--------------- pcsx2/x86/microVU_Macro.inl | 98 ++++++-- pcsx2/x86/microVU_Misc.h | 3 - pcsx2/x86/microVU_Misc.inl | 147 +++++++---- 14 files changed, 997 insertions(+), 307 deletions(-) diff --git a/common/emitter/x86types.h b/common/emitter/x86types.h index 75fee148eb..50d53650c6 100644 --- a/common/emitter/x86types.h +++ b/common/emitter/x86types.h @@ -34,6 +34,12 @@ extern thread_local XMMSSEType g_xmmtypes[iREGCNT_XMM]; namespace x86Emitter { + // Win32 requires 32 bytes of shadow stack in the caller's frame. +#ifdef _WIN32 + static constexpr int SHADOW_STACK_SIZE = 32; +#else + static constexpr int SHADOW_STACK_SIZE = 0; +#endif extern void xWrite8(u8 val); extern void xWrite16(u16 val); @@ -401,6 +407,8 @@ namespace x86Emitter pxAssertDev(other.canMapIDTo(4), "Mapping h registers to higher registers can produce unexpected values"); } + static const inline xRegister32& GetInstance(uint id); + bool operator==(const xRegister32& src) const { return this->Id == src.Id; } bool operator!=(const xRegister32& src) const { return this->Id != src.Id; } }; @@ -421,6 +429,8 @@ namespace x86Emitter pxAssertDev(other.canMapIDTo(8), "Mapping h registers to higher registers can produce unexpected values"); } + static const inline xRegister64& GetInstance(uint id); + bool operator==(const xRegister64& src) const { return this->Id == src.Id; } bool operator!=(const xRegister64& src) const { return this->Id != src.Id; } }; @@ -664,6 +674,34 @@ extern const xRegister32 #endif } + const xRegister32& xRegister32::GetInstance(uint id) + { + static const xRegister32* const m_tbl_x86Regs[] = + { + &eax, &ecx, &edx, &ebx, + &esp, &ebp, &esi, &edi, + &r8d, &r9d, &r10d, &r11d, + &r12d, &r13d, &r14d, &r15d, + }; + + pxAssert(id < iREGCNT_GPR); + return *m_tbl_x86Regs[id]; + } + + const xRegister64& xRegister64::GetInstance(uint id) + { + static const xRegister64* const m_tbl_x86Regs[] = + { + &rax, &rcx, &rdx, &rbx, + &rsp, &rbp, &rsi, &rdi, + &r8, &r9, &r10, &r11, + &r12, &r13, &r14, &r15 + }; + + pxAssert(id < iREGCNT_GPR); + return *m_tbl_x86Regs[id]; + } + bool xRegisterSSE::IsCallerSaved(uint id) { #ifdef _WIN32 diff --git a/pcsx2/x86/iCore.cpp b/pcsx2/x86/iCore.cpp index b0f3f0cae3..9569c55c52 100644 --- a/pcsx2/x86/iCore.cpp +++ b/pcsx2/x86/iCore.cpp @@ -144,7 +144,7 @@ int _getFreeXMMreg(u32 maxreg) case XMMTYPE_VFREG: { - if (COP2INST_USEDTEST(xmmregs[i].reg)) + if (EEINST_VFUSEDTEST(xmmregs[i].reg)) continue; } break; @@ -875,6 +875,16 @@ int _allocIfUsedGPRtoX86(int gprreg, int mode) return EEINST_USEDTEST(gprreg) ? _allocX86reg(X86TYPE_GPR, gprreg, mode) : -1; } +int _allocIfUsedVItoX86(int vireg, int mode) +{ + const int x86reg = _checkX86reg(X86TYPE_VIREG, vireg, mode); + if (x86reg >= 0) + return x86reg; + + // Prefer not to stop on COP2 reserved registers here. + return EEINST_VIUSEDTEST(vireg) ? _allocX86reg(X86TYPE_VIREG, vireg, mode | MODE_COP2) : -1; +} + int _allocIfUsedGPRtoXMM(int gprreg, int mode) { const int mmreg = _checkXMMreg(XMMTYPE_GPRREG, gprreg, mode); diff --git a/pcsx2/x86/iCore.h b/pcsx2/x86/iCore.h index d2208a8eec..7b336bc24b 100644 --- a/pcsx2/x86/iCore.h +++ b/pcsx2/x86/iCore.h @@ -30,6 +30,7 @@ #define MODE_READ 1 #define MODE_WRITE 2 #define MODE_CALLEESAVED 0x20 // can't flush reg to mem +#define MODE_COP2 0x40 // don't allow using reserved VU registers #define PROCESS_EE_XMM 0x02 @@ -119,6 +120,9 @@ void _flushConstReg(int reg); void _validateRegs(); void _writebackX86Reg(int x86reg); +void mVUFreeCOP2GPR(int hostreg); +bool mVUIsReservedCOP2(int hostreg); + //////////////////////////////////////////////////////////////////////////////// // XMM (128-bit) Register Allocation Tools @@ -247,11 +251,17 @@ static __fi bool EEINST_XMMUSEDTEST(u32 reg) } /// Returns true if the specified VF register is used later in the block. -static __fi bool COP2INST_USEDTEST(u32 reg) +static __fi bool EEINST_VFUSEDTEST(u32 reg) { return (g_pCurInstInfo->vfregs[reg] & (EEINST_USED | EEINST_LASTUSE)) == EEINST_USED; } +/// Returns true if the specified VI register is used later in the block. +static __fi bool EEINST_VIUSEDTEST(u32 reg) +{ + return (g_pCurInstInfo->viregs[reg] & (EEINST_USED | EEINST_LASTUSE)) == EEINST_USED; +} + /// Returns true if the value should be computed/written back. /// Basically, this means it's either used before it's overwritten, or not overwritten by the end of the block. static __fi bool EEINST_LIVETEST(u32 reg) @@ -297,6 +307,7 @@ extern u16 g_xmmAllocCounter; // allocates only if later insts use this register int _allocIfUsedGPRtoX86(int gprreg, int mode); +int _allocIfUsedVItoX86(int vireg, int mode); int _allocIfUsedGPRtoXMM(int gprreg, int mode); int _allocIfUsedFPUtoXMM(int fpureg, int mode); diff --git a/pcsx2/x86/ix86-32/iCore-32.cpp b/pcsx2/x86/ix86-32/iCore-32.cpp index 4d956dec7b..e5fc702292 100644 --- a/pcsx2/x86/ix86-32/iCore-32.cpp +++ b/pcsx2/x86/ix86-32/iCore-32.cpp @@ -55,6 +55,9 @@ int _getFreeX86reg(int mode) if ((mode & MODE_CALLEESAVED) && xRegister32::IsCallerSaved(reg)) continue; + if ((mode & MODE_COP2) && mVUIsReservedCOP2(reg)) + continue; + if (x86regs[reg].inuse == 0) { g_x86checknext = (reg + 1) % iREGCNT_GPR; @@ -70,6 +73,9 @@ int _getFreeX86reg(int mode) if ((mode & MODE_CALLEESAVED) && xRegister32::IsCallerSaved(i)) continue; + if ((mode & MODE_COP2) && mVUIsReservedCOP2(i)) + continue; + // should have checked inuse in the previous loop. pxAssert(x86regs[i].inuse); @@ -373,6 +379,13 @@ int _allocX86reg(int type, int reg, int mode) } break; + case X86TYPE_VIREG: + { + RALOG("Loading guest VI reg %d to GPR %d", reg, regnum); + xMOVZX(xRegister32(regnum), ptr16[&VU0.VI[reg].US[0]]); + } + break; + default: abort(); break; @@ -536,8 +549,7 @@ void _freeX86regWithoutWriteback(int x86reg) if (x86regs[x86reg].type == X86TYPE_VIREG) { RALOG("Freeing VI reg %d in host GPR %d\n", x86regs[x86reg].reg, x86reg); - //mVUFreeCOP2GPR(x86reg); - abort(); + mVUFreeCOP2GPR(x86reg); } else if (x86regs[x86reg].inuse && x86regs[x86reg].type == X86TYPE_GPR) { diff --git a/pcsx2/x86/microVU.cpp b/pcsx2/x86/microVU.cpp index 4439936218..09d93905ad 100644 --- a/pcsx2/x86/microVU.cpp +++ b/pcsx2/x86/microVU.cpp @@ -89,6 +89,7 @@ void mVUreset(microVU& mVU, bool resetReserve) x86SetPtr(mVU.dispCache); mVUdispatcherAB(mVU); mVUdispatcherCD(mVU); + mvuGenerateWaitMTVU(mVU); mVUemitSearch(); mVU.regs().nextBlockCycles = 0; diff --git a/pcsx2/x86/microVU.h b/pcsx2/x86/microVU.h index 918bf01bb7..fa859d03de 100644 --- a/pcsx2/x86/microVU.h +++ b/pcsx2/x86/microVU.h @@ -251,6 +251,7 @@ struct microVU u8* exitFunct; // Function Ptr to the recompiler dispatcher (exit) u8* startFunctXG; // Function Ptr to the recompiler dispatcher (xgkick resume) u8* exitFunctXG; // Function Ptr to the recompiler dispatcher (xgkick exit) + u8* waitMTVU; // Ptr to function to save registers/sync VU1 thread u8* resumePtrXG; // Ptr to recompiled code position to resume xgkick u32 code; // Contains the current Instruction u32 divFlag; // 1 instance of I/D flags diff --git a/pcsx2/x86/microVU_Alloc.inl b/pcsx2/x86/microVU_Alloc.inl index 606ff1a582..56d109c498 100644 --- a/pcsx2/x86/microVU_Alloc.inl +++ b/pcsx2/x86/microVU_Alloc.inl @@ -116,32 +116,10 @@ __fi void mVUallocCFLAGb(mV, const x32& reg, int fInstance) // VI Reg Allocators //------------------------------------------------------------------ -__ri void mVUallocVIa(mV, const x32& GPRreg, int _reg_, bool signext = false) +void microRegAlloc::writeVIBackup(const xRegisterInt& reg) { - if (!_reg_) - xXOR(GPRreg, GPRreg); - else if (signext) - xMOVSX(GPRreg, ptr16[&mVU.regs().VI[_reg_].SL]); - else - xMOVZX(GPRreg, ptr16[&mVU.regs().VI[_reg_].UL]); -} - -__ri void mVUallocVIb(mV, const x32& GPRreg, int _reg_) -{ - if (mVUlow.backupVI) // Backs up reg to memory (used when VI is modified b4 a branch) - { - xMOVZX(gprT3, ptr16[&mVU.regs().VI[_reg_].UL]); - xMOV (ptr32[&mVU.VIbackup], gprT3); - } - - if (_reg_ == 0) - { - return; - } - else if (_reg_ < 16) - { - xMOV(ptr16[&mVU.regs().VI[_reg_].UL], xRegister16(GPRreg.Id)); - } + microVU& mVU = index ? microVU1 : microVU0; + xMOV(ptr32[&mVU.VIbackup], xRegister32(reg)); } //------------------------------------------------------------------ diff --git a/pcsx2/x86/microVU_Execute.inl b/pcsx2/x86/microVU_Execute.inl index 309539e526..87261c50f2 100644 --- a/pcsx2/x86/microVU_Execute.inl +++ b/pcsx2/x86/microVU_Execute.inl @@ -123,6 +123,81 @@ void mVUdispatcherCD(mV) "microVU: Dispatcher generation exceeded reserved cache area!"); } +void mvuGenerateWaitMTVU(mV) +{ + mVU.waitMTVU = x86Ptr; + + int num_xmms = 0, num_gprs = 0; + + for (int i = 0; i < static_cast(iREGCNT_GPR); i++) + { + if (!xRegister32::IsCallerSaved(i) || i == rsp.GetId()) + continue; + + // no need to save temps + if (i == gprT1.GetId() || i == gprT2.GetId()) + continue; + + xPUSH(xRegister64(i)); + num_gprs++; + } + + for (int i = 0; i < static_cast(iREGCNT_XMM); i++) + { + if (!xRegisterSSE::IsCallerSaved(i)) + continue; + + num_xmms++; + } + + // We need 16 byte alignment on the stack. + // Since the stack is unaligned at entry to this function, we add 8 when it's even, not odd. + const int stack_size = (num_xmms * sizeof(u128)) + ((~num_gprs & 1) * sizeof(u64)) + SHADOW_STACK_SIZE; + int stack_offset = SHADOW_STACK_SIZE; + + if (stack_size > 0) + { + xSUB(rsp, stack_size); + for (int i = 0; i < static_cast(iREGCNT_XMM); i++) + { + if (!xRegisterSSE::IsCallerSaved(i)) + continue; + + xMOVAPS(ptr128[rsp + stack_offset], xRegisterSSE(i)); + stack_offset += sizeof(u128); + } + } + + xFastCall((void*)mVUwaitMTVU); + + stack_offset = (num_xmms - 1) * sizeof(u128) + SHADOW_STACK_SIZE; + for (int i = static_cast(iREGCNT_XMM - 1); i >= 0; i--) + { + if (!xRegisterSSE::IsCallerSaved(i)) + continue; + + xMOVAPS(xRegisterSSE(i), ptr128[rsp + stack_offset]); + stack_offset -= sizeof(u128); + } + xADD(rsp, stack_size); + + for (int i = static_cast(iREGCNT_GPR - 1); i >= 0; i--) + { + if (!xRegister32::IsCallerSaved(i) || i == rsp.GetId()) + continue; + + if (i == gprT1.GetId() || i == gprT2.GetId()) + continue; + + xPOP(xRegister64(i)); + } + + xRET(); + + pxAssertDev(xGetPtr() < (mVU.dispCache + mVUdispCacheSize), + "microVU: Dispatcher generation exceeded reserved cache area!"); +} + //------------------------------------------------------------------ // Execution Functions //------------------------------------------------------------------ diff --git a/pcsx2/x86/microVU_Flags.inl b/pcsx2/x86/microVU_Flags.inl index aa195cb822..3331a28b7a 100644 --- a/pcsx2/x86/microVU_Flags.inl +++ b/pcsx2/x86/microVU_Flags.inl @@ -313,13 +313,15 @@ __fi void mVUsetupFlags(mV, microFlagCycles& mFC) } else { + const xRegister32& temp3 = mVU.regAlloc->allocGPR(); xMOV(gprT1, getFlagReg(bStatus[0])); xMOV(gprT2, getFlagReg(bStatus[1])); - xMOV(gprT3, getFlagReg(bStatus[2])); + xMOV(temp3, getFlagReg(bStatus[2])); xMOV(gprF3, getFlagReg(bStatus[3])); xMOV(gprF0, gprT1); xMOV(gprF1, gprT2); - xMOV(gprF2, gprT3); + xMOV(gprF2, temp3); + mVU.regAlloc->clearNeeded(temp3); } } diff --git a/pcsx2/x86/microVU_IR.h b/pcsx2/x86/microVU_IR.h index b3c3d4eef6..a83ec58424 100644 --- a/pcsx2/x86/microVU_IR.h +++ b/pcsx2/x86/microVU_IR.h @@ -228,11 +228,25 @@ struct microMapXMM bool isZero; // Register was loaded from VF00 and doesn't need clamping }; +struct microMapGPR +{ + int VIreg; + int count; + bool isNeeded; + bool dirty; + bool isZeroExtended; + bool usable; +}; + class microRegAlloc { protected: - static const int xmmTotal = 15; // PQ register is reserved + static const int xmmTotal = iREGCNT_XMM - 1; // PQ register is reserved + static const int gprTotal = iREGCNT_GPR; + microMapXMM xmmMap[xmmTotal]; + microMapGPR gprMap[gprTotal]; + int counter; // Current allocation count int index; // VU0 or VU1 @@ -251,6 +265,18 @@ protected: __ri void loadIreg(const xmm& reg, int xyzw) { + for (int i = 0; i < gprTotal; i++) + { + if (gprMap[i].VIreg == REG_I) + { + xMOVDZX(reg, xRegister32(i)); + if (!_XYZWss(xyzw)) + xSHUF.PS(reg, reg, 0); + + return; + } + } + xMOVSSZX(reg, ptr32[&getVI(REG_I)]); if (!_XYZWss(xyzw)) xSHUF.PS(reg, reg, 0); @@ -290,10 +316,59 @@ protected: return x; } + int findFreeGPRRec(int startIdx) + { + for (int i = startIdx; i < gprTotal; i++) + { + if (gprMap[i].usable && !gprMap[i].isNeeded) + { + int x = findFreeGPRRec(i + 1); + if (x == -1) + return i; + return ((gprMap[i].count < gprMap[x].count) ? i : x); + } + } + return -1; + } + + int findFreeGPR(int vireg) + { + if (regAllocCOP2) + return _allocX86reg(X86TYPE_VIREG, vireg, MODE_COP2); + + for (int i = 0; i < gprTotal; i++) + { + if (gprMap[i].usable && !gprMap[i].isNeeded && (gprMap[i].VIreg < 0)) + { + return i; // Reg is not needed and was a temp reg + } + } + int x = findFreeGPRRec(0); + pxAssertDev(x >= 0, "microVU register allocation failure!"); + return x; + } + + void writeVIBackup(const xRegisterInt& reg); + public: microRegAlloc(int _index) { index = _index; + + // mark gpr registers as usable + std::memset(gprMap, 0, sizeof(gprMap)); + for (int i = 0; i < gprTotal; i++) + { + if (i == gprT1.GetId() || i == gprT2.GetId() || + i == gprF0.GetId() || i == gprF1.GetId() || i == gprF2.GetId() || i == gprF3.GetId() || + i == rsp.GetId()) + { + continue; + } + + gprMap[i].usable = true; + } + reset(false); } @@ -304,9 +379,10 @@ public: regAllocCOP2 = false; for (int i = 0; i < xmmTotal; i++) - { clearReg(i); - } + for (int i = 0; i < gprTotal; i++) + clearGPR(i); + counter = 0; regAllocCOP2 = cop2mode; pxmmregs = cop2mode ? xmmregs : nullptr; @@ -331,13 +407,37 @@ public: xmmMap[i].xyzw = ((pxmmregs[i].mode & MODE_WRITE) != 0) ? 0xf : 0x0; } } + + for (int i = 0; i < gprTotal; i++) + { + if (!x86regs[i].inuse || x86regs[i].type != X86TYPE_VIREG) + continue; + + // pxAssertRel(armregs[i].reg >= 0, "Valid full register preserved"); + if (x86regs[i].reg >= 0) + { + MVURALOG("Preserving VI reg %d in host reg %d across instruction\n", x86regs[i].reg, i); + x86regs[i].needed = false; + gprMap[i].isNeeded = false; + gprMap[i].isZeroExtended = false; + gprMap[i].VIreg = x86regs[i].reg; + gprMap[i].dirty = ((x86regs[i].mode & MODE_WRITE) != 0); + } + } } + + gprMap[RFASTMEMBASE.GetId()].usable = !cop2mode || !CHECK_FASTMEM; } int getXmmCount() { return xmmTotal + 1; } + int getGPRCount() + { + return gprTotal; + } + // Flushes all allocated registers (i.e. writes-back to memory all modified registers). // If clearState is 0, then it keeps cached reg data valid // If clearState is 1, then it invalidates all cached reg data after write-back @@ -349,6 +449,36 @@ public: if (clearState) clearReg(i); } + + for (int i = 0; i < gprTotal; i++) + { + writeBackReg(xRegister32(i), true); + if (clearState) + clearGPR(i); + } + } + + void flushCallerSavedRegisters(bool clearNeeded = false) + { + for (int i = 0; i < xmmTotal; i++) + { + if (!xRegisterSSE::IsCallerSaved(i)) + continue; + + writeBackReg(xmm(i)); + if (clearNeeded || !xmmMap[i].isNeeded) + clearReg(i); + } + + for (int i = 0; i < gprTotal; i++) + { + if (!xRegister32::IsCallerSaved(i)) + continue; + + writeBackReg(xRegister32(i), true); + if (clearNeeded || !gprMap[i].isNeeded) + clearGPR(i); + } } void flushPartialForCOP2() @@ -378,10 +508,19 @@ public: clear.isNeeded = 0; clear.isZero = 0; } + + for (int i = 0; i < gprTotal; i++) + { + microMapGPR& clear = gprMap[i]; + if (clear.VIreg < 0) + clearGPR(i); + } } - void TDwritebackAll(bool clearState = false) + void TDwritebackAll() { + // NOTE: We don't clear state here, this happens in an optional branch + for (int i = 0; i < xmmTotal; i++) { microMapXMM& mapX = xmmMap[xmm(i).Id]; @@ -396,6 +535,9 @@ public: mVUsaveReg(xmm(i), ptr[&getVF(mapX.VFreg)], mapX.xyzw, 1); } } + + for (int i = 0; i < gprTotal; i++) + writeBackReg(xRegister32(i), false); } bool checkVFClamp(int regId) @@ -414,11 +556,19 @@ public: return false; } + bool checkCachedGPR(int regId) + { + if (regId < gprTotal) + return gprMap[regId].VIreg >= 0 || gprMap[regId].isNeeded; + else + return false; + } + void clearReg(const xmm& reg) { clearReg(reg.Id); } void clearReg(int regId) { microMapXMM& clear = xmmMap[regId]; - if (regAllocCOP2) + if (regAllocCOP2 && (clear.isNeeded || clear.VFreg >= 0)) { pxAssert(pxmmregs[regId].type == XMMTYPE_VFREG); pxmmregs[regId].inuse = false; @@ -668,4 +818,262 @@ public: updateCOP2AllocState(x); return xmmX; } + + void clearGPR(const xRegisterInt& reg) { clearGPR(reg.GetId()); } + + void clearGPR(int regId) + { + microMapGPR& clear = gprMap[regId]; + + if (regAllocCOP2) + { + if (x86regs[regId].inuse && x86regs[regId].type == X86TYPE_VIREG) + { + pxAssert(x86regs[regId].reg == static_cast(clear.VIreg)); + _freeX86regWithoutWriteback(regId); + } + } + + clear.VIreg = -1; + clear.count = 0; + clear.isNeeded = 0; + clear.dirty = false; + clear.isZeroExtended = false; + } + + void clearGPRCOP2(int regId) + { + if (regAllocCOP2) + clearGPR(regId); + } + + void updateCOP2AllocState(const xRegisterInt& reg) + { + if (!regAllocCOP2) + return; + + const u32 rn = reg.GetId(); + const bool dirty = (gprMap[rn].VIreg >= 0 && gprMap[rn].dirty); + pxAssert(x86regs[rn].type == X86TYPE_VIREG); + x86regs[rn].reg = gprMap[rn].VIreg; + x86regs[rn].counter = gprMap[rn].count; + x86regs[rn].mode = dirty ? (MODE_READ | MODE_WRITE) : MODE_READ; + x86regs[rn].needed = gprMap[rn].isNeeded; + } + + void writeBackReg(const xRegisterInt& reg, bool clearDirty) + { + microMapGPR& mapX = gprMap[reg.GetId()]; + pxAssert(mapX.usable || !mapX.dirty); + if (mapX.dirty) + { + pxAssert(mapX.VIreg > 0); + if (mapX.VIreg < 16) + xMOV(ptr16[&getVI(mapX.VIreg)], xRegister16(reg)); + if (clearDirty) + { + mapX.dirty = false; + updateCOP2AllocState(reg); + } + } + } + + void clearNeeded(const xRegisterInt& reg) + { + pxAssert(reg.GetId() < gprTotal); + microMapGPR& clear = gprMap[reg.GetId()]; + clear.isNeeded = false; + if (regAllocCOP2) + x86regs[reg.GetId()].needed = false; + } + + void unbindAnyVIAllocations(int reg, bool& backup) + { + for (int i = 0; i < gprTotal; i++) + { + microMapGPR& mapI = gprMap[i]; + if (mapI.VIreg == reg) + { + if (backup) + { + writeVIBackup(xRegister32(i)); + backup = false; + } + + // if it's needed, we just unbind the allocation and preserve it, otherwise clear + if (mapI.isNeeded) + { + MVURALOG(" unbind %d to %d for write\n", i, reg); + if (regAllocCOP2) + { + pxAssert(x86regs[i].type == X86TYPE_VIREG && x86regs[i].reg == static_cast(mapI.VIreg)); + x86regs[i].reg = -1; + } + + mapI.VIreg = -1; + mapI.dirty = false; + mapI.isZeroExtended = false; + } + else + { + MVURALOG(" clear %d to %d for write\n", i, reg); + clearGPR(i); + } + + // shouldn't be any others... + for (int j = i + 1; j < gprTotal; j++) + { + pxAssert(gprMap[j].VIreg != reg); + } + + break; + } + } + } + + const xRegister32& allocGPR(int viLoadReg = -1, int viWriteReg = -1, bool backup = false, bool zext_if_dirty = false) + { + // TODO: When load != write, we should check whether load is used later, and if so, copy it. + + //DevCon.WriteLn("viLoadReg = %02d, viWriteReg = %02d, backup = %d",viLoadReg,viWriteReg,(int)backup); + const int this_counter = regAllocCOP2 ? (g_x86AllocCounter++) : (counter++); + if (viLoadReg == 0 || viWriteReg == 0) + { + // write zero register as temp and discard later + if (viWriteReg == 0) + { + int x = findFreeGPR(-1); + const xRegister32& gprX = xRegister32::GetInstance(x); + writeBackReg(gprX, true); + xXOR(gprX, gprX); + gprMap[x].VIreg = -1; + gprMap[x].dirty = false; + gprMap[x].count = this_counter; + gprMap[x].isNeeded = true; + gprMap[x].isZeroExtended = true; + MVURALOG(" alloc zero to scratch %d\n", x); + return gprX; + } + } + + if (viLoadReg >= 0) // Search For Cached Regs + { + for (int i = 0; i < gprTotal; i++) + { + microMapGPR& mapI = gprMap[i]; + if (mapI.VIreg == viLoadReg) + { + if (viWriteReg >= 0) // Reg will be modified + { + if (viLoadReg != viWriteReg) + { + // kill any allocations of viWriteReg + unbindAnyVIAllocations(viWriteReg, backup); + + // allocate a new register for writing to + int x = findFreeGPR(viWriteReg); + const xRegister32& gprX = xRegister32::GetInstance(x); + writeBackReg(gprX, true); + if (zext_if_dirty) + xMOVZX(gprX, xRegister16(i)); + else + xMOV(gprX, xRegister32(i)); + gprMap[x].isZeroExtended = zext_if_dirty; + MVURALOG(" clone write %d in %d to %d for %d\n", viLoadReg, i, x, viWriteReg); + std::swap(x, i); + } + else + { + // writing to it, no longer zero extended + gprMap[i].isZeroExtended = false; + } + + gprMap[i].VIreg = viWriteReg; + gprMap[i].dirty = true; + } + else if (zext_if_dirty && !gprMap[i].isZeroExtended) + { + xMOVZX(xRegister32(i), xRegister16(i)); + gprMap[i].isZeroExtended = true; + } + gprMap[i].count = this_counter; + gprMap[i].isNeeded = true; + + if (backup) + writeVIBackup(xRegister32(i)); + + if (regAllocCOP2) + { + pxAssert(x86regs[i].inuse && x86regs[i].type == X86TYPE_VIREG); + x86regs[i].reg = gprMap[i].VIreg; + x86regs[i].mode = gprMap[i].dirty ? (MODE_WRITE | MODE_READ) : (MODE_READ); + } + + MVURALOG(" returning cached in %d\n", i); + return xRegister32::GetInstance(i); + } + } + } + + if (viWriteReg >= 0) // Writing a new value, make sure this register isn't cached already + unbindAnyVIAllocations(viWriteReg, backup); + + int x = findFreeGPR(viLoadReg); + const xRegister32& gprX = xRegister32::GetInstance(x); + writeBackReg(gprX, true); + + if (viLoadReg > 0) + xMOVZX(gprX, ptr16[&getVI(viLoadReg)]); + else if (viLoadReg == 0) + xXOR(gprX, gprX); + + gprMap[x].VIreg = viLoadReg; + gprMap[x].isZeroExtended = true; + if (viWriteReg >= 0) + { + gprMap[x].VIreg = viWriteReg; + gprMap[x].dirty = true; + gprMap[x].isZeroExtended = false; + + if (backup) + { + if (viLoadReg < 0 && viWriteReg > 0) + xMOVZX(gprX, ptr16[&getVI(viWriteReg)]); + + writeVIBackup(gprX); + } + } + + gprMap[x].count = this_counter; + gprMap[x].isNeeded = true; + + if (regAllocCOP2) + { + pxAssert(x86regs[x].inuse && x86regs[x].type == X86TYPE_VIREG); + x86regs[x].reg = gprMap[x].VIreg; + x86regs[x].mode = gprMap[x].dirty ? (MODE_WRITE | MODE_READ) : (MODE_READ); + } + + MVURALOG(" returning new %d\n", x); + return gprX; + } + + void moveVIToGPR(const xRegisterInt& reg, int vi, bool signext = false) + { + pxAssert(vi >= 0); + if (vi == 0) + { + xXOR(xRegister32(reg), xRegister32(reg)); + return; + } + + // TODO: Check liveness/usedness before allocating. + // TODO: Check whether zero-extend is needed everywhere heae. Loadstores are. + const xRegister32& srcreg = allocGPR(vi); + if (signext) + xMOVSX(xRegister32(reg), xRegister16(srcreg)); + else + xMOVZX(xRegister32(reg), xRegister16(srcreg)); + clearNeeded(srcreg); + } }; diff --git a/pcsx2/x86/microVU_Lower.inl b/pcsx2/x86/microVU_Lower.inl index a46ec59dd9..dc6db0323b 100644 --- a/pcsx2/x86/microVU_Lower.inl +++ b/pcsx2/x86/microVU_Lower.inl @@ -611,11 +611,12 @@ mVUop(mVU_FCAND) pass1 { mVUanalyzeCflag(mVU, 1); } pass2 { - mVUallocCFLAGa(mVU, gprT1, cFLAG.read); - xAND(gprT1, _Imm24_); - xADD(gprT1, 0xffffff); - xSHR(gprT1, 24); - mVUallocVIb(mVU, gprT1, 1); + const xRegister32& dst = mVU.regAlloc->allocGPR(-1, 1, mVUlow.backupVI); + mVUallocCFLAGa(mVU, dst, cFLAG.read); + xAND(dst, _Imm24_); + xADD(dst, 0xffffff); + xSHR(dst, 24); + mVU.regAlloc->clearNeeded(dst); mVU.profiler.EmitOp(opFCAND); } pass3 { mVUlog("FCAND vi01, $%x", _Imm24_); } @@ -627,11 +628,12 @@ mVUop(mVU_FCEQ) pass1 { mVUanalyzeCflag(mVU, 1); } pass2 { - mVUallocCFLAGa(mVU, gprT1, cFLAG.read); - xXOR(gprT1, _Imm24_); - xSUB(gprT1, 1); - xSHR(gprT1, 31); - mVUallocVIb(mVU, gprT1, 1); + const xRegister32& dst = mVU.regAlloc->allocGPR(-1, 1, mVUlow.backupVI); + mVUallocCFLAGa(mVU, dst, cFLAG.read); + xXOR(dst, _Imm24_); + xSUB(dst, 1); + xSHR(dst, 31); + mVU.regAlloc->clearNeeded(dst); mVU.profiler.EmitOp(opFCEQ); } pass3 { mVUlog("FCEQ vi01, $%x", _Imm24_); } @@ -643,9 +645,10 @@ mVUop(mVU_FCGET) pass1 { mVUanalyzeCflag(mVU, _It_); } pass2 { - mVUallocCFLAGa(mVU, gprT1, cFLAG.read); - xAND(gprT1, 0xfff); - mVUallocVIb(mVU, gprT1, _It_); + const xRegister32& regT = mVU.regAlloc->allocGPR(-1, _It_, mVUlow.backupVI); + mVUallocCFLAGa(mVU, regT, cFLAG.read); + xAND(regT, 0xfff); + mVU.regAlloc->clearNeeded(regT); mVU.profiler.EmitOp(opFCGET); } pass3 { mVUlog("FCGET vi%02d", _Ft_); } @@ -657,11 +660,12 @@ mVUop(mVU_FCOR) pass1 { mVUanalyzeCflag(mVU, 1); } pass2 { - mVUallocCFLAGa(mVU, gprT1, cFLAG.read); - xOR(gprT1, _Imm24_); - xADD(gprT1, 1); // If 24 1's will make 25th bit 1, else 0 - xSHR(gprT1, 24); // Get the 25th bit (also clears the rest of the garbage in the reg) - mVUallocVIb(mVU, gprT1, 1); + const xRegister32& dst = mVU.regAlloc->allocGPR(-1, 1, mVUlow.backupVI); + mVUallocCFLAGa(mVU, dst, cFLAG.read); + xOR(dst, _Imm24_); + xADD(dst, 1); // If 24 1's will make 25th bit 1, else 0 + xSHR(dst, 24); // Get the 25th bit (also clears the rest of the garbage in the reg) + mVU.regAlloc->clearNeeded(dst); mVU.profiler.EmitOp(opFCOR); } pass3 { mVUlog("FCOR vi01, $%x", _Imm24_); } @@ -690,9 +694,9 @@ mVUop(mVU_FMAND) pass2 { mVUallocMFLAGa(mVU, gprT1, mFLAG.read); - mVUallocVIa(mVU, gprT2, _Is_); - xAND(gprT1b, gprT2b); - mVUallocVIb(mVU, gprT1, _It_); + const xRegister32& regT = mVU.regAlloc->allocGPR(_Is_, _It_, mVUlow.backupVI); + xAND(regT, gprT1); + mVU.regAlloc->clearNeeded(regT); mVU.profiler.EmitOp(opFMAND); } pass3 { mVUlog("FMAND vi%02d, vi%02d", _Ft_, _Fs_); } @@ -705,11 +709,11 @@ mVUop(mVU_FMEQ) pass2 { mVUallocMFLAGa(mVU, gprT1, mFLAG.read); - mVUallocVIa(mVU, gprT2, _Is_); - xXOR(gprT1, gprT2); - xSUB(gprT1, 1); - xSHR(gprT1, 31); - mVUallocVIb(mVU, gprT1, _It_); + const xRegister32& regT = mVU.regAlloc->allocGPR(_Is_, _It_, mVUlow.backupVI); + xXOR(regT, gprT1); + xSUB(regT, 1); + xSHR(regT, 31); + mVU.regAlloc->clearNeeded(regT); mVU.profiler.EmitOp(opFMEQ); } pass3 { mVUlog("FMEQ vi%02d, vi%02d", _Ft_, _Fs_); } @@ -722,9 +726,9 @@ mVUop(mVU_FMOR) pass2 { mVUallocMFLAGa(mVU, gprT1, mFLAG.read); - mVUallocVIa(mVU, gprT2, _Is_); - xOR(gprT1b, gprT2b); - mVUallocVIb(mVU, gprT1, _It_); + const xRegister32& regT = mVU.regAlloc->allocGPR(_Is_, _It_, mVUlow.backupVI); + xOR(regT, gprT1); + mVU.regAlloc->clearNeeded(regT); mVU.profiler.EmitOp(opFMOR); } pass3 { mVUlog("FMOR vi%02d, vi%02d", _Ft_, _Fs_); } @@ -742,9 +746,10 @@ mVUop(mVU_FSAND) { if (_Imm12_ & 0x0c30) DevCon.WriteLn(Color_Green, "mVU_FSAND: Checking I/D/IS/DS Flags"); if (_Imm12_ & 0x030c) DevCon.WriteLn(Color_Green, "mVU_FSAND: Checking U/O/US/OS Flags"); - mVUallocSFLAGc(gprT1, gprT2, sFLAG.read); - xAND(gprT1, _Imm12_); - mVUallocVIb(mVU, gprT1, _It_); + const xRegister32& reg = mVU.regAlloc->allocGPR(-1, _It_, mVUlow.backupVI); + mVUallocSFLAGc(reg, gprT1, sFLAG.read); + xAND(reg, _Imm12_); + mVU.regAlloc->clearNeeded(reg); mVU.profiler.EmitOp(opFSAND); } pass3 { mVUlog("FSAND vi%02d, $%x", _Ft_, _Imm12_); } @@ -756,9 +761,10 @@ mVUop(mVU_FSOR) pass1 { mVUanalyzeSflag(mVU, _It_); } pass2 { - mVUallocSFLAGc(gprT1, gprT2, sFLAG.read); - xOR(gprT1, _Imm12_); - mVUallocVIb(mVU, gprT1, _It_); + const xRegister32& reg = mVU.regAlloc->allocGPR(-1, _It_, mVUlow.backupVI); + mVUallocSFLAGc(reg, gprT2, sFLAG.read); + xOR(reg, _Imm12_); + mVU.regAlloc->clearNeeded(reg); mVU.profiler.EmitOp(opFSOR); } pass3 { mVUlog("FSOR vi%02d, $%x", _Ft_, _Imm12_); } @@ -786,15 +792,16 @@ mVUop(mVU_FSEQ) if (_Imm12_ & 0x0400) imm |= 0x1000000; // IS if (_Imm12_ & 0x0800) imm |= 0x2000000; // DS - mVUallocSFLAGa(gprT1, sFLAG.read); - setBitFSEQ(gprT1, 0x0f00); // Z bit - setBitFSEQ(gprT1, 0xf000); // S bit - setBitFSEQ(gprT1, 0x000f); // ZS bit - setBitFSEQ(gprT1, 0x00f0); // SS bit - xXOR(gprT1, imm); - xSUB(gprT1, 1); - xSHR(gprT1, 31); - mVUallocVIb(mVU, gprT1, _It_); + const xRegister32& reg = mVU.regAlloc->allocGPR(-1, _It_, mVUlow.backupVI); + mVUallocSFLAGa(reg, sFLAG.read); + setBitFSEQ(reg, 0x0f00); // Z bit + setBitFSEQ(reg, 0xf000); // S bit + setBitFSEQ(reg, 0x000f); // ZS bit + setBitFSEQ(reg, 0x00f0); // SS bit + xXOR(reg, imm); + xSUB(reg, 1); + xSHR(reg, 31); + mVU.regAlloc->clearNeeded(reg); mVU.profiler.EmitOp(opFSEQ); } pass3 { mVUlog("FSEQ vi%02d, $%x", _Ft_, _Imm12_); } @@ -834,15 +841,11 @@ mVUop(mVU_IADD) pass1 { mVUanalyzeIALU1(mVU, _Id_, _Is_, _It_); } pass2 { - mVUallocVIa(mVU, gprT1, _Is_); - if (_It_ != _Is_) - { - mVUallocVIa(mVU, gprT2, _It_); - xADD(gprT1b, gprT2b); - } - else - xADD(gprT1b, gprT1b); - mVUallocVIb(mVU, gprT1, _Id_); + const xRegister32& regT = mVU.regAlloc->allocGPR(_It_, -1); + const xRegister32& regS = mVU.regAlloc->allocGPR(_Is_, _Id_, mVUlow.backupVI); + xADD(regS, regT); + mVU.regAlloc->clearNeeded(regS); + mVU.regAlloc->clearNeeded(regT); mVU.profiler.EmitOp(opIADD); } pass3 { mVUlog("IADD vi%02d, vi%02d, vi%02d", _Fd_, _Fs_, _Ft_); } @@ -853,10 +856,10 @@ mVUop(mVU_IADDI) pass1 { mVUanalyzeIADDI(mVU, _Is_, _It_, _Imm5_); } pass2 { - mVUallocVIa(mVU, gprT1, _Is_); + const xRegister32& regS = mVU.regAlloc->allocGPR(_Is_, _It_, mVUlow.backupVI); if (_Imm5_ != 0) - xADD(gprT1b, _Imm5_); - mVUallocVIb(mVU, gprT1, _It_); + xADD(regS, _Imm5_); + mVU.regAlloc->clearNeeded(regS); mVU.profiler.EmitOp(opIADDI); } pass3 { mVUlog("IADDI vi%02d, vi%02d, %d", _Ft_, _Fs_, _Imm5_); } @@ -867,10 +870,10 @@ mVUop(mVU_IADDIU) pass1 { mVUanalyzeIADDI(mVU, _Is_, _It_, _Imm15_); } pass2 { - mVUallocVIa(mVU, gprT1, _Is_); + const xRegister32& regS = mVU.regAlloc->allocGPR(_Is_, _It_, mVUlow.backupVI); if (_Imm15_ != 0) - xADD(gprT1b, _Imm15_); - mVUallocVIb(mVU, gprT1, _It_); + xADD(regS, _Imm15_); + mVU.regAlloc->clearNeeded(regS); mVU.profiler.EmitOp(opIADDIU); } pass3 { mVUlog("IADDIU vi%02d, vi%02d, %d", _Ft_, _Fs_, _Imm15_); } @@ -881,13 +884,12 @@ mVUop(mVU_IAND) pass1 { mVUanalyzeIALU1(mVU, _Id_, _Is_, _It_); } pass2 { - mVUallocVIa(mVU, gprT1, _Is_); + const xRegister32& regT = mVU.regAlloc->allocGPR(_It_, -1); + const xRegister32& regS = mVU.regAlloc->allocGPR(_Is_, _Id_, mVUlow.backupVI); if (_It_ != _Is_) - { - mVUallocVIa(mVU, gprT2, _It_); - xAND(gprT1, gprT2); - } - mVUallocVIb(mVU, gprT1, _Id_); + xAND(regS, regT); + mVU.regAlloc->clearNeeded(regS); + mVU.regAlloc->clearNeeded(regT); mVU.profiler.EmitOp(opIAND); } pass3 { mVUlog("IAND vi%02d, vi%02d, vi%02d", _Fd_, _Fs_, _Ft_); } @@ -898,13 +900,12 @@ mVUop(mVU_IOR) pass1 { mVUanalyzeIALU1(mVU, _Id_, _Is_, _It_); } pass2 { - mVUallocVIa(mVU, gprT1, _Is_); + const xRegister32& regT = mVU.regAlloc->allocGPR(_It_, -1); + const xRegister32& regS = mVU.regAlloc->allocGPR(_Is_, _Id_, mVUlow.backupVI); if (_It_ != _Is_) - { - mVUallocVIa(mVU, gprT2, _It_); - xOR(gprT1, gprT2); - } - mVUallocVIb(mVU, gprT1, _Id_); + xOR(regS, regT); + mVU.regAlloc->clearNeeded(regS); + mVU.regAlloc->clearNeeded(regT); mVU.profiler.EmitOp(opIOR); } pass3 { mVUlog("IOR vi%02d, vi%02d, vi%02d", _Fd_, _Fs_, _Ft_); } @@ -917,15 +918,17 @@ mVUop(mVU_ISUB) { if (_It_ != _Is_) { - mVUallocVIa(mVU, gprT1, _Is_); - mVUallocVIa(mVU, gprT2, _It_); - xSUB(gprT1b, gprT2b); - mVUallocVIb(mVU, gprT1, _Id_); + const xRegister32& regT = mVU.regAlloc->allocGPR(_It_, -1); + const xRegister32& regS = mVU.regAlloc->allocGPR(_Is_, _Id_, mVUlow.backupVI); + xSUB(regS, regT); + mVU.regAlloc->clearNeeded(regS); + mVU.regAlloc->clearNeeded(regT); } else { - xXOR(gprT1, gprT1); - mVUallocVIb(mVU, gprT1, _Id_); + const xRegister32& regD = mVU.regAlloc->allocGPR(-1, _Id_, mVUlow.backupVI); + xXOR(regD, regD); + mVU.regAlloc->clearNeeded(regD); } mVU.profiler.EmitOp(opISUB); } @@ -937,10 +940,10 @@ mVUop(mVU_ISUBIU) pass1 { mVUanalyzeIALU2(mVU, _Is_, _It_); } pass2 { - mVUallocVIa(mVU, gprT1, _Is_); + const xRegister32& regS = mVU.regAlloc->allocGPR(_Is_, _It_, mVUlow.backupVI); if (_Imm15_ != 0) - xSUB(gprT1b, _Imm15_); - mVUallocVIb(mVU, gprT1, _It_); + xSUB(regS, _Imm15_); + mVU.regAlloc->clearNeeded(regS); mVU.profiler.EmitOp(opISUBIU); } pass3 { mVUlog("ISUBIU vi%02d, vi%02d, %d", _Ft_, _Fs_, _Imm15_); } @@ -964,10 +967,20 @@ mVUop(mVU_MFIR) pass2 { const xmm& Ft = mVU.regAlloc->allocReg(-1, _Ft_, _X_Y_Z_W); - mVUallocVIa(mVU, gprT1, _Is_, true); - xMOVDZX(Ft, gprT1); - if (!_XYZW_SS) - mVUunpack_xyzw(Ft, Ft, 0); + if (_Is_ != 0) + { + const xRegister32& regS = mVU.regAlloc->allocGPR(_Is_, -1); + xMOVSX(xRegister32(regS), xRegister16(regS)); + // TODO: Broadcast instead + xMOVDZX(Ft, regS); + if (!_XYZW_SS) + mVUunpack_xyzw(Ft, Ft, 0); + mVU.regAlloc->clearNeeded(regS); + } + else + { + xPXOR(Ft, Ft); + } mVU.regAlloc->clearNeeded(Ft); mVU.profiler.EmitOp(opMFIR); } @@ -1038,8 +1051,9 @@ mVUop(mVU_MTIR) pass2 { const xmm& Fs = mVU.regAlloc->allocReg(_Fs_, 0, (1 << (3 - _Fsf_))); - xMOVD(gprT1, Fs); - mVUallocVIb(mVU, gprT1, _It_); + const xRegister32& regT = mVU.regAlloc->allocGPR(-1, _It_, mVUlow.backupVI); + xMOVD(regT, Fs); + mVU.regAlloc->clearNeeded(regT); mVU.regAlloc->clearNeeded(Fs); mVU.profiler.EmitOp(opMTIR); } @@ -1064,14 +1078,14 @@ mVUop(mVU_ILW) { void* ptr = mVU.regs().Mem + offsetSS; - mVUallocVIa(mVU, gprT2, _Is_); - if (!_Is_) - xXOR(gprT2, gprT2); + mVU.regAlloc->moveVIToGPR(gprT1, _Is_); if (_Imm11_ != 0) - xADD(gprT2, _Imm11_); - mVUaddrFix(mVU, gprT2q); - xMOVZX(gprT1, ptr16[xComplexAddress(gprT3q, ptr, gprT2q)]); - mVUallocVIb(mVU, gprT1, _It_); + xADD(gprT1, _Imm11_); + mVUaddrFix(mVU, gprT1q); + + const xRegister32& regT = mVU.regAlloc->allocGPR(-1, _It_, mVUlow.backupVI); + xMOVZX(regT, ptr16[xComplexAddress(gprT2q, ptr, gprT1q)]); + mVU.regAlloc->clearNeeded(regT); mVU.profiler.EmitOp(opILW); } pass3 { mVUlog("ILW.%s vi%02d, vi%02d + %d", _XYZW_String, _Ft_, _Fs_, _Imm11_); } @@ -1092,15 +1106,19 @@ mVUop(mVU_ILWR) void* ptr = mVU.regs().Mem + offsetSS; if (_Is_) { - mVUallocVIa(mVU, gprT2, _Is_); - mVUaddrFix (mVU, gprT2q); - xMOVZX(gprT1, ptr16[xComplexAddress(gprT3q, ptr, gprT2q)]); + mVU.regAlloc->moveVIToGPR(gprT1, _Is_); + mVUaddrFix (mVU, gprT1q); + + const xRegister32& regT = mVU.regAlloc->allocGPR(-1, _It_, mVUlow.backupVI); + xMOVZX(regT, ptr16[xComplexAddress(gprT2q, ptr, gprT1q)]); + mVU.regAlloc->clearNeeded(regT); } else { - xMOVZX(gprT1, ptr16[ptr]); + const xRegister32& regT = mVU.regAlloc->allocGPR(-1, _It_, mVUlow.backupVI); + xMOVZX(regT, ptr16[ptr]); + mVU.regAlloc->clearNeeded(regT); } - mVUallocVIb(mVU, gprT1, _It_); mVU.profiler.EmitOp(opILWR); } pass3 { mVUlog("ILWR.%s vi%02d, vi%02d", _XYZW_String, _Ft_, _Fs_); } @@ -1110,7 +1128,7 @@ mVUop(mVU_ILWR) // ISW/ISWR //------------------------------------------------------------------ -static void writeBackISW(microVU& mVU, void* base_ptr, xAddressReg reg) +static void writeBackISW(microVU& mVU, void* base_ptr, xAddressReg reg, const xRegister32& val) { if (!reg.IsEmpty() && (sptr)base_ptr != (s32)(sptr)base_ptr) { @@ -1118,10 +1136,10 @@ static void writeBackISW(microVU& mVU, void* base_ptr, xAddressReg reg) auto writeBackAt = [&](int offset) { if (register_offset == -1) { - xLEA(gprT3q, ptr[(void*)((sptr)base_ptr + offset)]); + xLEA(gprT2q, ptr[(void*)((sptr)base_ptr + offset)]); register_offset = offset; } - xMOV(ptr32[gprT3q + reg + (offset - register_offset)], gprT1); + xMOV(ptr32[gprT2q + reg + (offset - register_offset)], val); }; if (_X) writeBackAt(0); if (_Y) writeBackAt(4); @@ -1130,17 +1148,17 @@ static void writeBackISW(microVU& mVU, void* base_ptr, xAddressReg reg) } else if (reg.IsEmpty()) { - if (_X) xMOV(ptr32[(void*)((uptr)base_ptr )], gprT1); - if (_Y) xMOV(ptr32[(void*)((uptr)base_ptr + 4)], gprT1); - if (_Z) xMOV(ptr32[(void*)((uptr)base_ptr + 8)], gprT1); - if (_W) xMOV(ptr32[(void*)((uptr)base_ptr + 12)], gprT1); + if (_X) xMOV(ptr32[(void*)((uptr)base_ptr )], val); + if (_Y) xMOV(ptr32[(void*)((uptr)base_ptr + 4)], val); + if (_Z) xMOV(ptr32[(void*)((uptr)base_ptr + 8)], val); + if (_W) xMOV(ptr32[(void*)((uptr)base_ptr + 12)], val); } else { - if (_X) xMOV(ptr32[base_ptr+reg ], gprT1); - if (_Y) xMOV(ptr32[base_ptr+reg + 4], gprT1); - if (_Z) xMOV(ptr32[base_ptr+reg + 8], gprT1); - if (_W) xMOV(ptr32[base_ptr+reg + 12], gprT1); + if (_X) xMOV(ptr32[base_ptr+reg ], val); + if (_Y) xMOV(ptr32[base_ptr+reg + 4], val); + if (_Z) xMOV(ptr32[base_ptr+reg + 8], val); + if (_W) xMOV(ptr32[base_ptr+reg + 12], val); } } @@ -1156,15 +1174,15 @@ mVUop(mVU_ISW) { void* ptr = mVU.regs().Mem; - mVUallocVIa(mVU, gprT2, _Is_); - if (!_Is_) - xXOR(gprT2, gprT2); + mVU.regAlloc->moveVIToGPR(gprT1, _Is_); if (_Imm11_ != 0) - xADD(gprT2, _Imm11_); - mVUaddrFix(mVU, gprT2q); + xADD(gprT1, _Imm11_); + mVUaddrFix(mVU, gprT1q); - mVUallocVIa(mVU, gprT1, _It_); - writeBackISW(mVU, ptr, gprT2q); + // If regT is dirty, the high bits might not be zero. + const xRegister32& regT = mVU.regAlloc->allocGPR(_It_, -1, false, true); + writeBackISW(mVU, ptr, gprT1q, regT); + mVU.regAlloc->clearNeeded(regT); mVU.profiler.EmitOp(opISW); } pass3 { mVUlog("ISW.%s vi%02d, vi%02d + %d", _XYZW_String, _Ft_, _Fs_, _Imm11_); } @@ -1184,12 +1202,13 @@ mVUop(mVU_ISWR) xAddressReg is = xEmptyReg; if (_Is_) { - mVUallocVIa(mVU, gprT2, _Is_); - mVUaddrFix(mVU, gprT2q); - is = gprT2q; + mVU.regAlloc->moveVIToGPR(gprT1, _Is_); + mVUaddrFix(mVU, gprT1q); + is = gprT1q; } - mVUallocVIa(mVU, gprT1, _It_); - writeBackISW(mVU, ptr, is); + const xRegister32& regT = mVU.regAlloc->allocGPR(_It_, -1, false, true); + writeBackISW(mVU, ptr, is, regT); + mVU.regAlloc->clearNeeded(regT); mVU.profiler.EmitOp(opISWR); } @@ -1206,15 +1225,13 @@ mVUop(mVU_LQ) pass2 { void* ptr = mVU.regs().Mem; - mVUallocVIa(mVU, gprT2, _Is_); - if (!_Is_) - xXOR(gprT2, gprT2); + mVU.regAlloc->moveVIToGPR(gprT1, _Is_); if (_Imm11_ != 0) - xADD(gprT2, _Imm11_); - mVUaddrFix(mVU, gprT2q); + xADD(gprT1, _Imm11_); + mVUaddrFix(mVU, gprT1q); const xmm& Ft = mVU.regAlloc->allocReg(-1, _Ft_, _X_Y_Z_W); - mVUloadReg(Ft, xComplexAddress(gprT3q, ptr, gprT2q), _X_Y_Z_W); + mVUloadReg(Ft, xComplexAddress(gprT2q, ptr, gprT1q), _X_Y_Z_W); mVU.regAlloc->clearNeeded(Ft); mVU.profiler.EmitOp(opLQ); } @@ -1230,12 +1247,12 @@ mVUop(mVU_LQD) xAddressReg is = xEmptyReg; if (_Is_ || isVU0) // Access VU1 regs mem-map in !_Is_ case { - mVUallocVIa(mVU, gprT2, _Is_); - xSUB(gprT2b, 1); - if (_Is_) - mVUallocVIb(mVU, gprT2, _Is_); - mVUaddrFix(mVU, gprT2q); - is = gprT2q; + const xRegister32& regS = mVU.regAlloc->allocGPR(_Is_, _Is_, mVUlow.backupVI); + xDEC(regS); + xMOVSX(gprT1, xRegister16(regS)); // TODO: Confirm + mVU.regAlloc->clearNeeded(regS); + mVUaddrFix(mVU, gprT1q); + is = gprT1q; } else { @@ -1250,7 +1267,7 @@ mVUop(mVU_LQD) } else { - mVUloadReg(Ft, xComplexAddress(gprT3q, ptr, is), _X_Y_Z_W); + mVUloadReg(Ft, xComplexAddress(gprT2q, ptr, is), _X_Y_Z_W); } mVU.regAlloc->clearNeeded(Ft); } @@ -1268,12 +1285,12 @@ mVUop(mVU_LQI) xAddressReg is = xEmptyReg; if (_Is_) { - mVUallocVIa(mVU, gprT1, _Is_); - xMOV(gprT2, gprT1); - xADD(gprT1b, 1); - mVUallocVIb(mVU, gprT1, _Is_); - mVUaddrFix (mVU, gprT2q); - is = gprT2q; + const xRegister32& regS = mVU.regAlloc->allocGPR(_Is_, _Is_, mVUlow.backupVI); + xMOVSX(gprT1, xRegister16(regS)); // TODO: Confirm + xINC(regS); + mVU.regAlloc->clearNeeded(regS); + mVUaddrFix(mVU, gprT1q); + is = gprT1q; } if (!mVUlow.noWriteVF) { @@ -1281,7 +1298,7 @@ mVUop(mVU_LQI) if (is.IsEmpty()) mVUloadReg(Ft, xAddressVoid(ptr), _X_Y_Z_W); else - mVUloadReg(Ft, xComplexAddress(gprT3q, ptr, is), _X_Y_Z_W); + mVUloadReg(Ft, xComplexAddress(gprT2q, ptr, is), _X_Y_Z_W); mVU.regAlloc->clearNeeded(Ft); } mVU.profiler.EmitOp(opLQI); @@ -1300,15 +1317,13 @@ mVUop(mVU_SQ) { void* ptr = mVU.regs().Mem; - mVUallocVIa(mVU, gprT2, _It_); - if (!_It_) - xXOR(gprT2, gprT2); + mVU.regAlloc->moveVIToGPR(gprT1, _It_); if (_Imm11_ != 0) - xADD(gprT2, _Imm11_); - mVUaddrFix(mVU, gprT2q); + xADD(gprT1, _Imm11_); + mVUaddrFix(mVU, gprT1q); const xmm& Fs = mVU.regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W); - mVUsaveReg(Fs, xComplexAddress(gprT3q, ptr, gprT2q), _X_Y_Z_W, 1); + mVUsaveReg(Fs, xComplexAddress(gprT2q, ptr, gprT1q), _X_Y_Z_W, 1); mVU.regAlloc->clearNeeded(Fs); mVU.profiler.EmitOp(opSQ); } @@ -1324,12 +1339,12 @@ mVUop(mVU_SQD) xAddressReg it = xEmptyReg; if (_It_ || isVU0) // Access VU1 regs mem-map in !_It_ case { - mVUallocVIa(mVU, gprT2, _It_); - xSUB(gprT2b, 1); - if (_It_) - mVUallocVIb(mVU, gprT2, _It_); - mVUaddrFix(mVU, gprT2q); - it = gprT2q; + const xRegister32& regT = mVU.regAlloc->allocGPR(_It_, _It_, mVUlow.backupVI); + xDEC(regT); + xMOVSX(gprT1, xRegister16(regT)); // TODO: Confirm + mVU.regAlloc->clearNeeded(regT); + mVUaddrFix(mVU, gprT1q); + it = gprT1q; } else { @@ -1339,7 +1354,7 @@ mVUop(mVU_SQD) if (it.IsEmpty()) mVUsaveReg(Fs, xAddressVoid(ptr), _X_Y_Z_W, 1); else - mVUsaveReg(Fs, xComplexAddress(gprT3q, ptr, it), _X_Y_Z_W, 1); + mVUsaveReg(Fs, xComplexAddress(gprT2q, ptr, it), _X_Y_Z_W, 1); mVU.regAlloc->clearNeeded(Fs); mVU.profiler.EmitOp(opSQD); } @@ -1354,15 +1369,15 @@ mVUop(mVU_SQI) void* ptr = mVU.regs().Mem; if (_It_) { - mVUallocVIa(mVU, gprT1, _It_); - xMOV(gprT2, gprT1); - xADD(gprT1b, 1); - mVUallocVIb(mVU, gprT1, _It_); - mVUaddrFix(mVU, gprT2q); + const xRegister32& regT = mVU.regAlloc->allocGPR(_It_, _It_, mVUlow.backupVI); + xMOVSX(gprT1, xRegister16(regT)); // TODO: Confirm + xINC(regT); + mVU.regAlloc->clearNeeded(regT); + mVUaddrFix(mVU, gprT1q); } const xmm& Fs = mVU.regAlloc->allocReg(_Fs_, 0, _X_Y_Z_W); if (_It_) - mVUsaveReg(Fs, xComplexAddress(gprT3q, ptr, gprT2q), _X_Y_Z_W, 1); + mVUsaveReg(Fs, xComplexAddress(gprT2q, ptr, gprT1q), _X_Y_Z_W, 1); else mVUsaveReg(Fs, xAddressVoid(ptr), _X_Y_Z_W, 1); mVU.regAlloc->clearNeeded(Fs); @@ -1426,22 +1441,24 @@ mVUop(mVU_RNEXT) pass2 { // algorithm from www.project-fao.org - xMOV(gprT3, ptr32[Rmem]); - xMOV(gprT1, gprT3); + const xRegister32& temp3 = mVU.regAlloc->allocGPR(); + xMOV(temp3, ptr32[Rmem]); + xMOV(gprT1, temp3); xSHR(gprT1, 4); xAND(gprT1, 1); - xMOV(gprT2, gprT3); + xMOV(gprT2, temp3); xSHR(gprT2, 22); xAND(gprT2, 1); - xSHL(gprT3, 1); + xSHL(temp3, 1); xXOR(gprT1, gprT2); - xXOR(gprT3, gprT1); - xAND(gprT3, 0x007fffff); - xOR (gprT3, 0x3f800000); - xMOV(ptr32[Rmem], gprT3); - mVU_RGET_(mVU, gprT3); + xXOR(temp3, gprT1); + xAND(temp3, 0x007fffff); + xOR (temp3, 0x3f800000); + xMOV(ptr32[Rmem], temp3); + mVU_RGET_(mVU, temp3); + mVU.regAlloc->clearNeeded(temp3); mVU.profiler.EmitOp(opRNEXT); } pass3 { mVUlog("RNEXT.%s vf%02d, R", _XYZW_String, _Ft_); } @@ -1512,8 +1529,9 @@ mVUop(mVU_XTOP) } pass2 { - xMOVZX(gprT1, ptr16[&mVU.getVifRegs().top]); - mVUallocVIb(mVU, gprT1, _It_); + const xRegister32& regT = mVU.regAlloc->allocGPR(-1, _It_, mVUlow.backupVI); + xMOVZX(regT, ptr16[&mVU.getVifRegs().top]); + mVU.regAlloc->clearNeeded(regT); mVU.profiler.EmitOp(opXTOP); } pass3 { mVUlog("XTOP vi%02d", _Ft_); } @@ -1530,9 +1548,10 @@ mVUop(mVU_XITOP) } pass2 { - xMOVZX(gprT1, ptr16[&mVU.getVifRegs().itop]); - xAND(gprT1, isVU1 ? 0x3ff : 0xff); - mVUallocVIb(mVU, gprT1, _It_); + const xRegister32& regT = mVU.regAlloc->allocGPR(-1, _It_, mVUlow.backupVI); + xMOVZX(regT, ptr16[&mVU.getVifRegs().itop]); + xAND(regT, isVU1 ? 0x3ff : 0xff); + mVU.regAlloc->clearNeeded(regT); mVU.profiler.EmitOp(opXITOP); } pass3 { mVUlog("XITOP vi%02d", _Ft_); } @@ -1634,6 +1653,8 @@ void _vuXGKICKTransfermVU(bool flush) static __fi void mVU_XGKICK_SYNC(mV, bool flush) { + mVU.regAlloc->flushCallerSavedRegisters(); + // Add the single cycle remainder after this instruction, some games do the store // on the second instruction after the kick and that needs to go through first // but that's VERY close.. @@ -1652,14 +1673,16 @@ static __fi void mVU_XGKICK_SYNC(mV, bool flush) static __fi void mVU_XGKICK_DELAY(mV) { - mVUbackupRegs(mVU); + mVU.regAlloc->flushCallerSavedRegisters(); + + mVUbackupRegs(mVU, true, true); #if 0 // XGkick Break - ToDo: Change "SomeGifPathValue" to w/e needs to be tested xTEST (ptr32[&SomeGifPathValue], 1); // If '1', breaks execution xMOV (ptr32[&mVU.resumePtrXG], (uptr)xGetPtr() + 10 + 6); xJcc32(Jcc_NotZero, (uptr)mVU.exitFunctXG - ((uptr)xGetPtr()+6)); #endif xFastCall(mVU_XGKICK_, ptr32[&mVU.VIxgkick]); - mVUrestoreRegs(mVU); + mVUrestoreRegs(mVU, true, true); } mVUop(mVU_XGKICK) @@ -1687,10 +1710,10 @@ mVUop(mVU_XGKICK) mVUinfo.doXGKICK = false; } + const xRegister32& regS = mVU.regAlloc->allocGPR(_Is_, -1); if (!CHECK_XGKICKHACK) { - mVUallocVIa(mVU, gprT1, _Is_); - xMOV(ptr32[&mVU.VIxgkick], gprT1); + xMOV(ptr32[&mVU.VIxgkick], regS); } else { @@ -1702,11 +1725,12 @@ mVUop(mVU_XGKICK) xSUB(gprT2, ptr32[&mVU.cycles]); xADD(gprT2, ptr32[&VU1.cycle]); xMOV(ptr32[&VU1.xgkicklastcycle], gprT2); - mVUallocVIa(mVU, gprT1, _Is_); + xMOV(gprT1, regS); xAND(gprT1, 0x3FF); xSHL(gprT1, 4); xMOV(ptr32[&VU1.xgkickaddr], gprT1); } + mVU.regAlloc->clearNeeded(regS); mVU.profiler.EmitOp(opXGKICK); } pass3 { mVUlog("XGKICK vi%02d", _Fs_); } @@ -1803,22 +1827,25 @@ mVUop(mVU_BAL) { if (!mVUlow.evilBranch) { - xMOV(gprT1, bSaveAddr); - mVUallocVIb(mVU, gprT1, _It_); + const xRegister32& regT = mVU.regAlloc->allocGPR(-1, _It_, mVUlow.backupVI); + xMOV(regT, bSaveAddr); + mVU.regAlloc->clearNeeded(regT); } else { incPC(-2); DevCon.Warning("Linking BAL from %s branch taken/not taken target! - If game broken report to PCSX2 Team", branchSTR[mVUlow.branch & 0xf]); incPC(2); - if (isEvilBlock) - xMOV(gprT1, ptr32[&mVU.evilBranch]); - else - xMOV(gprT1, ptr32[&mVU.badBranch]); - xADD(gprT1, 8); - xSHR(gprT1, 3); - mVUallocVIb(mVU, gprT1, _It_); + const xRegister32& regT = mVU.regAlloc->allocGPR(-1, _It_, mVUlow.backupVI); + if (isEvilBlock) + xMOV(regT, ptr32[&mVU.evilBranch]); + else + xMOV(regT, ptr32[&mVU.badBranch]); + + xADD(regT, 8); + xSHR(regT, 3); + mVU.regAlloc->clearNeeded(regT); } if (mVUlow.badBranch) { xMOV(ptr32[&mVU.badBranch], branchAddr(mVU)); } @@ -1837,14 +1864,15 @@ mVUop(mVU_IBEQ) if (mVUlow.memReadIs) xMOV(gprT1, ptr32[&mVU.VIbackup]); else - mVUallocVIa(mVU, gprT1, _Is_); + mVU.regAlloc->moveVIToGPR(gprT1, _Is_); if (mVUlow.memReadIt) xXOR(gprT1, ptr32[&mVU.VIbackup]); else { - mVUallocVIa(mVU, gprT2, _It_); - xXOR(gprT1, gprT2); + const xRegister32& regT = mVU.regAlloc->allocGPR(_It_); + xXOR(gprT1, regT); + mVU.regAlloc->clearNeeded(regT); } if (!(isBadOrEvil)) @@ -1865,7 +1893,7 @@ mVUop(mVU_IBGEZ) if (mVUlow.memReadIs) xMOV(gprT1, ptr32[&mVU.VIbackup]); else - mVUallocVIa(mVU, gprT1, _Is_); + mVU.regAlloc->moveVIToGPR(gprT1, _Is_); if (!(isBadOrEvil)) xMOV(ptr32[&mVU.branch], gprT1); else @@ -1884,7 +1912,7 @@ mVUop(mVU_IBGTZ) if (mVUlow.memReadIs) xMOV(gprT1, ptr32[&mVU.VIbackup]); else - mVUallocVIa(mVU, gprT1, _Is_); + mVU.regAlloc->moveVIToGPR(gprT1, _Is_); if (!(isBadOrEvil)) xMOV(ptr32[&mVU.branch], gprT1); else @@ -1903,7 +1931,7 @@ mVUop(mVU_IBLEZ) if (mVUlow.memReadIs) xMOV(gprT1, ptr32[&mVU.VIbackup]); else - mVUallocVIa(mVU, gprT1, _Is_); + mVU.regAlloc->moveVIToGPR(gprT1, _Is_); if (!(isBadOrEvil)) xMOV(ptr32[&mVU.branch], gprT1); else @@ -1922,7 +1950,7 @@ mVUop(mVU_IBLTZ) if (mVUlow.memReadIs) xMOV(gprT1, ptr32[&mVU.VIbackup]); else - mVUallocVIa(mVU, gprT1, _Is_); + mVU.regAlloc->moveVIToGPR(gprT1, _Is_); if (!(isBadOrEvil)) xMOV(ptr32[&mVU.branch], gprT1); else @@ -1941,14 +1969,15 @@ mVUop(mVU_IBNE) if (mVUlow.memReadIs) xMOV(gprT1, ptr32[&mVU.VIbackup]); else - mVUallocVIa(mVU, gprT1, _Is_); + mVU.regAlloc->moveVIToGPR(gprT1, _Is_); if (mVUlow.memReadIt) xXOR(gprT1, ptr32[&mVU.VIbackup]); else { - mVUallocVIa(mVU, gprT2, _It_); - xXOR(gprT1, gprT2); + const xRegister32& regT = mVU.regAlloc->allocGPR(_It_); + xXOR(gprT1, regT); + mVU.regAlloc->clearNeeded(regT); } if (!(isBadOrEvil)) @@ -1964,7 +1993,7 @@ void normJumpPass2(mV) { if (!mVUlow.constJump.isValid || mVUlow.evilBranch) { - mVUallocVIa(mVU, gprT1, _Is_); + mVU.regAlloc->moveVIToGPR(gprT1, _Is_); xSHL(gprT1, 3); xAND(gprT1, mVU.microMemSize - 8); @@ -2008,17 +2037,18 @@ mVUop(mVU_JALR) normJumpPass2(mVU); if (!mVUlow.evilBranch) { - xMOV(gprT1, bSaveAddr); - mVUallocVIb(mVU, gprT1, _It_); + const xRegister32& regT = mVU.regAlloc->allocGPR(-1, _It_, mVUlow.backupVI); + xMOV(regT, bSaveAddr); + mVU.regAlloc->clearNeeded(regT); } if (mVUlow.evilBranch) { + const xRegister32& regT = mVU.regAlloc->allocGPR(-1, _It_, mVUlow.backupVI); if (isEvilBlock) { - xMOV(gprT1, ptr32[&mVU.evilBranch]); - xADD(gprT1, 8); - xSHR(gprT1, 3); - mVUallocVIb(mVU, gprT1, _It_); + xMOV(regT, ptr32[&mVU.evilBranch]); + xADD(regT, 8); + xSHR(regT, 3); } else { @@ -2026,11 +2056,11 @@ mVUop(mVU_JALR) DevCon.Warning("Linking JALR from %s branch taken/not taken target! - If game broken report to PCSX2 Team", branchSTR[mVUlow.branch & 0xf]); incPC(2); - xMOV(gprT1, ptr32[&mVU.badBranch]); - xADD(gprT1, 8); - xSHR(gprT1, 3); - mVUallocVIb(mVU, gprT1, _It_); + xMOV(regT, ptr32[&mVU.badBranch]); + xADD(regT, 8); + xSHR(regT, 3); } + mVU.regAlloc->clearNeeded(regT); } mVU.profiler.EmitOp(opJALR); diff --git a/pcsx2/x86/microVU_Macro.inl b/pcsx2/x86/microVU_Macro.inl index a10ef6c848..5b5c5fc8ac 100644 --- a/pcsx2/x86/microVU_Macro.inl +++ b/pcsx2/x86/microVU_Macro.inl @@ -37,13 +37,6 @@ void setupMacroOp(int mode, const char* opName) // Set up reg allocation microVU0.regAlloc->reset(true); - if (mode & 0x110) // X86 regs are modified, or flags modified - { - _freeX86reg(eax); - _freeX86reg(ecx); - _freeX86reg(edx); - } - if (mode & 0x03) // Q will be read/written _freeXMMreg(xmmPQ.Id); @@ -127,6 +120,17 @@ void mVUFreeCOP2XMMreg(int hostreg) microVU0.regAlloc->clearRegCOP2(hostreg); } +void mVUFreeCOP2GPR(int hostreg) +{ + microVU0.regAlloc->clearGPRCOP2(hostreg); +} + +bool mVUIsReservedCOP2(int hostreg) +{ + // gprF1 through 3 is not correctly used in COP2 mode. + return (hostreg == gprT1.GetId() || hostreg == gprT2.GetId() || hostreg == gprF0.GetId()); +} + #define REC_COP2_mVU0(f, opName, mode) \ void recV##f() \ { \ @@ -429,11 +433,22 @@ static void recCFC2() const int regt = _allocX86reg(X86TYPE_GPR, _Rt_, MODE_WRITE); pxAssert(!GPR_IS_CONST1(_Rt_)); - // FixMe: Should R-Reg have upper 9 bits 0? - if (_Rd_ >= REG_STATUS_FLAG) + if (_Rd_ == 0) // why would you read vi00? + { + xXOR(xRegister32(regt), xRegister32(regt)); + } + else if (_Rd_ >= REG_STATUS_FLAG) // FixMe: Should R-Reg have upper 9 bits 0? + { xMOVSX(xRegister64(regt), ptr32[&vu0Regs.VI[_Rd_].UL]); + } else - xMOV(xRegister64(regt), ptr32[&vu0Regs.VI[_Rd_].UL]); + { + const int vireg = _allocIfUsedVItoX86(_Rd_, MODE_READ); + if (vireg >= 0) + xMOVZX(xRegister32(regt), xRegister16(vireg)); + else + xMOVZX(xRegister32(regt), ptr16[&vu0Regs.VI[_Rd_].UL]); + } } static void recCTC2() @@ -532,9 +547,62 @@ static void recCTC2() _freeXMMregWithoutWriteback(xmmreg); } - // Need to expand this out, because we want to write as 16 bits. - _eeMoveGPRtoR(eax, _Rt_); - xMOV(ptr16[&vu0Regs.VI[_Rd_].US[0]], ax); + // Little bit nasty, but optimal codegen. + const int gprreg = _allocIfUsedGPRtoX86(_Rt_, MODE_READ); + const int vireg = _allocIfUsedVItoX86(_Rd_, MODE_WRITE); + if (vireg >= 0) + { + if (gprreg >= 0) + { + xMOVZX(xRegister32(vireg), xRegister16(gprreg)); + } + else + { + // it could be in an xmm.. + const int gprxmmreg = _checkXMMreg(XMMTYPE_GPRREG, _Rt_, MODE_READ); + if (gprxmmreg >= 0) + { + xMOVD(xRegister32(vireg), xRegisterSSE(gprxmmreg)); + xMOVZX(xRegister32(vireg), xRegister16(vireg)); + } + else if (GPR_IS_CONST1(_Rt_)) + { + if (_Rt_ != 0) + xMOV(xRegister32(vireg), (g_cpuConstRegs[_Rt_].UL[0] & 0xFFFFu)); + else + xXOR(xRegister32(vireg), xRegister32(vireg)); + } + else + { + xMOVZX(xRegister32(vireg), ptr16[&cpuRegs.GPR.r[_Rt_].US[0]]); + } + } + } + else + { + if (gprreg >= 0) + { + xMOV(ptr16[&vu0Regs.VI[_Rd_].US[0]], xRegister16(gprreg)); + } + else + { + const int gprxmmreg = _checkXMMreg(XMMTYPE_GPRREG, _Rt_, MODE_READ); + if (gprxmmreg >= 0) + { + xMOVD(eax, xRegisterSSE(gprxmmreg)); + xMOV(ptr16[&vu0Regs.VI[_Rd_].US[0]], ax); + } + else if (GPR_IS_CONST1(_Rt_)) + { + xMOV(ptr16[&vu0Regs.VI[_Rd_].US[0]], (g_cpuConstRegs[_Rt_].UL[0] & 0xFFFFu)); + } + else + { + _eeMoveGPRtoR(eax, _Rt_); + xMOV(ptr16[&vu0Regs.VI[_Rd_].US[0]], ax); + } + } + } } else { @@ -562,7 +630,7 @@ static void recQMFC2() mVUFinishVU0(); } - const bool vf_used = COP2INST_USEDTEST(_Rd_); + const bool vf_used = EEINST_VFUSEDTEST(_Rd_); const int ftreg = _allocVFtoXMMreg(_Rd_, MODE_READ); _deleteEEreg128(_Rt_); @@ -607,7 +675,7 @@ static void recQMTC2() if (_Rt_) { // if we have to flush to memory anyway (has a constant or is x86), force load. - [[maybe_unused]] const bool vf_used = COP2INST_USEDTEST(_Rd_); + [[maybe_unused]] const bool vf_used = EEINST_VFUSEDTEST(_Rd_); const bool can_rename = EEINST_RENAMETEST(_Rt_); const int rtreg = (GPR_IS_DIRTY_CONST(_Rt_) || _hasX86reg(X86TYPE_GPR, _Rt_, MODE_WRITE)) ? _allocGPRtoXMMreg(_Rt_, MODE_READ) : diff --git a/pcsx2/x86/microVU_Misc.h b/pcsx2/x86/microVU_Misc.h index 1149216300..46c71045fe 100644 --- a/pcsx2/x86/microVU_Misc.h +++ b/pcsx2/x86/microVU_Misc.h @@ -154,13 +154,10 @@ static const char branchSTR[16][8] = { #define gprT1 eax // eax - Temp Reg #define gprT2 ecx // ecx - Temp Reg -#define gprT3 edx // edx - Temp Reg #define gprT1q rax // eax - Temp Reg #define gprT2q rcx // ecx - Temp Reg -#define gprT3q rdx // edx - Temp Reg #define gprT1b ax // Low 16-bit of gprT1 (eax) #define gprT2b cx // Low 16-bit of gprT2 (ecx) -#define gprT3b dx // Low 16-bit of gprT3 (edx) #define gprF0 ebx // Status Flag 0 #define gprF1 r12d // Status Flag 1 diff --git a/pcsx2/x86/microVU_Misc.inl b/pcsx2/x86/microVU_Misc.inl index b37d50fb6b..cd8e57c2f6 100644 --- a/pcsx2/x86/microVU_Misc.inl +++ b/pcsx2/x86/microVU_Misc.inl @@ -14,6 +14,7 @@ */ #pragma once +#include //------------------------------------------------------------------ // Micro VU - Reg Loading/Saving/Shuffling/Unpacking/Merging... @@ -149,14 +150,57 @@ __fi void mVUbackupRegs(microVU& mVU, bool toMemory = false, bool onlyNeeded = f { if (toMemory) { - for (int i = 0; i < mVU.regAlloc->getXmmCount(); i++) + int num_xmms = 0, num_gprs = 0; + + for (int i = 0; i < static_cast(iREGCNT_GPR); i++) { + if (!xRegister32::IsCallerSaved(i) || i == rsp.GetId()) + continue; + + if (!onlyNeeded || mVU.regAlloc->checkCachedGPR(i)) + { + num_gprs++; + xPUSH(xRegister64(i)); + } + } + + std::bitset save_xmms; + for (int i = 0; i < static_cast(iREGCNT_XMM); i++) + { + if (!xRegisterSSE::IsCallerSaved(i)) + continue; + if (!onlyNeeded || mVU.regAlloc->checkCachedReg(i) || xmmPQ.Id == i) - xMOVAPS(ptr128[&mVU.xmmBackup[i][0]], xmm(i)); + { + save_xmms[i] = true; + num_xmms++; + } + } + + // we need 16 byte alignment on the stack +#ifdef _WIN32 + const int stack_size = (num_xmms * sizeof(u128)) + ((num_gprs & 1) * sizeof(u64)) + 32; + int stack_offset = 32; +#else + const int stack_size = (num_xmms * sizeof(u128)) + ((num_gprs & 1) * sizeof(u64)); + int stack_offset = 0; +#endif + if (stack_size > 0) + { + xSUB(rsp, stack_size); + for (int i = 0; i < static_cast(iREGCNT_XMM); i++) + { + if (save_xmms[i]) + { + xMOVAPS(ptr128[rsp + stack_offset], xRegisterSSE(i)); + stack_offset += sizeof(u128); + } + } } } else { + // TODO(Stenzek): get rid of xmmbackup mVU.regAlloc->flushAll(); // Flush Regalloc xMOVAPS(ptr128[&mVU.xmmBackup[xmmPQ.Id][0]], xmmPQ); } @@ -167,47 +211,64 @@ __fi void mVUrestoreRegs(microVU& mVU, bool fromMemory = false, bool onlyNeeded { if (fromMemory) { - for (int i = 0; i < mVU.regAlloc->getXmmCount(); i++) + int num_xmms = 0, num_gprs = 0; + + std::bitset save_gprs; + for (int i = 0; i < static_cast(iREGCNT_GPR); i++) { + if (!xRegister32::IsCallerSaved(i) || i == rsp.GetId()) + continue; + + if (!onlyNeeded || mVU.regAlloc->checkCachedGPR(i)) + { + save_gprs[i] = true; + num_gprs++; + } + } + + std::bitset save_xmms; + for (int i = 0; i < static_cast(iREGCNT_XMM); i++) + { + if (!xRegisterSSE::IsCallerSaved(i)) + continue; + if (!onlyNeeded || mVU.regAlloc->checkCachedReg(i) || xmmPQ.Id == i) - xMOVAPS(xmm(i), ptr128[&mVU.xmmBackup[i][0]]); + { + save_xmms[i] = true; + num_xmms++; + } + } + +#ifdef _WIN32 + const int stack_extra = 32; +#else + const int stack_extra = 0; +#endif + const int stack_size = (num_xmms * sizeof(u128)) + ((num_gprs & 1) * sizeof(u64)) + stack_extra; + if (num_xmms > 0) + { + int stack_offset = (num_xmms - 1) * sizeof(u128) + stack_extra; + for (int i = static_cast(iREGCNT_XMM - 1); i >= 0; i--) + { + if (!save_xmms[i]) + continue; + + xMOVAPS(xRegisterSSE(i), ptr128[rsp + stack_offset]); + stack_offset -= sizeof(u128); + } + } + if (stack_size > 0) + xADD(rsp, stack_size); + + for (int i = static_cast(iREGCNT_GPR - 1); i >= 0; i--) + { + if (save_gprs[i]) + xPOP(xRegister64(i)); } } else + { xMOVAPS(xmmPQ, ptr128[&mVU.xmmBackup[xmmPQ.Id][0]]); -} - -class mVUScopedXMMBackup -{ - microVU& mVU; - bool fromMemory; - -public: - mVUScopedXMMBackup(microVU& mVU, bool fromMemory) - : mVU(mVU) , fromMemory(fromMemory) - { - mVUbackupRegs(mVU, fromMemory); - } - ~mVUScopedXMMBackup() - { - mVUrestoreRegs(mVU, fromMemory); - } -}; - -_mVUt void mVUprintRegs() -{ - microVU& mVU = mVUx; - for (int i = 0; i < mVU.regAlloc->getXmmCount(); i++) - { - Console.WriteLn("xmm%d = [0x%08x,0x%08x,0x%08x,0x%08x]", i, - mVU.xmmBackup[i][0], mVU.xmmBackup[i][1], - mVU.xmmBackup[i][2], mVU.xmmBackup[i][3]); - } - for (int i = 0; i < mVU.regAlloc->getXmmCount(); i++) - { - Console.WriteLn("xmm%d = [%f,%f,%f,%f]", i, - (float&)mVU.xmmBackup[i][0], (float&)mVU.xmmBackup[i][1], - (float&)mVU.xmmBackup[i][2], (float&)mVU.xmmBackup[i][3]); } } @@ -259,17 +320,15 @@ __fi void mVUaddrFix(mV, const xAddressReg& gprReg) jmpA.SetTarget(); if (THREAD_VU1) { - { - mVUScopedXMMBackup mVUSave(mVU, true); - xScopedSavedRegisters save{gprT1q, gprT2q, gprT3q}; +#if 0 if (IsDevBuild && !isCOP2) // Lets see which games do this! { - xMOV(arg1regd, mVU.prog.cur->idx); // Note: Kernel does it via COP2 to initialize VU1! - xMOV(arg2regd, xPC); // So we don't spam console, we'll only check micro-mode... + xMOV(gprT1, mVU.prog.cur->idx); // Note: Kernel does it via COP2 to initialize VU1! + xMOV(gprT2, xPC); // So we don't spam console, we'll only check micro-mode... xFastCall((void*)mVUwarningRegAccess, arg1regd, arg2regd); } - xFastCall((void*)mVUwaitMTVU); - } +#endif + xFastCall((void*)mVU.waitMTVU); } xAND(xRegister32(gprReg.Id), 0x3f); // ToDo: theres a potential problem if VU0 overrides VU1's VF0/VI0 regs! xADD(gprReg, (u128*)VU1.VF - (u128*)VU0.Mem);