mirror of https://github.com/PCSX2/pcsx2.git
iR5900: Elide VU0 micro finish calls when safe
This makes a difference in COP2-heavy games, where a chain of instructions will repeatedly test the VU0 idle bit unnecessarily, as it is impossible for a micro to be started inbetween the instruction chain. Saves a bit of code size (for register backup/restore), as well as getting rid of branches. Seems to make a 1-2% difference in performance in Ratchet on a 3900X, but if we're lucky, more on slower chips.
This commit is contained in:
parent
99487d0e2b
commit
cdd9b1fa3b
|
@ -220,6 +220,7 @@ int _signExtendXMMtoM(uptr to, x86SSERegType from, int candestroy); // returns t
|
|||
#define EEINST_COP2_STATUS_FLAG 0x400
|
||||
#define EEINST_COP2_MAC_FLAG 0x800
|
||||
#define EEINST_COP2_CLIP_FLAG 0x1000
|
||||
#define EEINST_COP2_FINISH_VU0_MICRO 0x2000
|
||||
|
||||
struct EEINST
|
||||
{
|
||||
|
|
|
@ -227,3 +227,39 @@ void COP2FlagHackPass::CommitAllFlags()
|
|||
CommitMACFlag();
|
||||
CommitClipFlag();
|
||||
}
|
||||
|
||||
COP2MicroFinishPass::COP2MicroFinishPass() = default;
|
||||
|
||||
COP2MicroFinishPass::~COP2MicroFinishPass() = default;
|
||||
|
||||
void COP2MicroFinishPass::Run(u32 start, u32 end, EEINST* inst_cache)
|
||||
{
|
||||
bool needs_vu0_finish = true;
|
||||
|
||||
ForEachInstruction(start, end, inst_cache, [&needs_vu0_finish](u32 apc, EEINST* inst) {
|
||||
// Catch SQ/SB/SH/SW/SD to potential DMA->VIF0->VU0 exec.
|
||||
// This is very unlikely in a cop2 chain.
|
||||
if (_Opcode_ == 037 || _Opcode_ == 050 || _Opcode_ == 051 || _Opcode_ == 053 || _Opcode_ == 077)
|
||||
{
|
||||
needs_vu0_finish = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
// Look for COP2 instructions.
|
||||
if (_Opcode_ != 022)
|
||||
return true;
|
||||
|
||||
// Set the flag on the current instruction, and clear it for the next.
|
||||
if (needs_vu0_finish)
|
||||
{
|
||||
inst->info |= EEINST_COP2_FINISH_VU0_MICRO;
|
||||
needs_vu0_finish = false;
|
||||
}
|
||||
|
||||
// Except for VCALLMS/VCALLMSR, that can start a micro, so the next instruction needs to finish it.
|
||||
if (_Funct_ == 070 || _Funct_ == 071)
|
||||
needs_vu0_finish = true;
|
||||
|
||||
return true;
|
||||
});
|
||||
}
|
||||
|
|
|
@ -62,4 +62,13 @@ namespace R5900
|
|||
|
||||
u32 m_cfc2_pc = 0;
|
||||
};
|
||||
|
||||
class COP2MicroFinishPass final : public AnalysisPass
|
||||
{
|
||||
public:
|
||||
COP2MicroFinishPass();
|
||||
~COP2MicroFinishPass();
|
||||
|
||||
void Run(u32 start, u32 end, EEINST* inst_cache) override;
|
||||
};
|
||||
} // namespace R5900
|
|
@ -2190,10 +2190,12 @@ StartRecomp:
|
|||
}
|
||||
|
||||
// eventually we'll want to have a vector of passes or something.
|
||||
if (has_cop2_instructions && EmuConfig.Speedhacks.vuFlagHack)
|
||||
if (has_cop2_instructions)
|
||||
{
|
||||
COP2FlagHackPass fhpass;
|
||||
fhpass.Run(startpc, s_nEndBlock, s_pInstCache + 1);
|
||||
COP2MicroFinishPass().Run(startpc, s_nEndBlock, s_pInstCache + 1);
|
||||
|
||||
if (EmuConfig.Speedhacks.vuFlagHack)
|
||||
COP2FlagHackPass().Run(startpc, s_nEndBlock, s_pInstCache + 1);
|
||||
}
|
||||
|
||||
// analyze instructions //
|
||||
|
|
|
@ -323,35 +323,40 @@ void recBC2TL() { _setupBranchTest(JZ32, true); }
|
|||
|
||||
void COP2_Interlock(bool mBitSync)
|
||||
{
|
||||
|
||||
if (cpuRegs.code & 1)
|
||||
{
|
||||
s_nBlockInterlocked = true;
|
||||
_freeX86reg(eax);
|
||||
xMOV(eax, ptr32[&cpuRegs.cycle]);
|
||||
xADD(eax, scaleblockcycles_clear());
|
||||
xMOV(ptr32[&cpuRegs.cycle], eax); // update cycles
|
||||
|
||||
xTEST(ptr32[&VU0.VI[REG_VPU_STAT].UL], 0x1);
|
||||
xForwardJZ32 skipvuidle;
|
||||
_cop2BackupRegs();
|
||||
if (mBitSync)
|
||||
// We can safely skip the _vu0FinishMicro() call, when there's nothing
|
||||
// that can trigger a VU0 program between CFC2/CTC2/COP2 instructions.
|
||||
if ((g_pCurInstInfo->info & EEINST_COP2_FINISH_VU0_MICRO) || mBitSync)
|
||||
{
|
||||
xSUB(eax, ptr32[&VU0.cycle]);
|
||||
xSUB(eax, ptr32[&VU0.nextBlockCycles]);
|
||||
xCMP(eax, 4);
|
||||
xForwardJL32 skip;
|
||||
xLoadFarAddr(arg1reg, CpuVU0);
|
||||
xMOV(arg2reg, s_nBlockInterlocked);
|
||||
xFastCall((void*)BaseVUmicroCPU::ExecuteBlockJIT, arg1reg, arg2reg);
|
||||
skip.SetTarget();
|
||||
_freeX86reg(eax);
|
||||
xMOV(eax, ptr32[&cpuRegs.cycle]);
|
||||
xADD(eax, scaleblockcycles_clear());
|
||||
xMOV(ptr32[&cpuRegs.cycle], eax); // update cycles
|
||||
|
||||
xFastCall((void*)_vu0WaitMicro);
|
||||
xTEST(ptr32[&VU0.VI[REG_VPU_STAT].UL], 0x1);
|
||||
xForwardJZ32 skipvuidle;
|
||||
_cop2BackupRegs();
|
||||
if (mBitSync)
|
||||
{
|
||||
xSUB(eax, ptr32[&VU0.cycle]);
|
||||
xSUB(eax, ptr32[&VU0.nextBlockCycles]);
|
||||
xCMP(eax, 4);
|
||||
xForwardJL32 skip;
|
||||
xLoadFarAddr(arg1reg, CpuVU0);
|
||||
xMOV(arg2reg, s_nBlockInterlocked);
|
||||
xFastCall((void*)BaseVUmicroCPU::ExecuteBlockJIT, arg1reg, arg2reg);
|
||||
skip.SetTarget();
|
||||
|
||||
xFastCall((void*)_vu0WaitMicro);
|
||||
}
|
||||
else
|
||||
xFastCall((void*)_vu0FinishMicro);
|
||||
_cop2RestoreRegs();
|
||||
skipvuidle.SetTarget();
|
||||
}
|
||||
else
|
||||
xFastCall((void*)_vu0FinishMicro);
|
||||
_cop2RestoreRegs();
|
||||
skipvuidle.SetTarget();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -665,12 +670,15 @@ namespace OpcodeImpl {
|
|||
void recCOP2_BC2() { recCOP2_BC2t[_Rt_](); }
|
||||
void recCOP2_SPEC1()
|
||||
{
|
||||
xTEST(ptr32[&VU0.VI[REG_VPU_STAT].UL], 0x1);
|
||||
xForwardJZ32 skipvuidle;
|
||||
_cop2BackupRegs();
|
||||
xFastCall((void*)_vu0FinishMicro);
|
||||
_cop2RestoreRegs();
|
||||
skipvuidle.SetTarget();
|
||||
if (g_pCurInstInfo->info & EEINST_COP2_FINISH_VU0_MICRO)
|
||||
{
|
||||
xTEST(ptr32[&VU0.VI[REG_VPU_STAT].UL], 0x1);
|
||||
xForwardJZ32 skipvuidle;
|
||||
_cop2BackupRegs();
|
||||
xFastCall((void*)_vu0FinishMicro);
|
||||
_cop2RestoreRegs();
|
||||
skipvuidle.SetTarget();
|
||||
}
|
||||
|
||||
recCOP2SPECIAL1t[_Funct_]();
|
||||
|
||||
|
|
Loading…
Reference in New Issue