iR5900: Elide VU0 micro finish calls when safe

This makes a difference in COP2-heavy games, where a chain of
instructions will repeatedly test the VU0 idle bit unnecessarily, as it
is impossible for a micro to be started inbetween the instruction chain.

Saves a bit of code size (for register backup/restore), as well as
getting rid of branches. Seems to make a 1-2% difference in performance
in Ratchet on a 3900X, but if we're lucky, more on slower chips.
This commit is contained in:
Connor McLaughlin 2022-06-26 15:02:03 +10:00 committed by refractionpcsx2
parent 99487d0e2b
commit cdd9b1fa3b
5 changed files with 87 additions and 31 deletions

View File

@ -220,6 +220,7 @@ int _signExtendXMMtoM(uptr to, x86SSERegType from, int candestroy); // returns t
#define EEINST_COP2_STATUS_FLAG 0x400
#define EEINST_COP2_MAC_FLAG 0x800
#define EEINST_COP2_CLIP_FLAG 0x1000
#define EEINST_COP2_FINISH_VU0_MICRO 0x2000
struct EEINST
{

View File

@ -227,3 +227,39 @@ void COP2FlagHackPass::CommitAllFlags()
CommitMACFlag();
CommitClipFlag();
}
COP2MicroFinishPass::COP2MicroFinishPass() = default;
COP2MicroFinishPass::~COP2MicroFinishPass() = default;
void COP2MicroFinishPass::Run(u32 start, u32 end, EEINST* inst_cache)
{
bool needs_vu0_finish = true;
ForEachInstruction(start, end, inst_cache, [&needs_vu0_finish](u32 apc, EEINST* inst) {
// Catch SQ/SB/SH/SW/SD to potential DMA->VIF0->VU0 exec.
// This is very unlikely in a cop2 chain.
if (_Opcode_ == 037 || _Opcode_ == 050 || _Opcode_ == 051 || _Opcode_ == 053 || _Opcode_ == 077)
{
needs_vu0_finish = true;
return true;
}
// Look for COP2 instructions.
if (_Opcode_ != 022)
return true;
// Set the flag on the current instruction, and clear it for the next.
if (needs_vu0_finish)
{
inst->info |= EEINST_COP2_FINISH_VU0_MICRO;
needs_vu0_finish = false;
}
// Except for VCALLMS/VCALLMSR, that can start a micro, so the next instruction needs to finish it.
if (_Funct_ == 070 || _Funct_ == 071)
needs_vu0_finish = true;
return true;
});
}

View File

@ -62,4 +62,13 @@ namespace R5900
u32 m_cfc2_pc = 0;
};
class COP2MicroFinishPass final : public AnalysisPass
{
public:
COP2MicroFinishPass();
~COP2MicroFinishPass();
void Run(u32 start, u32 end, EEINST* inst_cache) override;
};
} // namespace R5900

View File

@ -2190,10 +2190,12 @@ StartRecomp:
}
// eventually we'll want to have a vector of passes or something.
if (has_cop2_instructions && EmuConfig.Speedhacks.vuFlagHack)
if (has_cop2_instructions)
{
COP2FlagHackPass fhpass;
fhpass.Run(startpc, s_nEndBlock, s_pInstCache + 1);
COP2MicroFinishPass().Run(startpc, s_nEndBlock, s_pInstCache + 1);
if (EmuConfig.Speedhacks.vuFlagHack)
COP2FlagHackPass().Run(startpc, s_nEndBlock, s_pInstCache + 1);
}
// analyze instructions //

View File

@ -323,35 +323,40 @@ void recBC2TL() { _setupBranchTest(JZ32, true); }
void COP2_Interlock(bool mBitSync)
{
if (cpuRegs.code & 1)
{
s_nBlockInterlocked = true;
_freeX86reg(eax);
xMOV(eax, ptr32[&cpuRegs.cycle]);
xADD(eax, scaleblockcycles_clear());
xMOV(ptr32[&cpuRegs.cycle], eax); // update cycles
xTEST(ptr32[&VU0.VI[REG_VPU_STAT].UL], 0x1);
xForwardJZ32 skipvuidle;
_cop2BackupRegs();
if (mBitSync)
// We can safely skip the _vu0FinishMicro() call, when there's nothing
// that can trigger a VU0 program between CFC2/CTC2/COP2 instructions.
if ((g_pCurInstInfo->info & EEINST_COP2_FINISH_VU0_MICRO) || mBitSync)
{
xSUB(eax, ptr32[&VU0.cycle]);
xSUB(eax, ptr32[&VU0.nextBlockCycles]);
xCMP(eax, 4);
xForwardJL32 skip;
xLoadFarAddr(arg1reg, CpuVU0);
xMOV(arg2reg, s_nBlockInterlocked);
xFastCall((void*)BaseVUmicroCPU::ExecuteBlockJIT, arg1reg, arg2reg);
skip.SetTarget();
_freeX86reg(eax);
xMOV(eax, ptr32[&cpuRegs.cycle]);
xADD(eax, scaleblockcycles_clear());
xMOV(ptr32[&cpuRegs.cycle], eax); // update cycles
xFastCall((void*)_vu0WaitMicro);
xTEST(ptr32[&VU0.VI[REG_VPU_STAT].UL], 0x1);
xForwardJZ32 skipvuidle;
_cop2BackupRegs();
if (mBitSync)
{
xSUB(eax, ptr32[&VU0.cycle]);
xSUB(eax, ptr32[&VU0.nextBlockCycles]);
xCMP(eax, 4);
xForwardJL32 skip;
xLoadFarAddr(arg1reg, CpuVU0);
xMOV(arg2reg, s_nBlockInterlocked);
xFastCall((void*)BaseVUmicroCPU::ExecuteBlockJIT, arg1reg, arg2reg);
skip.SetTarget();
xFastCall((void*)_vu0WaitMicro);
}
else
xFastCall((void*)_vu0FinishMicro);
_cop2RestoreRegs();
skipvuidle.SetTarget();
}
else
xFastCall((void*)_vu0FinishMicro);
_cop2RestoreRegs();
skipvuidle.SetTarget();
}
}
@ -665,12 +670,15 @@ namespace OpcodeImpl {
void recCOP2_BC2() { recCOP2_BC2t[_Rt_](); }
void recCOP2_SPEC1()
{
xTEST(ptr32[&VU0.VI[REG_VPU_STAT].UL], 0x1);
xForwardJZ32 skipvuidle;
_cop2BackupRegs();
xFastCall((void*)_vu0FinishMicro);
_cop2RestoreRegs();
skipvuidle.SetTarget();
if (g_pCurInstInfo->info & EEINST_COP2_FINISH_VU0_MICRO)
{
xTEST(ptr32[&VU0.VI[REG_VPU_STAT].UL], 0x1);
xForwardJZ32 skipvuidle;
_cop2BackupRegs();
xFastCall((void*)_vu0FinishMicro);
_cop2RestoreRegs();
skipvuidle.SetTarget();
}
recCOP2SPECIAL1t[_Funct_]();