iR5900: Elide VU0 micro finish calls when safe

This makes a difference in COP2-heavy games, where a chain of instructions will repeatedly test the VU0 idle bit unnecessarily, as it is impossible for a micro to be started inbetween the instruction chain. Saves a bit of code size (for register backup/restore), as well as getting rid of branches. Seems to make a 1-2% difference in performance in Ratchet on a 3900X, but if we're lucky, more on slower chips.
2022-06-26 15:02:03 +10:00 · 2022-06-26 15:02:03 +10:00 · cdd9b1fa3b
parent 99487d0e2b
commit cdd9b1fa3b
5 changed files with 87 additions and 31 deletions
--- a/pcsx2/x86/iCore.h
+++ b/pcsx2/x86/iCore.h
@ -220,6 +220,7 @@ int _signExtendXMMtoM(uptr to, x86SSERegType from, int candestroy); // returns t
 #define EEINST_COP2_STATUS_FLAG 0x400
 #define EEINST_COP2_MAC_FLAG 0x800
 #define EEINST_COP2_CLIP_FLAG 0x1000
+#define EEINST_COP2_FINISH_VU0_MICRO 0x2000

 struct EEINST
 {
--- a/pcsx2/x86/iR5900Analysis.cpp
+++ b/pcsx2/x86/iR5900Analysis.cpp
@ -227,3 +227,39 @@ void COP2FlagHackPass::CommitAllFlags()
 	CommitMACFlag();
 	CommitClipFlag();
 }
+
+COP2MicroFinishPass::COP2MicroFinishPass() = default;
+
+COP2MicroFinishPass::~COP2MicroFinishPass() = default;
+
+void COP2MicroFinishPass::Run(u32 start, u32 end, EEINST* inst_cache)
+{
+	bool needs_vu0_finish = true;
+
+	ForEachInstruction(start, end, inst_cache, [&needs_vu0_finish](u32 apc, EEINST* inst) {
+		// Catch SQ/SB/SH/SW/SD to potential DMA->VIF0->VU0 exec.
+		// This is very unlikely in a cop2 chain.
+		if (_Opcode_ == 037 || _Opcode_ == 050 || _Opcode_ == 051 || _Opcode_ == 053 || _Opcode_ == 077)
+		{
+			needs_vu0_finish = true;
+			return true;
+		}
+
+		// Look for COP2 instructions.
+		if (_Opcode_ != 022)
+			return true;
+
+		// Set the flag on the current instruction, and clear it for the next.
+		if (needs_vu0_finish)
+		{
+			inst->info |= EEINST_COP2_FINISH_VU0_MICRO;
+			needs_vu0_finish = false;
+		}
+
+		// Except for VCALLMS/VCALLMSR, that can start a micro, so the next instruction needs to finish it.
+		if (_Funct_ == 070 || _Funct_ == 071)
+			needs_vu0_finish = true;
+
+		return true;
+	});
+}
--- a/pcsx2/x86/iR5900Analysis.h
+++ b/pcsx2/x86/iR5900Analysis.h
@ -62,4 +62,13 @@ namespace R5900

 		u32 m_cfc2_pc = 0;
 	};
+
+	class COP2MicroFinishPass final : public AnalysisPass
+	{
+	public:
+		COP2MicroFinishPass();
+		~COP2MicroFinishPass();
+
+		void Run(u32 start, u32 end, EEINST* inst_cache) override;
+	};
 } // namespace R5900
--- a/pcsx2/x86/ix86-32/iR5900-32.cpp
+++ b/pcsx2/x86/ix86-32/iR5900-32.cpp
@ -2190,10 +2190,12 @@ StartRecomp:
 	}

 	// eventually we'll want to have a vector of passes or something.
-	if (has_cop2_instructions && EmuConfig.Speedhacks.vuFlagHack)
+	if (has_cop2_instructions)
 	{
-		COP2FlagHackPass fhpass;
-		fhpass.Run(startpc, s_nEndBlock, s_pInstCache + 1);
+		COP2MicroFinishPass().Run(startpc, s_nEndBlock, s_pInstCache + 1);
+
+		if (EmuConfig.Speedhacks.vuFlagHack)
+			COP2FlagHackPass().Run(startpc, s_nEndBlock, s_pInstCache + 1);
 	}

 	// analyze instructions //
--- a/pcsx2/x86/microVU_Macro.inl
+++ b/pcsx2/x86/microVU_Macro.inl
@ -323,35 +323,40 @@ void recBC2TL() { _setupBranchTest(JZ32,  true);  }

 void COP2_Interlock(bool mBitSync)
 {
-
 	if (cpuRegs.code & 1)
 	{
 		s_nBlockInterlocked = true;
-		_freeX86reg(eax);
-		xMOV(eax, ptr32[&cpuRegs.cycle]);
-		xADD(eax, scaleblockcycles_clear());
-		xMOV(ptr32[&cpuRegs.cycle], eax); // update cycles

-		xTEST(ptr32[&VU0.VI[REG_VPU_STAT].UL], 0x1);
-		xForwardJZ32 skipvuidle;
-		_cop2BackupRegs();
-		if (mBitSync)
+		// We can safely skip the _vu0FinishMicro() call, when there's nothing
+		// that can trigger a VU0 program between CFC2/CTC2/COP2 instructions.
+		if ((g_pCurInstInfo->info & EEINST_COP2_FINISH_VU0_MICRO) || mBitSync)
 		{
-			xSUB(eax, ptr32[&VU0.cycle]);
-			xSUB(eax, ptr32[&VU0.nextBlockCycles]);
-			xCMP(eax, 4);
-			xForwardJL32 skip;
-			xLoadFarAddr(arg1reg, CpuVU0);
-			xMOV(arg2reg, s_nBlockInterlocked);
-			xFastCall((void*)BaseVUmicroCPU::ExecuteBlockJIT, arg1reg, arg2reg);
-			skip.SetTarget();
+			_freeX86reg(eax);
+			xMOV(eax, ptr32[&cpuRegs.cycle]);
+			xADD(eax, scaleblockcycles_clear());
+			xMOV(ptr32[&cpuRegs.cycle], eax); // update cycles

-			xFastCall((void*)_vu0WaitMicro);
+			xTEST(ptr32[&VU0.VI[REG_VPU_STAT].UL], 0x1);
+			xForwardJZ32 skipvuidle;
+			_cop2BackupRegs();
+			if (mBitSync)
+			{
+				xSUB(eax, ptr32[&VU0.cycle]);
+				xSUB(eax, ptr32[&VU0.nextBlockCycles]);
+				xCMP(eax, 4);
+				xForwardJL32 skip;
+				xLoadFarAddr(arg1reg, CpuVU0);
+				xMOV(arg2reg, s_nBlockInterlocked);
+				xFastCall((void*)BaseVUmicroCPU::ExecuteBlockJIT, arg1reg, arg2reg);
+				skip.SetTarget();
+
+				xFastCall((void*)_vu0WaitMicro);
+			}
+			else
+				xFastCall((void*)_vu0FinishMicro);
+			_cop2RestoreRegs();
+			skipvuidle.SetTarget();
 		}
-		else
-			xFastCall((void*)_vu0FinishMicro);
-		_cop2RestoreRegs();
-		skipvuidle.SetTarget();
 	}
 }

@ -665,12 +670,15 @@ namespace OpcodeImpl {
 void recCOP2_BC2() { recCOP2_BC2t[_Rt_](); }
 void recCOP2_SPEC1()
 {
-	xTEST(ptr32[&VU0.VI[REG_VPU_STAT].UL], 0x1);
-	xForwardJZ32 skipvuidle;
-	_cop2BackupRegs();
-	xFastCall((void*)_vu0FinishMicro);
-	_cop2RestoreRegs();
-	skipvuidle.SetTarget();
+	if (g_pCurInstInfo->info & EEINST_COP2_FINISH_VU0_MICRO)
+	{
+		xTEST(ptr32[&VU0.VI[REG_VPU_STAT].UL], 0x1);
+		xForwardJZ32 skipvuidle;
+		_cop2BackupRegs();
+		xFastCall((void*)_vu0FinishMicro);
+		_cop2RestoreRegs();
+		skipvuidle.SetTarget();
+	}

 	recCOP2SPECIAL1t[_Funct_]();