From a3eb1655c660b36a7ce328813789cc6687040b35 Mon Sep 17 00:00:00 2001
From: refractionpcsx2 <refraction@gmail.com>
Date: Wed, 8 Nov 2023 22:34:15 +0000
Subject: [PATCH] COP0: Correctly update performance counter cycles

---
 pcsx2/COP0.cpp      | 614 ++++++++++++++++++++++++--------------------
 pcsx2/x86/iCOP0.cpp |  46 +++-
 2 files changed, 369 insertions(+), 291 deletions(-)

diff --git a/pcsx2/COP0.cpp b/pcsx2/COP0.cpp
index 1b7a8492f8..d9407cc89b 100644
--- a/pcsx2/COP0.cpp
+++ b/pcsx2/COP0.cpp
@@ -21,7 +21,8 @@
 // Updates the CPU's mode of operation (either, Kernel, Supervisor, or User modes).
 // Currently the different modes are not implemented.
 // Given this function is called so much, it's commented out for now. (rama)
-__ri void cpuUpdateOperationMode() {
+__ri void cpuUpdateOperationMode()
+{
 
 	//u32 value = cpuRegs.CP0.n.Status.val;
 
@@ -33,15 +34,15 @@ __ri void cpuUpdateOperationMode() {
 	//}
 }
 
-void WriteCP0Status(u32 value) {
-
-	//DMA_LOG("COP0 Status write = 0x%08x", value);
-
+void WriteCP0Status(u32 value)
+{
+	COP0_UpdatePCCR();
 	cpuRegs.CP0.n.Status.val = value;
-    cpuSetNextEventDelta(4);
+	cpuSetNextEventDelta(4);
 }
 
-void WriteCP0Config(u32 value) {
+void WriteCP0Config(u32 value)
+{
 	// Protect the read-only ICacheSize (IC) and DataCacheSize (DC) bits
 	cpuRegs.CP0.n.Config = value & ~0xFC0;
 	cpuRegs.CP0.n.Config |= 0x440;
@@ -67,39 +68,39 @@ void WriteCP0Config(u32 value) {
 // count.  But only mode 1 (instruction counter) has been found to be used by games thus far.
 //
 
-static __fi bool PERF_ShouldCountEvent( uint evt )
+static __fi bool PERF_ShouldCountEvent(uint evt)
 {
-	switch( evt )
+	switch (evt)
 	{
-		// This is a rough table of actions for various PCR modes.  Some of these
-		// can be implemented more accurately later.  Others (WBBs in particular)
-		// probably cannot without some severe complications.
+			// This is a rough table of actions for various PCR modes.  Some of these
+			// can be implemented more accurately later.  Others (WBBs in particular)
+			// probably cannot without some severe complications.
 
-		// left sides are PCR0 / right sides are PCR1
+			// left sides are PCR0 / right sides are PCR1
 
-		case 1:		// cpu cycle counter.
-		case 2:		// single/dual instruction issued
-		case 3:		// Branch issued / Branch mispredicated
+		case 1: // cpu cycle counter.
+		case 2: // single/dual instruction issued
+		case 3: // Branch issued / Branch mispredicated
 			return true;
 
-		case 4:		// BTAC/TLB miss
-		case 5:		// ITLB/DTLB miss
-		case 6:		// Data/Instruction cache miss
+		case 4: // BTAC/TLB miss
+		case 5: // ITLB/DTLB miss
+		case 6: // Data/Instruction cache miss
 			return false;
 
-		case 7:		// Access to DTLB / WBB single request fail
-		case 8:		// Non-blocking load / WBB burst request fail
+		case 7: // Access to DTLB / WBB single request fail
+		case 8: // Non-blocking load / WBB burst request fail
 		case 9:
 		case 10:
 			return false;
 
-		case 11:	// CPU address bus busy / CPU data bus busy
+		case 11: // CPU address bus busy / CPU data bus busy
 			return false;
 
-		case 12:	// Instruction completed
-		case 13:	// non-delayslot instruction completed
-		case 14:	// COP2/COP1 instruction complete
-		case 15:	// Load/Store completed
+		case 12: // Instruction completed
+		case 13: // non-delayslot instruction completed
+		case 14: // COP2/COP1 instruction complete
+		case 15: // Load/Store completed
 			return true;
 	}
 
@@ -111,11 +112,11 @@ static __fi bool PERF_ShouldCountEvent( uint evt )
 // might save some debugging effort. :)
 void COP0_DiagnosticPCCR()
 {
-	if( cpuRegs.PERF.n.pccr.b.Event0 >= 7 && cpuRegs.PERF.n.pccr.b.Event0 <= 10 )
-		Console.Warning( "PERF/PCR0 Unsupported Update Event Mode = 0x%x", cpuRegs.PERF.n.pccr.b.Event0 );
+	if (cpuRegs.PERF.n.pccr.b.Event0 >= 7 && cpuRegs.PERF.n.pccr.b.Event0 <= 10)
+		Console.Warning("PERF/PCR0 Unsupported Update Event Mode = 0x%x", cpuRegs.PERF.n.pccr.b.Event0);
 
-	if( cpuRegs.PERF.n.pccr.b.Event1 >= 7 && cpuRegs.PERF.n.pccr.b.Event1 <= 10 )
-		Console.Warning( "PERF/PCR1 Unsupported Update Event Mode = 0x%x", cpuRegs.PERF.n.pccr.b.Event1 );
+	if (cpuRegs.PERF.n.pccr.b.Event1 >= 7 && cpuRegs.PERF.n.pccr.b.Event1 <= 10)
+		Console.Warning("PERF/PCR1 Unsupported Update Event Mode = 0x%x", cpuRegs.PERF.n.pccr.b.Event1);
 }
 extern int branch;
 __fi void COP0_UpdatePCCR()
@@ -131,32 +132,32 @@ __fi void COP0_UpdatePCCR()
 
 	// Implemented memory mode check (kernel/super/user)
 
-	if( cpuRegs.PERF.n.pccr.val & ((1 << (cpuRegs.CP0.n.Status.b.KSU + 2)) | (cpuRegs.CP0.n.Status.b.EXL << 1)))
+	if (cpuRegs.PERF.n.pccr.val & ((1 << (cpuRegs.CP0.n.Status.b.KSU + 2)) | (cpuRegs.CP0.n.Status.b.EXL << 1)))
 	{
 		// ----------------------------------
 		//    Update Performance Counter 0
 		// ----------------------------------
 
-		if( PERF_ShouldCountEvent( cpuRegs.PERF.n.pccr.b.Event0 ) )
+		if (PERF_ShouldCountEvent(cpuRegs.PERF.n.pccr.b.Event0))
 		{
 			u32 incr = cpuRegs.cycle - cpuRegs.lastPERFCycle[0];
-			if( incr == 0 ) incr++;
+			if (incr == 0)
+				incr++;
 
 			// use prev/XOR method for one-time exceptions (but likely less correct)
 			//u32 prev = cpuRegs.PERF.n.pcr0;
 			cpuRegs.PERF.n.pcr0 += incr;
-			cpuRegs.lastPERFCycle[0] = cpuRegs.cycle;
-
+			//DevCon.Warning("PCR VAL %x", cpuRegs.PERF.n.pccr.val);
 			//prev ^= (1UL<<31);		// XOR is fun!
 			//if( (prev & cpuRegs.PERF.n.pcr0) & (1UL<<31) )
-			if((cpuRegs.PERF.n.pcr0 & 0x80000000))
+			if ((cpuRegs.PERF.n.pcr0 & 0x80000000))
 			{
 				// TODO: Vector to the appropriate exception here.
 				// This code *should* be correct, but is untested (and other parts of the emu are
 				// not prepared to handle proper Level 2 exception vectors yet)
 
 				//branch == 1 is probably not the best way to check for the delay slot, but it beats nothing! (Refraction)
-			/*	if( branch == 1 )
+				/*	if( branch == 1 )
 				{
 					cpuRegs.CP0.n.ErrorEPC = cpuRegs.pc - 4;
 					cpuRegs.CP0.n.Cause |= 0x40000000;
@@ -182,21 +183,21 @@ __fi void COP0_UpdatePCCR()
 		}
 	}
 
-	if( cpuRegs.PERF.n.pccr.val & ((1 << (cpuRegs.CP0.n.Status.b.KSU + 12)) | (cpuRegs.CP0.n.Status.b.EXL << 11)))
+	if (cpuRegs.PERF.n.pccr.val & ((1 << (cpuRegs.CP0.n.Status.b.KSU + 12)) | (cpuRegs.CP0.n.Status.b.EXL << 11)))
 	{
 		// ----------------------------------
 		//    Update Performance Counter 1
 		// ----------------------------------
 
-		if( PERF_ShouldCountEvent( cpuRegs.PERF.n.pccr.b.Event1 ) )
+		if (PERF_ShouldCountEvent(cpuRegs.PERF.n.pccr.b.Event1))
 		{
 			u32 incr = cpuRegs.cycle - cpuRegs.lastPERFCycle[1];
-			if( incr == 0 ) incr++;
+			if (incr == 0)
+				incr++;
 
 			cpuRegs.PERF.n.pcr1 += incr;
-			cpuRegs.lastPERFCycle[1] = cpuRegs.cycle;
 
-			if( (cpuRegs.PERF.n.pcr1 & 0x80000000))
+			if ((cpuRegs.PERF.n.pcr1 & 0x80000000))
 			{
 				// TODO: Vector to the appropriate exception here.
 				// This code *should* be correct, but is untested (and other parts of the emu are
@@ -229,6 +230,8 @@ __fi void COP0_UpdatePCCR()
 			}
 		}
 	}
+	cpuRegs.lastPERFCycle[0] = cpuRegs.cycle;
+	cpuRegs.lastPERFCycle[1] = cpuRegs.cycle;
 }
 
 //////////////////////////////////////////////////////////////////////////////////////////
@@ -249,27 +252,34 @@ void MapTLB(const tlbs& t, int i)
 		vtlb_VMapBuffer(t.VPN2, eeMem->Scratch, Ps2MemSize::Scratch);
 	}
 
-	if (t.VPN2 == 0x70000000) return; //uh uhh right ...
-	if (t.EntryLo0 & 0x2) {
-		mask  = ((~t.Mask) << 1) & 0xfffff;
+	if (t.VPN2 == 0x70000000)
+		return; //uh uhh right ...
+	if (t.EntryLo0 & 0x2)
+	{
+		mask = ((~t.Mask) << 1) & 0xfffff;
 		saddr = t.VPN2 >> 12;
 		eaddr = saddr + t.Mask + 1;
 
-		for (addr=saddr; addr<eaddr; addr++) {
-			if ((addr & mask) == ((t.VPN2 >> 12) & mask)) { //match
+		for (addr = saddr; addr < eaddr; addr++)
+		{
+			if ((addr & mask) == ((t.VPN2 >> 12) & mask))
+			{ //match
 				memSetPageAddr(addr << 12, t.PFN0 + ((addr - saddr) << 12));
 				Cpu->Clear(addr << 12, 0x400);
 			}
 		}
 	}
 
-	if (t.EntryLo1 & 0x2) {
-		mask  = ((~t.Mask) << 1) & 0xfffff;
+	if (t.EntryLo1 & 0x2)
+	{
+		mask = ((~t.Mask) << 1) & 0xfffff;
 		saddr = (t.VPN2 >> 12) + t.Mask + 1;
 		eaddr = saddr + t.Mask + 1;
 
-		for (addr=saddr; addr<eaddr; addr++) {
-			if ((addr & mask) == ((t.VPN2 >> 12) & mask)) { //match
+		for (addr = saddr; addr < eaddr; addr++)
+		{
+			if ((addr & mask) == ((t.VPN2 >> 12) & mask))
+			{ //match
 				memSetPageAddr(addr << 12, t.PFN1 + ((addr - saddr) << 12));
 				Cpu->Clear(addr << 12, 0x400);
 			}
@@ -285,31 +295,36 @@ void UnmapTLB(const tlbs& t, int i)
 
 	if (t.S)
 	{
-		vtlb_VMapUnmap(t.VPN2,0x4000);
+		vtlb_VMapUnmap(t.VPN2, 0x4000);
 		return;
 	}
 
 	if (t.EntryLo0 & 0x2)
 	{
-		mask  = ((~t.Mask) << 1) & 0xfffff;
+		mask = ((~t.Mask) << 1) & 0xfffff;
 		saddr = t.VPN2 >> 12;
 		eaddr = saddr + t.Mask + 1;
-	//	Console.WriteLn("Clear TLB: %08x ~ %08x",saddr,eaddr-1);
-		for (addr=saddr; addr<eaddr; addr++) {
-			if ((addr & mask) == ((t.VPN2 >> 12) & mask)) { //match
+		//	Console.WriteLn("Clear TLB: %08x ~ %08x",saddr,eaddr-1);
+		for (addr = saddr; addr < eaddr; addr++)
+		{
+			if ((addr & mask) == ((t.VPN2 >> 12) & mask))
+			{ //match
 				memClearPageAddr(addr << 12);
 				Cpu->Clear(addr << 12, 0x400);
 			}
 		}
 	}
 
-	if (t.EntryLo1 & 0x2) {
-		mask  = ((~t.Mask) << 1) & 0xfffff;
+	if (t.EntryLo1 & 0x2)
+	{
+		mask = ((~t.Mask) << 1) & 0xfffff;
 		saddr = (t.VPN2 >> 12) + t.Mask + 1;
 		eaddr = saddr + t.Mask + 1;
-	//	Console.WriteLn("Clear TLB: %08x ~ %08x",saddr,eaddr-1);
-		for (addr=saddr; addr<eaddr; addr++) {
-			if ((addr & mask) == ((t.VPN2 >> 12) & mask)) { //match
+		//	Console.WriteLn("Clear TLB: %08x ~ %08x",saddr,eaddr-1);
+		for (addr = saddr; addr < eaddr; addr++)
+		{
+			if ((addr & mask) == ((t.VPN2 >> 12) & mask))
+			{ //match
 				memClearPageAddr(addr << 12);
 				Cpu->Clear(addr << 12, 0x400);
 			}
@@ -331,7 +346,7 @@ void WriteTLB(int i)
 	tlb[i].G = cpuRegs.CP0.n.EntryLo0 & cpuRegs.CP0.n.EntryLo1 & 0x1;
 	tlb[i].PFN0 = (((cpuRegs.CP0.n.EntryLo0 >> 6) & 0xFFFFF) & (~tlb[i].Mask)) << 12;
 	tlb[i].PFN1 = (((cpuRegs.CP0.n.EntryLo1 >> 6) & 0xFFFFF) & (~tlb[i].Mask)) << 12;
-	tlb[i].S = cpuRegs.CP0.n.EntryLo0&0x80000000;
+	tlb[i].S = cpuRegs.CP0.n.EntryLo0 & 0x80000000;
 
 	MapTLB(tlb[i], i);
 }
@@ -341,259 +356,290 @@ namespace Interpreter {
 namespace OpcodeImpl {
 namespace COP0 {
 
-void TLBR() {
-	COP0_LOG("COP0_TLBR %d:%x,%x,%x,%x",
-			cpuRegs.CP0.n.Index,   cpuRegs.CP0.n.PageMask, cpuRegs.CP0.n.EntryHi,
-			cpuRegs.CP0.n.EntryLo0, cpuRegs.CP0.n.EntryLo1);
-
-	int i = cpuRegs.CP0.n.Index & 0x3f;
-
-	cpuRegs.CP0.n.PageMask = tlb[i].PageMask;
-	cpuRegs.CP0.n.EntryHi = tlb[i].EntryHi&~(tlb[i].PageMask|0x1f00);
-	cpuRegs.CP0.n.EntryLo0 = (tlb[i].EntryLo0&~1)|((tlb[i].EntryHi>>12)&1);
-	cpuRegs.CP0.n.EntryLo1 =(tlb[i].EntryLo1&~1)|((tlb[i].EntryHi>>12)&1);
-}
-
-void TLBWI() {
-	int j = cpuRegs.CP0.n.Index & 0x3f;
-
-	//if (j > 48) return;
-
-	COP0_LOG("COP0_TLBWI %d:%x,%x,%x,%x",
-			cpuRegs.CP0.n.Index,    cpuRegs.CP0.n.PageMask, cpuRegs.CP0.n.EntryHi,
-			cpuRegs.CP0.n.EntryLo0, cpuRegs.CP0.n.EntryLo1);
-
-	UnmapTLB(tlb[j], j);
-	tlb[j].PageMask = cpuRegs.CP0.n.PageMask;
-	tlb[j].EntryHi = cpuRegs.CP0.n.EntryHi;
-	tlb[j].EntryLo0 = cpuRegs.CP0.n.EntryLo0;
-	tlb[j].EntryLo1 = cpuRegs.CP0.n.EntryLo1;
-	WriteTLB(j);
-}
-
-void TLBWR() {
-	int j = cpuRegs.CP0.n.Random & 0x3f;
-
-	//if (j > 48) return;
-
-DevCon.Warning("COP0_TLBWR %d:%x,%x,%x,%x\n",
-			cpuRegs.CP0.n.Random,   cpuRegs.CP0.n.PageMask, cpuRegs.CP0.n.EntryHi,
-			cpuRegs.CP0.n.EntryLo0, cpuRegs.CP0.n.EntryLo1);
-
-	//if (j > 48) return;
-
-	UnmapTLB(tlb[j], j);
-	tlb[j].PageMask = cpuRegs.CP0.n.PageMask;
-	tlb[j].EntryHi = cpuRegs.CP0.n.EntryHi;
-	tlb[j].EntryLo0 = cpuRegs.CP0.n.EntryLo0;
-	tlb[j].EntryLo1 = cpuRegs.CP0.n.EntryLo1;
-	WriteTLB(j);
-}
-
-void TLBP() {
-	int i;
-
-	union {
-		struct {
-			u32 VPN2:19;
-			u32 VPN2X:2;
-			u32 G:3;
-			u32 ASID:8;
-		} s;
-		u32 u;
-	} EntryHi32;
-
-	EntryHi32.u = cpuRegs.CP0.n.EntryHi;
-
-	cpuRegs.CP0.n.Index=0xFFFFFFFF;
-	for(i=0;i<48;i++){
-		if (tlb[i].VPN2 == ((~tlb[i].Mask) & (EntryHi32.s.VPN2))
-		&& ((tlb[i].G&1) || ((tlb[i].ASID & 0xff) == EntryHi32.s.ASID))) {
-			cpuRegs.CP0.n.Index = i;
-			break;
-		}
-	}
-	 if(cpuRegs.CP0.n.Index == 0xFFFFFFFF) cpuRegs.CP0.n.Index = 0x80000000;
-}
-
-void MFC0()
-{
-	// Note on _Rd_ Condition 9: CP0.Count should be updated even if _Rt_ is 0.
-	if ((_Rd_ != 9) && !_Rt_ ) return;
-
-	//if(bExecBIOS == FALSE && _Rd_ == 25) Console.WriteLn("MFC0 _Rd_ %x = %x", _Rd_, cpuRegs.CP0.r[_Rd_]);
-	switch (_Rd_)
+	void TLBR()
 	{
-		case 12:
-			cpuRegs.GPR.r[_Rt_].SD[0] = (s32)(cpuRegs.CP0.r[_Rd_] & 0xf0c79c1f);
-		break;
+		COP0_LOG("COP0_TLBR %d:%x,%x,%x,%x",
+			cpuRegs.CP0.n.Index, cpuRegs.CP0.n.PageMask, cpuRegs.CP0.n.EntryHi,
+			cpuRegs.CP0.n.EntryLo0, cpuRegs.CP0.n.EntryLo1);
 
-		case 25:
-			if (0 == (_Imm_ & 1)) // MFPS, register value ignored
-			{
-				cpuRegs.GPR.r[_Rt_].SD[0] = (s32)cpuRegs.PERF.n.pccr.val;
-			}
-			else if (0 == (_Imm_ & 2)) // MFPC 0, only LSB of register matters
-			{
-				COP0_UpdatePCCR();
-				cpuRegs.GPR.r[_Rt_].SD[0] = (s32)cpuRegs.PERF.n.pcr0;
-			}
-			else // MFPC 1
-			{
-				COP0_UpdatePCCR();
-				cpuRegs.GPR.r[_Rt_].SD[0] = (s32)cpuRegs.PERF.n.pcr1;
-			}
-		    /*Console.WriteLn("MFC0 PCCR = %x PCR0 = %x PCR1 = %x IMM= %x",  params
-		    cpuRegs.PERF.n.pccr, cpuRegs.PERF.n.pcr0, cpuRegs.PERF.n.pcr1, _Imm_ & 0x3F);*/
-		break;
+		int i = cpuRegs.CP0.n.Index & 0x3f;
 
-		case 24:
-			COP0_LOG("MFC0 Breakpoint debug Registers code = %x", cpuRegs.code & 0x3FF);
-		break;
+		cpuRegs.CP0.n.PageMask = tlb[i].PageMask;
+		cpuRegs.CP0.n.EntryHi = tlb[i].EntryHi & ~(tlb[i].PageMask | 0x1f00);
+		cpuRegs.CP0.n.EntryLo0 = (tlb[i].EntryLo0 & ~1) | ((tlb[i].EntryHi >> 12) & 1);
+		cpuRegs.CP0.n.EntryLo1 = (tlb[i].EntryLo1 & ~1) | ((tlb[i].EntryHi >> 12) & 1);
+	}
 
-		case 9:
+	void TLBWI()
+	{
+		int j = cpuRegs.CP0.n.Index & 0x3f;
+
+		//if (j > 48) return;
+
+		COP0_LOG("COP0_TLBWI %d:%x,%x,%x,%x",
+			cpuRegs.CP0.n.Index, cpuRegs.CP0.n.PageMask, cpuRegs.CP0.n.EntryHi,
+			cpuRegs.CP0.n.EntryLo0, cpuRegs.CP0.n.EntryLo1);
+
+		UnmapTLB(tlb[j], j);
+		tlb[j].PageMask = cpuRegs.CP0.n.PageMask;
+		tlb[j].EntryHi = cpuRegs.CP0.n.EntryHi;
+		tlb[j].EntryLo0 = cpuRegs.CP0.n.EntryLo0;
+		tlb[j].EntryLo1 = cpuRegs.CP0.n.EntryLo1;
+		WriteTLB(j);
+	}
+
+	void TLBWR()
+	{
+		int j = cpuRegs.CP0.n.Random & 0x3f;
+
+		//if (j > 48) return;
+
+		DevCon.Warning("COP0_TLBWR %d:%x,%x,%x,%x\n",
+			cpuRegs.CP0.n.Random, cpuRegs.CP0.n.PageMask, cpuRegs.CP0.n.EntryHi,
+			cpuRegs.CP0.n.EntryLo0, cpuRegs.CP0.n.EntryLo1);
+
+		//if (j > 48) return;
+
+		UnmapTLB(tlb[j], j);
+		tlb[j].PageMask = cpuRegs.CP0.n.PageMask;
+		tlb[j].EntryHi = cpuRegs.CP0.n.EntryHi;
+		tlb[j].EntryLo0 = cpuRegs.CP0.n.EntryLo0;
+		tlb[j].EntryLo1 = cpuRegs.CP0.n.EntryLo1;
+		WriteTLB(j);
+	}
+
+	void TLBP()
+	{
+		int i;
+
+		union
 		{
-			u32 incr = cpuRegs.cycle - cpuRegs.lastCOP0Cycle;
-			if( incr == 0 ) incr++;
-			cpuRegs.CP0.n.Count += incr;
-			cpuRegs.lastCOP0Cycle = cpuRegs.cycle;
-			if( !_Rt_ ) break;
+			struct
+			{
+				u32 VPN2 : 19;
+				u32 VPN2X : 2;
+				u32 G : 3;
+				u32 ASID : 8;
+			} s;
+			u32 u;
+		} EntryHi32;
+
+		EntryHi32.u = cpuRegs.CP0.n.EntryHi;
+
+		cpuRegs.CP0.n.Index = 0xFFFFFFFF;
+		for (i = 0; i < 48; i++)
+		{
+			if (tlb[i].VPN2 == ((~tlb[i].Mask) & (EntryHi32.s.VPN2)) && ((tlb[i].G & 1) || ((tlb[i].ASID & 0xff) == EntryHi32.s.ASID)))
+			{
+				cpuRegs.CP0.n.Index = i;
+				break;
+			}
 		}
-			[[fallthrough]];
-
-		default:
-			cpuRegs.GPR.r[_Rt_].SD[0] = (s32)cpuRegs.CP0.r[_Rd_];
+		if (cpuRegs.CP0.n.Index == 0xFFFFFFFF)
+			cpuRegs.CP0.n.Index = 0x80000000;
 	}
-}
 
-void MTC0()
-{
-	//if(bExecBIOS == FALSE && _Rd_ == 25) Console.WriteLn("MTC0 _Rd_ %x = %x", _Rd_, cpuRegs.CP0.r[_Rd_]);
-	switch (_Rd_)
+	void MFC0()
 	{
-		case 9:
-			cpuRegs.lastCOP0Cycle = cpuRegs.cycle;
-			cpuRegs.CP0.r[9] = cpuRegs.GPR.r[_Rt_].UL[0];
-		break;
+		// Note on _Rd_ Condition 9: CP0.Count should be updated even if _Rt_ is 0.
+		if ((_Rd_ != 9) && !_Rt_)
+			return;
 
-		case 12:
-			WriteCP0Status(cpuRegs.GPR.r[_Rt_].UL[0]);
-		break;
+		//if(bExecBIOS == FALSE && _Rd_ == 25) Console.WriteLn("MFC0 _Rd_ %x = %x", _Rd_, cpuRegs.CP0.r[_Rd_]);
+		switch (_Rd_)
+		{
+			case 12:
+				cpuRegs.GPR.r[_Rt_].SD[0] = (s32)(cpuRegs.CP0.r[_Rd_] & 0xf0c79c1f);
+				break;
 
-		case 16:
-			WriteCP0Config(cpuRegs.GPR.r[_Rt_].UL[0]);
-		break;
+			case 25:
+				if (0 == (_Imm_ & 1)) // MFPS, register value ignored
+				{
+					cpuRegs.GPR.r[_Rt_].SD[0] = (s32)cpuRegs.PERF.n.pccr.val;
+				}
+				else if (0 == (_Imm_ & 2)) // MFPC 0, only LSB of register matters
+				{
+					COP0_UpdatePCCR();
+					cpuRegs.GPR.r[_Rt_].SD[0] = (s32)cpuRegs.PERF.n.pcr0;
+				}
+				else // MFPC 1
+				{
+					COP0_UpdatePCCR();
+					cpuRegs.GPR.r[_Rt_].SD[0] = (s32)cpuRegs.PERF.n.pcr1;
+				}
+				/*Console.WriteLn("MFC0 PCCR = %x PCR0 = %x PCR1 = %x IMM= %x",  params
+cpuRegs.PERF.n.pccr, cpuRegs.PERF.n.pcr0, cpuRegs.PERF.n.pcr1, _Imm_ & 0x3F);*/
+				break;
 
-		case 24:
-			COP0_LOG("MTC0 Breakpoint debug Registers code = %x", cpuRegs.code & 0x3FF);
-		break;
+			case 24:
+				COP0_LOG("MFC0 Breakpoint debug Registers code = %x", cpuRegs.code & 0x3FF);
+				break;
 
-		case 25:
-			/*if(bExecBIOS == FALSE && _Rd_ == 25) Console.WriteLn("MTC0 PCCR = %x PCR0 = %x PCR1 = %x IMM= %x", params
-				cpuRegs.PERF.n.pccr, cpuRegs.PERF.n.pcr0, cpuRegs.PERF.n.pcr1, _Imm_ & 0x3F);*/
-			if (0 == (_Imm_ & 1)) // MTPS
+			case 9:
 			{
-				if (0 != (_Imm_ & 0x3E)) // only effective when the register is 0
+				u32 incr = cpuRegs.cycle - cpuRegs.lastCOP0Cycle;
+				if (incr == 0)
+					incr++;
+				cpuRegs.CP0.n.Count += incr;
+				cpuRegs.lastCOP0Cycle = cpuRegs.cycle;
+				if (!_Rt_)
 					break;
-				// Updates PCRs and sets the PCCR.
-				COP0_UpdatePCCR();
-				cpuRegs.PERF.n.pccr.val = cpuRegs.GPR.r[_Rt_].UL[0];
-				COP0_DiagnosticPCCR();
 			}
-			else if (0 == (_Imm_ & 2)) // MTPC 0, only LSB of register matters
-			{
-				cpuRegs.PERF.n.pcr0 = cpuRegs.GPR.r[_Rt_].UL[0];
-				cpuRegs.lastPERFCycle[0] = cpuRegs.cycle;
-			}
-			else // MTPC 1
-			{
-				cpuRegs.PERF.n.pcr1 = cpuRegs.GPR.r[_Rt_].UL[0];
-				cpuRegs.lastPERFCycle[1] = cpuRegs.cycle;
-			}
-		break;
+				[[fallthrough]];
 
-		default:
-			cpuRegs.CP0.r[_Rd_] = cpuRegs.GPR.r[_Rt_].UL[0];
-		break;
+			default:
+				cpuRegs.GPR.r[_Rt_].SD[0] = (s32)cpuRegs.CP0.r[_Rd_];
+		}
 	}
-}
 
-int CPCOND0() {
-	return (((dmacRegs.stat.CIS | ~dmacRegs.pcr.CPC) & 0x3FF) == 0x3ff);
-}
+	void MTC0()
+	{
+		//if(bExecBIOS == FALSE && _Rd_ == 25) Console.WriteLn("MTC0 _Rd_ %x = %x", _Rd_, cpuRegs.CP0.r[_Rd_]);
+		switch (_Rd_)
+		{
+			case 9:
+				cpuRegs.lastCOP0Cycle = cpuRegs.cycle;
+				cpuRegs.CP0.r[9] = cpuRegs.GPR.r[_Rt_].UL[0];
+				break;
 
-//#define CPCOND0	1
+			case 12:
+				WriteCP0Status(cpuRegs.GPR.r[_Rt_].UL[0]);
+				break;
 
-void BC0F() {
-	if (CPCOND0() == 0) intDoBranch(_BranchTarget_);
-}
+			case 16:
+				WriteCP0Config(cpuRegs.GPR.r[_Rt_].UL[0]);
+				break;
 
-void BC0T() {
-	if (CPCOND0() == 1) intDoBranch(_BranchTarget_);
-}
+			case 24:
+				COP0_LOG("MTC0 Breakpoint debug Registers code = %x", cpuRegs.code & 0x3FF);
+				break;
 
-void BC0FL() {
-	if (CPCOND0() == 0)
-		intDoBranch(_BranchTarget_);
-	else
-		cpuRegs.pc+= 4;
+			case 25:
+				/*if(bExecBIOS == FALSE && _Rd_ == 25) Console.WriteLn("MTC0 PCCR = %x PCR0 = %x PCR1 = %x IMM= %x", params
+	cpuRegs.PERF.n.pccr, cpuRegs.PERF.n.pcr0, cpuRegs.PERF.n.pcr1, _Imm_ & 0x3F);*/
+				if (0 == (_Imm_ & 1)) // MTPS
+				{
+					if (0 != (_Imm_ & 0x3E)) // only effective when the register is 0
+						break;
+					// Updates PCRs and sets the PCCR.
+					COP0_UpdatePCCR();
+					cpuRegs.PERF.n.pccr.val = cpuRegs.GPR.r[_Rt_].UL[0];
+					COP0_DiagnosticPCCR();
+				}
+				else if (0 == (_Imm_ & 2)) // MTPC 0, only LSB of register matters
+				{
+					cpuRegs.PERF.n.pcr0 = cpuRegs.GPR.r[_Rt_].UL[0];
+					cpuRegs.lastPERFCycle[0] = cpuRegs.cycle;
+				}
+				else // MTPC 1
+				{
+					cpuRegs.PERF.n.pcr1 = cpuRegs.GPR.r[_Rt_].UL[0];
+					cpuRegs.lastPERFCycle[1] = cpuRegs.cycle;
+				}
+				break;
 
-}
+			default:
+				cpuRegs.CP0.r[_Rd_] = cpuRegs.GPR.r[_Rt_].UL[0];
+				break;
+		}
+	}
 
-void BC0TL() {
-	if (CPCOND0() == 1)
-		intDoBranch(_BranchTarget_);
-	else
-		cpuRegs.pc+= 4;
-}
+	int CPCOND0()
+	{
+		return (((dmacRegs.stat.CIS | ~dmacRegs.pcr.CPC) & 0x3FF) == 0x3ff);
+	}
 
-void ERET() {
+	//#define CPCOND0	1
+
+	void BC0F()
+	{
+		if (CPCOND0() == 0)
+			intDoBranch(_BranchTarget_);
+	}
+
+	void BC0T()
+	{
+		if (CPCOND0() == 1)
+			intDoBranch(_BranchTarget_);
+	}
+
+	void BC0FL()
+	{
+		if (CPCOND0() == 0)
+			intDoBranch(_BranchTarget_);
+		else
+			cpuRegs.pc += 4;
+	}
+
+	void BC0TL()
+	{
+		if (CPCOND0() == 1)
+			intDoBranch(_BranchTarget_);
+		else
+			cpuRegs.pc += 4;
+	}
+
+	void ERET()
+	{
 #ifdef ENABLE_VTUNE
-	// Allow to stop vtune in a predictable way to compare runs
-	// Of course, the limit will depend on the game.
-	const u32 million = 1000 * 1000;
-	static u32 vtune = 0;
-	vtune++;
+		// Allow to stop vtune in a predictable way to compare runs
+		// Of course, the limit will depend on the game.
+		const u32 million = 1000 * 1000;
+		static u32 vtune = 0;
+		vtune++;
 
-	// quick_exit vs exit: quick_exit won't call static storage destructor (OS will manage). It helps
-	// avoiding the race condition between threads destruction.
-	if (vtune > 30 * million) {
-		Console.WriteLn("VTUNE: quick_exit");
-		std::quick_exit(EXIT_SUCCESS);
-	} else if (!(vtune % million)) {
-		Console.WriteLn("VTUNE: ERET was called %uM times", vtune/million);
-	}
+		// quick_exit vs exit: quick_exit won't call static storage destructor (OS will manage). It helps
+		// avoiding the race condition between threads destruction.
+		if (vtune > 30 * million)
+		{
+			Console.WriteLn("VTUNE: quick_exit");
+			std::quick_exit(EXIT_SUCCESS);
+		}
+		else if (!(vtune % million))
+		{
+			Console.WriteLn("VTUNE: ERET was called %uM times", vtune / million);
+		}
 
 #endif
 
-	if (cpuRegs.CP0.n.Status.b.ERL) {
-		cpuRegs.pc = cpuRegs.CP0.n.ErrorEPC;
-		cpuRegs.CP0.n.Status.b.ERL = 0;
-	} else {
-		cpuRegs.pc = cpuRegs.CP0.n.EPC;
-		cpuRegs.CP0.n.Status.b.EXL = 0;
-	}
-	cpuUpdateOperationMode();
-	cpuSetNextEventDelta(4);
-	intSetBranch();
-}
-
-void DI() {
-	if (cpuRegs.CP0.n.Status.b._EDI || cpuRegs.CP0.n.Status.b.EXL ||
-		cpuRegs.CP0.n.Status.b.ERL || (cpuRegs.CP0.n.Status.b.KSU == 0)) {
-		cpuRegs.CP0.n.Status.b.EIE = 0;
-		// IRQs are disabled so no need to do a cpu exception/event test...
-		//cpuSetNextEventDelta();
-	}
-}
-
-void EI() {
-	if (cpuRegs.CP0.n.Status.b._EDI || cpuRegs.CP0.n.Status.b.EXL ||
-		cpuRegs.CP0.n.Status.b.ERL || (cpuRegs.CP0.n.Status.b.KSU == 0)) {
-		cpuRegs.CP0.n.Status.b.EIE = 1;
-		// schedule an event test, which will check for and raise pending IRQs.
+		if (cpuRegs.CP0.n.Status.b.ERL)
+		{
+			cpuRegs.pc = cpuRegs.CP0.n.ErrorEPC;
+			cpuRegs.CP0.n.Status.b.ERL = 0;
+		}
+		else
+		{
+			cpuRegs.pc = cpuRegs.CP0.n.EPC;
+			cpuRegs.CP0.n.Status.b.EXL = 0;
+		}
+		cpuUpdateOperationMode();
 		cpuSetNextEventDelta(4);
+		intSetBranch();
 	}
-}
 
-} } } }	// end namespace R5900::Interpreter::OpcodeImpl
+	void DI()
+	{
+		if (cpuRegs.CP0.n.Status.b._EDI || cpuRegs.CP0.n.Status.b.EXL ||
+			cpuRegs.CP0.n.Status.b.ERL || (cpuRegs.CP0.n.Status.b.KSU == 0))
+		{
+			cpuRegs.CP0.n.Status.b.EIE = 0;
+			// IRQs are disabled so no need to do a cpu exception/event test...
+			//cpuSetNextEventDelta();
+		}
+	}
+
+	void EI()
+	{
+		if (cpuRegs.CP0.n.Status.b._EDI || cpuRegs.CP0.n.Status.b.EXL ||
+			cpuRegs.CP0.n.Status.b.ERL || (cpuRegs.CP0.n.Status.b.KSU == 0))
+		{
+			cpuRegs.CP0.n.Status.b.EIE = 1;
+			// schedule an event test, which will check for and raise pending IRQs.
+			cpuSetNextEventDelta(4);
+		}
+	}
+
+} // namespace COP0
+} // namespace OpcodeImpl
+} // namespace Interpreter
+} // namespace R5900
diff --git a/pcsx2/x86/iCOP0.cpp b/pcsx2/x86/iCOP0.cpp
index 51707e4e5a..1e6a9e0bd6 100644
--- a/pcsx2/x86/iCOP0.cpp
+++ b/pcsx2/x86/iCOP0.cpp
@@ -150,7 +150,9 @@ void recMFC0()
 	if (_Rd_ == 9)
 	{
 		// This case needs to be handled even if the write-back is ignored (_Rt_ == 0 )
-		xMOV(ecx, ptr[&cpuRegs.cycle]);
+		xMOV(ecx, ptr32[&cpuRegs.cycle]);
+		xADD(ecx, scaleblockcycles_clear());
+		xMOV(ptr32[&cpuRegs.cycle], ecx); // update cycles
 		xMOV(eax, ecx);
 		xSUB(eax, ptr[&cpuRegs.lastCOP0Cycle]);
 		u8* skipInc = JNZ8(0);
@@ -180,6 +182,9 @@ void recMFC0()
 		else if (0 == (_Imm_ & 2)) // MFPC 0, only LSB of register matters
 		{
 			iFlushCall(FLUSH_INTERPRETER);
+			xMOV(eax, ptr32[&cpuRegs.cycle]);
+			xADD(eax, scaleblockcycles_clear());
+			xMOV(ptr32[&cpuRegs.cycle], eax); // update cycles
 			xFastCall((void*)COP0_UpdatePCCR);
 
 			const int regt = _allocX86reg(X86TYPE_GPR, _Rt_, MODE_WRITE);
@@ -188,6 +193,9 @@ void recMFC0()
 		else // MFPC 1
 		{
 			iFlushCall(FLUSH_INTERPRETER);
+			xMOV(eax, ptr32[&cpuRegs.cycle]);
+			xADD(eax, scaleblockcycles_clear());
+			xMOV(ptr32[&cpuRegs.cycle], eax); // update cycles
 			xFastCall((void*)COP0_UpdatePCCR);
 
 			const int regt = _allocX86reg(X86TYPE_GPR, _Rt_, MODE_WRITE);
@@ -214,6 +222,9 @@ void recMTC0()
 		{
 			case 12:
 				iFlushCall(FLUSH_INTERPRETER);
+				xMOV(eax, ptr32[&cpuRegs.cycle]);
+				xADD(eax, scaleblockcycles_clear());
+				xMOV(ptr32[&cpuRegs.cycle], eax); // update cycles
 				xFastCall((void*)WriteCP0Status, g_cpuConstRegs[_Rt_].UL[0]);
 				break;
 
@@ -223,7 +234,9 @@ void recMTC0()
 				break;
 
 			case 9:
-				xMOV(ecx, ptr[&cpuRegs.cycle]);
+				xMOV(ecx, ptr32[&cpuRegs.cycle]);
+				xADD(ecx, scaleblockcycles_clear());
+				xMOV(ptr32[&cpuRegs.cycle], ecx); // update cycles
 				xMOV(ptr[&cpuRegs.lastCOP0Cycle], ecx);
 				xMOV(ptr32[&cpuRegs.CP0.r[9]], g_cpuConstRegs[_Rt_].UL[0]);
 				break;
@@ -235,19 +248,26 @@ void recMTC0()
 						break;
 					// Updates PCRs and sets the PCCR.
 					iFlushCall(FLUSH_INTERPRETER);
+					xMOV(eax, ptr32[&cpuRegs.cycle]);
+					xADD(eax, scaleblockcycles_clear());
+					xMOV(ptr32[&cpuRegs.cycle], eax); // update cycles
 					xFastCall((void*)COP0_UpdatePCCR);
 					xMOV(ptr32[&cpuRegs.PERF.n.pccr], g_cpuConstRegs[_Rt_].UL[0]);
 					xFastCall((void*)COP0_DiagnosticPCCR);
 				}
 				else if (0 == (_Imm_ & 2)) // MTPC 0, only LSB of register matters
 				{
-					xMOV(eax, ptr[&cpuRegs.cycle]);
+					xMOV(eax, ptr32[&cpuRegs.cycle]);
+					xADD(eax, scaleblockcycles_clear());
+					xMOV(ptr32[&cpuRegs.cycle], eax); // update cycles
 					xMOV(ptr32[&cpuRegs.PERF.n.pcr0], g_cpuConstRegs[_Rt_].UL[0]);
 					xMOV(ptr[&cpuRegs.lastPERFCycle[0]], eax);
 				}
 				else // MTPC 1
 				{
-					xMOV(eax, ptr[&cpuRegs.cycle]);
+					xMOV(eax, ptr32[&cpuRegs.cycle]);
+					xADD(eax, scaleblockcycles_clear());
+					xMOV(ptr32[&cpuRegs.cycle], eax); // update cycles
 					xMOV(ptr32[&cpuRegs.PERF.n.pcr1], g_cpuConstRegs[_Rt_].UL[0]);
 					xMOV(ptr[&cpuRegs.lastPERFCycle[1]], eax);
 				}
@@ -269,6 +289,9 @@ void recMTC0()
 			case 12:
 				_eeMoveGPRtoR(arg1reg, _Rt_);
 				iFlushCall(FLUSH_INTERPRETER);
+				xMOV(eax, ptr32[&cpuRegs.cycle]);
+				xADD(eax, scaleblockcycles_clear());
+				xMOV(ptr32[&cpuRegs.cycle], eax); // update cycles
 				xFastCall((void*)WriteCP0Status);
 				break;
 
@@ -279,7 +302,9 @@ void recMTC0()
 				break;
 
 			case 9:
-				xMOV(ecx, ptr[&cpuRegs.cycle]);
+				xMOV(ecx, ptr32[&cpuRegs.cycle]);
+				xADD(ecx, scaleblockcycles_clear());
+				xMOV(ptr32[&cpuRegs.cycle], ecx); // update cycles
 				_eeMoveGPRtoM((uptr)&cpuRegs.CP0.r[9], _Rt_);
 				xMOV(ptr[&cpuRegs.lastCOP0Cycle], ecx);
 				break;
@@ -290,19 +315,26 @@ void recMTC0()
 					if (0 != (_Imm_ & 0x3E)) // only effective when the register is 0
 						break;
 					iFlushCall(FLUSH_INTERPRETER);
+					xMOV(eax, ptr32[&cpuRegs.cycle]);
+					xADD(eax, scaleblockcycles_clear());
+					xMOV(ptr32[&cpuRegs.cycle], eax); // update cycles
 					xFastCall((void*)COP0_UpdatePCCR);
 					_eeMoveGPRtoM((uptr)&cpuRegs.PERF.n.pccr, _Rt_);
 					xFastCall((void*)COP0_DiagnosticPCCR);
 				}
 				else if (0 == (_Imm_ & 2)) // MTPC 0, only LSB of register matters
 				{
-					xMOV(ecx, ptr[&cpuRegs.cycle]);
+					xMOV(ecx, ptr32[&cpuRegs.cycle]);
+					xADD(ecx, scaleblockcycles_clear());
+					xMOV(ptr32[&cpuRegs.cycle], ecx); // update cycles
 					_eeMoveGPRtoM((uptr)&cpuRegs.PERF.n.pcr0, _Rt_);
 					xMOV(ptr[&cpuRegs.lastPERFCycle[0]], ecx);
 				}
 				else // MTPC 1
 				{
-					xMOV(ecx, ptr[&cpuRegs.cycle]);
+					xMOV(ecx, ptr32[&cpuRegs.cycle]);
+					xADD(ecx, scaleblockcycles_clear());
+					xMOV(ptr32[&cpuRegs.cycle], ecx); // update cycles
 					_eeMoveGPRtoM((uptr)&cpuRegs.PERF.n.pcr1, _Rt_);
 					xMOV(ptr[&cpuRegs.lastPERFCycle[1]], ecx);
 				}