From 8da2dc7df9e4ee7c28bbd2546cdbf1707a03ec2f Mon Sep 17 00:00:00 2001
From: sudonim1 <sudonim1@96395faa-99c1-11dd-bbfe-3dabce05a288>
Date: Mon, 5 Apr 2010 22:24:25 +0000
Subject: [PATCH] R5900: Replaced 0x81FC0 address check with constant loop
 detection logic and renamed the hack appropriately.  This is what I
 originally intended back before INTC_STAT and 81FC0 hacks, but it seems to
 have pretty minimal gains over them.  I don't think I've broken anything
 though, it might help some game and could perhaps be extended to handle more
 complicated loops later, with inlining or multiple branch logic.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@2814 96395faa-99c1-11dd-bbfe-3dabce05a288
---
 pcsx2/Config.h                         |   2 +-
 pcsx2/Pcsx2Config.cpp                  |   2 +-
 pcsx2/gui/Panels/ConfigurationPanels.h |   2 +-
 pcsx2/gui/Panels/SpeedhacksPanel.cpp   |  19 ++--
 pcsx2/x86/iR3000A.cpp                  |  17 ++--
 pcsx2/x86/ix86-32/iR5900-32.cpp        | 115 ++++++++++++++++++++-----
 6 files changed, 115 insertions(+), 42 deletions(-)

diff --git a/pcsx2/Config.h b/pcsx2/Config.h
index ff8501807c..aef4ef37a9 100644
--- a/pcsx2/Config.h
+++ b/pcsx2/Config.h
@@ -456,7 +456,7 @@ struct Pcsx2Config
 			bool
 				IopCycleRate_X2	:1,		// enables the x2 multiplier of the IOP cyclerate
 				IntcStat		:1,		// tells Pcsx2 to fast-forward through intc_stat waits.
-				BIFC0			:1,		// enables BIFC0 detection and fast-forwarding
+				WaitLoop		:1,		// enables constant loop detection and fast-forwarding
 				vuFlagHack		:1,		// microVU specific flag hack; Can cause Infinite loops, SPS, etc...
 				vuMinMax		:1;		// microVU specific MinMax hack; Can cause SPS, Black Screens,  etc...
 		BITFIELD_END
diff --git a/pcsx2/Pcsx2Config.cpp b/pcsx2/Pcsx2Config.cpp
index 774d5919a3..f79eba2df5 100644
--- a/pcsx2/Pcsx2Config.cpp
+++ b/pcsx2/Pcsx2Config.cpp
@@ -72,7 +72,7 @@ void Pcsx2Config::SpeedhackOptions::LoadSave( IniInterface& ini )
 	IniBitfield( VUCycleSteal );
 	IniBitBool( IopCycleRate_X2 );
 	IniBitBool( IntcStat );
-	IniBitBool( BIFC0 );
+	IniBitBool( WaitLoop );
 	IniBitBool( vuFlagHack );
 	IniBitBool( vuMinMax );
 }
diff --git a/pcsx2/gui/Panels/ConfigurationPanels.h b/pcsx2/gui/Panels/ConfigurationPanels.h
index b33760569d..fd6972863d 100644
--- a/pcsx2/gui/Panels/ConfigurationPanels.h
+++ b/pcsx2/gui/Panels/ConfigurationPanels.h
@@ -293,7 +293,7 @@ namespace Panels
 		pxStaticText*	m_msg_vustealer;
 
 		pxCheckBox*		m_check_intc;
-		pxCheckBox*		m_check_b1fc0;
+		pxCheckBox*		m_check_waitloop;
 		pxCheckBox*		m_check_IOPx2;
 		pxCheckBox*		m_check_vuFlagHack;
 		pxCheckBox*		m_check_vuMinMax;
diff --git a/pcsx2/gui/Panels/SpeedhacksPanel.cpp b/pcsx2/gui/Panels/SpeedhacksPanel.cpp
index eed493494a..192d0ad7ed 100644
--- a/pcsx2/gui/Panels/SpeedhacksPanel.cpp
+++ b/pcsx2/gui/Panels/SpeedhacksPanel.cpp
@@ -192,8 +192,8 @@ Panels::SpeedHacksPanel::SpeedHacksPanel( wxWindow* parent )
 	m_check_intc = new pxCheckBox( miscHacksPanel, _("Enable INTC Spin Detection"),
 		_("Huge speedup for some games, with almost no compatibility side effects. [Recommended]") );
 
-	m_check_b1fc0 = new pxCheckBox( miscHacksPanel, _("Enable BIFC0 Spin Detection"),
-		_("Moderate speedup for some games, with no known side effects. [Recommended]" ) );
+	m_check_waitloop = new pxCheckBox( miscHacksPanel, _("Enable Wait Loop Detection"),
+		_("Moderate speedup for some games, with no known side effects. [Recommended???]" ) );
 	
 	m_check_IOPx2 = new pxCheckBox( miscHacksPanel, _("IOP x2 cycle rate hack"),
 		_("Small Speedup and works well with most games; may cause some games to hang during startup.") );
@@ -204,10 +204,11 @@ Panels::SpeedHacksPanel::SpeedHacksPanel( wxWindow* parent )
 		L"RPG titles. Games that do not use this method of vsync will see little or no speedup from this hack."
 	) );
 
-	m_check_b1fc0->SetToolTip( pxE( ".Tooltips:Speedhacks:BIFC0",
-		L"This hack works especially well for Final Fantasy X and Kingdom Hearts.  BIFC0 is the address of a specific block of "
-		L"code in the EE kernel that's run repeatedly when the EE is waiting for the IOP to complete a task.  This hack detects "
-		L"that and responds by fast-forwarding the EE until the IOP signals that the task is complete."
+	m_check_waitloop->SetToolTip( pxE( ".Tooltips:Speedhacks:BIFC0",
+		L"Primarily targetting the EE idle loop at address 0x81FC0 in the kernel, this hack attempts to "
+		L"detect loops whose bodies are guaranteed to result in the same machine state for every iteration "
+		L"until a scheduled event triggers emulation of another unit.  After a single iteration of such loops, "
+		L"we advance to the time of the next event or the end of the processor's timeslice, whichever comes first."
 	) );
 
 	m_check_IOPx2->SetToolTip( pxE( ".Tooltips:Speedhacks:IOPx2",
@@ -233,7 +234,7 @@ Panels::SpeedHacksPanel::SpeedHacksPanel( wxWindow* parent )
 	*vuHacksPanel	+= m_check_vuMinMax;
 
 	*miscHacksPanel	+= m_check_intc;
-	*miscHacksPanel	+= m_check_b1fc0;
+	*miscHacksPanel	+= m_check_waitloop;
 	*miscHacksPanel	+= m_check_IOPx2;
 
 	*left	+= eeSliderPanel	| StdExpand();
@@ -303,7 +304,7 @@ void Panels::SpeedHacksPanel::AppStatusEvent_OnSettingsApplied( const Pcsx2Confi
 	m_check_vuFlagHack	->SetValue(opts.vuFlagHack);
 	m_check_vuMinMax	->SetValue(opts.vuMinMax);
 	m_check_intc		->SetValue(opts.IntcStat);
-	m_check_b1fc0		->SetValue(opts.BIFC0);
+	m_check_waitloop	->SetValue(opts.WaitLoop);
 	m_check_IOPx2		->SetValue(opts.IopCycleRate_X2);
 	
 	EnableStuff();
@@ -321,7 +322,7 @@ void Panels::SpeedHacksPanel::Apply()
 	opts.EECycleRate		= m_slider_eecycle->GetValue()-1;
 	opts.VUCycleSteal		= m_slider_vustealer->GetValue();
 
-	opts.BIFC0				= m_check_b1fc0->GetValue();
+	opts.WaitLoop			= m_check_waitloop->GetValue();
 	opts.IopCycleRate_X2	= m_check_IOPx2->GetValue();
 	opts.IntcStat			= m_check_intc->GetValue();
 	opts.vuFlagHack			= m_check_vuFlagHack->GetValue();
diff --git a/pcsx2/x86/iR3000A.cpp b/pcsx2/x86/iR3000A.cpp
index 442f2004f3..91adba36e6 100644
--- a/pcsx2/x86/iR3000A.cpp
+++ b/pcsx2/x86/iR3000A.cpp
@@ -75,6 +75,7 @@ static BASEBLOCK* s_pCurBlock = NULL;
 static BASEBLOCKEX* s_pCurBlockEx = NULL;
 
 static u32 s_nEndBlock = 0; // what psxpc the current block ends
+static u32 s_branchTo;
 static bool s_nBlockFF;
 
 static u32 s_saveConstRegs[32];
@@ -1007,7 +1008,7 @@ static void iPsxBranchTest(u32 newpc, u32 cpuBranch)
 {
 	u32 blockCycles = psxScaleBlockCycles();
 
-	if (EmuConfig.Speedhacks.BIFC0 && s_nBlockFF)
+	if (EmuConfig.Speedhacks.WaitLoop && s_nBlockFF && newpc == s_branchTo)
 	{
 		xMOV(eax, ptr32[&psxRegs.cycle]);
 		xMOV(ecx, eax);
@@ -1176,7 +1177,6 @@ static void printfn()
 static void __fastcall iopRecRecompile( const u32 startpc )
 {
 	u32 i;
-	u32 branchTo = -1;
 	u32 willbranch3 = 0;
 
 	if( IsDebugBuild && (psxdump & 4) )
@@ -1224,6 +1224,7 @@ static void __fastcall iopRecRecompile( const u32 startpc )
 	// go until the next branch
 	i = startpc;
 	s_nEndBlock = 0xffffffff;
+	s_branchTo = -1;
 
 	while(1) {
 		BASEBLOCK* pblock = PSX_GETBLOCK(i);
@@ -1251,8 +1252,8 @@ static void __fastcall iopRecRecompile( const u32 startpc )
 
 				if( _Rt_ == 0 || _Rt_ == 1 || _Rt_ == 16 || _Rt_ == 17 ) {
 
-					branchTo = _Imm_ * 4 + i + 4;
-					if( branchTo > startpc && branchTo < i ) s_nEndBlock = branchTo;
+					s_branchTo = _Imm_ * 4 + i + 4;
+					if( s_branchTo > startpc && s_branchTo < i ) s_nEndBlock = s_branchTo;
 					else  s_nEndBlock = i+8;
 
 					goto StartRecomp;
@@ -1262,15 +1263,15 @@ static void __fastcall iopRecRecompile( const u32 startpc )
 
 			case 2: // J
 			case 3: // JAL
-				branchTo = _Target_ << 2 | (i + 4) & 0xf0000000;
+				s_branchTo = _Target_ << 2 | (i + 4) & 0xf0000000;
 				s_nEndBlock = i + 8;
 				goto StartRecomp;
 
 			// branches
 			case 4: case 5: case 6: case 7:
 
-				branchTo = _Imm_ * 4 + i + 4;
-				if( branchTo > startpc && branchTo < i ) s_nEndBlock = branchTo;
+				s_branchTo = _Imm_ * 4 + i + 4;
+				if( s_branchTo > startpc && s_branchTo < i ) s_nEndBlock = s_branchTo;
 				else  s_nEndBlock = i+8;
 
 				goto StartRecomp;
@@ -1282,7 +1283,7 @@ static void __fastcall iopRecRecompile( const u32 startpc )
 StartRecomp:
 
 	s_nBlockFF = false;
-	if (branchTo == startpc) {
+	if (s_branchTo == startpc) {
 		s_nBlockFF = true;
 		for (i = startpc; i < s_nEndBlock; i += 4) {
 			if (i != s_nEndBlock - 8) {
diff --git a/pcsx2/x86/ix86-32/iR5900-32.cpp b/pcsx2/x86/ix86-32/iR5900-32.cpp
index 0ba8a1b6fd..6d8ded2b0f 100644
--- a/pcsx2/x86/ix86-32/iR5900-32.cpp
+++ b/pcsx2/x86/ix86-32/iR5900-32.cpp
@@ -73,6 +73,7 @@ static u32 s_nInstCacheSize = 0;
 static BASEBLOCK* s_pCurBlock = NULL;
 static BASEBLOCKEX* s_pCurBlockEx = NULL;
 u32 s_nEndBlock = 0; // what pc the current block ends
+u32 s_branchTo;
 static bool s_nBlockFF;
 
 // save states for branches
@@ -973,6 +974,7 @@ void SetBranchImm( u32 imm )
 
 	// end the current block
 	iFlushCall(FLUSH_EVERYTHING);
+	xMOV(ptr32[&cpuRegs.pc], imm);
 	iBranchTest(imm);
 }
 
@@ -1132,26 +1134,18 @@ static void iBranchTest(u32 newpc)
 	//    cpuRegs.cycle += blockcycles;
 	//    if( cpuRegs.cycle > g_nextBranchCycle ) { DoEvents(); }
 
-	if (EmuConfig.Speedhacks.BIFC0 && s_nBlockFF)
+	if (EmuConfig.Speedhacks.WaitLoop && s_nBlockFF && newpc == s_branchTo)
 	{
 		xMOV(eax, ptr32[&g_nextBranchCycle]);
 		xADD(ptr32[&cpuRegs.cycle], eeScaleBlockCycles());
 		xCMP(eax, ptr32[&cpuRegs.cycle]);
-		xCMOVL(eax, ptr32[&cpuRegs.cycle]);
+		xCMOVS(eax, ptr32[&cpuRegs.cycle]);
 		xMOV(ptr32[&cpuRegs.cycle], eax);
 
 		xJMP( DispatcherEvent );
 	}
 	else
 	{
-		// Optimization -- we need to load cpuRegs.pc on static block links, but doing it inside
-		// the if() block below (it would be paired with recBlocks.Link) breaks the sub/jcc
-		// pairing that modern CPUs optimize (applies to all P4+ and AMD X2+ CPUs).  So let's do
-		// it up here instead. :D
-
-		if( newpc != 0xffffffff )
-			xMOV( ptr32[&cpuRegs.pc], newpc );
-
 		xMOV(eax, &cpuRegs.cycle);
 		xADD(eax, eeScaleBlockCycles());
 		xMOV(&cpuRegs.cycle, eax); // update cycles
@@ -1367,7 +1361,6 @@ void __fastcall dyna_page_reset(u32 start,u32 sz)
 static void __fastcall recRecompile( const u32 startpc )
 {
 	u32 i = 0;
-	u32 branchTo;
 	u32 willbranch3 = 0;
 	u32 usecop2;
 
@@ -1389,10 +1382,6 @@ static void __fastcall recRecompile( const u32 startpc )
 	xSetPtr( recPtr );
 	recPtr = xGetAlignedCallTarget();
 
-	s_nBlockFF = false;
-	if (HWADDR(startpc) == 0x81fc0)
-		s_nBlockFF = true;
-
 	s_pCurBlock = PC_GETBLOCK(startpc);
 
 	pxAssert(s_pCurBlock->GetFnptr() == (uptr)JITCompile
@@ -1432,6 +1421,7 @@ static void __fastcall recRecompile( const u32 startpc )
 	// go until the next branch
 	i = startpc;
 	s_nEndBlock = 0xffffffff;
+	s_branchTo = -1;
 
 	while(1) {
 		BASEBLOCK* pblock = PC_GETBLOCK(i);
@@ -1470,8 +1460,8 @@ static void __fastcall recRecompile( const u32 startpc )
 
 				if( _Rt_ < 4 || (_Rt_ >= 16 && _Rt_ < 20) ) {
 					// branches
-					branchTo = _Imm_ * 4 + i + 4;
-					if( branchTo > startpc && branchTo < i ) s_nEndBlock = branchTo;
+					s_branchTo = _Imm_ * 4 + i + 4;
+					if( s_branchTo > startpc && s_branchTo < i ) s_nEndBlock = s_branchTo;
 					else  s_nEndBlock = i+8;
 
 					goto StartRecomp;
@@ -1480,14 +1470,15 @@ static void __fastcall recRecompile( const u32 startpc )
 
 			case 2: // J
 			case 3: // JAL
+				s_branchTo = _Target_ << 2 | (i + 4) & 0xf0000000;
 				s_nEndBlock = i + 8;
 				goto StartRecomp;
 
 			// branches
 			case 4: case 5: case 6: case 7:
 			case 20: case 21: case 22: case 23:
-				branchTo = _Imm_ * 4 + i + 4;
-				if( branchTo > startpc && branchTo < i ) s_nEndBlock = branchTo;
+				s_branchTo = _Imm_ * 4 + i + 4;
+				if( s_branchTo > startpc && s_branchTo < i ) s_nEndBlock = s_branchTo;
 				else  s_nEndBlock = i+8;
 
 				goto StartRecomp;
@@ -1507,8 +1498,8 @@ static void __fastcall recRecompile( const u32 startpc )
 				if( _Rs_ == 8 ) {
 					// BC1F, BC1T, BC1FL, BC1TL
 					// BC2F, BC2T, BC2FL, BC2TL
-					branchTo = _Imm_ * 4 + i + 4;
-					if( branchTo > startpc && branchTo < i ) s_nEndBlock = branchTo;
+					s_branchTo = _Imm_ * 4 + i + 4;
+					if( s_branchTo > startpc && s_branchTo < i ) s_nEndBlock = s_branchTo;
 					else  s_nEndBlock = i+8;
 
 					goto StartRecomp;
@@ -1521,6 +1512,86 @@ static void __fastcall recRecompile( const u32 startpc )
 
 StartRecomp:
 
+	// The idea here is that as long as a loop doesn't write to a register it's already read
+	// (excepting registers initialised with constants or memory loads) or use any instructions
+	// which alter the machine state apart from registers, it will do the same thing on every
+	// iteration.
+	// TODO: special handling for counting loops.  God of war wastes time in a loop which just
+	// counts to some large number and does nothing else, many other games use a counter as a
+	// timeout on a register read.  AFAICS the only way to optimise this for non-const cases
+	// without a significant loss in cycle accuracy is with a division, but games would probably
+	// be happy with time wasting loops completing in 0 cycles and timeouts waiting forever.
+	s_nBlockFF = false;
+	if (s_branchTo == startpc) {
+		s_nBlockFF = true;
+
+		u32 reads = 0, loads = 1;
+
+		for (i = startpc; i < s_nEndBlock; i += 4) {
+			if (i == s_nEndBlock - 8)
+				continue;
+			cpuRegs.code = *(u32*)PSM(i);
+			// nop
+			if (cpuRegs.code == 0)
+				continue;
+			// cache, sync
+			else if (_Opcode_ == 057 || _Opcode_ == 0 && _Funct_ == 013)
+				continue;
+			// imm arithmetic
+			else if ((_Opcode_ & 070) == 010 || (_Opcode_ & 076) == 030)
+			{
+				if (loads & 1 << _Rs_) {
+					loads |= 1 << _Rt_;
+					continue;
+				}
+				else
+					reads |= 1 << _Rs_;
+				if (reads & 1 << _Rt_) {
+					s_nBlockFF = false;
+					break;
+				}
+			}
+			// common register arithmetic instructions
+			else if (_Opcode_ == 0 && (_Funct_ & 060) == 040 && (_Funct_ & 076) != 050)
+			{
+				if (loads & 1 << _Rs_ && loads & 1 << _Rt_) {
+					loads |= 1 << _Rd_;
+					continue;
+				}
+				else
+					reads |= 1 << _Rs_ | 1 << _Rt_;
+				if (reads & 1 << _Rd_) {
+					s_nBlockFF = false;
+					break;
+				}
+			}
+			// loads
+			else if ((_Opcode_ & 070) == 040 || (_Opcode_ & 076) == 032 || _Opcode_ == 067)
+			{
+				if (loads & 1 << _Rs_) {
+					loads |= 1 << _Rt_;
+					continue;
+				}
+				else
+					reads |= 1 << _Rs_;
+				if (reads & 1 << _Rt_) {
+					s_nBlockFF = false;
+					break;
+				}
+			}
+			// mfc*, cfc*
+			else if ((_Opcode_ & 074) == 020 && _Rs_ < 4)
+			{
+				loads |= 1 << _Rt_;
+			}
+			else
+			{
+				s_nBlockFF = false;
+				break;
+			}
+		}
+	}
+
 	// rec info //
 	{
 		EEINST* pcur;
@@ -1753,7 +1824,7 @@ StartRecomp:
 
 			int numinsts = (pc - startpc) / 4;
 			if( numinsts > 6 )
-				iBranchTest(pc);
+				SetBranchImm(pc);
 			else
 			{
 				xMOV( ptr32[&cpuRegs.pc], pc );