From 1098253df7077d71c559b0d4dc11f89a1169e2e0 Mon Sep 17 00:00:00 2001
From: "Jake.Stine" <Jake.Stine@a6443dda-0b58-4228-96e9-037be469359c>
Date: Sun, 18 Jan 2009 21:10:48 +0000
Subject: [PATCH] Major bugfix/oopsie from r595 - I forgot to set the
 freezeregs flag, so XMM regs were getting corrupted liberally.

Added proper implementations for COP0's branching functions (BC0F, BC0T, etc).

git-svn-id: http://pcsx2-playground.googlecode.com/svn/trunk@606 a6443dda-0b58-4228-96e9-037be469359c
---
 pcsx2/x86/iCP0.cpp              | 96 ++++++++++++++++++++++++++++++---
 pcsx2/x86/ix86-32/iR5900-32.cpp | 75 +++++++++++++++++++-------
 2 files changed, 144 insertions(+), 27 deletions(-)

diff --git a/pcsx2/x86/iCP0.cpp b/pcsx2/x86/iCP0.cpp
index 6483dac7f2..763a0f9849 100644
--- a/pcsx2/x86/iCP0.cpp
+++ b/pcsx2/x86/iCP0.cpp
@@ -34,6 +34,45 @@ namespace Interp = R5900::Interpreter::OpcodeImpl::COP0;
 
 namespace Dynarec {
 namespace R5900 {
+
+// R5900 branch hepler!
+// Recompiles code for a branch test and/or skip, complete with delay slot
+// handling.  Note, for "likely" branches use iDoBranchImm_Likely instead, which
+// handles delay slots differently.
+// Parameters:
+//   jmpSkip - This parameter is the result of the appropriate J32 instruction
+//   (usually JZ32 or JNZ32).
+static void recDoBranchImm( u32* jmpSkip, bool isLikely = false )
+{
+	// All R5900 branches use this format:
+	const u32 branchTo = (s32)_Imm_ * 4 + pc;
+
+	// First up is the Branch Taken Path : Save the recompiler's state, compile the
+	// DelaySlot, and issue a BranchTest insertion.  The state is reloaded below for
+	// the "did not branch" path (maintains consts, register allocations, and other optimizations).
+
+	SaveBranchState();
+	recompileNextInstruction(1);
+	SetBranchImm(branchTo);
+
+	// Jump target when the branch is *not* taken, skips the branchtest code
+	// insertion above.
+	x86SetJ32(jmpSkip);
+
+	// if it's a likely branch then we'll need to skip the delay slot here, since
+	// MIPS cancels the delay slot instruction when branches aren't taken.
+	if( !isLikely ) pc -= 4;		// instruction rewinde for delay slot ,if non-likely.
+	LoadBranchState();
+	recompileNextInstruction(1);
+
+	SetBranchImm(pc);
+}
+
+static void recDoBranchImm_Likely( u32* jmpSkip )
+{
+	recDoBranchImm( jmpSkip, true );
+}
+
 namespace OpcodeImpl {
 namespace COP0 {
 
@@ -42,14 +81,55 @@ namespace COP0 {
 *                                                        *
 *********************************************************/
 
-void recBC0F() { recBranchCall( Interp::BC0F ); }
-void recBC0T() { recBranchCall( Interp::BC0T ); }
-void recBC0FL() { recBranchCall( Interp::BC0FL ); }
-void recBC0TL() { recBranchCall( Interp::BC0TL ); }
-void recTLBR() { recBranchCall( Interp::TLBR ); }
-void recTLBWI() { recBranchCall( Interp::TLBWI ); }
-void recTLBWR() { recBranchCall( Interp::TLBWR ); }
-void recTLBP() { recBranchCall( Interp::TLBP ); }
+// emits "setup" code for a COP0 branch test.  The instruction immediately following
+// this should be a conditional Jump -- JZ or JNZ normally.
+static void _setupBranchTest()
+{
+	_eeFlushAllUnused();
+
+	// COP0 branch conditionals are based on the following equation:
+	// (((psHu16(DMAC_STAT) & psHu16(DMAC_PCR)) & 0x3ff) == (psHu16(DMAC_PCR) & 0x3ff))
+	// BC0F checks if the statement is false, BC0T checks if the statement is true.
+
+	// note: We only want to compare the 16 bit values of DMAC_STAT and PCR.
+	// But using 32-bit loads here is ok (and faster), because we mask off
+	// everything except the lower 10 bits away.
+
+	MOV32MtoR( EAX, (uptr)&psHu32(DMAC_STAT) );
+	MOV32MtoR( ECX, (uptr)&psHu32(DMAC_PCR) );
+	AND32ItoR( EAX, 0x3ff );	// masks off all but lower 10 bits.
+	AND32ItoR( ECX, 0x3ff );
+	CMP32RtoR( EAX, ECX );
+}
+
+void recBC0F()
+{
+	_setupBranchTest();
+	recDoBranchImm(JNZ32(0));
+}
+
+void recBC0T()
+{
+	_setupBranchTest();
+	recDoBranchImm(JZ32(0));
+}
+
+void recBC0FL()
+{
+	_setupBranchTest();
+	recDoBranchImm_Likely(JNZ32(0));
+}
+
+void recBC0TL()
+{
+	_setupBranchTest();
+	recDoBranchImm_Likely(JZ32(0));
+}
+
+void recTLBR() { recCall( Interp::TLBR, -1 ); }
+void recTLBP() { recCall( Interp::TLBP, -1 ); }
+void recTLBWI() { recCall( Interp::TLBWI, -1 ); }
+void recTLBWR() { recCall( Interp::TLBWR, -1 ); }
 
 void recERET()
 {
diff --git a/pcsx2/x86/ix86-32/iR5900-32.cpp b/pcsx2/x86/ix86-32/iR5900-32.cpp
index 4207eda454..170e6e076f 100644
--- a/pcsx2/x86/ix86-32/iR5900-32.cpp
+++ b/pcsx2/x86/ix86-32/iR5900-32.cpp
@@ -117,7 +117,7 @@ static const char *txt1 = "REG[%d] = %x_%x\n";
 static const char *txt2 = "M32 = %x\n";
 #endif
 
-static void iBranchTest(u32 newpc, u32 cpuBranch);
+static void iBranchTest(u32 newpc, bool noDispatch=false);
 
 BASEBLOCKEX* PC_GETBLOCKEX(BASEBLOCK* p)
 {
@@ -686,7 +686,7 @@ static __declspec(naked,noreturn) void DispatcherClear()
 	// calc PC_GETBLOCK
 	s_pDispatchBlock = PC_GETBLOCK(cpuRegs.pc);
 
-	if( s_pDispatchBlock->startpc == cpuRegs.pc )
+	if( s_pDispatchBlock != NULL && s_pDispatchBlock->startpc == cpuRegs.pc )
 	{
 		assert( s_pDispatchBlock->pFnptr != 0 );
 
@@ -725,7 +725,7 @@ static __declspec(naked,noreturn) void DispatcherReg()
 {
 	s_pDispatchBlock = PC_GETBLOCK(cpuRegs.pc);
 
-	if( s_pDispatchBlock->startpc != cpuRegs.pc )
+	if( s_pDispatchBlock == NULL || s_pDispatchBlock->startpc != cpuRegs.pc )
 		recRecompile(cpuRegs.pc);
 
 	__asm
@@ -750,8 +750,9 @@ __forceinline void recExecute()
 	// Optimization note : Compared pushad against manually pushing the regs one-by-one.
 	// Manually pushing is faster, especially on Core2's and such. :)
 	do {
-		__asm {
-			
+		g_EEFreezeRegs = true;
+		__asm
+		{
 			push ebx
 			push esi
 			push edi
@@ -764,12 +765,14 @@ __forceinline void recExecute()
 			pop esi
 			pop ebx
 		}
+		g_EEFreezeRegs = false;
 	}
 	while( !recEventTest() );
 }
 
 static void recExecuteBlock()
 {
+	g_EEFreezeRegs = true;
 	__asm
 	{
 		push ebx
@@ -784,6 +787,7 @@ static void recExecuteBlock()
 		pop esi
 		pop ebx
 	}
+	g_EEFreezeRegs = false;
 	recEventTest();
 }
 
@@ -958,7 +962,7 @@ void SetBranchReg( u32 reg )
 
 	iFlushCall(FLUSH_EVERYTHING);
 
-	iBranchTest(0xffffffff, 1);
+	iBranchTest(0xffffffff);
 }
 
 void SetBranchImm( u32 imm )
@@ -971,7 +975,7 @@ void SetBranchImm( u32 imm )
 	MOV32ItoM( (uptr)&cpuRegs.pc, imm );
 	iFlushCall(FLUSH_EVERYTHING);
 
-	iBranchTest(imm, imm <= pc);
+	iBranchTest(imm);
 }
 
 void SaveBranchState()
@@ -1111,7 +1115,17 @@ static u32 eeScaleBlockCycles()
 	return s_nBlockCycles >> (3+2);
 }
 
-static void iBranchTest(u32 newpc, u32 cpuBranch)
+// Generates dynarec code for Event tests followed by a block dispatch (branch).
+// Parameters:
+//   newpc - address to jump to at the end of the block.  If newpc == 0xffffffff then
+//   the jump is assumed to be to a register (dynamic).  For any other value the
+//   jump is assumed to be static, in which case the block will be "hardlinked" after
+//   the first time it's dispatched.
+// 
+//   noDispatch - When set true, the jump to Dispatcher.  Used by the recs
+//   for blocks which perform exception checks without branching (it's enabled by
+//   setting "branch = 2";
+static void iBranchTest(u32 newpc, bool noDispatch)
 {
 #ifdef _DEBUG
 	//CALLFunc((uptr)testfpu);
@@ -1121,21 +1135,35 @@ static void iBranchTest(u32 newpc, u32 cpuBranch)
 	if( bExecBIOS ) CheckForBIOSEnd();
 
 	MOV32MtoR(EAX, (uptr)&cpuRegs.cycle);
-	ADD32ItoR(EAX, eeScaleBlockCycles());
-	if( newpc != 0xffffffff )
+	if( !noDispatch && newpc != 0xffffffff )
 	{
+		// Optimization note: Instructions order to pair EDX with EAX's load above.
+
+		// Load EDX with the address of the JS32 jump below.
+		// We do this because the the Dispatcher will use this info to modify
+		// the JS instruction later on with the address of the block it's jumping
+		// to; creating a static link of blocks that doesn't require the overhead
+		// of a dispatcher.
 		MOV32ItoR(EDX, 0);
 		ptr = (u32*)(x86Ptr-4);
 	}
+
+	// Check the Event scheduler if our "cycle target" has been reached.
+	// Equiv code to:
+	//    cpuRegs.cycle += blockcycles;
+	//    if( cpuRegs.cycle > g_nextBranchCycle ) { DoEvents(); }
+	ADD32ItoR(EAX, eeScaleBlockCycles());
 	MOV32RtoM((uptr)&cpuRegs.cycle, EAX); // update cycles
 	SUB32MtoR(EAX, (uptr)&g_nextBranchCycle);
 
 	if( newpc != 0xffffffff )
 	{
+		// This is the jump instruction which gets modified by Dispatcher.
 		*ptr = (u32)JS32((u32)Dispatcher - ( (u32)x86Ptr + 6 ));
 	}
-	else
+	else if( !noDispatch )
 	{
+		// This instruction is a dynamic link, so it's never modified.
 		JS32((uptr)DispatcherReg - ( (uptr)x86Ptr + 6 ));
 	}
 
@@ -1728,8 +1756,9 @@ void recRecompile( const u32 startpc )
 						goto StartRecomp;
 					}
 				}
+				// Fall through!
+				// COP0's branch opcodes line up with COP1 and COP2's
 
-				break;
 			case 17: // cp1
 			case 18: // cp2
 				if( _Rs_ == 8 ) {
@@ -2023,15 +2052,24 @@ StartRecomp:
 	if( !(pc&0x10000000) )
 		maxrecmem = std::max( (pc&~0xa0000000), maxrecmem );
 
-	if( branch == 2 ) {
-		iFlushCall(FLUSH_EVERYTHING);
+	if( branch == 2 )
+	{
+		// Branch type 2 - This is how I "think" this works (air):
+		// Performs a branch/event test but does not actually "break" the block.
+		// This allows exceptions to be raised, and is thus sufficient for
+		// certain types of things like SYSCALL, EI, etc.  but it is not sufficient
+		// for actual branching instructions.
 
-		iBranchTest(0xffffffff, 1);	
+		iFlushCall(FLUSH_EVERYTHING);
+		iBranchTest(0xffffffff, true);
 	}
-	else {
+	else
+	{
 		assert( branch != 3 );
-		if( branch ) assert( !willbranch3 );
-		else ADD32ItoM((int)&cpuRegs.cycle, eeScaleBlockCycles() );
+		if( branch )
+			assert( !willbranch3 );
+		else
+			ADD32ItoM((int)&cpuRegs.cycle, eeScaleBlockCycles() );
 
 		if( willbranch3 ) {
 			BASEBLOCK* pblock = PC_GETBLOCK(s_nEndBlock);
@@ -2088,7 +2126,6 @@ using namespace Dynarec::R5900;
 
 namespace R5900
 {
-
 	R5900cpu recCpu = {
 		recAlloc,
 		recReset,