From 1098253df7077d71c559b0d4dc11f89a1169e2e0 Mon Sep 17 00:00:00 2001 From: "Jake.Stine" Date: Sun, 18 Jan 2009 21:10:48 +0000 Subject: [PATCH] Major bugfix/oopsie from r595 - I forgot to set the freezeregs flag, so XMM regs were getting corrupted liberally. Added proper implementations for COP0's branching functions (BC0F, BC0T, etc). git-svn-id: http://pcsx2-playground.googlecode.com/svn/trunk@606 a6443dda-0b58-4228-96e9-037be469359c --- pcsx2/x86/iCP0.cpp | 96 ++++++++++++++++++++++++++++++--- pcsx2/x86/ix86-32/iR5900-32.cpp | 75 +++++++++++++++++++------- 2 files changed, 144 insertions(+), 27 deletions(-) diff --git a/pcsx2/x86/iCP0.cpp b/pcsx2/x86/iCP0.cpp index 6483dac7f2..763a0f9849 100644 --- a/pcsx2/x86/iCP0.cpp +++ b/pcsx2/x86/iCP0.cpp @@ -34,6 +34,45 @@ namespace Interp = R5900::Interpreter::OpcodeImpl::COP0; namespace Dynarec { namespace R5900 { + +// R5900 branch hepler! +// Recompiles code for a branch test and/or skip, complete with delay slot +// handling. Note, for "likely" branches use iDoBranchImm_Likely instead, which +// handles delay slots differently. +// Parameters: +// jmpSkip - This parameter is the result of the appropriate J32 instruction +// (usually JZ32 or JNZ32). +static void recDoBranchImm( u32* jmpSkip, bool isLikely = false ) +{ + // All R5900 branches use this format: + const u32 branchTo = (s32)_Imm_ * 4 + pc; + + // First up is the Branch Taken Path : Save the recompiler's state, compile the + // DelaySlot, and issue a BranchTest insertion. The state is reloaded below for + // the "did not branch" path (maintains consts, register allocations, and other optimizations). + + SaveBranchState(); + recompileNextInstruction(1); + SetBranchImm(branchTo); + + // Jump target when the branch is *not* taken, skips the branchtest code + // insertion above. + x86SetJ32(jmpSkip); + + // if it's a likely branch then we'll need to skip the delay slot here, since + // MIPS cancels the delay slot instruction when branches aren't taken. + if( !isLikely ) pc -= 4; // instruction rewinde for delay slot ,if non-likely. + LoadBranchState(); + recompileNextInstruction(1); + + SetBranchImm(pc); +} + +static void recDoBranchImm_Likely( u32* jmpSkip ) +{ + recDoBranchImm( jmpSkip, true ); +} + namespace OpcodeImpl { namespace COP0 { @@ -42,14 +81,55 @@ namespace COP0 { * * *********************************************************/ -void recBC0F() { recBranchCall( Interp::BC0F ); } -void recBC0T() { recBranchCall( Interp::BC0T ); } -void recBC0FL() { recBranchCall( Interp::BC0FL ); } -void recBC0TL() { recBranchCall( Interp::BC0TL ); } -void recTLBR() { recBranchCall( Interp::TLBR ); } -void recTLBWI() { recBranchCall( Interp::TLBWI ); } -void recTLBWR() { recBranchCall( Interp::TLBWR ); } -void recTLBP() { recBranchCall( Interp::TLBP ); } +// emits "setup" code for a COP0 branch test. The instruction immediately following +// this should be a conditional Jump -- JZ or JNZ normally. +static void _setupBranchTest() +{ + _eeFlushAllUnused(); + + // COP0 branch conditionals are based on the following equation: + // (((psHu16(DMAC_STAT) & psHu16(DMAC_PCR)) & 0x3ff) == (psHu16(DMAC_PCR) & 0x3ff)) + // BC0F checks if the statement is false, BC0T checks if the statement is true. + + // note: We only want to compare the 16 bit values of DMAC_STAT and PCR. + // But using 32-bit loads here is ok (and faster), because we mask off + // everything except the lower 10 bits away. + + MOV32MtoR( EAX, (uptr)&psHu32(DMAC_STAT) ); + MOV32MtoR( ECX, (uptr)&psHu32(DMAC_PCR) ); + AND32ItoR( EAX, 0x3ff ); // masks off all but lower 10 bits. + AND32ItoR( ECX, 0x3ff ); + CMP32RtoR( EAX, ECX ); +} + +void recBC0F() +{ + _setupBranchTest(); + recDoBranchImm(JNZ32(0)); +} + +void recBC0T() +{ + _setupBranchTest(); + recDoBranchImm(JZ32(0)); +} + +void recBC0FL() +{ + _setupBranchTest(); + recDoBranchImm_Likely(JNZ32(0)); +} + +void recBC0TL() +{ + _setupBranchTest(); + recDoBranchImm_Likely(JZ32(0)); +} + +void recTLBR() { recCall( Interp::TLBR, -1 ); } +void recTLBP() { recCall( Interp::TLBP, -1 ); } +void recTLBWI() { recCall( Interp::TLBWI, -1 ); } +void recTLBWR() { recCall( Interp::TLBWR, -1 ); } void recERET() { diff --git a/pcsx2/x86/ix86-32/iR5900-32.cpp b/pcsx2/x86/ix86-32/iR5900-32.cpp index 4207eda454..170e6e076f 100644 --- a/pcsx2/x86/ix86-32/iR5900-32.cpp +++ b/pcsx2/x86/ix86-32/iR5900-32.cpp @@ -117,7 +117,7 @@ static const char *txt1 = "REG[%d] = %x_%x\n"; static const char *txt2 = "M32 = %x\n"; #endif -static void iBranchTest(u32 newpc, u32 cpuBranch); +static void iBranchTest(u32 newpc, bool noDispatch=false); BASEBLOCKEX* PC_GETBLOCKEX(BASEBLOCK* p) { @@ -686,7 +686,7 @@ static __declspec(naked,noreturn) void DispatcherClear() // calc PC_GETBLOCK s_pDispatchBlock = PC_GETBLOCK(cpuRegs.pc); - if( s_pDispatchBlock->startpc == cpuRegs.pc ) + if( s_pDispatchBlock != NULL && s_pDispatchBlock->startpc == cpuRegs.pc ) { assert( s_pDispatchBlock->pFnptr != 0 ); @@ -725,7 +725,7 @@ static __declspec(naked,noreturn) void DispatcherReg() { s_pDispatchBlock = PC_GETBLOCK(cpuRegs.pc); - if( s_pDispatchBlock->startpc != cpuRegs.pc ) + if( s_pDispatchBlock == NULL || s_pDispatchBlock->startpc != cpuRegs.pc ) recRecompile(cpuRegs.pc); __asm @@ -750,8 +750,9 @@ __forceinline void recExecute() // Optimization note : Compared pushad against manually pushing the regs one-by-one. // Manually pushing is faster, especially on Core2's and such. :) do { - __asm { - + g_EEFreezeRegs = true; + __asm + { push ebx push esi push edi @@ -764,12 +765,14 @@ __forceinline void recExecute() pop esi pop ebx } + g_EEFreezeRegs = false; } while( !recEventTest() ); } static void recExecuteBlock() { + g_EEFreezeRegs = true; __asm { push ebx @@ -784,6 +787,7 @@ static void recExecuteBlock() pop esi pop ebx } + g_EEFreezeRegs = false; recEventTest(); } @@ -958,7 +962,7 @@ void SetBranchReg( u32 reg ) iFlushCall(FLUSH_EVERYTHING); - iBranchTest(0xffffffff, 1); + iBranchTest(0xffffffff); } void SetBranchImm( u32 imm ) @@ -971,7 +975,7 @@ void SetBranchImm( u32 imm ) MOV32ItoM( (uptr)&cpuRegs.pc, imm ); iFlushCall(FLUSH_EVERYTHING); - iBranchTest(imm, imm <= pc); + iBranchTest(imm); } void SaveBranchState() @@ -1111,7 +1115,17 @@ static u32 eeScaleBlockCycles() return s_nBlockCycles >> (3+2); } -static void iBranchTest(u32 newpc, u32 cpuBranch) +// Generates dynarec code for Event tests followed by a block dispatch (branch). +// Parameters: +// newpc - address to jump to at the end of the block. If newpc == 0xffffffff then +// the jump is assumed to be to a register (dynamic). For any other value the +// jump is assumed to be static, in which case the block will be "hardlinked" after +// the first time it's dispatched. +// +// noDispatch - When set true, the jump to Dispatcher. Used by the recs +// for blocks which perform exception checks without branching (it's enabled by +// setting "branch = 2"; +static void iBranchTest(u32 newpc, bool noDispatch) { #ifdef _DEBUG //CALLFunc((uptr)testfpu); @@ -1121,21 +1135,35 @@ static void iBranchTest(u32 newpc, u32 cpuBranch) if( bExecBIOS ) CheckForBIOSEnd(); MOV32MtoR(EAX, (uptr)&cpuRegs.cycle); - ADD32ItoR(EAX, eeScaleBlockCycles()); - if( newpc != 0xffffffff ) + if( !noDispatch && newpc != 0xffffffff ) { + // Optimization note: Instructions order to pair EDX with EAX's load above. + + // Load EDX with the address of the JS32 jump below. + // We do this because the the Dispatcher will use this info to modify + // the JS instruction later on with the address of the block it's jumping + // to; creating a static link of blocks that doesn't require the overhead + // of a dispatcher. MOV32ItoR(EDX, 0); ptr = (u32*)(x86Ptr-4); } + + // Check the Event scheduler if our "cycle target" has been reached. + // Equiv code to: + // cpuRegs.cycle += blockcycles; + // if( cpuRegs.cycle > g_nextBranchCycle ) { DoEvents(); } + ADD32ItoR(EAX, eeScaleBlockCycles()); MOV32RtoM((uptr)&cpuRegs.cycle, EAX); // update cycles SUB32MtoR(EAX, (uptr)&g_nextBranchCycle); if( newpc != 0xffffffff ) { + // This is the jump instruction which gets modified by Dispatcher. *ptr = (u32)JS32((u32)Dispatcher - ( (u32)x86Ptr + 6 )); } - else + else if( !noDispatch ) { + // This instruction is a dynamic link, so it's never modified. JS32((uptr)DispatcherReg - ( (uptr)x86Ptr + 6 )); } @@ -1728,8 +1756,9 @@ void recRecompile( const u32 startpc ) goto StartRecomp; } } + // Fall through! + // COP0's branch opcodes line up with COP1 and COP2's - break; case 17: // cp1 case 18: // cp2 if( _Rs_ == 8 ) { @@ -2023,15 +2052,24 @@ StartRecomp: if( !(pc&0x10000000) ) maxrecmem = std::max( (pc&~0xa0000000), maxrecmem ); - if( branch == 2 ) { - iFlushCall(FLUSH_EVERYTHING); + if( branch == 2 ) + { + // Branch type 2 - This is how I "think" this works (air): + // Performs a branch/event test but does not actually "break" the block. + // This allows exceptions to be raised, and is thus sufficient for + // certain types of things like SYSCALL, EI, etc. but it is not sufficient + // for actual branching instructions. - iBranchTest(0xffffffff, 1); + iFlushCall(FLUSH_EVERYTHING); + iBranchTest(0xffffffff, true); } - else { + else + { assert( branch != 3 ); - if( branch ) assert( !willbranch3 ); - else ADD32ItoM((int)&cpuRegs.cycle, eeScaleBlockCycles() ); + if( branch ) + assert( !willbranch3 ); + else + ADD32ItoM((int)&cpuRegs.cycle, eeScaleBlockCycles() ); if( willbranch3 ) { BASEBLOCK* pblock = PC_GETBLOCK(s_nEndBlock); @@ -2088,7 +2126,6 @@ using namespace Dynarec::R5900; namespace R5900 { - R5900cpu recCpu = { recAlloc, recReset,