From 8da2dc7df9e4ee7c28bbd2546cdbf1707a03ec2f Mon Sep 17 00:00:00 2001 From: sudonim1 Date: Mon, 5 Apr 2010 22:24:25 +0000 Subject: [PATCH] R5900: Replaced 0x81FC0 address check with constant loop detection logic and renamed the hack appropriately. This is what I originally intended back before INTC_STAT and 81FC0 hacks, but it seems to have pretty minimal gains over them. I don't think I've broken anything though, it might help some game and could perhaps be extended to handle more complicated loops later, with inlining or multiple branch logic. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@2814 96395faa-99c1-11dd-bbfe-3dabce05a288 --- pcsx2/Config.h | 2 +- pcsx2/Pcsx2Config.cpp | 2 +- pcsx2/gui/Panels/ConfigurationPanels.h | 2 +- pcsx2/gui/Panels/SpeedhacksPanel.cpp | 19 ++-- pcsx2/x86/iR3000A.cpp | 17 ++-- pcsx2/x86/ix86-32/iR5900-32.cpp | 115 ++++++++++++++++++++----- 6 files changed, 115 insertions(+), 42 deletions(-) diff --git a/pcsx2/Config.h b/pcsx2/Config.h index ff8501807c..aef4ef37a9 100644 --- a/pcsx2/Config.h +++ b/pcsx2/Config.h @@ -456,7 +456,7 @@ struct Pcsx2Config bool IopCycleRate_X2 :1, // enables the x2 multiplier of the IOP cyclerate IntcStat :1, // tells Pcsx2 to fast-forward through intc_stat waits. - BIFC0 :1, // enables BIFC0 detection and fast-forwarding + WaitLoop :1, // enables constant loop detection and fast-forwarding vuFlagHack :1, // microVU specific flag hack; Can cause Infinite loops, SPS, etc... vuMinMax :1; // microVU specific MinMax hack; Can cause SPS, Black Screens, etc... BITFIELD_END diff --git a/pcsx2/Pcsx2Config.cpp b/pcsx2/Pcsx2Config.cpp index 774d5919a3..f79eba2df5 100644 --- a/pcsx2/Pcsx2Config.cpp +++ b/pcsx2/Pcsx2Config.cpp @@ -72,7 +72,7 @@ void Pcsx2Config::SpeedhackOptions::LoadSave( IniInterface& ini ) IniBitfield( VUCycleSteal ); IniBitBool( IopCycleRate_X2 ); IniBitBool( IntcStat ); - IniBitBool( BIFC0 ); + IniBitBool( WaitLoop ); IniBitBool( vuFlagHack ); IniBitBool( vuMinMax ); } diff --git a/pcsx2/gui/Panels/ConfigurationPanels.h b/pcsx2/gui/Panels/ConfigurationPanels.h index b33760569d..fd6972863d 100644 --- a/pcsx2/gui/Panels/ConfigurationPanels.h +++ b/pcsx2/gui/Panels/ConfigurationPanels.h @@ -293,7 +293,7 @@ namespace Panels pxStaticText* m_msg_vustealer; pxCheckBox* m_check_intc; - pxCheckBox* m_check_b1fc0; + pxCheckBox* m_check_waitloop; pxCheckBox* m_check_IOPx2; pxCheckBox* m_check_vuFlagHack; pxCheckBox* m_check_vuMinMax; diff --git a/pcsx2/gui/Panels/SpeedhacksPanel.cpp b/pcsx2/gui/Panels/SpeedhacksPanel.cpp index eed493494a..192d0ad7ed 100644 --- a/pcsx2/gui/Panels/SpeedhacksPanel.cpp +++ b/pcsx2/gui/Panels/SpeedhacksPanel.cpp @@ -192,8 +192,8 @@ Panels::SpeedHacksPanel::SpeedHacksPanel( wxWindow* parent ) m_check_intc = new pxCheckBox( miscHacksPanel, _("Enable INTC Spin Detection"), _("Huge speedup for some games, with almost no compatibility side effects. [Recommended]") ); - m_check_b1fc0 = new pxCheckBox( miscHacksPanel, _("Enable BIFC0 Spin Detection"), - _("Moderate speedup for some games, with no known side effects. [Recommended]" ) ); + m_check_waitloop = new pxCheckBox( miscHacksPanel, _("Enable Wait Loop Detection"), + _("Moderate speedup for some games, with no known side effects. [Recommended???]" ) ); m_check_IOPx2 = new pxCheckBox( miscHacksPanel, _("IOP x2 cycle rate hack"), _("Small Speedup and works well with most games; may cause some games to hang during startup.") ); @@ -204,10 +204,11 @@ Panels::SpeedHacksPanel::SpeedHacksPanel( wxWindow* parent ) L"RPG titles. Games that do not use this method of vsync will see little or no speedup from this hack." ) ); - m_check_b1fc0->SetToolTip( pxE( ".Tooltips:Speedhacks:BIFC0", - L"This hack works especially well for Final Fantasy X and Kingdom Hearts. BIFC0 is the address of a specific block of " - L"code in the EE kernel that's run repeatedly when the EE is waiting for the IOP to complete a task. This hack detects " - L"that and responds by fast-forwarding the EE until the IOP signals that the task is complete." + m_check_waitloop->SetToolTip( pxE( ".Tooltips:Speedhacks:BIFC0", + L"Primarily targetting the EE idle loop at address 0x81FC0 in the kernel, this hack attempts to " + L"detect loops whose bodies are guaranteed to result in the same machine state for every iteration " + L"until a scheduled event triggers emulation of another unit. After a single iteration of such loops, " + L"we advance to the time of the next event or the end of the processor's timeslice, whichever comes first." ) ); m_check_IOPx2->SetToolTip( pxE( ".Tooltips:Speedhacks:IOPx2", @@ -233,7 +234,7 @@ Panels::SpeedHacksPanel::SpeedHacksPanel( wxWindow* parent ) *vuHacksPanel += m_check_vuMinMax; *miscHacksPanel += m_check_intc; - *miscHacksPanel += m_check_b1fc0; + *miscHacksPanel += m_check_waitloop; *miscHacksPanel += m_check_IOPx2; *left += eeSliderPanel | StdExpand(); @@ -303,7 +304,7 @@ void Panels::SpeedHacksPanel::AppStatusEvent_OnSettingsApplied( const Pcsx2Confi m_check_vuFlagHack ->SetValue(opts.vuFlagHack); m_check_vuMinMax ->SetValue(opts.vuMinMax); m_check_intc ->SetValue(opts.IntcStat); - m_check_b1fc0 ->SetValue(opts.BIFC0); + m_check_waitloop ->SetValue(opts.WaitLoop); m_check_IOPx2 ->SetValue(opts.IopCycleRate_X2); EnableStuff(); @@ -321,7 +322,7 @@ void Panels::SpeedHacksPanel::Apply() opts.EECycleRate = m_slider_eecycle->GetValue()-1; opts.VUCycleSteal = m_slider_vustealer->GetValue(); - opts.BIFC0 = m_check_b1fc0->GetValue(); + opts.WaitLoop = m_check_waitloop->GetValue(); opts.IopCycleRate_X2 = m_check_IOPx2->GetValue(); opts.IntcStat = m_check_intc->GetValue(); opts.vuFlagHack = m_check_vuFlagHack->GetValue(); diff --git a/pcsx2/x86/iR3000A.cpp b/pcsx2/x86/iR3000A.cpp index 442f2004f3..91adba36e6 100644 --- a/pcsx2/x86/iR3000A.cpp +++ b/pcsx2/x86/iR3000A.cpp @@ -75,6 +75,7 @@ static BASEBLOCK* s_pCurBlock = NULL; static BASEBLOCKEX* s_pCurBlockEx = NULL; static u32 s_nEndBlock = 0; // what psxpc the current block ends +static u32 s_branchTo; static bool s_nBlockFF; static u32 s_saveConstRegs[32]; @@ -1007,7 +1008,7 @@ static void iPsxBranchTest(u32 newpc, u32 cpuBranch) { u32 blockCycles = psxScaleBlockCycles(); - if (EmuConfig.Speedhacks.BIFC0 && s_nBlockFF) + if (EmuConfig.Speedhacks.WaitLoop && s_nBlockFF && newpc == s_branchTo) { xMOV(eax, ptr32[&psxRegs.cycle]); xMOV(ecx, eax); @@ -1176,7 +1177,6 @@ static void printfn() static void __fastcall iopRecRecompile( const u32 startpc ) { u32 i; - u32 branchTo = -1; u32 willbranch3 = 0; if( IsDebugBuild && (psxdump & 4) ) @@ -1224,6 +1224,7 @@ static void __fastcall iopRecRecompile( const u32 startpc ) // go until the next branch i = startpc; s_nEndBlock = 0xffffffff; + s_branchTo = -1; while(1) { BASEBLOCK* pblock = PSX_GETBLOCK(i); @@ -1251,8 +1252,8 @@ static void __fastcall iopRecRecompile( const u32 startpc ) if( _Rt_ == 0 || _Rt_ == 1 || _Rt_ == 16 || _Rt_ == 17 ) { - branchTo = _Imm_ * 4 + i + 4; - if( branchTo > startpc && branchTo < i ) s_nEndBlock = branchTo; + s_branchTo = _Imm_ * 4 + i + 4; + if( s_branchTo > startpc && s_branchTo < i ) s_nEndBlock = s_branchTo; else s_nEndBlock = i+8; goto StartRecomp; @@ -1262,15 +1263,15 @@ static void __fastcall iopRecRecompile( const u32 startpc ) case 2: // J case 3: // JAL - branchTo = _Target_ << 2 | (i + 4) & 0xf0000000; + s_branchTo = _Target_ << 2 | (i + 4) & 0xf0000000; s_nEndBlock = i + 8; goto StartRecomp; // branches case 4: case 5: case 6: case 7: - branchTo = _Imm_ * 4 + i + 4; - if( branchTo > startpc && branchTo < i ) s_nEndBlock = branchTo; + s_branchTo = _Imm_ * 4 + i + 4; + if( s_branchTo > startpc && s_branchTo < i ) s_nEndBlock = s_branchTo; else s_nEndBlock = i+8; goto StartRecomp; @@ -1282,7 +1283,7 @@ static void __fastcall iopRecRecompile( const u32 startpc ) StartRecomp: s_nBlockFF = false; - if (branchTo == startpc) { + if (s_branchTo == startpc) { s_nBlockFF = true; for (i = startpc; i < s_nEndBlock; i += 4) { if (i != s_nEndBlock - 8) { diff --git a/pcsx2/x86/ix86-32/iR5900-32.cpp b/pcsx2/x86/ix86-32/iR5900-32.cpp index 0ba8a1b6fd..6d8ded2b0f 100644 --- a/pcsx2/x86/ix86-32/iR5900-32.cpp +++ b/pcsx2/x86/ix86-32/iR5900-32.cpp @@ -73,6 +73,7 @@ static u32 s_nInstCacheSize = 0; static BASEBLOCK* s_pCurBlock = NULL; static BASEBLOCKEX* s_pCurBlockEx = NULL; u32 s_nEndBlock = 0; // what pc the current block ends +u32 s_branchTo; static bool s_nBlockFF; // save states for branches @@ -973,6 +974,7 @@ void SetBranchImm( u32 imm ) // end the current block iFlushCall(FLUSH_EVERYTHING); + xMOV(ptr32[&cpuRegs.pc], imm); iBranchTest(imm); } @@ -1132,26 +1134,18 @@ static void iBranchTest(u32 newpc) // cpuRegs.cycle += blockcycles; // if( cpuRegs.cycle > g_nextBranchCycle ) { DoEvents(); } - if (EmuConfig.Speedhacks.BIFC0 && s_nBlockFF) + if (EmuConfig.Speedhacks.WaitLoop && s_nBlockFF && newpc == s_branchTo) { xMOV(eax, ptr32[&g_nextBranchCycle]); xADD(ptr32[&cpuRegs.cycle], eeScaleBlockCycles()); xCMP(eax, ptr32[&cpuRegs.cycle]); - xCMOVL(eax, ptr32[&cpuRegs.cycle]); + xCMOVS(eax, ptr32[&cpuRegs.cycle]); xMOV(ptr32[&cpuRegs.cycle], eax); xJMP( DispatcherEvent ); } else { - // Optimization -- we need to load cpuRegs.pc on static block links, but doing it inside - // the if() block below (it would be paired with recBlocks.Link) breaks the sub/jcc - // pairing that modern CPUs optimize (applies to all P4+ and AMD X2+ CPUs). So let's do - // it up here instead. :D - - if( newpc != 0xffffffff ) - xMOV( ptr32[&cpuRegs.pc], newpc ); - xMOV(eax, &cpuRegs.cycle); xADD(eax, eeScaleBlockCycles()); xMOV(&cpuRegs.cycle, eax); // update cycles @@ -1367,7 +1361,6 @@ void __fastcall dyna_page_reset(u32 start,u32 sz) static void __fastcall recRecompile( const u32 startpc ) { u32 i = 0; - u32 branchTo; u32 willbranch3 = 0; u32 usecop2; @@ -1389,10 +1382,6 @@ static void __fastcall recRecompile( const u32 startpc ) xSetPtr( recPtr ); recPtr = xGetAlignedCallTarget(); - s_nBlockFF = false; - if (HWADDR(startpc) == 0x81fc0) - s_nBlockFF = true; - s_pCurBlock = PC_GETBLOCK(startpc); pxAssert(s_pCurBlock->GetFnptr() == (uptr)JITCompile @@ -1432,6 +1421,7 @@ static void __fastcall recRecompile( const u32 startpc ) // go until the next branch i = startpc; s_nEndBlock = 0xffffffff; + s_branchTo = -1; while(1) { BASEBLOCK* pblock = PC_GETBLOCK(i); @@ -1470,8 +1460,8 @@ static void __fastcall recRecompile( const u32 startpc ) if( _Rt_ < 4 || (_Rt_ >= 16 && _Rt_ < 20) ) { // branches - branchTo = _Imm_ * 4 + i + 4; - if( branchTo > startpc && branchTo < i ) s_nEndBlock = branchTo; + s_branchTo = _Imm_ * 4 + i + 4; + if( s_branchTo > startpc && s_branchTo < i ) s_nEndBlock = s_branchTo; else s_nEndBlock = i+8; goto StartRecomp; @@ -1480,14 +1470,15 @@ static void __fastcall recRecompile( const u32 startpc ) case 2: // J case 3: // JAL + s_branchTo = _Target_ << 2 | (i + 4) & 0xf0000000; s_nEndBlock = i + 8; goto StartRecomp; // branches case 4: case 5: case 6: case 7: case 20: case 21: case 22: case 23: - branchTo = _Imm_ * 4 + i + 4; - if( branchTo > startpc && branchTo < i ) s_nEndBlock = branchTo; + s_branchTo = _Imm_ * 4 + i + 4; + if( s_branchTo > startpc && s_branchTo < i ) s_nEndBlock = s_branchTo; else s_nEndBlock = i+8; goto StartRecomp; @@ -1507,8 +1498,8 @@ static void __fastcall recRecompile( const u32 startpc ) if( _Rs_ == 8 ) { // BC1F, BC1T, BC1FL, BC1TL // BC2F, BC2T, BC2FL, BC2TL - branchTo = _Imm_ * 4 + i + 4; - if( branchTo > startpc && branchTo < i ) s_nEndBlock = branchTo; + s_branchTo = _Imm_ * 4 + i + 4; + if( s_branchTo > startpc && s_branchTo < i ) s_nEndBlock = s_branchTo; else s_nEndBlock = i+8; goto StartRecomp; @@ -1521,6 +1512,86 @@ static void __fastcall recRecompile( const u32 startpc ) StartRecomp: + // The idea here is that as long as a loop doesn't write to a register it's already read + // (excepting registers initialised with constants or memory loads) or use any instructions + // which alter the machine state apart from registers, it will do the same thing on every + // iteration. + // TODO: special handling for counting loops. God of war wastes time in a loop which just + // counts to some large number and does nothing else, many other games use a counter as a + // timeout on a register read. AFAICS the only way to optimise this for non-const cases + // without a significant loss in cycle accuracy is with a division, but games would probably + // be happy with time wasting loops completing in 0 cycles and timeouts waiting forever. + s_nBlockFF = false; + if (s_branchTo == startpc) { + s_nBlockFF = true; + + u32 reads = 0, loads = 1; + + for (i = startpc; i < s_nEndBlock; i += 4) { + if (i == s_nEndBlock - 8) + continue; + cpuRegs.code = *(u32*)PSM(i); + // nop + if (cpuRegs.code == 0) + continue; + // cache, sync + else if (_Opcode_ == 057 || _Opcode_ == 0 && _Funct_ == 013) + continue; + // imm arithmetic + else if ((_Opcode_ & 070) == 010 || (_Opcode_ & 076) == 030) + { + if (loads & 1 << _Rs_) { + loads |= 1 << _Rt_; + continue; + } + else + reads |= 1 << _Rs_; + if (reads & 1 << _Rt_) { + s_nBlockFF = false; + break; + } + } + // common register arithmetic instructions + else if (_Opcode_ == 0 && (_Funct_ & 060) == 040 && (_Funct_ & 076) != 050) + { + if (loads & 1 << _Rs_ && loads & 1 << _Rt_) { + loads |= 1 << _Rd_; + continue; + } + else + reads |= 1 << _Rs_ | 1 << _Rt_; + if (reads & 1 << _Rd_) { + s_nBlockFF = false; + break; + } + } + // loads + else if ((_Opcode_ & 070) == 040 || (_Opcode_ & 076) == 032 || _Opcode_ == 067) + { + if (loads & 1 << _Rs_) { + loads |= 1 << _Rt_; + continue; + } + else + reads |= 1 << _Rs_; + if (reads & 1 << _Rt_) { + s_nBlockFF = false; + break; + } + } + // mfc*, cfc* + else if ((_Opcode_ & 074) == 020 && _Rs_ < 4) + { + loads |= 1 << _Rt_; + } + else + { + s_nBlockFF = false; + break; + } + } + } + // rec info // { EEINST* pcur; @@ -1753,7 +1824,7 @@ StartRecomp: int numinsts = (pc - startpc) / 4; if( numinsts > 6 ) - iBranchTest(pc); + SetBranchImm(pc); else { xMOV( ptr32[&cpuRegs.pc], pc );