R5900: Replaced 0x81FC0 address check with constant loop detection logic and renamed the hack appropriately. This is what I originally intended back before INTC_STAT and 81FC0 hacks, but it seems to have pretty minimal gains over them. I don't think I've broken anything though, it might help some game and could perhaps be extended to handle more complicated loops later, with inlining or multiple branch logic.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@2814 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
sudonim1 2010-04-05 22:24:25 +00:00
parent ea04f34136
commit 8da2dc7df9
6 changed files with 115 additions and 42 deletions

View File

@ -456,7 +456,7 @@ struct Pcsx2Config
bool bool
IopCycleRate_X2 :1, // enables the x2 multiplier of the IOP cyclerate IopCycleRate_X2 :1, // enables the x2 multiplier of the IOP cyclerate
IntcStat :1, // tells Pcsx2 to fast-forward through intc_stat waits. IntcStat :1, // tells Pcsx2 to fast-forward through intc_stat waits.
BIFC0 :1, // enables BIFC0 detection and fast-forwarding WaitLoop :1, // enables constant loop detection and fast-forwarding
vuFlagHack :1, // microVU specific flag hack; Can cause Infinite loops, SPS, etc... vuFlagHack :1, // microVU specific flag hack; Can cause Infinite loops, SPS, etc...
vuMinMax :1; // microVU specific MinMax hack; Can cause SPS, Black Screens, etc... vuMinMax :1; // microVU specific MinMax hack; Can cause SPS, Black Screens, etc...
BITFIELD_END BITFIELD_END

View File

@ -72,7 +72,7 @@ void Pcsx2Config::SpeedhackOptions::LoadSave( IniInterface& ini )
IniBitfield( VUCycleSteal ); IniBitfield( VUCycleSteal );
IniBitBool( IopCycleRate_X2 ); IniBitBool( IopCycleRate_X2 );
IniBitBool( IntcStat ); IniBitBool( IntcStat );
IniBitBool( BIFC0 ); IniBitBool( WaitLoop );
IniBitBool( vuFlagHack ); IniBitBool( vuFlagHack );
IniBitBool( vuMinMax ); IniBitBool( vuMinMax );
} }

View File

@ -293,7 +293,7 @@ namespace Panels
pxStaticText* m_msg_vustealer; pxStaticText* m_msg_vustealer;
pxCheckBox* m_check_intc; pxCheckBox* m_check_intc;
pxCheckBox* m_check_b1fc0; pxCheckBox* m_check_waitloop;
pxCheckBox* m_check_IOPx2; pxCheckBox* m_check_IOPx2;
pxCheckBox* m_check_vuFlagHack; pxCheckBox* m_check_vuFlagHack;
pxCheckBox* m_check_vuMinMax; pxCheckBox* m_check_vuMinMax;

View File

@ -192,8 +192,8 @@ Panels::SpeedHacksPanel::SpeedHacksPanel( wxWindow* parent )
m_check_intc = new pxCheckBox( miscHacksPanel, _("Enable INTC Spin Detection"), m_check_intc = new pxCheckBox( miscHacksPanel, _("Enable INTC Spin Detection"),
_("Huge speedup for some games, with almost no compatibility side effects. [Recommended]") ); _("Huge speedup for some games, with almost no compatibility side effects. [Recommended]") );
m_check_b1fc0 = new pxCheckBox( miscHacksPanel, _("Enable BIFC0 Spin Detection"), m_check_waitloop = new pxCheckBox( miscHacksPanel, _("Enable Wait Loop Detection"),
_("Moderate speedup for some games, with no known side effects. [Recommended]" ) ); _("Moderate speedup for some games, with no known side effects. [Recommended???]" ) );
m_check_IOPx2 = new pxCheckBox( miscHacksPanel, _("IOP x2 cycle rate hack"), m_check_IOPx2 = new pxCheckBox( miscHacksPanel, _("IOP x2 cycle rate hack"),
_("Small Speedup and works well with most games; may cause some games to hang during startup.") ); _("Small Speedup and works well with most games; may cause some games to hang during startup.") );
@ -204,10 +204,11 @@ Panels::SpeedHacksPanel::SpeedHacksPanel( wxWindow* parent )
L"RPG titles. Games that do not use this method of vsync will see little or no speedup from this hack." L"RPG titles. Games that do not use this method of vsync will see little or no speedup from this hack."
) ); ) );
m_check_b1fc0->SetToolTip( pxE( ".Tooltips:Speedhacks:BIFC0", m_check_waitloop->SetToolTip( pxE( ".Tooltips:Speedhacks:BIFC0",
L"This hack works especially well for Final Fantasy X and Kingdom Hearts. BIFC0 is the address of a specific block of " L"Primarily targetting the EE idle loop at address 0x81FC0 in the kernel, this hack attempts to "
L"code in the EE kernel that's run repeatedly when the EE is waiting for the IOP to complete a task. This hack detects " L"detect loops whose bodies are guaranteed to result in the same machine state for every iteration "
L"that and responds by fast-forwarding the EE until the IOP signals that the task is complete." L"until a scheduled event triggers emulation of another unit. After a single iteration of such loops, "
L"we advance to the time of the next event or the end of the processor's timeslice, whichever comes first."
) ); ) );
m_check_IOPx2->SetToolTip( pxE( ".Tooltips:Speedhacks:IOPx2", m_check_IOPx2->SetToolTip( pxE( ".Tooltips:Speedhacks:IOPx2",
@ -233,7 +234,7 @@ Panels::SpeedHacksPanel::SpeedHacksPanel( wxWindow* parent )
*vuHacksPanel += m_check_vuMinMax; *vuHacksPanel += m_check_vuMinMax;
*miscHacksPanel += m_check_intc; *miscHacksPanel += m_check_intc;
*miscHacksPanel += m_check_b1fc0; *miscHacksPanel += m_check_waitloop;
*miscHacksPanel += m_check_IOPx2; *miscHacksPanel += m_check_IOPx2;
*left += eeSliderPanel | StdExpand(); *left += eeSliderPanel | StdExpand();
@ -303,7 +304,7 @@ void Panels::SpeedHacksPanel::AppStatusEvent_OnSettingsApplied( const Pcsx2Confi
m_check_vuFlagHack ->SetValue(opts.vuFlagHack); m_check_vuFlagHack ->SetValue(opts.vuFlagHack);
m_check_vuMinMax ->SetValue(opts.vuMinMax); m_check_vuMinMax ->SetValue(opts.vuMinMax);
m_check_intc ->SetValue(opts.IntcStat); m_check_intc ->SetValue(opts.IntcStat);
m_check_b1fc0 ->SetValue(opts.BIFC0); m_check_waitloop ->SetValue(opts.WaitLoop);
m_check_IOPx2 ->SetValue(opts.IopCycleRate_X2); m_check_IOPx2 ->SetValue(opts.IopCycleRate_X2);
EnableStuff(); EnableStuff();
@ -321,7 +322,7 @@ void Panels::SpeedHacksPanel::Apply()
opts.EECycleRate = m_slider_eecycle->GetValue()-1; opts.EECycleRate = m_slider_eecycle->GetValue()-1;
opts.VUCycleSteal = m_slider_vustealer->GetValue(); opts.VUCycleSteal = m_slider_vustealer->GetValue();
opts.BIFC0 = m_check_b1fc0->GetValue(); opts.WaitLoop = m_check_waitloop->GetValue();
opts.IopCycleRate_X2 = m_check_IOPx2->GetValue(); opts.IopCycleRate_X2 = m_check_IOPx2->GetValue();
opts.IntcStat = m_check_intc->GetValue(); opts.IntcStat = m_check_intc->GetValue();
opts.vuFlagHack = m_check_vuFlagHack->GetValue(); opts.vuFlagHack = m_check_vuFlagHack->GetValue();

View File

@ -75,6 +75,7 @@ static BASEBLOCK* s_pCurBlock = NULL;
static BASEBLOCKEX* s_pCurBlockEx = NULL; static BASEBLOCKEX* s_pCurBlockEx = NULL;
static u32 s_nEndBlock = 0; // what psxpc the current block ends static u32 s_nEndBlock = 0; // what psxpc the current block ends
static u32 s_branchTo;
static bool s_nBlockFF; static bool s_nBlockFF;
static u32 s_saveConstRegs[32]; static u32 s_saveConstRegs[32];
@ -1007,7 +1008,7 @@ static void iPsxBranchTest(u32 newpc, u32 cpuBranch)
{ {
u32 blockCycles = psxScaleBlockCycles(); u32 blockCycles = psxScaleBlockCycles();
if (EmuConfig.Speedhacks.BIFC0 && s_nBlockFF) if (EmuConfig.Speedhacks.WaitLoop && s_nBlockFF && newpc == s_branchTo)
{ {
xMOV(eax, ptr32[&psxRegs.cycle]); xMOV(eax, ptr32[&psxRegs.cycle]);
xMOV(ecx, eax); xMOV(ecx, eax);
@ -1176,7 +1177,6 @@ static void printfn()
static void __fastcall iopRecRecompile( const u32 startpc ) static void __fastcall iopRecRecompile( const u32 startpc )
{ {
u32 i; u32 i;
u32 branchTo = -1;
u32 willbranch3 = 0; u32 willbranch3 = 0;
if( IsDebugBuild && (psxdump & 4) ) if( IsDebugBuild && (psxdump & 4) )
@ -1224,6 +1224,7 @@ static void __fastcall iopRecRecompile( const u32 startpc )
// go until the next branch // go until the next branch
i = startpc; i = startpc;
s_nEndBlock = 0xffffffff; s_nEndBlock = 0xffffffff;
s_branchTo = -1;
while(1) { while(1) {
BASEBLOCK* pblock = PSX_GETBLOCK(i); BASEBLOCK* pblock = PSX_GETBLOCK(i);
@ -1251,8 +1252,8 @@ static void __fastcall iopRecRecompile( const u32 startpc )
if( _Rt_ == 0 || _Rt_ == 1 || _Rt_ == 16 || _Rt_ == 17 ) { if( _Rt_ == 0 || _Rt_ == 1 || _Rt_ == 16 || _Rt_ == 17 ) {
branchTo = _Imm_ * 4 + i + 4; s_branchTo = _Imm_ * 4 + i + 4;
if( branchTo > startpc && branchTo < i ) s_nEndBlock = branchTo; if( s_branchTo > startpc && s_branchTo < i ) s_nEndBlock = s_branchTo;
else s_nEndBlock = i+8; else s_nEndBlock = i+8;
goto StartRecomp; goto StartRecomp;
@ -1262,15 +1263,15 @@ static void __fastcall iopRecRecompile( const u32 startpc )
case 2: // J case 2: // J
case 3: // JAL case 3: // JAL
branchTo = _Target_ << 2 | (i + 4) & 0xf0000000; s_branchTo = _Target_ << 2 | (i + 4) & 0xf0000000;
s_nEndBlock = i + 8; s_nEndBlock = i + 8;
goto StartRecomp; goto StartRecomp;
// branches // branches
case 4: case 5: case 6: case 7: case 4: case 5: case 6: case 7:
branchTo = _Imm_ * 4 + i + 4; s_branchTo = _Imm_ * 4 + i + 4;
if( branchTo > startpc && branchTo < i ) s_nEndBlock = branchTo; if( s_branchTo > startpc && s_branchTo < i ) s_nEndBlock = s_branchTo;
else s_nEndBlock = i+8; else s_nEndBlock = i+8;
goto StartRecomp; goto StartRecomp;
@ -1282,7 +1283,7 @@ static void __fastcall iopRecRecompile( const u32 startpc )
StartRecomp: StartRecomp:
s_nBlockFF = false; s_nBlockFF = false;
if (branchTo == startpc) { if (s_branchTo == startpc) {
s_nBlockFF = true; s_nBlockFF = true;
for (i = startpc; i < s_nEndBlock; i += 4) { for (i = startpc; i < s_nEndBlock; i += 4) {
if (i != s_nEndBlock - 8) { if (i != s_nEndBlock - 8) {

View File

@ -73,6 +73,7 @@ static u32 s_nInstCacheSize = 0;
static BASEBLOCK* s_pCurBlock = NULL; static BASEBLOCK* s_pCurBlock = NULL;
static BASEBLOCKEX* s_pCurBlockEx = NULL; static BASEBLOCKEX* s_pCurBlockEx = NULL;
u32 s_nEndBlock = 0; // what pc the current block ends u32 s_nEndBlock = 0; // what pc the current block ends
u32 s_branchTo;
static bool s_nBlockFF; static bool s_nBlockFF;
// save states for branches // save states for branches
@ -973,6 +974,7 @@ void SetBranchImm( u32 imm )
// end the current block // end the current block
iFlushCall(FLUSH_EVERYTHING); iFlushCall(FLUSH_EVERYTHING);
xMOV(ptr32[&cpuRegs.pc], imm);
iBranchTest(imm); iBranchTest(imm);
} }
@ -1132,26 +1134,18 @@ static void iBranchTest(u32 newpc)
// cpuRegs.cycle += blockcycles; // cpuRegs.cycle += blockcycles;
// if( cpuRegs.cycle > g_nextBranchCycle ) { DoEvents(); } // if( cpuRegs.cycle > g_nextBranchCycle ) { DoEvents(); }
if (EmuConfig.Speedhacks.BIFC0 && s_nBlockFF) if (EmuConfig.Speedhacks.WaitLoop && s_nBlockFF && newpc == s_branchTo)
{ {
xMOV(eax, ptr32[&g_nextBranchCycle]); xMOV(eax, ptr32[&g_nextBranchCycle]);
xADD(ptr32[&cpuRegs.cycle], eeScaleBlockCycles()); xADD(ptr32[&cpuRegs.cycle], eeScaleBlockCycles());
xCMP(eax, ptr32[&cpuRegs.cycle]); xCMP(eax, ptr32[&cpuRegs.cycle]);
xCMOVL(eax, ptr32[&cpuRegs.cycle]); xCMOVS(eax, ptr32[&cpuRegs.cycle]);
xMOV(ptr32[&cpuRegs.cycle], eax); xMOV(ptr32[&cpuRegs.cycle], eax);
xJMP( DispatcherEvent ); xJMP( DispatcherEvent );
} }
else else
{ {
// Optimization -- we need to load cpuRegs.pc on static block links, but doing it inside
// the if() block below (it would be paired with recBlocks.Link) breaks the sub/jcc
// pairing that modern CPUs optimize (applies to all P4+ and AMD X2+ CPUs). So let's do
// it up here instead. :D
if( newpc != 0xffffffff )
xMOV( ptr32[&cpuRegs.pc], newpc );
xMOV(eax, &cpuRegs.cycle); xMOV(eax, &cpuRegs.cycle);
xADD(eax, eeScaleBlockCycles()); xADD(eax, eeScaleBlockCycles());
xMOV(&cpuRegs.cycle, eax); // update cycles xMOV(&cpuRegs.cycle, eax); // update cycles
@ -1367,7 +1361,6 @@ void __fastcall dyna_page_reset(u32 start,u32 sz)
static void __fastcall recRecompile( const u32 startpc ) static void __fastcall recRecompile( const u32 startpc )
{ {
u32 i = 0; u32 i = 0;
u32 branchTo;
u32 willbranch3 = 0; u32 willbranch3 = 0;
u32 usecop2; u32 usecop2;
@ -1389,10 +1382,6 @@ static void __fastcall recRecompile( const u32 startpc )
xSetPtr( recPtr ); xSetPtr( recPtr );
recPtr = xGetAlignedCallTarget(); recPtr = xGetAlignedCallTarget();
s_nBlockFF = false;
if (HWADDR(startpc) == 0x81fc0)
s_nBlockFF = true;
s_pCurBlock = PC_GETBLOCK(startpc); s_pCurBlock = PC_GETBLOCK(startpc);
pxAssert(s_pCurBlock->GetFnptr() == (uptr)JITCompile pxAssert(s_pCurBlock->GetFnptr() == (uptr)JITCompile
@ -1432,6 +1421,7 @@ static void __fastcall recRecompile( const u32 startpc )
// go until the next branch // go until the next branch
i = startpc; i = startpc;
s_nEndBlock = 0xffffffff; s_nEndBlock = 0xffffffff;
s_branchTo = -1;
while(1) { while(1) {
BASEBLOCK* pblock = PC_GETBLOCK(i); BASEBLOCK* pblock = PC_GETBLOCK(i);
@ -1470,8 +1460,8 @@ static void __fastcall recRecompile( const u32 startpc )
if( _Rt_ < 4 || (_Rt_ >= 16 && _Rt_ < 20) ) { if( _Rt_ < 4 || (_Rt_ >= 16 && _Rt_ < 20) ) {
// branches // branches
branchTo = _Imm_ * 4 + i + 4; s_branchTo = _Imm_ * 4 + i + 4;
if( branchTo > startpc && branchTo < i ) s_nEndBlock = branchTo; if( s_branchTo > startpc && s_branchTo < i ) s_nEndBlock = s_branchTo;
else s_nEndBlock = i+8; else s_nEndBlock = i+8;
goto StartRecomp; goto StartRecomp;
@ -1480,14 +1470,15 @@ static void __fastcall recRecompile( const u32 startpc )
case 2: // J case 2: // J
case 3: // JAL case 3: // JAL
s_branchTo = _Target_ << 2 | (i + 4) & 0xf0000000;
s_nEndBlock = i + 8; s_nEndBlock = i + 8;
goto StartRecomp; goto StartRecomp;
// branches // branches
case 4: case 5: case 6: case 7: case 4: case 5: case 6: case 7:
case 20: case 21: case 22: case 23: case 20: case 21: case 22: case 23:
branchTo = _Imm_ * 4 + i + 4; s_branchTo = _Imm_ * 4 + i + 4;
if( branchTo > startpc && branchTo < i ) s_nEndBlock = branchTo; if( s_branchTo > startpc && s_branchTo < i ) s_nEndBlock = s_branchTo;
else s_nEndBlock = i+8; else s_nEndBlock = i+8;
goto StartRecomp; goto StartRecomp;
@ -1507,8 +1498,8 @@ static void __fastcall recRecompile( const u32 startpc )
if( _Rs_ == 8 ) { if( _Rs_ == 8 ) {
// BC1F, BC1T, BC1FL, BC1TL // BC1F, BC1T, BC1FL, BC1TL
// BC2F, BC2T, BC2FL, BC2TL // BC2F, BC2T, BC2FL, BC2TL
branchTo = _Imm_ * 4 + i + 4; s_branchTo = _Imm_ * 4 + i + 4;
if( branchTo > startpc && branchTo < i ) s_nEndBlock = branchTo; if( s_branchTo > startpc && s_branchTo < i ) s_nEndBlock = s_branchTo;
else s_nEndBlock = i+8; else s_nEndBlock = i+8;
goto StartRecomp; goto StartRecomp;
@ -1521,6 +1512,86 @@ static void __fastcall recRecompile( const u32 startpc )
StartRecomp: StartRecomp:
// The idea here is that as long as a loop doesn't write to a register it's already read
// (excepting registers initialised with constants or memory loads) or use any instructions
// which alter the machine state apart from registers, it will do the same thing on every
// iteration.
// TODO: special handling for counting loops. God of war wastes time in a loop which just
// counts to some large number and does nothing else, many other games use a counter as a
// timeout on a register read. AFAICS the only way to optimise this for non-const cases
// without a significant loss in cycle accuracy is with a division, but games would probably
// be happy with time wasting loops completing in 0 cycles and timeouts waiting forever.
s_nBlockFF = false;
if (s_branchTo == startpc) {
s_nBlockFF = true;
u32 reads = 0, loads = 1;
for (i = startpc; i < s_nEndBlock; i += 4) {
if (i == s_nEndBlock - 8)
continue;
cpuRegs.code = *(u32*)PSM(i);
// nop
if (cpuRegs.code == 0)
continue;
// cache, sync
else if (_Opcode_ == 057 || _Opcode_ == 0 && _Funct_ == 013)
continue;
// imm arithmetic
else if ((_Opcode_ & 070) == 010 || (_Opcode_ & 076) == 030)
{
if (loads & 1 << _Rs_) {
loads |= 1 << _Rt_;
continue;
}
else
reads |= 1 << _Rs_;
if (reads & 1 << _Rt_) {
s_nBlockFF = false;
break;
}
}
// common register arithmetic instructions
else if (_Opcode_ == 0 && (_Funct_ & 060) == 040 && (_Funct_ & 076) != 050)
{
if (loads & 1 << _Rs_ && loads & 1 << _Rt_) {
loads |= 1 << _Rd_;
continue;
}
else
reads |= 1 << _Rs_ | 1 << _Rt_;
if (reads & 1 << _Rd_) {
s_nBlockFF = false;
break;
}
}
// loads
else if ((_Opcode_ & 070) == 040 || (_Opcode_ & 076) == 032 || _Opcode_ == 067)
{
if (loads & 1 << _Rs_) {
loads |= 1 << _Rt_;
continue;
}
else
reads |= 1 << _Rs_;
if (reads & 1 << _Rt_) {
s_nBlockFF = false;
break;
}
}
// mfc*, cfc*
else if ((_Opcode_ & 074) == 020 && _Rs_ < 4)
{
loads |= 1 << _Rt_;
}
else
{
s_nBlockFF = false;
break;
}
}
}
// rec info // // rec info //
{ {
EEINST* pcur; EEINST* pcur;
@ -1753,7 +1824,7 @@ StartRecomp:
int numinsts = (pc - startpc) / 4; int numinsts = (pc - startpc) / 4;
if( numinsts > 6 ) if( numinsts > 6 )
iBranchTest(pc); SetBranchImm(pc);
else else
{ {
xMOV( ptr32[&cpuRegs.pc], pc ); xMOV( ptr32[&cpuRegs.pc], pc );