R5900: Replaced 0x81FC0 address check with constant loop detection logic and renamed the hack appropriately. This is what I originally intended back before INTC_STAT and 81FC0 hacks, but it seems to have pretty minimal gains over them. I don't think I've broken anything though, it might help some game and could perhaps be extended to handle more complicated loops later, with inlining or multiple branch logic.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@2814 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
sudonim1 2010-04-05 22:24:25 +00:00
parent ea04f34136
commit 8da2dc7df9
6 changed files with 115 additions and 42 deletions

View File

@ -456,7 +456,7 @@ struct Pcsx2Config
bool
IopCycleRate_X2 :1, // enables the x2 multiplier of the IOP cyclerate
IntcStat :1, // tells Pcsx2 to fast-forward through intc_stat waits.
BIFC0 :1, // enables BIFC0 detection and fast-forwarding
WaitLoop :1, // enables constant loop detection and fast-forwarding
vuFlagHack :1, // microVU specific flag hack; Can cause Infinite loops, SPS, etc...
vuMinMax :1; // microVU specific MinMax hack; Can cause SPS, Black Screens, etc...
BITFIELD_END

View File

@ -72,7 +72,7 @@ void Pcsx2Config::SpeedhackOptions::LoadSave( IniInterface& ini )
IniBitfield( VUCycleSteal );
IniBitBool( IopCycleRate_X2 );
IniBitBool( IntcStat );
IniBitBool( BIFC0 );
IniBitBool( WaitLoop );
IniBitBool( vuFlagHack );
IniBitBool( vuMinMax );
}

View File

@ -293,7 +293,7 @@ namespace Panels
pxStaticText* m_msg_vustealer;
pxCheckBox* m_check_intc;
pxCheckBox* m_check_b1fc0;
pxCheckBox* m_check_waitloop;
pxCheckBox* m_check_IOPx2;
pxCheckBox* m_check_vuFlagHack;
pxCheckBox* m_check_vuMinMax;

View File

@ -192,8 +192,8 @@ Panels::SpeedHacksPanel::SpeedHacksPanel( wxWindow* parent )
m_check_intc = new pxCheckBox( miscHacksPanel, _("Enable INTC Spin Detection"),
_("Huge speedup for some games, with almost no compatibility side effects. [Recommended]") );
m_check_b1fc0 = new pxCheckBox( miscHacksPanel, _("Enable BIFC0 Spin Detection"),
_("Moderate speedup for some games, with no known side effects. [Recommended]" ) );
m_check_waitloop = new pxCheckBox( miscHacksPanel, _("Enable Wait Loop Detection"),
_("Moderate speedup for some games, with no known side effects. [Recommended???]" ) );
m_check_IOPx2 = new pxCheckBox( miscHacksPanel, _("IOP x2 cycle rate hack"),
_("Small Speedup and works well with most games; may cause some games to hang during startup.") );
@ -204,10 +204,11 @@ Panels::SpeedHacksPanel::SpeedHacksPanel( wxWindow* parent )
L"RPG titles. Games that do not use this method of vsync will see little or no speedup from this hack."
) );
m_check_b1fc0->SetToolTip( pxE( ".Tooltips:Speedhacks:BIFC0",
L"This hack works especially well for Final Fantasy X and Kingdom Hearts. BIFC0 is the address of a specific block of "
L"code in the EE kernel that's run repeatedly when the EE is waiting for the IOP to complete a task. This hack detects "
L"that and responds by fast-forwarding the EE until the IOP signals that the task is complete."
m_check_waitloop->SetToolTip( pxE( ".Tooltips:Speedhacks:BIFC0",
L"Primarily targetting the EE idle loop at address 0x81FC0 in the kernel, this hack attempts to "
L"detect loops whose bodies are guaranteed to result in the same machine state for every iteration "
L"until a scheduled event triggers emulation of another unit. After a single iteration of such loops, "
L"we advance to the time of the next event or the end of the processor's timeslice, whichever comes first."
) );
m_check_IOPx2->SetToolTip( pxE( ".Tooltips:Speedhacks:IOPx2",
@ -233,7 +234,7 @@ Panels::SpeedHacksPanel::SpeedHacksPanel( wxWindow* parent )
*vuHacksPanel += m_check_vuMinMax;
*miscHacksPanel += m_check_intc;
*miscHacksPanel += m_check_b1fc0;
*miscHacksPanel += m_check_waitloop;
*miscHacksPanel += m_check_IOPx2;
*left += eeSliderPanel | StdExpand();
@ -303,7 +304,7 @@ void Panels::SpeedHacksPanel::AppStatusEvent_OnSettingsApplied( const Pcsx2Confi
m_check_vuFlagHack ->SetValue(opts.vuFlagHack);
m_check_vuMinMax ->SetValue(opts.vuMinMax);
m_check_intc ->SetValue(opts.IntcStat);
m_check_b1fc0 ->SetValue(opts.BIFC0);
m_check_waitloop ->SetValue(opts.WaitLoop);
m_check_IOPx2 ->SetValue(opts.IopCycleRate_X2);
EnableStuff();
@ -321,7 +322,7 @@ void Panels::SpeedHacksPanel::Apply()
opts.EECycleRate = m_slider_eecycle->GetValue()-1;
opts.VUCycleSteal = m_slider_vustealer->GetValue();
opts.BIFC0 = m_check_b1fc0->GetValue();
opts.WaitLoop = m_check_waitloop->GetValue();
opts.IopCycleRate_X2 = m_check_IOPx2->GetValue();
opts.IntcStat = m_check_intc->GetValue();
opts.vuFlagHack = m_check_vuFlagHack->GetValue();

View File

@ -75,6 +75,7 @@ static BASEBLOCK* s_pCurBlock = NULL;
static BASEBLOCKEX* s_pCurBlockEx = NULL;
static u32 s_nEndBlock = 0; // what psxpc the current block ends
static u32 s_branchTo;
static bool s_nBlockFF;
static u32 s_saveConstRegs[32];
@ -1007,7 +1008,7 @@ static void iPsxBranchTest(u32 newpc, u32 cpuBranch)
{
u32 blockCycles = psxScaleBlockCycles();
if (EmuConfig.Speedhacks.BIFC0 && s_nBlockFF)
if (EmuConfig.Speedhacks.WaitLoop && s_nBlockFF && newpc == s_branchTo)
{
xMOV(eax, ptr32[&psxRegs.cycle]);
xMOV(ecx, eax);
@ -1176,7 +1177,6 @@ static void printfn()
static void __fastcall iopRecRecompile( const u32 startpc )
{
u32 i;
u32 branchTo = -1;
u32 willbranch3 = 0;
if( IsDebugBuild && (psxdump & 4) )
@ -1224,6 +1224,7 @@ static void __fastcall iopRecRecompile( const u32 startpc )
// go until the next branch
i = startpc;
s_nEndBlock = 0xffffffff;
s_branchTo = -1;
while(1) {
BASEBLOCK* pblock = PSX_GETBLOCK(i);
@ -1251,8 +1252,8 @@ static void __fastcall iopRecRecompile( const u32 startpc )
if( _Rt_ == 0 || _Rt_ == 1 || _Rt_ == 16 || _Rt_ == 17 ) {
branchTo = _Imm_ * 4 + i + 4;
if( branchTo > startpc && branchTo < i ) s_nEndBlock = branchTo;
s_branchTo = _Imm_ * 4 + i + 4;
if( s_branchTo > startpc && s_branchTo < i ) s_nEndBlock = s_branchTo;
else s_nEndBlock = i+8;
goto StartRecomp;
@ -1262,15 +1263,15 @@ static void __fastcall iopRecRecompile( const u32 startpc )
case 2: // J
case 3: // JAL
branchTo = _Target_ << 2 | (i + 4) & 0xf0000000;
s_branchTo = _Target_ << 2 | (i + 4) & 0xf0000000;
s_nEndBlock = i + 8;
goto StartRecomp;
// branches
case 4: case 5: case 6: case 7:
branchTo = _Imm_ * 4 + i + 4;
if( branchTo > startpc && branchTo < i ) s_nEndBlock = branchTo;
s_branchTo = _Imm_ * 4 + i + 4;
if( s_branchTo > startpc && s_branchTo < i ) s_nEndBlock = s_branchTo;
else s_nEndBlock = i+8;
goto StartRecomp;
@ -1282,7 +1283,7 @@ static void __fastcall iopRecRecompile( const u32 startpc )
StartRecomp:
s_nBlockFF = false;
if (branchTo == startpc) {
if (s_branchTo == startpc) {
s_nBlockFF = true;
for (i = startpc; i < s_nEndBlock; i += 4) {
if (i != s_nEndBlock - 8) {

View File

@ -73,6 +73,7 @@ static u32 s_nInstCacheSize = 0;
static BASEBLOCK* s_pCurBlock = NULL;
static BASEBLOCKEX* s_pCurBlockEx = NULL;
u32 s_nEndBlock = 0; // what pc the current block ends
u32 s_branchTo;
static bool s_nBlockFF;
// save states for branches
@ -973,6 +974,7 @@ void SetBranchImm( u32 imm )
// end the current block
iFlushCall(FLUSH_EVERYTHING);
xMOV(ptr32[&cpuRegs.pc], imm);
iBranchTest(imm);
}
@ -1132,26 +1134,18 @@ static void iBranchTest(u32 newpc)
// cpuRegs.cycle += blockcycles;
// if( cpuRegs.cycle > g_nextBranchCycle ) { DoEvents(); }
if (EmuConfig.Speedhacks.BIFC0 && s_nBlockFF)
if (EmuConfig.Speedhacks.WaitLoop && s_nBlockFF && newpc == s_branchTo)
{
xMOV(eax, ptr32[&g_nextBranchCycle]);
xADD(ptr32[&cpuRegs.cycle], eeScaleBlockCycles());
xCMP(eax, ptr32[&cpuRegs.cycle]);
xCMOVL(eax, ptr32[&cpuRegs.cycle]);
xCMOVS(eax, ptr32[&cpuRegs.cycle]);
xMOV(ptr32[&cpuRegs.cycle], eax);
xJMP( DispatcherEvent );
}
else
{
// Optimization -- we need to load cpuRegs.pc on static block links, but doing it inside
// the if() block below (it would be paired with recBlocks.Link) breaks the sub/jcc
// pairing that modern CPUs optimize (applies to all P4+ and AMD X2+ CPUs). So let's do
// it up here instead. :D
if( newpc != 0xffffffff )
xMOV( ptr32[&cpuRegs.pc], newpc );
xMOV(eax, &cpuRegs.cycle);
xADD(eax, eeScaleBlockCycles());
xMOV(&cpuRegs.cycle, eax); // update cycles
@ -1367,7 +1361,6 @@ void __fastcall dyna_page_reset(u32 start,u32 sz)
static void __fastcall recRecompile( const u32 startpc )
{
u32 i = 0;
u32 branchTo;
u32 willbranch3 = 0;
u32 usecop2;
@ -1389,10 +1382,6 @@ static void __fastcall recRecompile( const u32 startpc )
xSetPtr( recPtr );
recPtr = xGetAlignedCallTarget();
s_nBlockFF = false;
if (HWADDR(startpc) == 0x81fc0)
s_nBlockFF = true;
s_pCurBlock = PC_GETBLOCK(startpc);
pxAssert(s_pCurBlock->GetFnptr() == (uptr)JITCompile
@ -1432,6 +1421,7 @@ static void __fastcall recRecompile( const u32 startpc )
// go until the next branch
i = startpc;
s_nEndBlock = 0xffffffff;
s_branchTo = -1;
while(1) {
BASEBLOCK* pblock = PC_GETBLOCK(i);
@ -1470,8 +1460,8 @@ static void __fastcall recRecompile( const u32 startpc )
if( _Rt_ < 4 || (_Rt_ >= 16 && _Rt_ < 20) ) {
// branches
branchTo = _Imm_ * 4 + i + 4;
if( branchTo > startpc && branchTo < i ) s_nEndBlock = branchTo;
s_branchTo = _Imm_ * 4 + i + 4;
if( s_branchTo > startpc && s_branchTo < i ) s_nEndBlock = s_branchTo;
else s_nEndBlock = i+8;
goto StartRecomp;
@ -1480,14 +1470,15 @@ static void __fastcall recRecompile( const u32 startpc )
case 2: // J
case 3: // JAL
s_branchTo = _Target_ << 2 | (i + 4) & 0xf0000000;
s_nEndBlock = i + 8;
goto StartRecomp;
// branches
case 4: case 5: case 6: case 7:
case 20: case 21: case 22: case 23:
branchTo = _Imm_ * 4 + i + 4;
if( branchTo > startpc && branchTo < i ) s_nEndBlock = branchTo;
s_branchTo = _Imm_ * 4 + i + 4;
if( s_branchTo > startpc && s_branchTo < i ) s_nEndBlock = s_branchTo;
else s_nEndBlock = i+8;
goto StartRecomp;
@ -1507,8 +1498,8 @@ static void __fastcall recRecompile( const u32 startpc )
if( _Rs_ == 8 ) {
// BC1F, BC1T, BC1FL, BC1TL
// BC2F, BC2T, BC2FL, BC2TL
branchTo = _Imm_ * 4 + i + 4;
if( branchTo > startpc && branchTo < i ) s_nEndBlock = branchTo;
s_branchTo = _Imm_ * 4 + i + 4;
if( s_branchTo > startpc && s_branchTo < i ) s_nEndBlock = s_branchTo;
else s_nEndBlock = i+8;
goto StartRecomp;
@ -1521,6 +1512,86 @@ static void __fastcall recRecompile( const u32 startpc )
StartRecomp:
// The idea here is that as long as a loop doesn't write to a register it's already read
// (excepting registers initialised with constants or memory loads) or use any instructions
// which alter the machine state apart from registers, it will do the same thing on every
// iteration.
// TODO: special handling for counting loops. God of war wastes time in a loop which just
// counts to some large number and does nothing else, many other games use a counter as a
// timeout on a register read. AFAICS the only way to optimise this for non-const cases
// without a significant loss in cycle accuracy is with a division, but games would probably
// be happy with time wasting loops completing in 0 cycles and timeouts waiting forever.
s_nBlockFF = false;
if (s_branchTo == startpc) {
s_nBlockFF = true;
u32 reads = 0, loads = 1;
for (i = startpc; i < s_nEndBlock; i += 4) {
if (i == s_nEndBlock - 8)
continue;
cpuRegs.code = *(u32*)PSM(i);
// nop
if (cpuRegs.code == 0)
continue;
// cache, sync
else if (_Opcode_ == 057 || _Opcode_ == 0 && _Funct_ == 013)
continue;
// imm arithmetic
else if ((_Opcode_ & 070) == 010 || (_Opcode_ & 076) == 030)
{
if (loads & 1 << _Rs_) {
loads |= 1 << _Rt_;
continue;
}
else
reads |= 1 << _Rs_;
if (reads & 1 << _Rt_) {
s_nBlockFF = false;
break;
}
}
// common register arithmetic instructions
else if (_Opcode_ == 0 && (_Funct_ & 060) == 040 && (_Funct_ & 076) != 050)
{
if (loads & 1 << _Rs_ && loads & 1 << _Rt_) {
loads |= 1 << _Rd_;
continue;
}
else
reads |= 1 << _Rs_ | 1 << _Rt_;
if (reads & 1 << _Rd_) {
s_nBlockFF = false;
break;
}
}
// loads
else if ((_Opcode_ & 070) == 040 || (_Opcode_ & 076) == 032 || _Opcode_ == 067)
{
if (loads & 1 << _Rs_) {
loads |= 1 << _Rt_;
continue;
}
else
reads |= 1 << _Rs_;
if (reads & 1 << _Rt_) {
s_nBlockFF = false;
break;
}
}
// mfc*, cfc*
else if ((_Opcode_ & 074) == 020 && _Rs_ < 4)
{
loads |= 1 << _Rt_;
}
else
{
s_nBlockFF = false;
break;
}
}
}
// rec info //
{
EEINST* pcur;
@ -1753,7 +1824,7 @@ StartRecomp:
int numinsts = (pc - startpc) / 4;
if( numinsts > 6 )
iBranchTest(pc);
SetBranchImm(pc);
else
{
xMOV( ptr32[&cpuRegs.pc], pc );