EE: Changed some load/store flushing behaviour for better efficiency (and hopefully no bugs). Changed my mind about the internals for future exception handling so scrapped what we have for now. Also restored correct rt=r0 behaviour on all loads.

Should be a small boost in fps everywhere and otherwise likely not change anything.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@4803 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
sudonim1@gmail.com 2011-07-10 03:38:21 +00:00
parent cad9249b79
commit fe66636625
4 changed files with 114 additions and 173 deletions

View File

@ -576,7 +576,6 @@ void _deleteVFtoXMMreg(int reg, int vu, int flush)
_freeXMMreg(i);
break;
case 1:
case 2:
if( xmmregs[i].mode & MODE_WRITE )
{
pxAssert( reg != 0 );
@ -621,8 +620,10 @@ void _deleteVFtoXMMreg(int reg, int vu, int flush)
xmmregs[i].mode &= ~MODE_WRITE;
xmmregs[i].mode |= MODE_READ;
}
break;
if (flush == 2) xmmregs[i].inuse = 0;
case 2:
xmmregs[i].inuse = 0;
break;
}

View File

@ -356,12 +356,13 @@ extern u16 x86FpuState;
#define FLUSH_FREE_ALLX86 0x080 // free all x86 regs
#define FLUSH_FREE_VU0 0x100 // free all vu0 related regs
#define FLUSH_PC 0x200 // program counter
#define FLUSH_CAUSE 0x400 // cause register, only the branch delay bit
#define FLUSH_CAUSE 0x000 // disabled for now: cause register, only the branch delay bit
#define FLUSH_CODE 0x800 // opcode for interpreter
#define FLUSH_EVERYTHING 0x1ff
#define FLUSH_EXCEPTION 0x1ff // not supported yet, so disabled for a small speedup (set back to 0x7ff when needed)
//#define FLUSH_EXCEPTION 0x1ff // will probably do this totally differently actually
#define FLUSH_INTERPRETER 0xfff
#define FLUSH_FULLVTLB FLUSH_NOCONST
// no freeing, used when callee won't destroy mmx/xmm regs
#define FLUSH_NODESTROY (FLUSH_CACHED_REGS|FLUSH_FLUSH_XMM|FLUSH_FLUSH_MMX|FLUSH_FLUSH_ALLX86)

View File

@ -25,10 +25,6 @@ using namespace x86Emitter;
#define REC_STORES
#define REC_LOADS
#define NEWLWC1
#define NEWSWC
#define NEWLQC
#define NEWSQC
// Implemented at the bottom of the module:
void SetFastMemory(int bSetFast);
@ -111,38 +107,37 @@ void recLoad64( u32 bits, bool sign )
{
jASSUME( bits == 64 || bits == 128 );
//no int 3? i love to get my hands dirty ;p - Raz
//write8(0xCC);
// Load EDX with the destination.
// 64/128 bit modes load the result directly into the cpuRegs.GPR struct.
if( _Rt_ )
MOV32ItoR(EDX, (uptr)&cpuRegs.GPR.r[ _Rt_ ].UL[ 0 ] );
if (_Rt_)
xMOV(edx, (uptr)&cpuRegs.GPR.r[_Rt_].UL[0]);
else
MOV32ItoR(EDX, (uptr)&dummyValue[0] );
xMOV(edx, (uptr)&dummyValue[0]);
if( GPR_IS_CONST1( _Rs_ ) )
if (GPR_IS_CONST1(_Rs_))
{
iFlushCall(FLUSH_EXCEPTION);
u32 srcadr = g_cpuConstRegs[_Rs_].UL[0] + _Imm_;
if (bits == 128)
srcadr &= ~0x0f;
_eeOnLoadWrite(_Rt_);
_deleteEEreg(_Rt_, 0);
u32 srcadr = g_cpuConstRegs[_Rs_].UL[0] + _Imm_;
if( bits == 128 ) srcadr &= ~0x0f;
vtlb_DynGenRead64_Const( bits, srcadr );
vtlb_DynGenRead64_Const(bits, srcadr);
}
else
{
iFlushCall(FLUSH_EXCEPTION);
// Load ECX with the source memory address that we're reading from.
_eeMoveGPRtoR(ECX, _Rs_);
if ( _Imm_ != 0 )
ADD32ItoR( ECX, _Imm_ );
if( bits == 128 ) // force 16 byte alignment on 128 bit reads
AND32ItoR(ECX,~0x0F); // emitter automatically encodes this as an 8-bit sign-extended imm8
if (_Imm_ != 0)
xADD(ecx, _Imm_);
if (bits == 128) // force 16 byte alignment on 128 bit reads
xAND(ecx, ~0x0F);
_eeOnLoadWrite(_Rt_);
_deleteEEreg(_Rt_, 0);
iFlushCall(FLUSH_FULLVTLB);
vtlb_DynGenRead64(bits);
}
@ -154,108 +149,103 @@ void recLoad32( u32 bits, bool sign )
{
jASSUME( bits <= 32 );
//no int 3? i love to get my hands dirty ;p - Raz
//write8(0xCC);
// 8/16/32 bit modes return the loaded value in EAX.
if( GPR_IS_CONST1( _Rs_ ) )
if (GPR_IS_CONST1(_Rs_))
{
iFlushCall(FLUSH_EXCEPTION);
u32 srcadr = g_cpuConstRegs[_Rs_].UL[0] + _Imm_;
_eeOnLoadWrite(_Rt_);
_deleteEEreg(_Rt_, 0);
u32 srcadr = g_cpuConstRegs[_Rs_].UL[0] + _Imm_;
vtlb_DynGenRead32_Const( bits, sign, srcadr );
vtlb_DynGenRead32_Const(bits, sign, srcadr);
}
else
{
iFlushCall(FLUSH_EXCEPTION);
// Load ECX with the source memory address that we're reading from.
_eeMoveGPRtoR(ECX, _Rs_);
if ( _Imm_ != 0 )
if (_Imm_ != 0)
ADD32ItoR( ECX, _Imm_ );
_eeOnLoadWrite(_Rt_);
_deleteEEreg(_Rt_, 0);
iFlushCall(FLUSH_FULLVTLB);
vtlb_DynGenRead32(bits, sign);
}
if( _Rt_ )
if (_Rt_)
{
// EAX holds the loaded value, so sign extend as needed:
if (sign)
CDQ();
else
XOR32RtoR(EDX,EDX);
xCDQ();
MOV32RtoM( (int)&cpuRegs.GPR.r[ _Rt_ ].UL[ 0 ], EAX );
MOV32RtoM( (int)&cpuRegs.GPR.r[ _Rt_ ].UL[ 1 ], EDX );
xMOV(ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]], eax);
if (sign)
xMOV(ptr32[&cpuRegs.GPR.r[_Rt_].UL[1]], edx);
else
xMOV(ptr32[&cpuRegs.GPR.r[_Rt_].UL[1]], 0);
}
}
//////////////////////////////////////////////////////////////////////////////////////////
//
// edxAlreadyAssigned - set to true if edx already holds the value being written (used by SWL/SWR)
void recStore(u32 sz, bool edxAlreadyAssigned=false)
void recStore(u32 bits)
{
// Performance note: Const prop for the store address is good, always.
// Constprop for the value being stored is not really worthwhile (better to use register
// allocation -- simpler code and just as fast)
// Load EDX first with the value being written, or the address of the value
// being written (64/128 bit modes). TODO: use register allocation, if the
// value is allocated to a register.
// being written (64/128 bit modes).
if( !edxAlreadyAssigned )
if (bits < 64)
{
if( sz < 64 )
{
_eeMoveGPRtoR(EDX, _Rt_);
}
else if (sz==128 || sz==64)
{
_flushEEreg(_Rt_); // flush register to mem
MOV32ItoR(EDX,(int)&cpuRegs.GPR.r[ _Rt_ ].UL[ 0 ]);
}
_eeMoveGPRtoR(EDX, _Rt_);
}
else if (bits == 128 || bits == 64)
{
_flushEEreg(_Rt_); // flush register to mem
xMOV(edx, (uptr)&cpuRegs.GPR.r[_Rt_].UL[0]);
}
// Load ECX with the destination address, or issue a direct optimized write
// if the address is a constant propagation.
if( GPR_IS_CONST1( _Rs_ ) )
if (GPR_IS_CONST1(_Rs_))
{
iFlushCall(FLUSH_EXCEPTION);
u32 dstadr = g_cpuConstRegs[_Rs_].UL[0] + _Imm_;
if( sz == 128 ) dstadr &= ~0x0f;
vtlb_DynGenWrite_Const( sz, dstadr );
if (bits == 128)
dstadr &= ~0x0f;
vtlb_DynGenWrite_Const( bits, dstadr );
}
else
{
iFlushCall(FLUSH_EXCEPTION);
_eeMoveGPRtoR(ECX, _Rs_);
if (_Imm_ != 0)
xADD(ecx, _Imm_);
if (bits == 128)
xAND(ecx, ~0x0F);
if ( _Imm_ != 0 )
ADD32ItoR(ECX, _Imm_);
if (sz==128)
AND32ItoR(ECX,~0x0F);
iFlushCall(FLUSH_FULLVTLB);
vtlb_DynGenWrite(sz);
vtlb_DynGenWrite(bits);
}
}
//////////////////////////////////////////////////////////////////////////////////////////
//
void recLB( void ) { if(_Rt_) recLoad32(8,true); }
void recLBU( void ) { if(_Rt_) recLoad32(8,false); }
void recLH( void ) { if(_Rt_) recLoad32(16,true); }
void recLHU( void ) { if(_Rt_) recLoad32(16,false); }
void recLW( void ) { if(_Rt_) recLoad32(32,true); }
void recLWU( void ) { if(_Rt_) recLoad32(32,false); }
void recLD( void ) { if(_Rt_) recLoad64(64,false); }
void recLQ( void ) { if(_Rt_) recLoad64(128,false); }
void recLB( void ) { recLoad32(8,true); }
void recLBU( void ) { recLoad32(8,false); }
void recLH( void ) { recLoad32(16,true); }
void recLHU( void ) { recLoad32(16,false); }
void recLW( void ) { recLoad32(32,true); }
void recLWU( void ) { recLoad32(32,false); }
void recLD( void ) { recLoad64(64,false); }
void recLQ( void ) { recLoad64(128,false); }
void recSB( void ) { recStore(8); }
void recSH( void ) { recStore(16); }
@ -263,18 +253,12 @@ void recSW( void ) { recStore(32); }
void recSQ( void ) { recStore(128); }
void recSD( void ) { recStore(64); }
//////////////////////////////////////////////////////////////////////////////////////////
// Non-recompiled Implementations Start Here -->
// (LWL/SWL, LWR/SWR, etc)
////////////////////////////////////////////////////
void recLWL( void )
{
if (!_Rt_)
return;
#ifdef REC_LOADS
iFlushCall(FLUSH_EXCEPTION);
iFlushCall(FLUSH_FULLVTLB);
_deleteEEreg(_Rt_, 1);
_eeMoveGPRtoR(ECX, _Rs_);
@ -287,7 +271,10 @@ void recLWL( void )
xSHL(edi, 3);
xAND(ecx, ~3);
vtlb_DynGenRead32(32, true);
vtlb_DynGenRead32(32, false);
if (!_Rt_)
return;
// mask off bytes loaded
xMOV(ecx, edi);
@ -305,7 +292,7 @@ void recLWL( void )
xCDQ();
xMOV(ptr32[&cpuRegs.GPR.r[_Rt_].UL[1]], edx);
#else
iFlushCall(FLUSH_EXCEPTION);
iFlushCall(FLUSH_INTERPRETER);
_deleteEEreg(_Rs_, 1);
_deleteEEreg(_Rt_, 1);
@ -316,10 +303,8 @@ void recLWL( void )
////////////////////////////////////////////////////
void recLWR(void)
{
if (!_Rt_)
return;
#ifdef REC_LOADS
iFlushCall(FLUSH_EXCEPTION);
iFlushCall(FLUSH_FULLVTLB);
_deleteEEreg(_Rt_, 1);
_eeMoveGPRtoR(ECX, _Rs_);
@ -332,7 +317,10 @@ void recLWR(void)
xSHL(edi, 3);
xAND(ecx, ~3);
vtlb_DynGenRead32(32, true);
vtlb_DynGenRead32(32, false);
if (!_Rt_)
return;
// mask off bytes loaded
xMOV(ecx, 24);
@ -353,7 +341,7 @@ void recLWR(void)
xMOV(ptr32[&cpuRegs.GPR.r[_Rt_].UL[1]], edx);
nosignextend.SetTarget();
#else
iFlushCall(FLUSH_EXCEPTION);
iFlushCall(FLUSH_INTERPRETER);
_deleteEEreg(_Rs_, 1);
_deleteEEreg(_Rt_, 1);
@ -365,7 +353,7 @@ void recLWR(void)
void recSWL(void)
{
#ifdef REC_STORES
iFlushCall(FLUSH_EXCEPTION);
iFlushCall(FLUSH_FULLVTLB);
_eeMoveGPRtoR(ECX, _Rs_);
if (_Imm_ != 0)
@ -402,7 +390,7 @@ void recSWL(void)
vtlb_DynGenWrite(32);
#else
iFlushCall(FLUSH_EXCEPTION);
iFlushCall(FLUSH_INTERPRETER);
_deleteEEreg(_Rs_, 1);
_deleteEEreg(_Rt_, 1);
recCall(SWL);
@ -413,7 +401,7 @@ void recSWL(void)
void recSWR(void)
{
#ifdef REC_STORES
iFlushCall(FLUSH_EXCEPTION);
iFlushCall(FLUSH_FULLVTLB);
_eeMoveGPRtoR(ECX, _Rs_);
if (_Imm_ != 0)
@ -434,7 +422,7 @@ void recSWR(void)
xSHR(edx, cl);
xAND(edx, eax);
if(_Rt_)
if (_Rt_)
{
// mask write and OR -> edx
xMOV(ecx, edi);
@ -450,7 +438,7 @@ void recSWR(void)
vtlb_DynGenWrite(32);
#else
iFlushCall(FLUSH_EXCEPTION);
iFlushCall(FLUSH_INTERPRETER);
_deleteEEreg(_Rs_, 1);
_deleteEEreg(_Rt_, 1);
recCall(SWR);
@ -460,9 +448,7 @@ void recSWR(void)
////////////////////////////////////////////////////
void recLDL( void )
{
if (!_Rt_)
return;
iFlushCall(FLUSH_EXCEPTION);
iFlushCall(FLUSH_INTERPRETER);
_deleteEEreg(_Rs_, 1);
_deleteEEreg(_Rt_, 1);
recCall(LDL);
@ -471,9 +457,7 @@ void recLDL( void )
////////////////////////////////////////////////////
void recLDR( void )
{
if (!_Rt_)
return;
iFlushCall(FLUSH_EXCEPTION);
iFlushCall(FLUSH_INTERPRETER);
_deleteEEreg(_Rs_, 1);
_deleteEEreg(_Rt_, 1);
recCall(LDR);
@ -483,7 +467,7 @@ void recLDR( void )
void recSDL( void )
{
iFlushCall(FLUSH_EXCEPTION);
iFlushCall(FLUSH_INTERPRETER);
_deleteEEreg(_Rs_, 1);
_deleteEEreg(_Rt_, 1);
recCall(SDL);
@ -492,7 +476,7 @@ void recSDL( void )
////////////////////////////////////////////////////
void recSDR( void )
{
iFlushCall(FLUSH_EXCEPTION);
iFlushCall(FLUSH_INTERPRETER);
_deleteEEreg(_Rs_, 1);
_deleteEEreg(_Rt_, 1);
recCall(SDR);
@ -508,11 +492,9 @@ void recSDR( void )
void recLWC1( void )
{
#ifdef NEWLWC1
iFlushCall(FLUSH_EXCEPTION);
_deleteFPtoXMMreg(_Rt_, 2);
if( GPR_IS_CONST1( _Rs_ ) )
if (GPR_IS_CONST1(_Rs_))
{
int addr = g_cpuConstRegs[_Rs_].UL[0] + _Imm_;
vtlb_DynGenRead32_Const(32, false, addr);
@ -520,34 +502,24 @@ void recLWC1( void )
else
{
_eeMoveGPRtoR(ECX, _Rs_);
if ( _Imm_ != 0 )
ADD32ItoR( ECX, _Imm_ );
if (_Imm_ != 0)
xADD(ecx, _Imm_);
iFlushCall(FLUSH_FULLVTLB);
vtlb_DynGenRead32(32, false);
}
MOV32RtoM( (int)&fpuRegs.fpr[ _Rt_ ].UL, EAX );
#else
iFlushCall(FLUSH_EXCEPTION);
_deleteFPtoXMMreg(_Rt_, 2);
_eeMoveGPRtoR(ECX, _Rs_);
if ( _Imm_ != 0 )
ADD32ItoR( ECX, _Imm_ );
vtlb_DynGenRead32(32, false);
MOV32RtoM( (int)&fpuRegs.fpr[ _Rt_ ].UL, EAX );
#endif
xMOV(ptr32[&fpuRegs.fpr[_Rt_].UL], eax);
}
////////////////////////////////////////////////////
void recSWC1( void )
{
#ifdef NEWSWC
iFlushCall(FLUSH_EXCEPTION);
_deleteFPtoXMMreg(_Rt_, 1);
MOV32MtoR(EDX, (int)&fpuRegs.fpr[ _Rt_ ].UL );
xMOV(edx, ptr32[&fpuRegs.fpr[_Rt_].UL] );
if( GPR_IS_CONST1( _Rs_ ) )
{
@ -557,21 +529,13 @@ void recSWC1( void )
else
{
_eeMoveGPRtoR(ECX, _Rs_);
if ( _Imm_ != 0 )
ADD32ItoR( ECX, _Imm_ );
if (_Imm_ != 0)
xADD(ecx, _Imm_);
iFlushCall(FLUSH_FULLVTLB);
vtlb_DynGenWrite(32);
}
#else
iFlushCall(FLUSH_EXCEPTION);
_deleteFPtoXMMreg(_Rt_, 0);
_eeMoveGPRtoR(ECX, _Rs_);
if ( _Imm_ != 0 )
ADD32ItoR( ECX, _Imm_ );
MOV32MtoR(EDX, (int)&fpuRegs.fpr[ _Rt_ ].UL );
vtlb_DynGenWrite(32);
#endif
}
////////////////////////////////////////////////////
@ -589,16 +553,14 @@ void recSWC1( void )
void recLQC2( void )
{
#ifdef NEWLQC
iFlushCall(FLUSH_EXCEPTION);
_deleteVFtoXMMreg(_Ft_, 0, 2);
if ( _Rt_ )
MOV32ItoR(EDX, (int)&VU0.VF[_Ft_].UD[0] );
if (_Rt_)
xMOV(edx, (uptr)&VU0.VF[_Ft_].UD[0]);
else
MOV32ItoR(EDX, (int)&dummyValue[0] );
xMOV(edx, (uptr)&dummyValue[0]);
if( GPR_IS_CONST1( _Rs_ ) )
if (GPR_IS_CONST1(_Rs_))
{
int addr = g_cpuConstRegs[_Rs_].UL[0] + _Imm_;
@ -607,41 +569,24 @@ void recLQC2( void )
else
{
_eeMoveGPRtoR(ECX, _Rs_);
if (_Imm_ != 0)
xADD(ecx, _Imm_);
if ( _Imm_ != 0 )
ADD32ItoR( ECX, _Imm_);
iFlushCall(FLUSH_FULLVTLB);
vtlb_DynGenRead64(128);
}
#else
iFlushCall(FLUSH_EXCEPTION);
_deleteVFtoXMMreg(_Ft_, 0, 2);
_eeMoveGPRtoR(ECX, _Rs_);
if ( _Imm_ != 0 )
ADD32ItoR( ECX, _Imm_);
if ( _Rt_ )
MOV32ItoR(EDX, (int)&VU0.VF[_Ft_].UD[0] );
else
MOV32ItoR(EDX, (int)&dummyValue[0] );
vtlb_DynGenRead64(128);
#endif
}
////////////////////////////////////////////////////
void recSQC2( void )
{
#ifdef NEWSQC
iFlushCall(FLUSH_EXCEPTION);
_deleteVFtoXMMreg(_Ft_, 0, 1); //Want to flush it but not clear it
MOV32ItoR(EDX, (int)&VU0.VF[_Ft_].UD[0] );
if( GPR_IS_CONST1( _Rs_ ) )
xMOV(edx, (uptr)&VU0.VF[_Ft_].UD[0]);
if (GPR_IS_CONST1(_Rs_))
{
int addr = g_cpuConstRegs[_Rs_].UL[0] + _Imm_;
vtlb_DynGenWrite_Const(128, addr);
@ -649,22 +594,13 @@ void recSQC2( void )
else
{
_eeMoveGPRtoR(ECX, _Rs_);
if ( _Imm_ != 0 )
ADD32ItoR( ECX, _Imm_ );
if (_Imm_ != 0)
xADD(ecx, _Imm_);
iFlushCall(FLUSH_FULLVTLB);
vtlb_DynGenWrite(128);
}
#else
iFlushCall(FLUSH_EXCEPTION);
_deleteVFtoXMMreg(_Ft_, 0, 0);
_eeMoveGPRtoR(ECX, _Rs_);
if ( _Imm_ != 0 )
ADD32ItoR( ECX, _Imm_ );
MOV32ItoR(EDX, (int)&VU0.VF[_Ft_].UD[0] );
vtlb_DynGenWrite(128);
#endif
}
#endif

View File

@ -394,6 +394,7 @@ void vtlb_DynGenRead64_Const( u32 bits, u32 addr_const )
case 128: szidx=4; break;
}
iFlushCall(FLUSH_FULLVTLB);
xMOV( ecx, paddr );
xCALL( vtlbdata.RWFT[szidx][0][handler] );
}
@ -455,6 +456,7 @@ void vtlb_DynGenRead32_Const( u32 bits, bool sign, u32 addr_const )
}
else
{
iFlushCall(FLUSH_FULLVTLB);
xMOV( ecx, paddr );
xCALL( vtlbdata.RWFT[szidx][0][handler] );
@ -543,6 +545,7 @@ void vtlb_DynGenWrite_Const( u32 bits, u32 addr_const )
case 128: szidx=4; break;
}
iFlushCall(FLUSH_FULLVTLB);
xMOV( ecx, paddr );
xCALL( vtlbdata.RWFT[szidx][1][handler] );
}