Minor Optimization: Added Const support to Vtlb's Load and Store implementations.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@627 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
Jake.Stine 2009-02-27 06:40:05 +00:00
parent a5ab9ad28f
commit e205999744
4 changed files with 276 additions and 164 deletions

View File

@ -37,7 +37,6 @@ static std::string disOut;
#ifdef PCSX2_DEVBUILD
static void debugI()
{
//CPU_LOG("%s\n", disR5900Current.getCString());
if (cpuRegs.GPR.n.r0.UD[0] || cpuRegs.GPR.n.r0.UD[1]) Console::Error("R0 is not zero!!!!");
}
#else
@ -64,6 +63,17 @@ static void execI()
// }
//}
// Another method of instruction dumping:
/*if( cpuRegs.cycle > 0x4f24d714 )
{
//CPU_LOG( "%s\n", disR5900Current.getCString());
disOut.clear();
opcode.disasm( disOut );
disOut += '\n';
CPU_LOG( disOut.c_str() );
}*/
cpuBlockCycles += opcode.cycles;
cpuRegs.pc += 4;

View File

@ -61,6 +61,10 @@ extern void vtlb_DynGenWrite(u32 sz);
extern void vtlb_DynGenRead32(u32 bits, bool sign);
extern void vtlb_DynGenRead64(u32 sz);
extern void vtlb_DynGenWrite_Const( u32 bits, u32 addr_const );
extern void vtlb_DynGenRead64_Const( u32 bits, u32 addr_const );
extern void vtlb_DynGenRead32_Const( u32 bits, bool sign, u32 addr_const );
namespace vtlb_private
{
static const uint VTLB_PAGE_BITS = 12;

View File

@ -2081,27 +2081,36 @@ void recLoad64( u32 bits, bool sign )
_deleteEEreg(_Rs_, 1);
_eeOnLoadWrite(_Rt_);
EEINST_RESETSIGNEXT(_Rt_); // remove the sign extension -> what does this really do ?
EEINST_RESETSIGNEXT(_Rt_); // remove the sign extension
_deleteEEreg(_Rt_, 0);
// Load ECX with the source memory address that we're reading from.
MOV32MtoR( ECX, (int)&cpuRegs.GPR.r[ _Rs_ ].UL[ 0 ] );
if ( _Imm_ != 0 )
ADD32ItoR( ECX, _Imm_ );
// Load EDX with the destination.
// 64/128 bit modes load the result directly into the cpuRegs.GPR struct.
if( bits == 128 ) // force 16 byte alignment on 128 bit reads
AND32I8toR(ECX,0xF0);
// Load EDX with the destination. 64/128 bit modes load the result directly into
// the cpuRegs.GPR struct.
if ( _Rt_ )
MOV32ItoR(EDX, (int)&cpuRegs.GPR.r[ _Rt_ ].UL[ 0 ] );
if( _Rt_ )
MOV32ItoR(EDX, (uptr)&cpuRegs.GPR.r[ _Rt_ ].UL[ 0 ] );
else
MOV32ItoR(EDX, (int)&dummyValue[0] );
MOV32ItoR(EDX, (uptr)&dummyValue[0] );
vtlb_DynGenRead64(bits);
if( IS_EECONSTREG( _Rs_ ) )
{
u32 srcadr = g_cpuConstRegs[_Rs_].UL[0] + _Imm_;
if( bits == 128 ) srcadr &= ~0x0f;
vtlb_DynGenRead64_Const( bits, srcadr );
}
else
{
// Load ECX with the source memory address that we're reading from.
MOV32MtoR( ECX, (uptr)&cpuRegs.GPR.r[ _Rs_ ].UL[ 0 ] );
if ( _Imm_ != 0 )
ADD32ItoR( ECX, _Imm_ );
if( bits == 128 ) // force 16 byte alignment on 128 bit reads
AND32I8toR(ECX,0xF0);
vtlb_DynGenRead64(bits);
}
}
void recLoad32(u32 bits,bool sign)
@ -2115,40 +2124,26 @@ void recLoad32(u32 bits,bool sign)
_eeOnLoadWrite(_Rt_);
_deleteEEreg(_Rt_, 0);
// Load ECX with the source memory address that we're reading from.
MOV32MtoR( ECX, (int)&cpuRegs.GPR.r[ _Rs_ ].UL[ 0 ] );
if ( _Imm_ != 0 )
ADD32ItoR( ECX, _Imm_ );
// 8/16/32 bit modes return the loaded value in EAX.
//MOV32ItoR(EDX, (int)&dummyValue[0] );
vtlb_DynGenRead32(bits, sign);
if ( _Rt_ )
if( IS_EECONSTREG( _Rs_ ) )
{
// Perform sign extension if needed
u32 srcadr = g_cpuConstRegs[_Rs_].UL[0] + _Imm_;
vtlb_DynGenRead32_Const( bits, sign, srcadr );
}
else
{
// Load ECX with the source memory address that we're reading from.
MOV32MtoR( ECX, (int)&cpuRegs.GPR.r[ _Rs_ ].UL[ 0 ] );
if ( _Imm_ != 0 )
ADD32ItoR( ECX, _Imm_ );
//MOV32MtoR( EAX, (int)&dummyValue[0] ); //ewww, lame ! movsx /zx has r/m forms too ...
/*if (bits==8)
{
if (sign)
//MOVSX32M8toR( EAX, (int)&dummyValue[0] );
MOVSX32R8toR( EAX, EAX );
//else
//MOVZX32M8toR( EAX, (int)&dummyValue[0] );
//MOVZX32R8toR( EAX, EAX );
}
else if (bits==16)
{
if (sign)
//MOVSX32M16toR( EAX, (int)&dummyValue[0] );
MOVSX32R16toR( EAX, EAX );
//else
//MOVZX32M16toR( EAX, (int)&dummyValue[0] );
//MOVZX32R16toR( EAX, EAX );
}*/
vtlb_DynGenRead32(bits, sign);
}
if( _Rt_ )
{
// EAX holds the loaded value, so sign extend as needed:
if (sign)
CDQ();
else
@ -2463,6 +2458,7 @@ void recLQ( void )
ADD32ItoR( ESP, 8 );
*/
}
void recStore(u32 sz)
{
//no int 3? i love to get my hands dirty ;p - Raz
@ -2471,15 +2467,14 @@ void recStore(u32 sz)
_deleteEEreg(_Rs_, 1);
_deleteEEreg(_Rt_, 1);
MOV32MtoR( ECX, (int)&cpuRegs.GPR.r[ _Rs_ ].UL[ 0 ] );
if ( _Imm_ != 0 )
{
ADD32ItoR( ECX, _Imm_);
}
if (sz==128)
{
AND32I8toR(ECX,0xF0);
}
// Performance note: Const prop for the store address is good, always.
// Constprop for the value being stored is not really worthwhile (better to use register
// allocation -- simpler code and just as fast)
// Load EDX first with the value being written, or the address of the value
// being written (64/128 bit modes). TODO: use register allocation, if the
// value is allocated to a register.
if (sz<64)
{
@ -2493,22 +2488,28 @@ void recStore(u32 sz)
MOV32ItoR(EDX,(int)&cpuRegs.GPR.r[ _Rt_ ].UL[ 0 ]);
}
// Load ECX with the destination address, or issue a direct optimized write
// if the address is a constant propagation.
vtlb_DynGenWrite(sz);
if( GPR_IS_CONST1( _Rs_ ) )
{
u32 dstadr = g_cpuConstRegs[_Rs_].UL[0] + _Imm_;
if( sz == 128 ) dstadr &= ~0x0f;
vtlb_DynGenWrite_Const( sz, dstadr );
}
else
{
MOV32MtoR( ECX, (int)&cpuRegs.GPR.r[ _Rs_ ].UL[ 0 ] );
if ( _Imm_ != 0 )
ADD32ItoR(ECX, _Imm_);
/*
if (sz==8)
CALLFunc( (int)memWrite8 );
else if (sz==16)
CALLFunc( (int)memWrite16 );
else if (sz==32)
CALLFunc( (int)memWrite32 );
else if (sz==64)
CALLFunc( (int)memWrite64 );
else if (sz==128)
CALLFunc( (int)memWrite128 );
*/
if (sz==128)
AND32I8toR(ECX,0xF0);
vtlb_DynGenWrite(sz);
}
}
////////////////////////////////////////////////////
void recSB( void )
{

View File

@ -27,6 +27,21 @@
using namespace vtlb_private;
// NOTICE: This function *destroys* EAX!!
// Moves 128 bits of memory from the source register ptr to the dest register ptr.
// (used as an equivalent to movaps, when a free XMM register is unavailable for some reason)
void MOV128_MtoM( x86IntRegType destRm, x86IntRegType srcRm )
{
MOV32RmtoR(EAX,srcRm);
MOV32RtoRm(destRm,EAX);
MOV32RmtoROffset(EAX,srcRm,4);
MOV32RtoRmOffset(destRm,EAX,4);
MOV32RmtoROffset(EAX,srcRm,8);
MOV32RtoRmOffset(destRm,EAX,8);
MOV32RmtoROffset(EAX,srcRm,12);
MOV32RtoRmOffset(destRm,EAX,12);
}
/*
// Pseudo-Code For the following Dynarec Implementations -->
@ -78,18 +93,31 @@ using namespace vtlb_private;
*/
//////////////////////////////////////////////////////////////////////////////////////////
// Dynarec Load Implementations
//ecx = addr
//edx = ptr
void vtlb_DynGenRead64(u32 bits)
static void _vtlb_DynGen_DirectRead( u32 bits, bool sign )
{
MOV32RtoR(EAX,ECX);
SHR32ItoR(EAX,VTLB_PAGE_BITS);
MOV32RmSOffsettoR(EAX,EAX,(int)vmap,2);
ADD32RtoR(ECX,EAX);
u8* _fullread=JS8(0);
switch(bits)
switch( bits )
{
case 8:
if( sign )
MOVSX32Rm8toR(EAX,ECX);
else
MOVZX32Rm8toR(EAX,ECX);
break;
case 16:
if( sign )
MOVSX32Rm16toR(EAX,ECX);
else
MOVZX32Rm16toR(EAX,ECX);
break;
case 32:
MOV32RmtoR(EAX,ECX);
break;
case 64:
if( _hasFreeMMXreg() )
{
@ -106,7 +134,7 @@ void vtlb_DynGenRead64(u32 bits)
MOV32RmtoROffset(EAX,ECX,4);
MOV32RtoRmOffset(EDX,EAX,4);
}
break;
break;
case 128:
if( _hasFreeXMMreg() )
@ -121,31 +149,25 @@ void vtlb_DynGenRead64(u32 bits)
// Could put in an MMX optimization here as well, but no point really.
// It's almost never used since there's almost always a free XMM reg.
MOV32RmtoR(EAX,ECX);
MOV32RtoRm(EDX,EAX);
MOV32RmtoROffset(EAX,ECX,4);
MOV32RtoRmOffset(EDX,EAX,4);
MOV32RmtoROffset(EAX,ECX,8);
MOV32RtoRmOffset(EDX,EAX,8);
MOV32RmtoROffset(EAX,ECX,12);
MOV32RtoRmOffset(EDX,EAX,12);
MOV128_MtoM( EDX, ECX ); // dest <- src!
}
break;
break;
jNO_DEFAULT
}
}
u8* cont=JMP8(0);
x86SetJ8(_fullread);
static void _vtlb_DynGen_IndirectRead( u32 bits )
{
int szidx;
switch(bits)
switch( bits )
{
case 64: szidx=3; break;
case 128: szidx=4; break;
case 8: szidx=0; break;
case 16: szidx=1; break;
case 32: szidx=2; break;
case 64: szidx=3; break;
case 128: szidx=4; break;
jNO_DEFAULT
}
@ -155,12 +177,33 @@ void vtlb_DynGenRead64(u32 bits)
MOV32RmSOffsettoR(EAX,EAX,(int)RWFT[szidx][0],2);
SUB32ItoR(ECX,0x80000000);
CALL32R(EAX);
}
// Recompiled input registers:
// ecx = source addr to read from
// edx = ptr to dest to write to
void vtlb_DynGenRead64(u32 bits)
{
jASSUME( bits == 64 || bits == 128 );
MOV32RtoR(EAX,ECX);
SHR32ItoR(EAX,VTLB_PAGE_BITS);
MOV32RmSOffsettoR(EAX,EAX,(int)vmap,2);
ADD32RtoR(ECX,EAX);
u8* _fullread = JS8(0);
_vtlb_DynGen_DirectRead( bits, false );
u8* cont = JMP8(0);
x86SetJ8(_fullread);
_vtlb_DynGen_IndirectRead( bits );
x86SetJ8(cont);
}
// ecx - source address to read from
// Returns read value in eax.
// Recompiled input registers:
// ecx - source address to read from
// Returns read value in eax.
void vtlb_DynGenRead32(u32 bits, bool sign)
{
jASSUME( bits <= 32 );
@ -169,49 +212,13 @@ void vtlb_DynGenRead32(u32 bits, bool sign)
SHR32ItoR(EAX,VTLB_PAGE_BITS);
MOV32RmSOffsettoR(EAX,EAX,(int)vmap,2);
ADD32RtoR(ECX,EAX);
u8* _fullread=JS8(0);
u8* _fullread = JS8(0);
switch(bits)
{
case 8:
if( sign )
MOVSX32Rm8toR(EAX,ECX);
else
MOVZX32Rm8toR(EAX,ECX);
break;
_vtlb_DynGen_DirectRead( bits, sign );
u8* cont = JMP8(0);
case 16:
if( sign )
MOVSX32Rm16toR(EAX,ECX);
else
MOVZX32Rm16toR(EAX,ECX);
break;
case 32:
MOV32RmtoR(EAX,ECX);
break;
jNO_DEFAULT
}
u8* cont=JMP8(0);
x86SetJ8(_fullread);
int szidx;
switch(bits)
{
case 8: szidx=0; break;
case 16: szidx=1; break;
case 32: szidx=2; break;
jNO_DEFAULT
}
MOVZX32R8toR(EAX,EAX);
SUB32RtoR(ECX,EAX);
//eax=[funct+eax]
MOV32RmSOffsettoR(EAX,EAX,(int)RWFT[szidx][0],2);
SUB32ItoR(ECX,0x80000000);
CALL32R(EAX);
_vtlb_DynGen_IndirectRead( bits );
// perform sign extension on the result:
@ -233,25 +240,83 @@ void vtlb_DynGenRead32(u32 bits, bool sign)
x86SetJ8(cont);
}
void vtlb_DynGenWrite(u32 sz)
void vtlb_DynGenRead64_Const( u32 bits, u32 addr_const )
{
MOV32RtoR(EAX,ECX);
SHR32ItoR(EAX,VTLB_PAGE_BITS);
MOV32RmSOffsettoR(EAX,EAX,(int)vmap,2);
ADD32RtoR(ECX,EAX);
u8* _full=JS8(0);
switch(sz)
jASSUME( bits == 64 || bits == 128 );
void* vmv_ptr = &vmap[addr_const>>VTLB_PAGE_BITS];
MOV32MtoR(EAX,(uptr)vmv_ptr);
MOV32ItoR(ECX,addr_const);
ADD32RtoR(ECX,EAX); // ecx=ppf
u8* _fullread = JS8(0);
_vtlb_DynGen_DirectRead( bits, false );
u8* cont = JMP8(0);
x86SetJ8(_fullread);
_vtlb_DynGen_IndirectRead( bits );
x86SetJ8(cont);
}
// Recompiled input registers:
// ecx - source address to read from
// Returns read value in eax.
void vtlb_DynGenRead32_Const( u32 bits, bool sign, u32 addr_const )
{
jASSUME( bits <= 32 );
void* vmv_ptr = &vmap[addr_const>>VTLB_PAGE_BITS];
MOV32MtoR(EAX,(uptr)vmv_ptr);
MOV32ItoR(ECX,addr_const);
ADD32RtoR(ECX,EAX); // ecx=ppf
u8* _fullread = JS8(0);
_vtlb_DynGen_DirectRead( bits, sign );
u8* cont = JMP8(0);
x86SetJ8(_fullread);
_vtlb_DynGen_IndirectRead( bits );
// perform sign extension on the result:
if( bits==8 )
{
if( sign )
MOVSX32R8toR(EAX,EAX);
else
MOVZX32R8toR(EAX,EAX);
}
else if( bits==16 )
{
if( sign )
MOVSX32R16toR(EAX,EAX);
else
MOVZX32R16toR(EAX,EAX);
}
x86SetJ8(cont);
}
//////////////////////////////////////////////////////////////////////////////////////////
// Dynarec Store Implementations
static void _vtlb_DynGen_DirectWrite( u32 bits )
{
switch(bits)
{
//8 , 16, 32 : data on EDX
case 8:
MOV8RtoRm(ECX,EDX);
break;
break;
case 16:
MOV16RtoRm(ECX,EDX);
break;
break;
case 32:
MOV32RtoRm(ECX,EDX);
break;
break;
case 64:
if( _hasFreeMMXreg() )
@ -269,7 +334,7 @@ void vtlb_DynGenWrite(u32 sz)
MOV32RmtoROffset(EAX,EDX,4);
MOV32RtoRmOffset(ECX,EAX,4);
}
break;
break;
case 128:
if( _hasFreeXMMreg() )
@ -284,23 +349,16 @@ void vtlb_DynGenWrite(u32 sz)
// Could put in an MMX optimization here as well, but no point really.
// It's almost never used since there's almost always a free XMM reg.
MOV32RmtoR(EAX,EDX);
MOV32RtoRm(ECX,EAX);
MOV32RmtoROffset(EAX,EDX,4);
MOV32RtoRmOffset(ECX,EAX,4);
MOV32RmtoROffset(EAX,EDX,8);
MOV32RtoRmOffset(ECX,EAX,8);
MOV32RmtoROffset(EAX,EDX,12);
MOV32RtoRmOffset(ECX,EAX,12);
MOV128_MtoM( ECX, EDX ); // dest <- src!
}
break;
break;
}
u8* cont=JMP8(0);
x86SetJ8(_full);
}
static void _vtlb_DynGen_IndirectWrite( u32 bits )
{
int szidx=0;
switch(sz)
switch( bits )
{
case 8: szidx=0; break;
case 16: szidx=1; break;
@ -314,6 +372,45 @@ void vtlb_DynGenWrite(u32 sz)
MOV32RmSOffsettoR(EAX,EAX,(int)RWFT[szidx][1],2);
SUB32ItoR(ECX,0x80000000);
CALL32R(EAX);
}
void vtlb_DynGenWrite(u32 sz)
{
MOV32RtoR(EAX,ECX);
SHR32ItoR(EAX,VTLB_PAGE_BITS);
MOV32RmSOffsettoR(EAX,EAX,(int)vmap,2);
ADD32RtoR(ECX,EAX);
u8* _full=JS8(0);
_vtlb_DynGen_DirectWrite( sz );
u8* cont = JMP8(0);
x86SetJ8(_full);
_vtlb_DynGen_IndirectWrite( sz );
x86SetJ8(cont);
}
// Generates code for a store instruction, where the address is a known constant.
void vtlb_DynGenWrite_Const( u32 bits, u32 addr_const )
{
// Important: It's not technically safe to do a const lookup of the VTLB here, since
// the VTLB could feasibly be remapped by other recompiled code at any time.
// So we're limited in exactly how much we can pre-calcuate.
void* vmv_ptr = &vmap[addr_const>>VTLB_PAGE_BITS];
MOV32MtoR(EAX,(uptr)vmv_ptr);
MOV32ItoR(ECX,addr_const);
ADD32RtoR(ECX,EAX); // ecx=ppf
u8* _full = JS8(0);
_vtlb_DynGen_DirectWrite( bits );
u8* cont = JMP8(0);
x86SetJ8(_full);
_vtlb_DynGen_IndirectWrite( bits );
x86SetJ8(cont);
}