From e205999744995b40170d07f853ead9be369b1d57 Mon Sep 17 00:00:00 2001 From: "Jake.Stine" Date: Fri, 27 Feb 2009 06:40:05 +0000 Subject: [PATCH] Minor Optimization: Added Const support to Vtlb's Load and Store implementations. git-svn-id: http://pcsx2.googlecode.com/svn/trunk@627 96395faa-99c1-11dd-bbfe-3dabce05a288 --- pcsx2/Interpreter.cpp | 12 +- pcsx2/vtlb.h | 4 + pcsx2/x86/ix86-32/iR5900LoadStore.cpp | 141 ++++++------- pcsx2/x86/ix86-32/recVTLB.cpp | 283 +++++++++++++++++--------- 4 files changed, 276 insertions(+), 164 deletions(-) diff --git a/pcsx2/Interpreter.cpp b/pcsx2/Interpreter.cpp index 5eb2447f5f..83fdec38af 100644 --- a/pcsx2/Interpreter.cpp +++ b/pcsx2/Interpreter.cpp @@ -37,7 +37,6 @@ static std::string disOut; #ifdef PCSX2_DEVBUILD static void debugI() { - //CPU_LOG("%s\n", disR5900Current.getCString()); if (cpuRegs.GPR.n.r0.UD[0] || cpuRegs.GPR.n.r0.UD[1]) Console::Error("R0 is not zero!!!!"); } #else @@ -64,6 +63,17 @@ static void execI() // } //} + // Another method of instruction dumping: + /*if( cpuRegs.cycle > 0x4f24d714 ) + { + //CPU_LOG( "%s\n", disR5900Current.getCString()); + disOut.clear(); + opcode.disasm( disOut ); + disOut += '\n'; + CPU_LOG( disOut.c_str() ); + }*/ + + cpuBlockCycles += opcode.cycles; cpuRegs.pc += 4; diff --git a/pcsx2/vtlb.h b/pcsx2/vtlb.h index 973e4a04c0..84ba374d08 100644 --- a/pcsx2/vtlb.h +++ b/pcsx2/vtlb.h @@ -61,6 +61,10 @@ extern void vtlb_DynGenWrite(u32 sz); extern void vtlb_DynGenRead32(u32 bits, bool sign); extern void vtlb_DynGenRead64(u32 sz); +extern void vtlb_DynGenWrite_Const( u32 bits, u32 addr_const ); +extern void vtlb_DynGenRead64_Const( u32 bits, u32 addr_const ); +extern void vtlb_DynGenRead32_Const( u32 bits, bool sign, u32 addr_const ); + namespace vtlb_private { static const uint VTLB_PAGE_BITS = 12; diff --git a/pcsx2/x86/ix86-32/iR5900LoadStore.cpp b/pcsx2/x86/ix86-32/iR5900LoadStore.cpp index 11c7a0f8d2..9ee0ea9fd4 100644 --- a/pcsx2/x86/ix86-32/iR5900LoadStore.cpp +++ b/pcsx2/x86/ix86-32/iR5900LoadStore.cpp @@ -2081,27 +2081,36 @@ void recLoad64( u32 bits, bool sign ) _deleteEEreg(_Rs_, 1); _eeOnLoadWrite(_Rt_); - EEINST_RESETSIGNEXT(_Rt_); // remove the sign extension -> what does this really do ? + EEINST_RESETSIGNEXT(_Rt_); // remove the sign extension _deleteEEreg(_Rt_, 0); - // Load ECX with the source memory address that we're reading from. - MOV32MtoR( ECX, (int)&cpuRegs.GPR.r[ _Rs_ ].UL[ 0 ] ); - if ( _Imm_ != 0 ) - ADD32ItoR( ECX, _Imm_ ); + // Load EDX with the destination. + // 64/128 bit modes load the result directly into the cpuRegs.GPR struct. - if( bits == 128 ) // force 16 byte alignment on 128 bit reads - AND32I8toR(ECX,0xF0); - - // Load EDX with the destination. 64/128 bit modes load the result directly into - // the cpuRegs.GPR struct. - - if ( _Rt_ ) - MOV32ItoR(EDX, (int)&cpuRegs.GPR.r[ _Rt_ ].UL[ 0 ] ); + if( _Rt_ ) + MOV32ItoR(EDX, (uptr)&cpuRegs.GPR.r[ _Rt_ ].UL[ 0 ] ); else - MOV32ItoR(EDX, (int)&dummyValue[0] ); + MOV32ItoR(EDX, (uptr)&dummyValue[0] ); - vtlb_DynGenRead64(bits); + if( IS_EECONSTREG( _Rs_ ) ) + { + u32 srcadr = g_cpuConstRegs[_Rs_].UL[0] + _Imm_; + if( bits == 128 ) srcadr &= ~0x0f; + vtlb_DynGenRead64_Const( bits, srcadr ); + } + else + { + // Load ECX with the source memory address that we're reading from. + MOV32MtoR( ECX, (uptr)&cpuRegs.GPR.r[ _Rs_ ].UL[ 0 ] ); + if ( _Imm_ != 0 ) + ADD32ItoR( ECX, _Imm_ ); + + if( bits == 128 ) // force 16 byte alignment on 128 bit reads + AND32I8toR(ECX,0xF0); + + vtlb_DynGenRead64(bits); + } } void recLoad32(u32 bits,bool sign) @@ -2115,40 +2124,26 @@ void recLoad32(u32 bits,bool sign) _eeOnLoadWrite(_Rt_); _deleteEEreg(_Rt_, 0); - // Load ECX with the source memory address that we're reading from. - MOV32MtoR( ECX, (int)&cpuRegs.GPR.r[ _Rs_ ].UL[ 0 ] ); - if ( _Imm_ != 0 ) - ADD32ItoR( ECX, _Imm_ ); - // 8/16/32 bit modes return the loaded value in EAX. - //MOV32ItoR(EDX, (int)&dummyValue[0] ); - vtlb_DynGenRead32(bits, sign); - - if ( _Rt_ ) + if( IS_EECONSTREG( _Rs_ ) ) { - // Perform sign extension if needed - - //MOV32MtoR( EAX, (int)&dummyValue[0] ); //ewww, lame ! movsx /zx has r/m forms too ... - /*if (bits==8) - { - if (sign) - //MOVSX32M8toR( EAX, (int)&dummyValue[0] ); - MOVSX32R8toR( EAX, EAX ); - //else - //MOVZX32M8toR( EAX, (int)&dummyValue[0] ); - //MOVZX32R8toR( EAX, EAX ); - } - else if (bits==16) - { - if (sign) - //MOVSX32M16toR( EAX, (int)&dummyValue[0] ); - MOVSX32R16toR( EAX, EAX ); - //else - //MOVZX32M16toR( EAX, (int)&dummyValue[0] ); - //MOVZX32R16toR( EAX, EAX ); - }*/ + u32 srcadr = g_cpuConstRegs[_Rs_].UL[0] + _Imm_; + vtlb_DynGenRead32_Const( bits, sign, srcadr ); + } + else + { + // Load ECX with the source memory address that we're reading from. + MOV32MtoR( ECX, (int)&cpuRegs.GPR.r[ _Rs_ ].UL[ 0 ] ); + if ( _Imm_ != 0 ) + ADD32ItoR( ECX, _Imm_ ); + + vtlb_DynGenRead32(bits, sign); + } + if( _Rt_ ) + { + // EAX holds the loaded value, so sign extend as needed: if (sign) CDQ(); else @@ -2463,6 +2458,7 @@ void recLQ( void ) ADD32ItoR( ESP, 8 ); */ } + void recStore(u32 sz) { //no int 3? i love to get my hands dirty ;p - Raz @@ -2470,16 +2466,15 @@ void recStore(u32 sz) _deleteEEreg(_Rs_, 1); _deleteEEreg(_Rt_, 1); - - MOV32MtoR( ECX, (int)&cpuRegs.GPR.r[ _Rs_ ].UL[ 0 ] ); - if ( _Imm_ != 0 ) - { - ADD32ItoR( ECX, _Imm_); - } - if (sz==128) - { - AND32I8toR(ECX,0xF0); - } + + // Performance note: Const prop for the store address is good, always. + // Constprop for the value being stored is not really worthwhile (better to use register + // allocation -- simpler code and just as fast) + + + // Load EDX first with the value being written, or the address of the value + // being written (64/128 bit modes). TODO: use register allocation, if the + // value is allocated to a register. if (sz<64) { @@ -2492,23 +2487,29 @@ void recStore(u32 sz) { MOV32ItoR(EDX,(int)&cpuRegs.GPR.r[ _Rt_ ].UL[ 0 ]); } - - - vtlb_DynGenWrite(sz); - /* - if (sz==8) - CALLFunc( (int)memWrite8 ); - else if (sz==16) - CALLFunc( (int)memWrite16 ); - else if (sz==32) - CALLFunc( (int)memWrite32 ); - else if (sz==64) - CALLFunc( (int)memWrite64 ); - else if (sz==128) - CALLFunc( (int)memWrite128 ); - */ + // Load ECX with the destination address, or issue a direct optimized write + // if the address is a constant propagation. + + if( GPR_IS_CONST1( _Rs_ ) ) + { + u32 dstadr = g_cpuConstRegs[_Rs_].UL[0] + _Imm_; + if( sz == 128 ) dstadr &= ~0x0f; + vtlb_DynGenWrite_Const( sz, dstadr ); + } + else + { + MOV32MtoR( ECX, (int)&cpuRegs.GPR.r[ _Rs_ ].UL[ 0 ] ); + if ( _Imm_ != 0 ) + ADD32ItoR(ECX, _Imm_); + + if (sz==128) + AND32I8toR(ECX,0xF0); + + vtlb_DynGenWrite(sz); + } } + //////////////////////////////////////////////////// void recSB( void ) { diff --git a/pcsx2/x86/ix86-32/recVTLB.cpp b/pcsx2/x86/ix86-32/recVTLB.cpp index 8941772390..e7c0f3c047 100644 --- a/pcsx2/x86/ix86-32/recVTLB.cpp +++ b/pcsx2/x86/ix86-32/recVTLB.cpp @@ -27,6 +27,21 @@ using namespace vtlb_private; +// NOTICE: This function *destroys* EAX!! +// Moves 128 bits of memory from the source register ptr to the dest register ptr. +// (used as an equivalent to movaps, when a free XMM register is unavailable for some reason) +void MOV128_MtoM( x86IntRegType destRm, x86IntRegType srcRm ) +{ + MOV32RmtoR(EAX,srcRm); + MOV32RtoRm(destRm,EAX); + MOV32RmtoROffset(EAX,srcRm,4); + MOV32RtoRmOffset(destRm,EAX,4); + MOV32RmtoROffset(EAX,srcRm,8); + MOV32RtoRmOffset(destRm,EAX,8); + MOV32RmtoROffset(EAX,srcRm,12); + MOV32RtoRmOffset(destRm,EAX,12); +} + /* // Pseudo-Code For the following Dynarec Implementations --> @@ -78,18 +93,31 @@ using namespace vtlb_private; */ +////////////////////////////////////////////////////////////////////////////////////////// +// Dynarec Load Implementations -//ecx = addr -//edx = ptr -void vtlb_DynGenRead64(u32 bits) +static void _vtlb_DynGen_DirectRead( u32 bits, bool sign ) { - MOV32RtoR(EAX,ECX); - SHR32ItoR(EAX,VTLB_PAGE_BITS); - MOV32RmSOffsettoR(EAX,EAX,(int)vmap,2); - ADD32RtoR(ECX,EAX); - u8* _fullread=JS8(0); - switch(bits) + switch( bits ) { + case 8: + if( sign ) + MOVSX32Rm8toR(EAX,ECX); + else + MOVZX32Rm8toR(EAX,ECX); + break; + + case 16: + if( sign ) + MOVSX32Rm16toR(EAX,ECX); + else + MOVZX32Rm16toR(EAX,ECX); + break; + + case 32: + MOV32RmtoR(EAX,ECX); + break; + case 64: if( _hasFreeMMXreg() ) { @@ -106,7 +134,7 @@ void vtlb_DynGenRead64(u32 bits) MOV32RmtoROffset(EAX,ECX,4); MOV32RtoRmOffset(EDX,EAX,4); } - break; + break; case 128: if( _hasFreeXMMreg() ) @@ -121,31 +149,25 @@ void vtlb_DynGenRead64(u32 bits) // Could put in an MMX optimization here as well, but no point really. // It's almost never used since there's almost always a free XMM reg. - MOV32RmtoR(EAX,ECX); - MOV32RtoRm(EDX,EAX); - - MOV32RmtoROffset(EAX,ECX,4); - MOV32RtoRmOffset(EDX,EAX,4); - - MOV32RmtoROffset(EAX,ECX,8); - MOV32RtoRmOffset(EDX,EAX,8); - - MOV32RmtoROffset(EAX,ECX,12); - MOV32RtoRmOffset(EDX,EAX,12); + MOV128_MtoM( EDX, ECX ); // dest <- src! } - break; + break; jNO_DEFAULT } +} - u8* cont=JMP8(0); - x86SetJ8(_fullread); +static void _vtlb_DynGen_IndirectRead( u32 bits ) +{ int szidx; - switch(bits) + switch( bits ) { - case 64: szidx=3; break; - case 128: szidx=4; break; + case 8: szidx=0; break; + case 16: szidx=1; break; + case 32: szidx=2; break; + case 64: szidx=3; break; + case 128: szidx=4; break; jNO_DEFAULT } @@ -155,12 +177,33 @@ void vtlb_DynGenRead64(u32 bits) MOV32RmSOffsettoR(EAX,EAX,(int)RWFT[szidx][0],2); SUB32ItoR(ECX,0x80000000); CALL32R(EAX); +} + +// Recompiled input registers: +// ecx = source addr to read from +// edx = ptr to dest to write to +void vtlb_DynGenRead64(u32 bits) +{ + jASSUME( bits == 64 || bits == 128 ); + + MOV32RtoR(EAX,ECX); + SHR32ItoR(EAX,VTLB_PAGE_BITS); + MOV32RmSOffsettoR(EAX,EAX,(int)vmap,2); + ADD32RtoR(ECX,EAX); + u8* _fullread = JS8(0); + + _vtlb_DynGen_DirectRead( bits, false ); + u8* cont = JMP8(0); + + x86SetJ8(_fullread); + _vtlb_DynGen_IndirectRead( bits ); x86SetJ8(cont); } -// ecx - source address to read from -// Returns read value in eax. +// Recompiled input registers: +// ecx - source address to read from +// Returns read value in eax. void vtlb_DynGenRead32(u32 bits, bool sign) { jASSUME( bits <= 32 ); @@ -169,49 +212,13 @@ void vtlb_DynGenRead32(u32 bits, bool sign) SHR32ItoR(EAX,VTLB_PAGE_BITS); MOV32RmSOffsettoR(EAX,EAX,(int)vmap,2); ADD32RtoR(ECX,EAX); - u8* _fullread=JS8(0); + u8* _fullread = JS8(0); - switch(bits) - { - case 8: - if( sign ) - MOVSX32Rm8toR(EAX,ECX); - else - MOVZX32Rm8toR(EAX,ECX); - break; + _vtlb_DynGen_DirectRead( bits, sign ); + u8* cont = JMP8(0); - case 16: - if( sign ) - MOVSX32Rm16toR(EAX,ECX); - else - MOVZX32Rm16toR(EAX,ECX); - break; - - case 32: - MOV32RmtoR(EAX,ECX); - break; - - jNO_DEFAULT - } - - u8* cont=JMP8(0); x86SetJ8(_fullread); - int szidx; - - switch(bits) - { - case 8: szidx=0; break; - case 16: szidx=1; break; - case 32: szidx=2; break; - jNO_DEFAULT - } - - MOVZX32R8toR(EAX,EAX); - SUB32RtoR(ECX,EAX); - //eax=[funct+eax] - MOV32RmSOffsettoR(EAX,EAX,(int)RWFT[szidx][0],2); - SUB32ItoR(ECX,0x80000000); - CALL32R(EAX); + _vtlb_DynGen_IndirectRead( bits ); // perform sign extension on the result: @@ -233,25 +240,83 @@ void vtlb_DynGenRead32(u32 bits, bool sign) x86SetJ8(cont); } -void vtlb_DynGenWrite(u32 sz) +void vtlb_DynGenRead64_Const( u32 bits, u32 addr_const ) { - MOV32RtoR(EAX,ECX); - SHR32ItoR(EAX,VTLB_PAGE_BITS); - MOV32RmSOffsettoR(EAX,EAX,(int)vmap,2); - ADD32RtoR(ECX,EAX); - u8* _full=JS8(0); - switch(sz) + jASSUME( bits == 64 || bits == 128 ); + + void* vmv_ptr = &vmap[addr_const>>VTLB_PAGE_BITS]; + + MOV32MtoR(EAX,(uptr)vmv_ptr); + MOV32ItoR(ECX,addr_const); + ADD32RtoR(ECX,EAX); // ecx=ppf + u8* _fullread = JS8(0); + + _vtlb_DynGen_DirectRead( bits, false ); + u8* cont = JMP8(0); + + x86SetJ8(_fullread); + _vtlb_DynGen_IndirectRead( bits ); + + x86SetJ8(cont); +} + +// Recompiled input registers: +// ecx - source address to read from +// Returns read value in eax. +void vtlb_DynGenRead32_Const( u32 bits, bool sign, u32 addr_const ) +{ + jASSUME( bits <= 32 ); + + void* vmv_ptr = &vmap[addr_const>>VTLB_PAGE_BITS]; + + MOV32MtoR(EAX,(uptr)vmv_ptr); + MOV32ItoR(ECX,addr_const); + ADD32RtoR(ECX,EAX); // ecx=ppf + u8* _fullread = JS8(0); + + _vtlb_DynGen_DirectRead( bits, sign ); + u8* cont = JMP8(0); + + x86SetJ8(_fullread); + _vtlb_DynGen_IndirectRead( bits ); + + // perform sign extension on the result: + + if( bits==8 ) + { + if( sign ) + MOVSX32R8toR(EAX,EAX); + else + MOVZX32R8toR(EAX,EAX); + } + else if( bits==16 ) + { + if( sign ) + MOVSX32R16toR(EAX,EAX); + else + MOVZX32R16toR(EAX,EAX); + } + + x86SetJ8(cont); +} + +////////////////////////////////////////////////////////////////////////////////////////// +// Dynarec Store Implementations + +static void _vtlb_DynGen_DirectWrite( u32 bits ) +{ + switch(bits) { //8 , 16, 32 : data on EDX case 8: MOV8RtoRm(ECX,EDX); - break; + break; case 16: MOV16RtoRm(ECX,EDX); - break; + break; case 32: MOV32RtoRm(ECX,EDX); - break; + break; case 64: if( _hasFreeMMXreg() ) @@ -269,7 +334,7 @@ void vtlb_DynGenWrite(u32 sz) MOV32RmtoROffset(EAX,EDX,4); MOV32RtoRmOffset(ECX,EAX,4); } - break; + break; case 128: if( _hasFreeXMMreg() ) @@ -284,23 +349,16 @@ void vtlb_DynGenWrite(u32 sz) // Could put in an MMX optimization here as well, but no point really. // It's almost never used since there's almost always a free XMM reg. - MOV32RmtoR(EAX,EDX); - MOV32RtoRm(ECX,EAX); - MOV32RmtoROffset(EAX,EDX,4); - MOV32RtoRmOffset(ECX,EAX,4); - MOV32RmtoROffset(EAX,EDX,8); - MOV32RtoRmOffset(ECX,EAX,8); - MOV32RmtoROffset(EAX,EDX,12); - MOV32RtoRmOffset(ECX,EAX,12); + MOV128_MtoM( ECX, EDX ); // dest <- src! } - break; + break; } - u8* cont=JMP8(0); - - x86SetJ8(_full); +} +static void _vtlb_DynGen_IndirectWrite( u32 bits ) +{ int szidx=0; - switch(sz) + switch( bits ) { case 8: szidx=0; break; case 16: szidx=1; break; @@ -314,6 +372,45 @@ void vtlb_DynGenWrite(u32 sz) MOV32RmSOffsettoR(EAX,EAX,(int)RWFT[szidx][1],2); SUB32ItoR(ECX,0x80000000); CALL32R(EAX); +} + +void vtlb_DynGenWrite(u32 sz) +{ + MOV32RtoR(EAX,ECX); + SHR32ItoR(EAX,VTLB_PAGE_BITS); + MOV32RmSOffsettoR(EAX,EAX,(int)vmap,2); + ADD32RtoR(ECX,EAX); + u8* _full=JS8(0); + + _vtlb_DynGen_DirectWrite( sz ); + u8* cont = JMP8(0); + + x86SetJ8(_full); + _vtlb_DynGen_IndirectWrite( sz ); + + x86SetJ8(cont); +} + + +// Generates code for a store instruction, where the address is a known constant. +void vtlb_DynGenWrite_Const( u32 bits, u32 addr_const ) +{ + // Important: It's not technically safe to do a const lookup of the VTLB here, since + // the VTLB could feasibly be remapped by other recompiled code at any time. + // So we're limited in exactly how much we can pre-calcuate. + + void* vmv_ptr = &vmap[addr_const>>VTLB_PAGE_BITS]; + + MOV32MtoR(EAX,(uptr)vmv_ptr); + MOV32ItoR(ECX,addr_const); + ADD32RtoR(ECX,EAX); // ecx=ppf + u8* _full = JS8(0); + + _vtlb_DynGen_DirectWrite( bits ); + u8* cont = JMP8(0); + + x86SetJ8(_full); + _vtlb_DynGen_IndirectWrite( bits ); x86SetJ8(cont); }