From 859d62d2a744dca763eb9560fb24062b59334126 Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Fri, 4 Dec 2015 19:36:29 +0100 Subject: [PATCH] ee|iop: use xScopedStackFrame to handle dynarec frame * Rework a bit MVU to support xScopedStackFrame. Potentially stack frame can be optimized (save 5 instructions) * I removed the recompiler stack check. Address sanitizer is more efficient anyway --- pcsx2/x86/iR3000A.cpp | 165 ++------------------------------ pcsx2/x86/ix86-32/iR5900-32.cpp | 105 ++------------------ pcsx2/x86/microVU.cpp | 6 +- pcsx2/x86/microVU_Execute.inl | 163 ++++++++++++------------------- 4 files changed, 74 insertions(+), 365 deletions(-) diff --git a/pcsx2/x86/iR3000A.cpp b/pcsx2/x86/iR3000A.cpp index 9bca128013..10a047642f 100644 --- a/pcsx2/x86/iR3000A.cpp +++ b/pcsx2/x86/iR3000A.cpp @@ -120,50 +120,6 @@ static void recEventTest() _cpuEventTest_Shared(); } -// parameters: -// espORebp - 0 for ESP, or 1 for EBP. -// regval - current value of the register at the time the fault was detected (predates the -// stackframe setup code in this function) -static void __fastcall StackFrameCheckFailed( int espORebp, int regval ) -{ - pxFailDev( pxsFmt( L"(R3000A Recompiler Stackframe) Sanity check failed on %ls\n\tCurrent=%d; Saved=%d", - (espORebp==0) ? L"ESP" : L"EBP", regval, (espORebp==0) ? s_store_esp : s_store_ebp ) - ); - - // Note: The recompiler will attempt to recover ESP and EBP after returning from this function, - // so typically selecting Continue/Ignore/Cancel for this assertion should allow PCSX2 to con- - // tinue to run with some degree of stability. -} - -static void _DynGen_StackFrameCheck() -{ - if( !IsDevBuild ) return; - - // --------- EBP Here ----------- - - xCMP( ebp, ptr[&s_store_ebp] ); - xForwardJE8 skipassert_ebp; - - xMOV( ecx, 1 ); // 1 specifies EBP - xMOV( edx, ebp ); - xCALL( StackFrameCheckFailed ); - xMOV( ebp, ptr[&s_store_ebp] ); // half-hearted frame recovery attempt! - - skipassert_ebp.SetTarget(); - - // --------- ESP There ----------- - - xCMP( esp, ptr[&s_store_esp] ); - xForwardJE8 skipassert_esp; - - xXOR( ecx, ecx ); // 0 specifies ESP - xMOV( edx, esp ); - xCALL( StackFrameCheckFailed ); - xMOV( esp, ptr[&s_store_esp] ); // half-hearted frame recovery attempt! - - skipassert_esp.SetTarget(); -} - // The address for all cleared blocks. It recompiles the current pc and then // dispatches to the recompiled block address. static DynGenFunc* _DynGen_JITCompile() @@ -171,7 +127,6 @@ static DynGenFunc* _DynGen_JITCompile() pxAssertMsg( iopDispatcherReg != NULL, "Please compile the DispatcherReg subroutine *before* JITComple. Thanks." ); u8* retval = xGetPtr(); - _DynGen_StackFrameCheck(); xMOV( ecx, ptr[&psxRegs.pc] ); xCALL( iopRecRecompile ); @@ -196,7 +151,6 @@ static DynGenFunc* _DynGen_JITCompileInBlock() static DynGenFunc* _DynGen_DispatcherReg() { u8* retval = xGetPtr(); - _DynGen_StackFrameCheck(); xMOV( eax, ptr[&psxRegs.pc] ); xMOV( ebx, eax ); @@ -210,128 +164,21 @@ static DynGenFunc* _DynGen_DispatcherReg() // -------------------------------------------------------------------------------------- // EnterRecompiledCode - dynamic compilation stub! // -------------------------------------------------------------------------------------- - -// In Release Builds this literally generates the following code: -// push edi -// push esi -// push ebx -// jmp DispatcherReg -// pop ebx -// pop esi -// pop edi -// -// See notes on why this works in both GCC (aligned stack!) and other compilers (not-so- -// aligned stack!). In debug/dev builds the code gen is more complicated, as it constructs -// ebp stackframe mess, which allows for a complete backtrace from debug breakpoints (yay). -// -// Also, if you set PCSX2_IOP_FORCED_ALIGN_STACK to 1, the codegen for MSVC becomes slightly -// more complicated since it has to perform a full stack alignment on entry. -// - -#if defined(__GNUG__) || defined(__DARWIN__) -# define PCSX2_ASSUME_ALIGNED_STACK 1 -#else -# define PCSX2_ASSUME_ALIGNED_STACK 0 -#endif - -// Set to 0 for a speedup in release builds. -// [doesn't apply to GCC/Mac, which must always align] -#define PCSX2_IOP_FORCED_ALIGN_STACK 0 //1 - - -// For overriding stackframe generation options in Debug builds (possibly useful for troubleshooting) -// Typically this value should be the same as IsDevBuild. -static const bool GenerateStackFrame = IsDevBuild; - static DynGenFunc* _DynGen_EnterRecompiledCode() { - u8* retval = xGetPtr(); - - bool allocatedStack = GenerateStackFrame || PCSX2_IOP_FORCED_ALIGN_STACK; - // Optimization: The IOP never uses stack-based parameter invocation, so we can avoid // allocating any room on the stack for it (which is important since the IOP's entry // code gets invoked quite a lot). - if( allocatedStack ) - { - xPUSH( ebp ); - xMOV( ebp, esp ); - xAND( esp, -0x10 ); + u8* retval = xGetPtr(); - xSUB( esp, 0x20 ); + { // Properly scope the frame prologue/epilogue + xScopedStackFrame frame(IsDevBuild); - xMOV( ptr[ebp-12], edi ); - xMOV( ptr[ebp-8], esi ); - xMOV( ptr[ebp-4], ebx ); - } - else - { - // GCC Compiler: - // The frame pointer coming in from the EE's event test can be safely assumed to be - // aligned, since GCC always aligns stackframes. While handy in x86-64, where CALL + PUSH EBP - // results in a neatly realigned stack on entry to every function, unfortunately in x86-32 - // this is usually worthless because CALL+PUSH leaves us 8 byte aligned instead (fail). So - // we have to do the usual set of stackframe alignments and simulated callstack mess - // *regardless*. + xJMP(iopDispatcherReg); - // MSVC/Intel compilers: - // The PCSX2_IOP_FORCED_ALIGN_STACK setting is 0, so we don't care. Just push regs like - // the good old days! (stack alignment will be indeterminate) - - xPUSH( edi ); - xPUSH( esi ); - xPUSH( ebx ); - - allocatedStack = false; - } - - uptr* imm = NULL; - if( allocatedStack ) - { - if( GenerateStackFrame ) - { - // Simulate a CALL function by pushing the call address and EBP onto the stack. - // This retains proper stacktrace and stack unwinding (handy in devbuilds!) - - xMOV( ptr32[esp+0x0c], 0xffeeff ); - imm = (uptr*)(xGetPtr()-4); - - // This part simulates the "normal" stackframe prep of "push ebp, mov ebp, esp" - xMOV( ptr32[esp+0x08], ebp ); - xLEA( ebp, ptr32[esp+0x08] ); - } - } - - if( IsDevBuild ) - { - xMOV( ptr[&s_store_esp], esp ); - xMOV( ptr[&s_store_ebp], ebp ); - } - - xJMP( iopDispatcherReg ); - if( imm != NULL ) - *imm = (uptr)xGetPtr(); - - // ---------------------- - // ----> Cleanup! ----> - - iopExitRecompiledCode = (DynGenFunc*)xGetPtr(); - - if( allocatedStack ) - { - // pop the nested "simulated call" stackframe, if needed: - if( GenerateStackFrame ) xLEAVE(); - xMOV( edi, ptr[ebp-12] ); - xMOV( esi, ptr[ebp-8] ); - xMOV( ebx, ptr[ebp-4] ); - xLEAVE(); - } - else - { - xPOP( ebx ); - xPOP( esi ); - xPOP( edi ); + // Save an exit point + iopExitRecompiledCode = (DynGenFunc*)xGetPtr(); } xRET(); diff --git a/pcsx2/x86/ix86-32/iR5900-32.cpp b/pcsx2/x86/ix86-32/iR5900-32.cpp index 0a1da6de6b..a3bd76fd43 100644 --- a/pcsx2/x86/ix86-32/iR5900-32.cpp +++ b/pcsx2/x86/ix86-32/iR5900-32.cpp @@ -372,50 +372,6 @@ static void recEventTest() _cpuEventTest_Shared(); } -// parameters: -// espORebp - 0 for ESP, or 1 for EBP. -// regval - current value of the register at the time the fault was detected (predates the -// stackframe setup code in this function) -static void __fastcall StackFrameCheckFailed( int espORebp, int regval ) -{ - pxFailDev( wxsFormat( L"(R5900 Recompiler Stackframe) Sanity check failed on %s\n\tCurrent=%d; Saved=%d", - (espORebp==0) ? L"ESP" : L"EBP", regval, (espORebp==0) ? s_store_esp : s_store_ebp ) - ); - - // Note: The recompiler will attempt to recover ESP and EBP after returning from this function, - // so typically selecting Continue/Ignore/Cancel for this assertion should allow PCSX2 to con- - // tinue to run with some degree of stability. -} - -static void _DynGen_StackFrameCheck() -{ - if( !EmuConfig.Cpu.Recompiler.StackFrameChecks ) return; - - // --------- EBP Here ----------- - - xCMP( ebp, ptr[&s_store_ebp] ); - xForwardJE8 skipassert_ebp; - - xMOV( ecx, 1 ); // 1 specifies EBP - xMOV( edx, ebp ); - xCALL( StackFrameCheckFailed ); - xMOV( ebp, ptr[&s_store_ebp] ); // half-hearted frame recovery attempt! - - skipassert_ebp.SetTarget(); - - // --------- ESP There ----------- - - xCMP( esp, ptr[&s_store_esp] ); - xForwardJE8 skipassert_esp; - - xXOR( ecx, ecx ); // 0 specifies ESP - xMOV( edx, esp ); - xCALL( StackFrameCheckFailed ); - xMOV( esp, ptr[&s_store_esp] ); // half-hearted frame recovery attempt! - - skipassert_esp.SetTarget(); -} - // The address for all cleared blocks. It recompiles the current pc and then // dispatches to the recompiled block address. static DynGenFunc* _DynGen_JITCompile() @@ -423,7 +379,6 @@ static DynGenFunc* _DynGen_JITCompile() pxAssertMsg( DispatcherReg != NULL, "Please compile the DispatcherReg subroutine *before* JITComple. Thanks." ); u8* retval = xGetAlignedCallTarget(); - _DynGen_StackFrameCheck(); xMOV( ecx, ptr[&cpuRegs.pc] ); xCALL( recRecompile ); @@ -448,7 +403,6 @@ static DynGenFunc* _DynGen_JITCompileInBlock() static DynGenFunc* _DynGen_DispatcherReg() { u8* retval = xGetPtr(); // fallthrough target, can't align it! - _DynGen_StackFrameCheck(); xMOV( eax, ptr[&cpuRegs.pc] ); xMOV( ebx, eax ); @@ -471,63 +425,18 @@ static DynGenFunc* _DynGen_DispatcherEvent() static DynGenFunc* _DynGen_EnterRecompiledCode() { pxAssertDev( DispatcherReg != NULL, "Dynamically generated dispatchers are required prior to generating EnterRecompiledCode!" ); - + u8* retval = xGetAlignedCallTarget(); - // "standard" frame pointer setup for aligned stack: Record the original - // esp into ebp, and then align esp. ebp references the original esp base - // for the duration of our function, and is used to restore the original - // esp before returning from the function + { // Properly scope the frame prologue/epilogue + xScopedStackFrame frame(IsDevBuild); - xPUSH( ebp ); - xMOV( ebp, esp ); - xAND( esp, -0x10 ); + xJMP(DispatcherReg); - // First 0x10 is for esi, edi, etc. Second 0x10 is for the return address and ebp. The - // third 0x10 is an optimization for C-style CDECL calls we might make from the recompiler - // (parameters for those calls can be stored there!) [currently no cdecl functions are - // used -- we do everything through __fastcall) - - static const int cdecl_reserve = 0x00; - xSUB( esp, 0x20 + cdecl_reserve ); - - xMOV( ptr[ebp-12], edi ); - xMOV( ptr[ebp-8], esi ); - xMOV( ptr[ebp-4], ebx ); - - // Simulate a CALL function by pushing the call address and EBP onto the stack. - // (the dummy address here is filled in later right before we generate the LEAVE code) - xMOV( ptr32[esp+0x0c+cdecl_reserve], 0xdeadbeef ); - uptr& imm = *(uptr*)(xGetPtr()-4); - - // This part simulates the "normal" stackframe prep of "push ebp, mov ebp, esp" - // It is done here because we can't really generate that stuff from the Dispatchers themselves. - xMOV( ptr32[esp+0x08+cdecl_reserve], ebp ); - xLEA( ebp, ptr32[esp+0x08+cdecl_reserve] ); - - if (EmuConfig.Cpu.Recompiler.StackFrameChecks) { - xMOV( ptr[&s_store_esp], esp ); - xMOV( ptr[&s_store_ebp], ebp ); + // Save an exit point + ExitRecompiledCode = (DynGenFunc*)xGetPtr(); } - xJMP( DispatcherReg ); - - xAlignCallTarget(); - - // This dummy CALL is unreachable code that some debuggers (MSVC2008) need in order to - // unwind the stack properly. This is effectively the call that we simulate above. - if( IsDevBuild ) xCALL( DispatcherReg ); - - imm = (uptr)xGetPtr(); - ExitRecompiledCode = (DynGenFunc*)xGetPtr(); - - xLEAVE(); - - xMOV( edi, ptr[ebp-12] ); - xMOV( esi, ptr[ebp-8] ); - xMOV( ebx, ptr[ebp-4] ); - - xLEAVE(); xRET(); return (DynGenFunc*)retval; @@ -1149,8 +1058,6 @@ static u32 scaleblockcycles() // setting "g_branch = 2"; static void iBranchTest(u32 newpc) { - _DynGen_StackFrameCheck(); - // Check the Event scheduler if our "cycle target" has been reached. // Equiv code to: // cpuRegs.cycle += blockcycles; diff --git a/pcsx2/x86/microVU.cpp b/pcsx2/x86/microVU.cpp index f9e43bbdf2..a4378196fc 100644 --- a/pcsx2/x86/microVU.cpp +++ b/pcsx2/x86/microVU.cpp @@ -80,10 +80,8 @@ void mVUreset(microVU& mVU, bool resetReserve) { else Perf::any.map((uptr)&mVU.dispCache, mVUdispCacheSize, "mVU0 Dispatcher"); x86SetPtr(mVU.dispCache); - mVUdispatcherA(mVU); - mVUdispatcherB(mVU); - mVUdispatcherC(mVU); - mVUdispatcherD(mVU); + mVUdispatcherAB(mVU); + mVUdispatcherCD(mVU); mVUemitSearch(); // Clear All Program Data diff --git a/pcsx2/x86/microVU_Execute.inl b/pcsx2/x86/microVU_Execute.inl index d7a910e848..0aaaeef15d 100644 --- a/pcsx2/x86/microVU_Execute.inl +++ b/pcsx2/x86/microVU_Execute.inl @@ -19,139 +19,96 @@ // Dispatcher Functions //------------------------------------------------------------------ -// Generates the code for entering recompiled blocks -void mVUdispatcherA(mV) { +// Generates the code for entering/exit recompiled blocks +void mVUdispatcherAB(mV) { mVU.startFunct = x86Ptr; - // Backup cpu state - xPUSH(ebp); - xPUSH(ebx); - xPUSH(esi); - xPUSH(edi); + { + xScopedStackFrame frame(false, true); - // Align the stackframe (GCC only, since GCC assumes stackframe is always aligned) - #ifdef __GNUC__ - xSUB(esp, 12); - #endif + // __fastcall = The caller has already put the needed parameters in ecx/edx: + if (!isVU1) { xCALL(mVUexecuteVU0); } + else { xCALL(mVUexecuteVU1); } - // __fastcall = The caller has already put the needed parameters in ecx/edx: - if (!isVU1) { xCALL(mVUexecuteVU0); } - else { xCALL(mVUexecuteVU1); } + // Load VU's MXCSR state + xLDMXCSR(g_sseVUMXCSR); - // Load VU's MXCSR state - xLDMXCSR(g_sseVUMXCSR); + // Load Regs + xMOV(gprF0, ptr32[&mVU.regs().VI[REG_STATUS_FLAG].UL]); + xMOV(gprF1, gprF0); + xMOV(gprF2, gprF0); + xMOV(gprF3, gprF0); - // Load Regs - xMOV(gprF0, ptr32[&mVU.regs().VI[REG_STATUS_FLAG].UL]); - xMOV(gprF1, gprF0); - xMOV(gprF2, gprF0); - xMOV(gprF3, gprF0); + xMOVAPS (xmmT1, ptr128[&mVU.regs().VI[REG_MAC_FLAG].UL]); + xSHUF.PS(xmmT1, xmmT1, 0); + xMOVAPS (ptr128[mVU.macFlag], xmmT1); - xMOVAPS (xmmT1, ptr128[&mVU.regs().VI[REG_MAC_FLAG].UL]); - xSHUF.PS(xmmT1, xmmT1, 0); - xMOVAPS (ptr128[mVU.macFlag], xmmT1); + xMOVAPS (xmmT1, ptr128[&mVU.regs().VI[REG_CLIP_FLAG].UL]); + xSHUF.PS(xmmT1, xmmT1, 0); + xMOVAPS (ptr128[mVU.clipFlag], xmmT1); - xMOVAPS (xmmT1, ptr128[&mVU.regs().VI[REG_CLIP_FLAG].UL]); - xSHUF.PS(xmmT1, xmmT1, 0); - xMOVAPS (ptr128[mVU.clipFlag], xmmT1); + xMOVAPS (xmmT1, ptr128[&mVU.regs().VI[REG_P].UL]); + xMOVAPS (xmmPQ, ptr128[&mVU.regs().VI[REG_Q].UL]); + xSHUF.PS(xmmPQ, xmmT1, 0); // wzyx = PPQQ - xMOVAPS (xmmT1, ptr128[&mVU.regs().VI[REG_P].UL]); - xMOVAPS (xmmPQ, ptr128[&mVU.regs().VI[REG_Q].UL]); - xSHUF.PS(xmmPQ, xmmT1, 0); // wzyx = PPQQ + // Jump to Recompiled Code Block + xJMP(eax); - // Jump to Recompiled Code Block - xJMP(eax); - pxAssertDev(xGetPtr() < (mVU.dispCache + mVUdispCacheSize), - "microVU: Dispatcher generation exceeded reserved cache area!"); -} + mVU.exitFunct = x86Ptr; -// Generates the code to exit from recompiled blocks -void mVUdispatcherB(mV) { - mVU.exitFunct = x86Ptr; + // Load EE's MXCSR state + xLDMXCSR(g_sseMXCSR); - // Load EE's MXCSR state - xLDMXCSR(g_sseMXCSR); - - // __fastcall = The first two DWORD or smaller arguments are passed in ECX and EDX registers; - // all other arguments are passed right to left. - if (!isVU1) { xCALL(mVUcleanUpVU0); } - else { xCALL(mVUcleanUpVU1); } - - // Unalign the stackframe: - #ifdef __GNUC__ - xADD( esp, 12 ); - #endif - - // Restore cpu state - xPOP(edi); - xPOP(esi); - xPOP(ebx); - xPOP(ebp); + // __fastcall = The first two DWORD or smaller arguments are passed in ECX and EDX registers; + // all other arguments are passed right to left. + if (!isVU1) { xCALL(mVUcleanUpVU0); } + else { xCALL(mVUcleanUpVU1); } + } xRET(); + pxAssertDev(xGetPtr() < (mVU.dispCache + mVUdispCacheSize), - "microVU: Dispatcher generation exceeded reserved cache area!"); + "microVU: Dispatcher generation exceeded reserved cache area!"); } -// Generates the code for resuming xgkick -void mVUdispatcherC(mV) { +// Generates the code for resuming/exit xgkick +void mVUdispatcherCD(mV) { mVU.startFunctXG = x86Ptr; - // Backup cpu state - xPUSH(ebp); - xPUSH(ebx); - xPUSH(esi); - xPUSH(edi); + { + xScopedStackFrame frame(false, true); - // Align the stackframe (GCC only, since GCC assumes stackframe is always aligned) - #ifdef __GNUC__ - xSUB(esp, 12); - #endif + // Load VU's MXCSR state + xLDMXCSR(g_sseVUMXCSR); - // Load VU's MXCSR state - xLDMXCSR(g_sseVUMXCSR); + mVUrestoreRegs(mVU); - mVUrestoreRegs(mVU); + xMOV(gprF0, ptr32[&mVU.statFlag[0]]); + xMOV(gprF1, ptr32[&mVU.statFlag[1]]); + xMOV(gprF2, ptr32[&mVU.statFlag[2]]); + xMOV(gprF3, ptr32[&mVU.statFlag[3]]); - xMOV(gprF0, ptr32[&mVU.statFlag[0]]); - xMOV(gprF1, ptr32[&mVU.statFlag[1]]); - xMOV(gprF2, ptr32[&mVU.statFlag[2]]); - xMOV(gprF3, ptr32[&mVU.statFlag[3]]); + // Jump to Recompiled Code Block + xJMP(ptr32[&mVU.resumePtrXG]); - // Jump to Recompiled Code Block - xJMP(ptr32[&mVU.resumePtrXG]); - pxAssertDev(xGetPtr() < (mVU.dispCache + mVUdispCacheSize), - "microVU: Dispatcher generation exceeded reserved cache area!"); -} + mVU.exitFunctXG = x86Ptr; -// Generates the code to exit from xgkick -void mVUdispatcherD(mV) { - mVU.exitFunctXG = x86Ptr; + //xPOP(gprT1); // Pop return address + //xMOV(ptr32[&mVU.resumePtrXG], gprT1); - //xPOP(gprT1); // Pop return address - //xMOV(ptr32[&mVU.resumePtrXG], gprT1); + // Backup Status Flag (other regs were backed up on xgkick) + xMOV(ptr32[&mVU.statFlag[0]], gprF0); + xMOV(ptr32[&mVU.statFlag[1]], gprF1); + xMOV(ptr32[&mVU.statFlag[2]], gprF2); + xMOV(ptr32[&mVU.statFlag[3]], gprF3); - // Backup Status Flag (other regs were backed up on xgkick) - xMOV(ptr32[&mVU.statFlag[0]], gprF0); - xMOV(ptr32[&mVU.statFlag[1]], gprF1); - xMOV(ptr32[&mVU.statFlag[2]], gprF2); - xMOV(ptr32[&mVU.statFlag[3]], gprF3); + // Load EE's MXCSR state + xLDMXCSR(g_sseMXCSR); - // Load EE's MXCSR state - xLDMXCSR(g_sseMXCSR); - - // Unalign the stackframe: - #ifdef __GNUC__ - xADD( esp, 12 ); - #endif - - // Restore cpu state - xPOP(edi); - xPOP(esi); - xPOP(ebx); - xPOP(ebp); + } xRET(); + pxAssertDev(xGetPtr() < (mVU.dispCache + mVUdispCacheSize), "microVU: Dispatcher generation exceeded reserved cache area!"); }