From 6e66bea1522f1bb88eeb22944b75e975a996aa35 Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Thu, 3 Dec 2015 20:15:52 +0100 Subject: [PATCH 1/3] x86emitter: improve the various abi wrapper V2: * Add various option to handle the different frame * Basic x86-64 port (untested so likely completely broken) v3: Create an templated xImpl_FastCall object v4: clang compilation fix --- common/include/x86emitter/implement/jmpcall.h | 129 +++++++++++++++ common/include/x86emitter/instructions.h | 15 +- common/src/x86emitter/jmp.cpp | 2 + common/src/x86emitter/x86emitter.cpp | 156 ++++++++---------- 4 files changed, 204 insertions(+), 98 deletions(-) diff --git a/common/include/x86emitter/implement/jmpcall.h b/common/include/x86emitter/implement/jmpcall.h index 7f8ef72744..426d4600dc 100644 --- a/common/include/x86emitter/implement/jmpcall.h +++ b/common/include/x86emitter/implement/jmpcall.h @@ -68,5 +68,134 @@ struct xImpl_JmpCall } }; +// yes it is awful. Due to template code is in a header with a nice circular dep. +extern const xImpl_Mov xMOV; +extern const xImpl_JmpCall xCALL; + +struct xImpl_FastCall +{ + // FIXME: current 64 bits is mostly a copy/past potentially it would require to push/pop + // some registers. But I think it is enough to handle the first call. + + + // Type unsafety is nice +#ifdef __x86_64__ + +#define XFASTCALL \ + xCALL(func); + +#define XFASTCALL1 \ + xMOV(rdi, a1); \ + xCALL(func); + +#define XFASTCALL2 \ + xMOV(rdi, a1); \ + xMOV(rsi, a2); \ + xCALL(func); + +#else + +#define XFASTCALL \ + xCALL(func); + +#define XFASTCALL1 \ + xMOV(ecx, a1); \ + xCALL(func); + +#define XFASTCALL2 \ + xMOV(ecx, a1); \ + xMOV(edx, a2); \ + xCALL(func); + +#endif + + template< typename T > __fi __always_inline_tmpl_fail + void operator()( T* func, const xRegister32& a1 = xEmptyReg, const xRegister32& a2 = xEmptyReg) const + { +#ifdef __x86_64__ + if (a1.IsEmpty()) { + XFASTCALL; + } else if (a2.IsEmpty()) { + XFASTCALL1; + } else { + XFASTCALL2; + } +#else + if (a1.IsEmpty()) { + XFASTCALL; + } else if (a2.IsEmpty()) { + XFASTCALL1; + } else { + XFASTCALL2; + } +#endif + } + + template< typename T > __fi __always_inline_tmpl_fail + void operator()( T* func, u32 a1, const xRegister32& a2) const + { +#ifdef __x86_64__ + XFASTCALL2; +#else + XFASTCALL2; +#endif + } + + template< typename T > __fi __always_inline_tmpl_fail + void operator()( T* func, const xIndirectVoid& a1) const + { +#ifdef __x86_64__ + XFASTCALL1; +#else + XFASTCALL1; +#endif + } + + template< typename T > __fi __always_inline_tmpl_fail + void operator()( T* func, u32 a1, u32 a2) const + { +#ifdef __x86_64__ + XFASTCALL2; +#else + XFASTCALL2; +#endif + } + + template< typename T > __fi __always_inline_tmpl_fail + void operator()( T* func, u32 a1) const + { +#ifdef __x86_64__ + XFASTCALL1; +#else + XFASTCALL1; +#endif + } + + void operator()(const xIndirect32& func, const xRegister32& a1 = xEmptyReg, const xRegister32& a2 = xEmptyReg) const + { +#ifdef __x86_64__ + if (a1.IsEmpty()) { + XFASTCALL; + } else if (a2.IsEmpty()) { + XFASTCALL1; + } else { + XFASTCALL2; + } +#else + if (a1.IsEmpty()) { + XFASTCALL; + } else if (a2.IsEmpty()) { + XFASTCALL1; + } else { + XFASTCALL2; + } +#endif + } + +#undef XFASTCALL +#undef XFASTCALL1 +#undef XFASTCALL2 +}; + } // End namespace x86Emitter diff --git a/common/include/x86emitter/instructions.h b/common/include/x86emitter/instructions.h index 8d80012ae9..9441204624 100644 --- a/common/include/x86emitter/instructions.h +++ b/common/include/x86emitter/instructions.h @@ -93,6 +93,7 @@ namespace x86Emitter #else extern const xImpl_JmpCall xCALL; #endif + extern const xImpl_FastCall xFastCall; // ------------------------------------------------------------------------ extern const xImpl_CMov @@ -183,19 +184,15 @@ namespace x86Emitter extern void xINTO(); ////////////////////////////////////////////////////////////////////////////////////////// - // Helper function to handle the various functions ABI - extern void xFastCall(void* func, const xRegister32& a1 = xEmptyReg, const xRegister32& a2 = xEmptyReg); - extern void xFastCall(void* func, const xRegisterSSE& a1, const xRegisterSSE& a2); - extern void xFastCall(void* func, u32 a1, u32 a2); - extern void xFastCall(void* func, u32 a1); - - extern void xStdCall(void* func, u32 a1); - + // Helper object to handle the various functions ABI class xScopedStackFrame { bool m_base_frame; + bool m_save_base_pointer; + int m_offset; - xScopedStackFrame(bool base_frame); + public: + xScopedStackFrame(bool base_frame, bool save_base_pointer = false, int offset = 0); ~xScopedStackFrame(); }; diff --git a/common/src/x86emitter/jmp.cpp b/common/src/x86emitter/jmp.cpp index b2b89ef2c2..c56f0b1abc 100644 --- a/common/src/x86emitter/jmp.cpp +++ b/common/src/x86emitter/jmp.cpp @@ -42,6 +42,8 @@ void xImpl_JmpCall::operator()( const xIndirect16& src ) const { xOpWrite( 0x6 const xImpl_JmpCall xJMP = { true }; const xImpl_JmpCall xCALL = { false }; +const xImpl_FastCall xFastCall = { }; + void xSmartJump::SetTarget() { u8* target = xGetPtr(); diff --git a/common/src/x86emitter/x86emitter.cpp b/common/src/x86emitter/x86emitter.cpp index a1a476e365..8a362ccc96 100644 --- a/common/src/x86emitter/x86emitter.cpp +++ b/common/src/x86emitter/x86emitter.cpp @@ -1022,123 +1022,99 @@ __emitinline void xRestoreReg( const xRegisterSSE& dest ) xMOVDQA( dest, ptr[&xmm_data[dest.Id*2]] ); } -////////////////////////////////////////////////////////////////////////////////////////// -// Helper function to handle the various functions ABI - -__emitinline void xFastCall(void *func, const xRegister32& a1, const xRegister32& a2) -{ -#ifdef __x86_64__ - // NEW ABI - pxAssert(0); -#else - if (!a1.IsEmpty()) - xMOV(ecx, a1); - - if (!a2.IsEmpty()) - xMOV(edx, a2); - - xCALL(func); -#endif -} - -__emitinline void xFastCall(void *func, const xRegisterSSE& a1, const xRegisterSSE& a2) -{ -#ifdef __x86_64__ - // NEW ABI - pxAssert(0); -#else - xMOVD(ecx, a1); - xMOVD(edx, a2); - - xCALL(func); -#endif -} - -__emitinline void xFastCall(void *func, u32 a1, u32 a2) -{ -#ifdef __x86_64__ - // NEW ABI - pxAssert(0); -#else - xMOV(ecx, a1); - xMOV(edx, a2); - - xCALL(func); -#endif -} - -__emitinline void xFastCall(void *func, u32 a1) -{ -#ifdef __x86_64__ - // NEW ABI - pxAssert(0); -#else - xMOV(ecx, a1); - - xCALL(func); -#endif -} - -__emitinline void xStdCall(void *func, u32 a1) -{ -#ifdef __x86_64__ - // NEW ABI - pxAssert(0); -#else - // GCC note: unlike C call, GCC doesn't requires - // strict 16B alignment on std call - xPUSH(a1); - xCALL(func); -#endif -} - ////////////////////////////////////////////////////////////////////////////////////////// // Helper object to handle ABI frame +#ifdef __GNUC__ -xScopedStackFrame::xScopedStackFrame(bool base_frame) +#ifdef __x86_64__ +// GCC ensures/requires stack to be 16 bytes aligned (but when?) +#define ALIGN_STACK(v) xADD(rsp, v) +#else +// GCC ensures/requires stack to be 16 bytes aligned before the call +// Call will store 4 bytes. EDI/ESI/EBX will take another 12 bytes. +// EBP will take 4 bytes if m_base_frame is enabled +#define ALIGN_STACK(v) xADD(esp, v) +#endif + +#else + +#define ALIGN_STACK(v) + +#endif + +xScopedStackFrame::xScopedStackFrame(bool base_frame, bool save_base_pointer, int offset) { m_base_frame = base_frame; + m_save_base_pointer = save_base_pointer; + m_offset = offset; #ifdef __x86_64__ - // NEW ABI - pxAssert(0); + + m_offset += 8; // Call stores the return address (4 bytes) + + // Note rbp can surely be optimized in 64 bits + if (m_base_frame) { + xPUSH( rbp ); + xMOV( rbp, rsp ); + m_offset += 8; + } else if (m_save_base_pointer) { + xPUSH( rbp ); + m_offset += 8; + } + + xPUSH( rbx ); + xPUSH( r12 ); + xPUSH( r13 ); + xPUSH( r14 ); + xPUSH( r15 ); + m_offset += 40; + #else + m_offset += 4; // Call stores the return address (4 bytes) + // Create a new frame if (m_base_frame) { xPUSH( ebp ); xMOV( ebp, esp ); + m_offset += 4; + } else if (m_save_base_pointer) { + xPUSH( ebp ); + m_offset += 4; } // Save the register context xPUSH( edi ); xPUSH( esi ); xPUSH( ebx ); - -#ifdef __GNUC__ - // Realign the stack to 16 byte - if (m_base_frame) { - xSUB( esp, 12); - } -#endif + m_offset += 12; #endif + + ALIGN_STACK(-(16 - m_offset % 16)); } xScopedStackFrame::~xScopedStackFrame() { -#ifdef __x86_64__ - // NEW ABI - pxAssert(0); -#else + ALIGN_STACK(16 - m_offset % 16); -#ifdef __GNUC__ - // Restore the stack (due to the above alignment) - // Potentially it can be restored from ebp +#ifdef __x86_64__ + + // Restore the register context + xPOP( r15 ); + xPOP( r14 ); + xPOP( r13 ); + xPOP( r12 ); + xPOP( rbx ); + + // Destroy the frame if (m_base_frame) { - xADD( esp, 12); + xLEAVE(); + } else if (m_save_base_pointer) { + xPOP( rbp ); } -#endif + +#else // Restore the register context xPOP( ebx ); @@ -1148,6 +1124,8 @@ xScopedStackFrame::~xScopedStackFrame() // Destroy the frame if (m_base_frame) { xLEAVE(); + } else if (m_save_base_pointer) { + xPOP( ebp ); } #endif From 859d62d2a744dca763eb9560fb24062b59334126 Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Fri, 4 Dec 2015 19:36:29 +0100 Subject: [PATCH 2/3] ee|iop: use xScopedStackFrame to handle dynarec frame * Rework a bit MVU to support xScopedStackFrame. Potentially stack frame can be optimized (save 5 instructions) * I removed the recompiler stack check. Address sanitizer is more efficient anyway --- pcsx2/x86/iR3000A.cpp | 165 ++------------------------------ pcsx2/x86/ix86-32/iR5900-32.cpp | 105 ++------------------ pcsx2/x86/microVU.cpp | 6 +- pcsx2/x86/microVU_Execute.inl | 163 ++++++++++++------------------- 4 files changed, 74 insertions(+), 365 deletions(-) diff --git a/pcsx2/x86/iR3000A.cpp b/pcsx2/x86/iR3000A.cpp index 9bca128013..10a047642f 100644 --- a/pcsx2/x86/iR3000A.cpp +++ b/pcsx2/x86/iR3000A.cpp @@ -120,50 +120,6 @@ static void recEventTest() _cpuEventTest_Shared(); } -// parameters: -// espORebp - 0 for ESP, or 1 for EBP. -// regval - current value of the register at the time the fault was detected (predates the -// stackframe setup code in this function) -static void __fastcall StackFrameCheckFailed( int espORebp, int regval ) -{ - pxFailDev( pxsFmt( L"(R3000A Recompiler Stackframe) Sanity check failed on %ls\n\tCurrent=%d; Saved=%d", - (espORebp==0) ? L"ESP" : L"EBP", regval, (espORebp==0) ? s_store_esp : s_store_ebp ) - ); - - // Note: The recompiler will attempt to recover ESP and EBP after returning from this function, - // so typically selecting Continue/Ignore/Cancel for this assertion should allow PCSX2 to con- - // tinue to run with some degree of stability. -} - -static void _DynGen_StackFrameCheck() -{ - if( !IsDevBuild ) return; - - // --------- EBP Here ----------- - - xCMP( ebp, ptr[&s_store_ebp] ); - xForwardJE8 skipassert_ebp; - - xMOV( ecx, 1 ); // 1 specifies EBP - xMOV( edx, ebp ); - xCALL( StackFrameCheckFailed ); - xMOV( ebp, ptr[&s_store_ebp] ); // half-hearted frame recovery attempt! - - skipassert_ebp.SetTarget(); - - // --------- ESP There ----------- - - xCMP( esp, ptr[&s_store_esp] ); - xForwardJE8 skipassert_esp; - - xXOR( ecx, ecx ); // 0 specifies ESP - xMOV( edx, esp ); - xCALL( StackFrameCheckFailed ); - xMOV( esp, ptr[&s_store_esp] ); // half-hearted frame recovery attempt! - - skipassert_esp.SetTarget(); -} - // The address for all cleared blocks. It recompiles the current pc and then // dispatches to the recompiled block address. static DynGenFunc* _DynGen_JITCompile() @@ -171,7 +127,6 @@ static DynGenFunc* _DynGen_JITCompile() pxAssertMsg( iopDispatcherReg != NULL, "Please compile the DispatcherReg subroutine *before* JITComple. Thanks." ); u8* retval = xGetPtr(); - _DynGen_StackFrameCheck(); xMOV( ecx, ptr[&psxRegs.pc] ); xCALL( iopRecRecompile ); @@ -196,7 +151,6 @@ static DynGenFunc* _DynGen_JITCompileInBlock() static DynGenFunc* _DynGen_DispatcherReg() { u8* retval = xGetPtr(); - _DynGen_StackFrameCheck(); xMOV( eax, ptr[&psxRegs.pc] ); xMOV( ebx, eax ); @@ -210,128 +164,21 @@ static DynGenFunc* _DynGen_DispatcherReg() // -------------------------------------------------------------------------------------- // EnterRecompiledCode - dynamic compilation stub! // -------------------------------------------------------------------------------------- - -// In Release Builds this literally generates the following code: -// push edi -// push esi -// push ebx -// jmp DispatcherReg -// pop ebx -// pop esi -// pop edi -// -// See notes on why this works in both GCC (aligned stack!) and other compilers (not-so- -// aligned stack!). In debug/dev builds the code gen is more complicated, as it constructs -// ebp stackframe mess, which allows for a complete backtrace from debug breakpoints (yay). -// -// Also, if you set PCSX2_IOP_FORCED_ALIGN_STACK to 1, the codegen for MSVC becomes slightly -// more complicated since it has to perform a full stack alignment on entry. -// - -#if defined(__GNUG__) || defined(__DARWIN__) -# define PCSX2_ASSUME_ALIGNED_STACK 1 -#else -# define PCSX2_ASSUME_ALIGNED_STACK 0 -#endif - -// Set to 0 for a speedup in release builds. -// [doesn't apply to GCC/Mac, which must always align] -#define PCSX2_IOP_FORCED_ALIGN_STACK 0 //1 - - -// For overriding stackframe generation options in Debug builds (possibly useful for troubleshooting) -// Typically this value should be the same as IsDevBuild. -static const bool GenerateStackFrame = IsDevBuild; - static DynGenFunc* _DynGen_EnterRecompiledCode() { - u8* retval = xGetPtr(); - - bool allocatedStack = GenerateStackFrame || PCSX2_IOP_FORCED_ALIGN_STACK; - // Optimization: The IOP never uses stack-based parameter invocation, so we can avoid // allocating any room on the stack for it (which is important since the IOP's entry // code gets invoked quite a lot). - if( allocatedStack ) - { - xPUSH( ebp ); - xMOV( ebp, esp ); - xAND( esp, -0x10 ); + u8* retval = xGetPtr(); - xSUB( esp, 0x20 ); + { // Properly scope the frame prologue/epilogue + xScopedStackFrame frame(IsDevBuild); - xMOV( ptr[ebp-12], edi ); - xMOV( ptr[ebp-8], esi ); - xMOV( ptr[ebp-4], ebx ); - } - else - { - // GCC Compiler: - // The frame pointer coming in from the EE's event test can be safely assumed to be - // aligned, since GCC always aligns stackframes. While handy in x86-64, where CALL + PUSH EBP - // results in a neatly realigned stack on entry to every function, unfortunately in x86-32 - // this is usually worthless because CALL+PUSH leaves us 8 byte aligned instead (fail). So - // we have to do the usual set of stackframe alignments and simulated callstack mess - // *regardless*. + xJMP(iopDispatcherReg); - // MSVC/Intel compilers: - // The PCSX2_IOP_FORCED_ALIGN_STACK setting is 0, so we don't care. Just push regs like - // the good old days! (stack alignment will be indeterminate) - - xPUSH( edi ); - xPUSH( esi ); - xPUSH( ebx ); - - allocatedStack = false; - } - - uptr* imm = NULL; - if( allocatedStack ) - { - if( GenerateStackFrame ) - { - // Simulate a CALL function by pushing the call address and EBP onto the stack. - // This retains proper stacktrace and stack unwinding (handy in devbuilds!) - - xMOV( ptr32[esp+0x0c], 0xffeeff ); - imm = (uptr*)(xGetPtr()-4); - - // This part simulates the "normal" stackframe prep of "push ebp, mov ebp, esp" - xMOV( ptr32[esp+0x08], ebp ); - xLEA( ebp, ptr32[esp+0x08] ); - } - } - - if( IsDevBuild ) - { - xMOV( ptr[&s_store_esp], esp ); - xMOV( ptr[&s_store_ebp], ebp ); - } - - xJMP( iopDispatcherReg ); - if( imm != NULL ) - *imm = (uptr)xGetPtr(); - - // ---------------------- - // ----> Cleanup! ----> - - iopExitRecompiledCode = (DynGenFunc*)xGetPtr(); - - if( allocatedStack ) - { - // pop the nested "simulated call" stackframe, if needed: - if( GenerateStackFrame ) xLEAVE(); - xMOV( edi, ptr[ebp-12] ); - xMOV( esi, ptr[ebp-8] ); - xMOV( ebx, ptr[ebp-4] ); - xLEAVE(); - } - else - { - xPOP( ebx ); - xPOP( esi ); - xPOP( edi ); + // Save an exit point + iopExitRecompiledCode = (DynGenFunc*)xGetPtr(); } xRET(); diff --git a/pcsx2/x86/ix86-32/iR5900-32.cpp b/pcsx2/x86/ix86-32/iR5900-32.cpp index 0a1da6de6b..a3bd76fd43 100644 --- a/pcsx2/x86/ix86-32/iR5900-32.cpp +++ b/pcsx2/x86/ix86-32/iR5900-32.cpp @@ -372,50 +372,6 @@ static void recEventTest() _cpuEventTest_Shared(); } -// parameters: -// espORebp - 0 for ESP, or 1 for EBP. -// regval - current value of the register at the time the fault was detected (predates the -// stackframe setup code in this function) -static void __fastcall StackFrameCheckFailed( int espORebp, int regval ) -{ - pxFailDev( wxsFormat( L"(R5900 Recompiler Stackframe) Sanity check failed on %s\n\tCurrent=%d; Saved=%d", - (espORebp==0) ? L"ESP" : L"EBP", regval, (espORebp==0) ? s_store_esp : s_store_ebp ) - ); - - // Note: The recompiler will attempt to recover ESP and EBP after returning from this function, - // so typically selecting Continue/Ignore/Cancel for this assertion should allow PCSX2 to con- - // tinue to run with some degree of stability. -} - -static void _DynGen_StackFrameCheck() -{ - if( !EmuConfig.Cpu.Recompiler.StackFrameChecks ) return; - - // --------- EBP Here ----------- - - xCMP( ebp, ptr[&s_store_ebp] ); - xForwardJE8 skipassert_ebp; - - xMOV( ecx, 1 ); // 1 specifies EBP - xMOV( edx, ebp ); - xCALL( StackFrameCheckFailed ); - xMOV( ebp, ptr[&s_store_ebp] ); // half-hearted frame recovery attempt! - - skipassert_ebp.SetTarget(); - - // --------- ESP There ----------- - - xCMP( esp, ptr[&s_store_esp] ); - xForwardJE8 skipassert_esp; - - xXOR( ecx, ecx ); // 0 specifies ESP - xMOV( edx, esp ); - xCALL( StackFrameCheckFailed ); - xMOV( esp, ptr[&s_store_esp] ); // half-hearted frame recovery attempt! - - skipassert_esp.SetTarget(); -} - // The address for all cleared blocks. It recompiles the current pc and then // dispatches to the recompiled block address. static DynGenFunc* _DynGen_JITCompile() @@ -423,7 +379,6 @@ static DynGenFunc* _DynGen_JITCompile() pxAssertMsg( DispatcherReg != NULL, "Please compile the DispatcherReg subroutine *before* JITComple. Thanks." ); u8* retval = xGetAlignedCallTarget(); - _DynGen_StackFrameCheck(); xMOV( ecx, ptr[&cpuRegs.pc] ); xCALL( recRecompile ); @@ -448,7 +403,6 @@ static DynGenFunc* _DynGen_JITCompileInBlock() static DynGenFunc* _DynGen_DispatcherReg() { u8* retval = xGetPtr(); // fallthrough target, can't align it! - _DynGen_StackFrameCheck(); xMOV( eax, ptr[&cpuRegs.pc] ); xMOV( ebx, eax ); @@ -471,63 +425,18 @@ static DynGenFunc* _DynGen_DispatcherEvent() static DynGenFunc* _DynGen_EnterRecompiledCode() { pxAssertDev( DispatcherReg != NULL, "Dynamically generated dispatchers are required prior to generating EnterRecompiledCode!" ); - + u8* retval = xGetAlignedCallTarget(); - // "standard" frame pointer setup for aligned stack: Record the original - // esp into ebp, and then align esp. ebp references the original esp base - // for the duration of our function, and is used to restore the original - // esp before returning from the function + { // Properly scope the frame prologue/epilogue + xScopedStackFrame frame(IsDevBuild); - xPUSH( ebp ); - xMOV( ebp, esp ); - xAND( esp, -0x10 ); + xJMP(DispatcherReg); - // First 0x10 is for esi, edi, etc. Second 0x10 is for the return address and ebp. The - // third 0x10 is an optimization for C-style CDECL calls we might make from the recompiler - // (parameters for those calls can be stored there!) [currently no cdecl functions are - // used -- we do everything through __fastcall) - - static const int cdecl_reserve = 0x00; - xSUB( esp, 0x20 + cdecl_reserve ); - - xMOV( ptr[ebp-12], edi ); - xMOV( ptr[ebp-8], esi ); - xMOV( ptr[ebp-4], ebx ); - - // Simulate a CALL function by pushing the call address and EBP onto the stack. - // (the dummy address here is filled in later right before we generate the LEAVE code) - xMOV( ptr32[esp+0x0c+cdecl_reserve], 0xdeadbeef ); - uptr& imm = *(uptr*)(xGetPtr()-4); - - // This part simulates the "normal" stackframe prep of "push ebp, mov ebp, esp" - // It is done here because we can't really generate that stuff from the Dispatchers themselves. - xMOV( ptr32[esp+0x08+cdecl_reserve], ebp ); - xLEA( ebp, ptr32[esp+0x08+cdecl_reserve] ); - - if (EmuConfig.Cpu.Recompiler.StackFrameChecks) { - xMOV( ptr[&s_store_esp], esp ); - xMOV( ptr[&s_store_ebp], ebp ); + // Save an exit point + ExitRecompiledCode = (DynGenFunc*)xGetPtr(); } - xJMP( DispatcherReg ); - - xAlignCallTarget(); - - // This dummy CALL is unreachable code that some debuggers (MSVC2008) need in order to - // unwind the stack properly. This is effectively the call that we simulate above. - if( IsDevBuild ) xCALL( DispatcherReg ); - - imm = (uptr)xGetPtr(); - ExitRecompiledCode = (DynGenFunc*)xGetPtr(); - - xLEAVE(); - - xMOV( edi, ptr[ebp-12] ); - xMOV( esi, ptr[ebp-8] ); - xMOV( ebx, ptr[ebp-4] ); - - xLEAVE(); xRET(); return (DynGenFunc*)retval; @@ -1149,8 +1058,6 @@ static u32 scaleblockcycles() // setting "g_branch = 2"; static void iBranchTest(u32 newpc) { - _DynGen_StackFrameCheck(); - // Check the Event scheduler if our "cycle target" has been reached. // Equiv code to: // cpuRegs.cycle += blockcycles; diff --git a/pcsx2/x86/microVU.cpp b/pcsx2/x86/microVU.cpp index f9e43bbdf2..a4378196fc 100644 --- a/pcsx2/x86/microVU.cpp +++ b/pcsx2/x86/microVU.cpp @@ -80,10 +80,8 @@ void mVUreset(microVU& mVU, bool resetReserve) { else Perf::any.map((uptr)&mVU.dispCache, mVUdispCacheSize, "mVU0 Dispatcher"); x86SetPtr(mVU.dispCache); - mVUdispatcherA(mVU); - mVUdispatcherB(mVU); - mVUdispatcherC(mVU); - mVUdispatcherD(mVU); + mVUdispatcherAB(mVU); + mVUdispatcherCD(mVU); mVUemitSearch(); // Clear All Program Data diff --git a/pcsx2/x86/microVU_Execute.inl b/pcsx2/x86/microVU_Execute.inl index d7a910e848..0aaaeef15d 100644 --- a/pcsx2/x86/microVU_Execute.inl +++ b/pcsx2/x86/microVU_Execute.inl @@ -19,139 +19,96 @@ // Dispatcher Functions //------------------------------------------------------------------ -// Generates the code for entering recompiled blocks -void mVUdispatcherA(mV) { +// Generates the code for entering/exit recompiled blocks +void mVUdispatcherAB(mV) { mVU.startFunct = x86Ptr; - // Backup cpu state - xPUSH(ebp); - xPUSH(ebx); - xPUSH(esi); - xPUSH(edi); + { + xScopedStackFrame frame(false, true); - // Align the stackframe (GCC only, since GCC assumes stackframe is always aligned) - #ifdef __GNUC__ - xSUB(esp, 12); - #endif + // __fastcall = The caller has already put the needed parameters in ecx/edx: + if (!isVU1) { xCALL(mVUexecuteVU0); } + else { xCALL(mVUexecuteVU1); } - // __fastcall = The caller has already put the needed parameters in ecx/edx: - if (!isVU1) { xCALL(mVUexecuteVU0); } - else { xCALL(mVUexecuteVU1); } + // Load VU's MXCSR state + xLDMXCSR(g_sseVUMXCSR); - // Load VU's MXCSR state - xLDMXCSR(g_sseVUMXCSR); + // Load Regs + xMOV(gprF0, ptr32[&mVU.regs().VI[REG_STATUS_FLAG].UL]); + xMOV(gprF1, gprF0); + xMOV(gprF2, gprF0); + xMOV(gprF3, gprF0); - // Load Regs - xMOV(gprF0, ptr32[&mVU.regs().VI[REG_STATUS_FLAG].UL]); - xMOV(gprF1, gprF0); - xMOV(gprF2, gprF0); - xMOV(gprF3, gprF0); + xMOVAPS (xmmT1, ptr128[&mVU.regs().VI[REG_MAC_FLAG].UL]); + xSHUF.PS(xmmT1, xmmT1, 0); + xMOVAPS (ptr128[mVU.macFlag], xmmT1); - xMOVAPS (xmmT1, ptr128[&mVU.regs().VI[REG_MAC_FLAG].UL]); - xSHUF.PS(xmmT1, xmmT1, 0); - xMOVAPS (ptr128[mVU.macFlag], xmmT1); + xMOVAPS (xmmT1, ptr128[&mVU.regs().VI[REG_CLIP_FLAG].UL]); + xSHUF.PS(xmmT1, xmmT1, 0); + xMOVAPS (ptr128[mVU.clipFlag], xmmT1); - xMOVAPS (xmmT1, ptr128[&mVU.regs().VI[REG_CLIP_FLAG].UL]); - xSHUF.PS(xmmT1, xmmT1, 0); - xMOVAPS (ptr128[mVU.clipFlag], xmmT1); + xMOVAPS (xmmT1, ptr128[&mVU.regs().VI[REG_P].UL]); + xMOVAPS (xmmPQ, ptr128[&mVU.regs().VI[REG_Q].UL]); + xSHUF.PS(xmmPQ, xmmT1, 0); // wzyx = PPQQ - xMOVAPS (xmmT1, ptr128[&mVU.regs().VI[REG_P].UL]); - xMOVAPS (xmmPQ, ptr128[&mVU.regs().VI[REG_Q].UL]); - xSHUF.PS(xmmPQ, xmmT1, 0); // wzyx = PPQQ + // Jump to Recompiled Code Block + xJMP(eax); - // Jump to Recompiled Code Block - xJMP(eax); - pxAssertDev(xGetPtr() < (mVU.dispCache + mVUdispCacheSize), - "microVU: Dispatcher generation exceeded reserved cache area!"); -} + mVU.exitFunct = x86Ptr; -// Generates the code to exit from recompiled blocks -void mVUdispatcherB(mV) { - mVU.exitFunct = x86Ptr; + // Load EE's MXCSR state + xLDMXCSR(g_sseMXCSR); - // Load EE's MXCSR state - xLDMXCSR(g_sseMXCSR); - - // __fastcall = The first two DWORD or smaller arguments are passed in ECX and EDX registers; - // all other arguments are passed right to left. - if (!isVU1) { xCALL(mVUcleanUpVU0); } - else { xCALL(mVUcleanUpVU1); } - - // Unalign the stackframe: - #ifdef __GNUC__ - xADD( esp, 12 ); - #endif - - // Restore cpu state - xPOP(edi); - xPOP(esi); - xPOP(ebx); - xPOP(ebp); + // __fastcall = The first two DWORD or smaller arguments are passed in ECX and EDX registers; + // all other arguments are passed right to left. + if (!isVU1) { xCALL(mVUcleanUpVU0); } + else { xCALL(mVUcleanUpVU1); } + } xRET(); + pxAssertDev(xGetPtr() < (mVU.dispCache + mVUdispCacheSize), - "microVU: Dispatcher generation exceeded reserved cache area!"); + "microVU: Dispatcher generation exceeded reserved cache area!"); } -// Generates the code for resuming xgkick -void mVUdispatcherC(mV) { +// Generates the code for resuming/exit xgkick +void mVUdispatcherCD(mV) { mVU.startFunctXG = x86Ptr; - // Backup cpu state - xPUSH(ebp); - xPUSH(ebx); - xPUSH(esi); - xPUSH(edi); + { + xScopedStackFrame frame(false, true); - // Align the stackframe (GCC only, since GCC assumes stackframe is always aligned) - #ifdef __GNUC__ - xSUB(esp, 12); - #endif + // Load VU's MXCSR state + xLDMXCSR(g_sseVUMXCSR); - // Load VU's MXCSR state - xLDMXCSR(g_sseVUMXCSR); + mVUrestoreRegs(mVU); - mVUrestoreRegs(mVU); + xMOV(gprF0, ptr32[&mVU.statFlag[0]]); + xMOV(gprF1, ptr32[&mVU.statFlag[1]]); + xMOV(gprF2, ptr32[&mVU.statFlag[2]]); + xMOV(gprF3, ptr32[&mVU.statFlag[3]]); - xMOV(gprF0, ptr32[&mVU.statFlag[0]]); - xMOV(gprF1, ptr32[&mVU.statFlag[1]]); - xMOV(gprF2, ptr32[&mVU.statFlag[2]]); - xMOV(gprF3, ptr32[&mVU.statFlag[3]]); + // Jump to Recompiled Code Block + xJMP(ptr32[&mVU.resumePtrXG]); - // Jump to Recompiled Code Block - xJMP(ptr32[&mVU.resumePtrXG]); - pxAssertDev(xGetPtr() < (mVU.dispCache + mVUdispCacheSize), - "microVU: Dispatcher generation exceeded reserved cache area!"); -} + mVU.exitFunctXG = x86Ptr; -// Generates the code to exit from xgkick -void mVUdispatcherD(mV) { - mVU.exitFunctXG = x86Ptr; + //xPOP(gprT1); // Pop return address + //xMOV(ptr32[&mVU.resumePtrXG], gprT1); - //xPOP(gprT1); // Pop return address - //xMOV(ptr32[&mVU.resumePtrXG], gprT1); + // Backup Status Flag (other regs were backed up on xgkick) + xMOV(ptr32[&mVU.statFlag[0]], gprF0); + xMOV(ptr32[&mVU.statFlag[1]], gprF1); + xMOV(ptr32[&mVU.statFlag[2]], gprF2); + xMOV(ptr32[&mVU.statFlag[3]], gprF3); - // Backup Status Flag (other regs were backed up on xgkick) - xMOV(ptr32[&mVU.statFlag[0]], gprF0); - xMOV(ptr32[&mVU.statFlag[1]], gprF1); - xMOV(ptr32[&mVU.statFlag[2]], gprF2); - xMOV(ptr32[&mVU.statFlag[3]], gprF3); + // Load EE's MXCSR state + xLDMXCSR(g_sseMXCSR); - // Load EE's MXCSR state - xLDMXCSR(g_sseMXCSR); - - // Unalign the stackframe: - #ifdef __GNUC__ - xADD( esp, 12 ); - #endif - - // Restore cpu state - xPOP(edi); - xPOP(esi); - xPOP(ebx); - xPOP(ebp); + } xRET(); + pxAssertDev(xGetPtr() < (mVU.dispCache + mVUdispCacheSize), "microVU: Dispatcher generation exceeded reserved cache area!"); } From e3d5eb5a4e51b7d964aa7c394e0954d0a3e2d330 Mon Sep 17 00:00:00 2001 From: Gregory Hainaut Date: Mon, 11 Jan 2016 08:26:00 +0100 Subject: [PATCH 3/3] core: convert xCALL to xFastCall SuperVU wasn't converted (unlikely to be ported to 64 bits) A couple of calls weren't converted because they require extra work but there are not mandatory (debug/MTVU/...) --- pcsx2/x86/iCOP0.cpp | 19 ++++++------- pcsx2/x86/iFPU.cpp | 6 ++-- pcsx2/x86/iFPUd.cpp | 16 +---------- pcsx2/x86/iMMI.cpp | 2 +- pcsx2/x86/iR3000A.cpp | 28 +++++++++---------- pcsx2/x86/iR3000Atables.cpp | 20 ++++++------- pcsx2/x86/iR5900Misc.cpp | 4 +-- pcsx2/x86/ix86-32/iR5900-32.cpp | 45 ++++++++++++++---------------- pcsx2/x86/ix86-32/iR5900Branch.cpp | 2 +- pcsx2/x86/ix86-32/recVTLB.cpp | 11 +++----- pcsx2/x86/microVU_Branch.inl | 12 ++++---- pcsx2/x86/microVU_Compile.inl | 19 ++++--------- pcsx2/x86/microVU_Execute.inl | 8 +++--- pcsx2/x86/microVU_Lower.inl | 3 +- pcsx2/x86/microVU_Macro.inl | 13 ++++----- 15 files changed, 88 insertions(+), 120 deletions(-) diff --git a/pcsx2/x86/iCOP0.cpp b/pcsx2/x86/iCOP0.cpp index 309bd9b265..a4b672edb5 100644 --- a/pcsx2/x86/iCOP0.cpp +++ b/pcsx2/x86/iCOP0.cpp @@ -112,7 +112,7 @@ void recDI() //xMOV(eax, ptr[&cpuRegs.cycle ]); //xMOV(ptr[&g_nextBranchCycle], eax); - //xCALL((void*)(uptr)Interp::DI ); + //xFastCall((void*)(uptr)Interp::DI ); xMOV(eax, ptr[&cpuRegs.CP0.n.Status]); xTEST(eax, 0x20006); // EXL | ERL | EDI @@ -170,12 +170,12 @@ void recMFC0() case 1: iFlushCall(FLUSH_INTERPRETER); - xCALL( COP0_UpdatePCCR ); + xFastCall(COP0_UpdatePCCR ); xMOV(eax, ptr[&cpuRegs.PERF.n.pcr0]); break; case 3: iFlushCall(FLUSH_INTERPRETER); - xCALL( COP0_UpdatePCCR ); + xFastCall(COP0_UpdatePCCR ); xMOV(eax, ptr[&cpuRegs.PERF.n.pcr1]); break; } @@ -207,8 +207,7 @@ void recMTC0() { case 12: iFlushCall(FLUSH_INTERPRETER); - xMOV( ecx, g_cpuConstRegs[_Rt_].UL[0] ); - xCALL( WriteCP0Status ); + xFastCall(WriteCP0Status, g_cpuConstRegs[_Rt_].UL[0] ); break; case 9: @@ -222,9 +221,9 @@ void recMTC0() { case 0: iFlushCall(FLUSH_INTERPRETER); - xCALL( COP0_UpdatePCCR ); + xFastCall(COP0_UpdatePCCR ); xMOV( ptr32[&cpuRegs.PERF.n.pccr], g_cpuConstRegs[_Rt_].UL[0] ); - xCALL( COP0_DiagnosticPCCR ); + xFastCall(COP0_DiagnosticPCCR ); break; case 1: @@ -257,7 +256,7 @@ void recMTC0() case 12: iFlushCall(FLUSH_INTERPRETER); _eeMoveGPRtoR(ecx, _Rt_); - xCALL( WriteCP0Status ); + xFastCall(WriteCP0Status, ecx ); break; case 9: @@ -271,9 +270,9 @@ void recMTC0() { case 0: iFlushCall(FLUSH_INTERPRETER); - xCALL( COP0_UpdatePCCR ); + xFastCall(COP0_UpdatePCCR ); _eeMoveGPRtoM((uptr)&cpuRegs.PERF.n.pccr, _Rt_); - xCALL( COP0_DiagnosticPCCR ); + xFastCall(COP0_DiagnosticPCCR ); break; case 1: diff --git a/pcsx2/x86/iFPU.cpp b/pcsx2/x86/iFPU.cpp index 9208aaecb5..3513df4c3a 100644 --- a/pcsx2/x86/iFPU.cpp +++ b/pcsx2/x86/iFPU.cpp @@ -92,7 +92,7 @@ static const __aligned16 u32 s_pos[4] = { 0x7fffffff, 0xffffffff, 0xffffffff, 0x void f(); \ void rec##f() { \ iFlushCall(FLUSH_INTERPRETER); \ - xCALL((void*)(uptr)R5900::Interpreter::OpcodeImpl::COP1::f); \ + xFastCall((void*)(uptr)R5900::Interpreter::OpcodeImpl::COP1::f); \ branch = 2; \ } @@ -100,7 +100,7 @@ static const __aligned16 u32 s_pos[4] = { 0x7fffffff, 0xffffffff, 0xffffffff, 0x void f(); \ void rec##f() { \ iFlushCall(FLUSH_INTERPRETER); \ - xCALL((void*)(uptr)R5900::Interpreter::OpcodeImpl::COP1::f); \ + xFastCall((void*)(uptr)R5900::Interpreter::OpcodeImpl::COP1::f); \ } //------------------------------------------------------------------ @@ -550,7 +550,7 @@ void FPU_MUL(int regd, int regt, bool reverseOperands) { xMOVD(ecx, xRegisterSSE(reverseOperands ? regt : regd)); xMOVD(edx, xRegisterSSE(reverseOperands ? regd : regt)); - xCALL((void*)(uptr)&FPU_MUL_HACK ); //returns the hacked result or 0 + xFastCall((void*)(uptr)&FPU_MUL_HACK, ecx, edx); //returns the hacked result or 0 xTEST(eax, eax); noHack = JZ8(0); xMOVDZX(xRegisterSSE(regd), eax); diff --git a/pcsx2/x86/iFPUd.cpp b/pcsx2/x86/iFPUd.cpp index fd60ade533..ee397d9054 100644 --- a/pcsx2/x86/iFPUd.cpp +++ b/pcsx2/x86/iFPUd.cpp @@ -89,20 +89,6 @@ namespace DOUBLE { #define FPUflagSO 0X00000010 #define FPUflagSU 0X00000008 -#define REC_FPUBRANCH(f) \ - void f(); \ - void rec##f() { \ - iFlushCall(FLUSH_INTERPRETER); \ - xCALL((void*)(uptr)R5900::Interpreter::OpcodeImpl::COP1::f); \ - branch = 2; \ -} - -#define REC_FPUFUNC(f) \ - void f(); \ - void rec##f() { \ - iFlushCall(FLUSH_INTERPRETER); \ - xCALL((void*)(uptr)R5900::Interpreter::OpcodeImpl::COP1::f); \ -} //------------------------------------------------------------------ //------------------------------------------------------------------ @@ -416,7 +402,7 @@ void FPU_MUL(int info, int regd, int sreg, int treg, bool acc) { xMOVD(ecx, xRegisterSSE(sreg)); xMOVD(edx, xRegisterSSE(treg)); - xCALL((void*)(uptr)&FPU_MUL_HACK ); //returns the hacked result or 0 + xFastCall((void*)(uptr)&FPU_MUL_HACK, ecx, edx); //returns the hacked result or 0 xTEST(eax, eax); noHack = JZ8(0); xMOVDZX(xRegisterSSE(regd), eax); diff --git a/pcsx2/x86/iMMI.cpp b/pcsx2/x86/iMMI.cpp index eb99ab0f30..5d24c67e9c 100644 --- a/pcsx2/x86/iMMI.cpp +++ b/pcsx2/x86/iMMI.cpp @@ -185,7 +185,7 @@ void recPMFHL() // fall to interp _deleteEEreg(_Rd_, 0); iFlushCall(FLUSH_INTERPRETER); // since calling CALLFunc - xCALL((void*)(uptr)R5900::Interpreter::OpcodeImpl::MMI::PMFHL ); + xFastCall((void*)(uptr)R5900::Interpreter::OpcodeImpl::MMI::PMFHL ); break; case 0x03: // LH diff --git a/pcsx2/x86/iR3000A.cpp b/pcsx2/x86/iR3000A.cpp index 10a047642f..de1f88b377 100644 --- a/pcsx2/x86/iR3000A.cpp +++ b/pcsx2/x86/iR3000A.cpp @@ -128,8 +128,7 @@ static DynGenFunc* _DynGen_JITCompile() u8* retval = xGetPtr(); - xMOV( ecx, ptr[&psxRegs.pc] ); - xCALL( iopRecRecompile ); + xFastCall(iopRecRecompile, ptr[&psxRegs.pc] ); xMOV( eax, ptr[&psxRegs.pc] ); xMOV( ebx, eax ); @@ -199,7 +198,7 @@ static void _DynGen_Dispatchers() // Place the EventTest and DispatcherReg stuff at the top, because they get called the // most and stand to benefit from strong alignment and direct referencing. iopDispatcherEvent = (DynGenFunc*)xGetPtr(); - xCALL( recEventTest ); + xFastCall(recEventTest ); iopDispatcherReg = _DynGen_DispatcherReg(); iopJITCompile = _DynGen_JITCompile(); @@ -523,11 +522,11 @@ void psxRecompileCodeConst1(R3000AFNPTR constcode, R3000AFNPTR_INFO noconstcode) } if (debug) - xCALL(debug); + xFastCall(debug); #endif irxHLE hle = irxImportHLE(libname, index); if (hle) { - xCALL(hle); + xFastCall(hle); xCMP(eax, 0); xJNE(iopDispatcherReg); } @@ -907,7 +906,7 @@ static void iPsxBranchTest(u32 newpc, u32 cpuBranch) xSUB(ptr32[&iopCycleEE], eax); xJLE(iopExitRecompiledCode); - xCALL(iopEventTest); + xFastCall(iopEventTest); if( newpc != 0xffffffff ) { @@ -929,7 +928,7 @@ static void iPsxBranchTest(u32 newpc, u32 cpuBranch) xSUB(eax, ptr32[&g_iopNextEventCycle]); xForwardJS nointerruptpending; - xCALL(iopEventTest); + xFastCall(iopEventTest); if( newpc != 0xffffffff ) { xCMP(ptr32[&psxRegs.pc], newpc); @@ -964,9 +963,9 @@ void rpsxSYSCALL() xMOV(ptr32[&psxRegs.pc], psxpc - 4); _psxFlushCall(FLUSH_NODESTROY); - xMOV( ecx, 0x20 ); // exception code - xMOV( edx, psxbranch==1 ); // branch delay slot? - xCALL( psxException ); + //xMOV( ecx, 0x20 ); // exception code + //xMOV( edx, psxbranch==1 ); // branch delay slot? + xFastCall(psxException, 0x20, psxbranch == 1 ); xCMP(ptr32[&psxRegs.pc], psxpc-4); j8Ptr[0] = JE8(0); @@ -987,9 +986,9 @@ void rpsxBREAK() xMOV(ptr32[&psxRegs.pc], psxpc - 4); _psxFlushCall(FLUSH_NODESTROY); - xMOV( ecx, 0x24 ); // exception code - xMOV( edx, psxbranch==1 ); // branch delay slot? - xCALL( psxException ); + //xMOV( ecx, 0x24 ); // exception code + //xMOV( edx, psxbranch==1 ); // branch delay slot? + xFastCall(psxException, 0x24, psxbranch == 1 ); xCMP(ptr32[&psxRegs.pc], psxpc-4); j8Ptr[0] = JE8(0); @@ -1102,8 +1101,7 @@ static void __fastcall iopRecRecompile( const u32 startpc ) if( IsDebugBuild ) { - xMOV(ecx, psxpc); - xCALL(PreBlockCheck); + xFastCall(PreBlockCheck, psxpc); } // go until the next branch diff --git a/pcsx2/x86/iR3000Atables.cpp b/pcsx2/x86/iR3000Atables.cpp index 07c5ffc78e..1aca465f5d 100644 --- a/pcsx2/x86/iR3000Atables.cpp +++ b/pcsx2/x86/iR3000Atables.cpp @@ -32,7 +32,7 @@ extern u32 g_psxMaxRecMem; static void rpsx##f() { \ xMOV(ptr32[&psxRegs.code], (u32)psxRegs.code); \ _psxFlushCall(FLUSH_EVERYTHING); \ - xCALL((void*)(uptr)psx##f); \ + xFastCall((void*)(uptr)psx##f); \ PSX_DEL_CONST(_Rt_); \ /* branch = 2; */\ } @@ -626,7 +626,7 @@ static void rpsxLB() xMOV(ecx, ptr[&psxRegs.GPR.r[_Rs_]]); if (_Imm_) xADD(ecx, _Imm_); - xCALL( iopMemRead8 ); // returns value in EAX + xFastCall(iopMemRead8, ecx ); // returns value in EAX if (_Rt_) { xMOVSX(eax, al); xMOV(ptr[&psxRegs.GPR.r[_Rt_]], eax); @@ -642,7 +642,7 @@ static void rpsxLBU() xMOV(ecx, ptr[&psxRegs.GPR.r[_Rs_]]); if (_Imm_) xADD(ecx, _Imm_); - xCALL( iopMemRead8 ); // returns value in EAX + xFastCall(iopMemRead8, ecx ); // returns value in EAX if (_Rt_) { xMOVZX(eax, al); xMOV(ptr[&psxRegs.GPR.r[_Rt_]], eax); @@ -658,7 +658,7 @@ static void rpsxLH() xMOV(ecx, ptr[&psxRegs.GPR.r[_Rs_]]); if (_Imm_) xADD(ecx, _Imm_); - xCALL( iopMemRead16 ); // returns value in EAX + xFastCall(iopMemRead16, ecx ); // returns value in EAX if (_Rt_) { xMOVSX(eax, ax); xMOV(ptr[&psxRegs.GPR.r[_Rt_]], eax); @@ -674,7 +674,7 @@ static void rpsxLHU() xMOV(ecx, ptr[&psxRegs.GPR.r[_Rs_]]); if (_Imm_) xADD(ecx, _Imm_); - xCALL( iopMemRead16 ); // returns value in EAX + xFastCall(iopMemRead16, ecx ); // returns value in EAX if (_Rt_) { xMOVZX(eax, ax); xMOV(ptr[&psxRegs.GPR.r[_Rt_]], eax); @@ -695,7 +695,7 @@ static void rpsxLW() xTEST(ecx, 0x10000000); j8Ptr[0] = JZ8(0); - xCALL( iopMemRead32 ); // returns value in EAX + xFastCall(iopMemRead32, ecx ); // returns value in EAX if (_Rt_) { xMOV(ptr[&psxRegs.GPR.r[_Rt_]], eax); } @@ -721,7 +721,7 @@ static void rpsxSB() xMOV(ecx, ptr[&psxRegs.GPR.r[_Rs_]]); if (_Imm_) xADD(ecx, _Imm_); xMOV( edx, ptr[&psxRegs.GPR.r[_Rt_]] ); - xCALL( iopMemWrite8 ); + xFastCall(iopMemWrite8, ecx, edx ); } static void rpsxSH() @@ -732,7 +732,7 @@ static void rpsxSH() xMOV(ecx, ptr[&psxRegs.GPR.r[_Rs_]]); if (_Imm_) xADD(ecx, _Imm_); xMOV( edx, ptr[&psxRegs.GPR.r[_Rt_]] ); - xCALL( iopMemWrite16 ); + xFastCall(iopMemWrite16, ecx, edx ); } static void rpsxSW() @@ -743,7 +743,7 @@ static void rpsxSW() xMOV(ecx, ptr[&psxRegs.GPR.r[_Rs_]]); if (_Imm_) xADD(ecx, _Imm_); xMOV( edx, ptr[&psxRegs.GPR.r[_Rt_]] ); - xCALL( iopMemWrite32 ); + xFastCall(iopMemWrite32, ecx, edx ); } //// SLL @@ -1371,7 +1371,7 @@ void rpsxRFE() // Test the IOP's INTC status, so that any pending ints get raised. _psxFlushCall(0); - xCALL((void*)(uptr)&iopTestIntc ); + xFastCall((void*)(uptr)&iopTestIntc ); } // R3000A tables diff --git a/pcsx2/x86/iR5900Misc.cpp b/pcsx2/x86/iR5900Misc.cpp index 2a54a6875b..d9f0b28317 100644 --- a/pcsx2/x86/iR5900Misc.cpp +++ b/pcsx2/x86/iR5900Misc.cpp @@ -71,7 +71,7 @@ namespace OpcodeImpl { // xMOV(ptr32[&cpuRegs.code], cpuRegs.code ); // xMOV(ptr32[&cpuRegs.pc], pc ); // iFlushCall(FLUSH_EVERYTHING); -// xCALL((void*)(uptr)CACHE ); +// xFastCall((void*)(uptr)CACHE ); // //branch = 2; // // xCMP(ptr32[(u32*)((int)&cpuRegs.pc)], pc); @@ -203,7 +203,7 @@ void recMTSAH() //xMOV(ptr32[&cpuRegs.code], (u32)cpuRegs.code ); //xMOV(ptr32[&cpuRegs.pc], (u32)pc ); //iFlushCall(FLUSH_EVERYTHING); - //xCALL((void*)(uptr)R5900::Interpreter::OpcodeImpl::CACHE ); + //xFastCall((void*)(uptr)R5900::Interpreter::OpcodeImpl::CACHE ); //branch = 2; } diff --git a/pcsx2/x86/ix86-32/iR5900-32.cpp b/pcsx2/x86/ix86-32/iR5900-32.cpp index a3bd76fd43..8cb340c409 100644 --- a/pcsx2/x86/ix86-32/iR5900-32.cpp +++ b/pcsx2/x86/ix86-32/iR5900-32.cpp @@ -340,7 +340,7 @@ void recBranchCall( void (*func)() ) void recCall( void (*func)() ) { iFlushCall(FLUSH_INTERPRETER); - xCALL(func); + xFastCall(func); } // ===================================================================================================== @@ -380,8 +380,7 @@ static DynGenFunc* _DynGen_JITCompile() u8* retval = xGetAlignedCallTarget(); - xMOV( ecx, ptr[&cpuRegs.pc] ); - xCALL( recRecompile ); + xFastCall(recRecompile, ptr[&cpuRegs.pc] ); xMOV( eax, ptr[&cpuRegs.pc] ); xMOV( ebx, eax ); @@ -417,7 +416,7 @@ static DynGenFunc* _DynGen_DispatcherEvent() { u8* retval = xGetPtr(); - xCALL( recEventTest ); + xFastCall(recEventTest ); return (DynGenFunc*)retval; } @@ -446,7 +445,7 @@ static DynGenFunc* _DynGen_DispatchBlockDiscard() { u8* retval = xGetPtr(); xEMMS(); - xCALL(dyna_block_discard); + xFastCall(dyna_block_discard); xJMP(ExitRecompiledCode); return (DynGenFunc*)retval; } @@ -455,7 +454,7 @@ static DynGenFunc* _DynGen_DispatchPageReset() { u8* retval = xGetPtr(); xEMMS(); - xCALL(dyna_page_reset); + xFastCall(dyna_page_reset); xJMP(ExitRecompiledCode); return (DynGenFunc*)retval; } @@ -916,7 +915,7 @@ void SetBranchReg( u32 reg ) // xCMP(ptr32[&cpuRegs.pc], 0); // j8Ptr[5] = JNE8(0); -// xCALL((void*)(uptr)tempfn); +// xFastCall((void*)(uptr)tempfn); // x86SetJ8( j8Ptr[5] ); iFlushCall(FLUSH_EVERYTHING); @@ -1201,11 +1200,11 @@ void recMemcheck(u32 op, u32 bits, bool store) if (bits == 128) xAND(ecx, ~0x0F); - xCALL(standardizeBreakpointAddress); + xFastCall(standardizeBreakpointAddress, ecx); xMOV(ecx,eax); xMOV(edx,eax); xADD(edx,bits/8); - + // ecx = access address // edx = access address+size @@ -1220,11 +1219,11 @@ void recMemcheck(u32 op, u32 bits, bool store) continue; // logic: memAddress < bpEnd && bpStart < memAddress+memSize - + xMOV(eax,standardizeBreakpointAddress(checks[i].end)); xCMP(ecx,eax); // address < end xForwardJGE8 next1; // if address >= end then goto next1 - + xMOV(eax,standardizeBreakpointAddress(checks[i].start)); xCMP(eax,edx); // start < address+size xForwardJGE8 next2; // if start >= address+size then goto next2 @@ -1232,10 +1231,10 @@ void recMemcheck(u32 op, u32 bits, bool store) // hit the breakpoint if (checks[i].result & MEMCHECK_LOG) { xMOV(edx, store); - xCALL(&dynarecMemLogcheck); + xFastCall(dynarecMemLogcheck, ecx, edx); } if (checks[i].result & MEMCHECK_BREAK) { - xCALL(&dynarecMemcheck); + xFastCall(dynarecMemcheck); } next1.SetTarget(); @@ -1248,7 +1247,7 @@ void encodeBreakpoint() if (isBreakpointNeeded(pc) != 0) { iFlushCall(FLUSH_EVERYTHING|FLUSH_PC); - xCALL(&dynarecCheckBreakpoint); + xFastCall(dynarecCheckBreakpoint); } } @@ -1297,7 +1296,7 @@ void recompileNextInstruction(int delayslot) s_pCode = (int *)PSM( pc ); pxAssert(s_pCode); - + if( IsDebugBuild ) xMOV(eax, pc); // acts as a tag for delimiting recompiled instructions when viewing x86 disasm. @@ -1660,7 +1659,7 @@ static void __fastcall recRecompile( const u32 startpc ) if (0x8000d618 == startpc) DbgCon.WriteLn("Compiling block @ 0x%08x", startpc); - + s_pCurBlock = PC_GETBLOCK(startpc); pxAssert(s_pCurBlock->GetFnptr() == (uptr)JITCompile @@ -1674,14 +1673,14 @@ static void __fastcall recRecompile( const u32 startpc ) pxAssert(s_pCurBlockEx); if (g_SkipBiosHack && HWADDR(startpc) == EELOAD_START) { - xCALL(eeloadReplaceOSDSYS); + xFastCall(eeloadReplaceOSDSYS); xCMP(ptr32[&cpuRegs.pc], startpc); xJNE(DispatcherReg); } // this is the only way patches get applied, doesn't depend on a hack if (HWADDR(startpc) == ElfEntry) { - xCALL(eeGameStarting); + xFastCall(eeGameStarting); // Apply patch as soon as possible. Normally it is done in // eeGameStarting but first block is already compiled. // @@ -1711,20 +1710,18 @@ static void __fastcall recRecompile( const u32 startpc ) // [TODO] : These must be enabled from the GUI or INI to be used, otherwise the // code that calls PreBlockCheck will not be generated. - xMOV(ecx, pc); - xCALL(PreBlockCheck); + xFastCall(PreBlockCheck, pc); } if (EmuConfig.Gamefixes.GoemonTlbHack) { if (pc == 0x33ad48 || pc == 0x35060c) { // 0x33ad48 and 0x35060c are the return address of the function (0x356250) that populate the TLB cache - xCALL(GoemonPreloadTlb); + xFastCall(GoemonPreloadTlb); } else if (pc == 0x3563b8) { // Game will unmap some virtual addresses. If a constant address were hardcoded in the block, we would be in a bad situation. AtomicExchange( eeRecNeedsReset, true ); // 0x3563b8 is the start address of the function that invalidate entry in TLB cache - xMOV(ecx, ptr[&cpuRegs.GPR.n.a0.UL[ 0 ] ]); - xCALL(GoemonUnloadTlb); + xFastCall(GoemonUnloadTlb, ptr[&cpuRegs.GPR.n.a0.UL[0]]); } } @@ -1745,7 +1742,7 @@ static void __fastcall recRecompile( const u32 startpc ) while(1) { BASEBLOCK* pblock = PC_GETBLOCK(i); - + // stop before breakpoints if (isBreakpointNeeded(i) != 0 || isMemcheckNeeded(i) != 0) { diff --git a/pcsx2/x86/ix86-32/iR5900Branch.cpp b/pcsx2/x86/ix86-32/iR5900Branch.cpp index dd8861ea76..e9c6cebe24 100644 --- a/pcsx2/x86/ix86-32/iR5900Branch.cpp +++ b/pcsx2/x86/ix86-32/iR5900Branch.cpp @@ -396,7 +396,7 @@ EERECOMPILE_CODE0(BNEL, XMMINFO_READS|XMMINFO_READT); // xMOV(ptr32[(u32*)((int)&cpuRegs.code)], cpuRegs.code ); // xMOV(ptr32[(u32*)((int)&cpuRegs.pc)], pc ); // iFlushCall(FLUSH_EVERYTHING); -// xCALL((void*)(int)BLTZAL ); +// xFastCall((void*)(int)BLTZAL ); // branch = 2; //} diff --git a/pcsx2/x86/ix86-32/recVTLB.cpp b/pcsx2/x86/ix86-32/recVTLB.cpp index aace596c2c..3b0eb0e797 100644 --- a/pcsx2/x86/ix86-32/recVTLB.cpp +++ b/pcsx2/x86/ix86-32/recVTLB.cpp @@ -310,7 +310,7 @@ void vtlb_dynarec_init() // jump to the indirect handler, which is a __fastcall C++ function. // [ecx is address, edx is data] - xCALL( ptr32[(eax*4) + vtlbdata.RWFT[bits][mode]] ); + xFastCall(ptr32[(eax*4) + vtlbdata.RWFT[bits][mode]], ecx, edx); if (!mode) { @@ -406,8 +406,7 @@ void vtlb_DynGenRead64_Const( u32 bits, u32 addr_const ) } iFlushCall(FLUSH_FULLVTLB); - xMOV( ecx, paddr ); - xCALL( vtlbdata.RWFT[szidx][0][handler] ); + xFastCall( vtlbdata.RWFT[szidx][0][handler], paddr ); } } @@ -470,8 +469,7 @@ void vtlb_DynGenRead32_Const( u32 bits, bool sign, u32 addr_const ) else { iFlushCall(FLUSH_FULLVTLB); - xMOV( ecx, paddr ); - xCALL( vtlbdata.RWFT[szidx][0][handler] ); + xFastCall( vtlbdata.RWFT[szidx][0][handler], paddr ); // perform sign extension on the result: @@ -561,8 +559,7 @@ void vtlb_DynGenWrite_Const( u32 bits, u32 addr_const ) } iFlushCall(FLUSH_FULLVTLB); - xMOV( ecx, paddr ); - xCALL( vtlbdata.RWFT[szidx][1][handler] ); + xFastCall( vtlbdata.RWFT[szidx][1][handler], paddr, edx ); } } diff --git a/pcsx2/x86/microVU_Branch.inl b/pcsx2/x86/microVU_Branch.inl index 11d588bb71..b26ec3f729 100644 --- a/pcsx2/x86/microVU_Branch.inl +++ b/pcsx2/x86/microVU_Branch.inl @@ -57,8 +57,8 @@ void mVUDTendProgram(mV, microFlagCycles* mFC, int isEbit) { mVU_XGKICK_DELAY(mVU); } if (doEarlyExit(mVU)) { - if (!isVU1) xCALL(mVU0clearlpStateJIT); - else xCALL(mVU1clearlpStateJIT); + if (!isVU1) xFastCall(mVU0clearlpStateJIT); + else xFastCall(mVU1clearlpStateJIT); } } @@ -117,9 +117,9 @@ void mVUendProgram(mV, microFlagCycles* mFC, int isEbit) { } if (doEarlyExit(mVU)) { if (!isVU1) - xCALL(mVU0clearlpStateJIT); + xFastCall(mVU0clearlpStateJIT); else - xCALL(mVU1clearlpStateJIT); + xFastCall(mVU1clearlpStateJIT); } } @@ -192,8 +192,8 @@ void normJumpCompile(mV, microFlagCycles& mFC, bool isEvilJump) { xJMP(mVU.exitFunct); } - if (!mVU.index) xCALL(mVUcompileJIT<0>); //(u32 startPC, uptr pState) - else xCALL(mVUcompileJIT<1>); + if (!mVU.index) xFastCall(mVUcompileJIT<0>, gprT2, gprT3); //(u32 startPC, uptr pState) + else xFastCall(mVUcompileJIT<1>, gprT2, gprT3); mVUrestoreRegs(mVU); xJMP(gprT1); // Jump to rec-code address diff --git a/pcsx2/x86/microVU_Compile.inl b/pcsx2/x86/microVU_Compile.inl index cb8318ab69..264527d6c5 100644 --- a/pcsx2/x86/microVU_Compile.inl +++ b/pcsx2/x86/microVU_Compile.inl @@ -194,10 +194,8 @@ __fi void handleBadOp(mV, int count) { #ifdef PCSX2_DEVBUILD if (mVUinfo.isBadOp) { mVUbackupRegs(mVU, true); - xMOV(gprT2, mVU.prog.cur->idx); - xMOV(gprT3, xPC); - if (!isVU1) xCALL(mVUbadOp0); - else xCALL(mVUbadOp1); + if (!isVU1) xFastCall(mVUbadOp0, mVU.prog.cur->idx, xPC); + else xFastCall(mVUbadOp1, mVU.prog.cur->idx, xPC); mVUrestoreRegs(mVU, true); } #endif @@ -345,9 +343,8 @@ void mVUsetCycles(mV) { void mVUdebugPrintBlocks(microVU& mVU, bool isEndPC) { if (mVUdebugNow) { mVUbackupRegs(mVU, true); - xMOV(gprT2, xPC); - if (isEndPC) xCALL(mVUprintPC2); - else xCALL(mVUprintPC1); + if (isEndPC) xFastCall(mVUprintPC2, xPC); + else xFastCall(mVUprintPC1, xPC); mVUrestoreRegs(mVU, true); } } @@ -375,9 +372,7 @@ void mVUtestCycles(microVU& mVU) { // TEST32ItoM((uptr)&mVU.regs().flags, VUFLAG_MFLAGSET); // xFowardJZ32 vu0jmp; // mVUbackupRegs(mVU, true); - // xMOV(gprT2, mVU.prog.cur->idx); - // xMOV(gprT3, xPC); - // xCALL(mVUwarning0); // VU0 is allowed early exit for COP2 Interlock Simulation + // xFastCall(mVUwarning0, mVU.prog.cur->idx, xPC); // VU0 is allowed early exit for COP2 Interlock Simulation // mVUrestoreRegs(mVU, true); mVUsavePipelineState(mVU); mVUendProgram(mVU, NULL, 0); @@ -385,9 +380,7 @@ void mVUtestCycles(microVU& mVU) { } else { mVUbackupRegs(mVU, true); - xMOV(gprT2, mVU.prog.cur->idx); - xMOV(gprT3, xPC); - xCALL(mVUwarning1); + xFastCall(mVUwarning1, mVU.prog.cur->idx, xPC); mVUrestoreRegs(mVU, true); mVUsavePipelineState(mVU); mVUendProgram(mVU, NULL, 0); diff --git a/pcsx2/x86/microVU_Execute.inl b/pcsx2/x86/microVU_Execute.inl index 0aaaeef15d..3c88f2e2e3 100644 --- a/pcsx2/x86/microVU_Execute.inl +++ b/pcsx2/x86/microVU_Execute.inl @@ -27,8 +27,8 @@ void mVUdispatcherAB(mV) { xScopedStackFrame frame(false, true); // __fastcall = The caller has already put the needed parameters in ecx/edx: - if (!isVU1) { xCALL(mVUexecuteVU0); } - else { xCALL(mVUexecuteVU1); } + if (!isVU1) { xFastCall(mVUexecuteVU0, ecx, edx); } + else { xFastCall(mVUexecuteVU1, ecx, edx); } // Load VU's MXCSR state xLDMXCSR(g_sseVUMXCSR); @@ -61,8 +61,8 @@ void mVUdispatcherAB(mV) { // __fastcall = The first two DWORD or smaller arguments are passed in ECX and EDX registers; // all other arguments are passed right to left. - if (!isVU1) { xCALL(mVUcleanUpVU0); } - else { xCALL(mVUcleanUpVU1); } + if (!isVU1) { xFastCall(mVUcleanUpVU0); } + else { xFastCall(mVUcleanUpVU1); } } xRET(); diff --git a/pcsx2/x86/microVU_Lower.inl b/pcsx2/x86/microVU_Lower.inl index 80bc07e995..ddb42cb26d 100644 --- a/pcsx2/x86/microVU_Lower.inl +++ b/pcsx2/x86/microVU_Lower.inl @@ -1219,8 +1219,7 @@ static __fi void mVU_XGKICK_DELAY(mV) { xMOV (ptr32[&mVU.resumePtrXG], (uptr)xGetPtr() + 10 + 6); xJcc32(Jcc_NotZero, (uptr)mVU.exitFunctXG - ((uptr)xGetPtr()+6)); #endif - xMOV(gprT2, ptr32[&mVU.VIxgkick]); - xCALL(mVU_XGKICK_); + xFastCall(mVU_XGKICK_, ptr32[&mVU.VIxgkick]); mVUrestoreRegs(mVU); } diff --git a/pcsx2/x86/microVU_Macro.inl b/pcsx2/x86/microVU_Macro.inl index a725ddc02f..214f9319e7 100644 --- a/pcsx2/x86/microVU_Macro.inl +++ b/pcsx2/x86/microVU_Macro.inl @@ -249,15 +249,15 @@ void recBC2TL() { _setupBranchTest(JZ32, true); } void COP2_Interlock(bool mBitSync) { if (cpuRegs.code & 1) { iFlushCall(FLUSH_EVERYTHING | FLUSH_PC); - if (mBitSync) xCALL(_vu0WaitMicro); - else xCALL(_vu0FinishMicro); + if (mBitSync) xFastCall(_vu0WaitMicro); + else xFastCall(_vu0FinishMicro); } } void TEST_FBRST_RESET(FnType_Void* resetFunct, int vuIndex) { xTEST(eax, (vuIndex) ? 0x200 : 0x002); xForwardJZ8 skip; - xCALL(resetFunct); + xFastCall(resetFunct); xMOV(eax, ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]); skip.SetTarget(); } @@ -316,8 +316,8 @@ static void recCTC2() { xMOV(ecx, ptr32[&cpuRegs.GPR.r[_Rt_].UL[0]]); } else xXOR(ecx, ecx); - xCALL(vu1ExecMicro); - xCALL(vif1VUFinish); + xFastCall(vu1ExecMicro, ecx); + xFastCall(vif1VUFinish); break; case REG_FBRST: if (!_Rt_) { @@ -336,8 +336,7 @@ static void recCTC2() { // Executing vu0 block here fixes the intro of Ratchet and Clank // sVU's COP2 has a comment that "Donald Duck" needs this too... if (_Rd_) _eeMoveGPRtoM((uptr)&vu0Regs.VI[_Rd_].UL, _Rt_); - xMOV(ecx, (uptr)CpuVU0); - xCALL(BaseVUmicroCPU::ExecuteBlockJIT); + xFastCall(BaseVUmicroCPU::ExecuteBlockJIT, (uptr)CpuVU0); break; } }