diff --git a/pcsx2/IopMem.cpp b/pcsx2/IopMem.cpp index 10989ac42c..f44df6ac9f 100644 --- a/pcsx2/IopMem.cpp +++ b/pcsx2/IopMem.cpp @@ -37,7 +37,7 @@ static const uint m_psxMemSize = void psxMemAlloc() { if( m_psxAllMem == NULL ) - m_psxAllMem = vtlb_malloc( m_psxMemSize, 4096, 0x21000000 ); + m_psxAllMem = vtlb_malloc( m_psxMemSize, 4096 ); if( m_psxAllMem == NULL) throw Exception::OutOfMemory( "psxMemAlloc > failed allocating memory for the IOP processor." ); diff --git a/pcsx2/Memory.cpp b/pcsx2/Memory.cpp index aef7d3c627..a5eb775e2a 100644 --- a/pcsx2/Memory.cpp +++ b/pcsx2/Memory.cpp @@ -618,7 +618,7 @@ static u8* m_psAllMem = NULL; void memAlloc() { if( m_psAllMem == NULL ) - m_psAllMem = vtlb_malloc( m_allMemSize, 4096, 0x2400000 ); + m_psAllMem = vtlb_malloc( m_allMemSize, 4096 ); if( m_psAllMem == NULL) throw Exception::OutOfMemory( "memAlloc > failed to allocate PS2's base ram/rom/scratchpad." ); diff --git a/pcsx2/SPR.cpp b/pcsx2/SPR.cpp index 5c5291fad0..8166ba3ba4 100644 --- a/pcsx2/SPR.cpp +++ b/pcsx2/SPR.cpp @@ -76,6 +76,9 @@ int _SPR0chain() { memcpy_fast((u8*)pMem, &PS2MEM_SCRATCH[spr0->sadr & 0x3fff], spr0->qwc << 4); + // Clear dependent EE recompiler blocks, if necessary [needed for BTS protection system] + Cpu->Clear( spr0->madr, spr0->qwc << 2 ); + // clear VU mem also! TestClearVUs(spr0->madr, spr0->qwc << 2); // Wtf is going on here? AFAIK, only VIF should affect VU micromem (cottonvibes) @@ -121,6 +124,7 @@ void _SPR0interleave() { // clear VU mem also! TestClearVUs(spr0->madr, spr0->qwc << 2); + Cpu->Clear( spr0->madr, spr0->qwc << 2 ); memcpy_fast((u8*)pMem, &PS2MEM_SCRATCH[spr0->sadr & 0x3fff], spr0->qwc << 4); } spr0->sadr += spr0->qwc * 16; diff --git a/pcsx2/System.cpp b/pcsx2/System.cpp index ee1a7e5731..8197d76f20 100644 --- a/pcsx2/System.cpp +++ b/pcsx2/System.cpp @@ -168,6 +168,7 @@ bool SysAllocateMem() try { + vtlb_Core_Alloc(); memAlloc(); psxMemAlloc(); vuMicroMemAlloc(); @@ -271,6 +272,7 @@ void SysShutdownMem() vuMicroMemShutdown(); psxMemShutdown(); memShutdown(); + vtlb_Core_Shutdown(); } ////////////////////////////////////////////////////////////////////////////////////////// diff --git a/pcsx2/VUmicroMem.cpp b/pcsx2/VUmicroMem.cpp index bdcef00d31..ce2e32add1 100644 --- a/pcsx2/VUmicroMem.cpp +++ b/pcsx2/VUmicroMem.cpp @@ -83,7 +83,7 @@ static const uint m_vuMemSize = void vuMicroMemAlloc() { if( m_vuAllMem == NULL ) - m_vuAllMem = vtlb_malloc( m_vuMemSize, 16, 0x28000000 ); + m_vuAllMem = vtlb_malloc( m_vuMemSize, 16 ); if( m_vuAllMem == NULL ) throw Exception::OutOfMemory( "vuMicroMemInit > Failed to allocate VUmicro memory." ); diff --git a/pcsx2/vtlb.cpp b/pcsx2/vtlb.cpp index f34df81bf4..058f1dcbed 100644 --- a/pcsx2/vtlb.cpp +++ b/pcsx2/vtlb.cpp @@ -61,7 +61,6 @@ vtlbHandler UnmappedVirtHandler1; vtlbHandler UnmappedPhyHandler0; vtlbHandler UnmappedPhyHandler1; - /* __asm { @@ -87,10 +86,22 @@ callfunction: jmp [readfunctions8-0x800000+eax]; }*/ -///////////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////////////// // Interpreter Implementations of VTLB Memory Operations. // See recVTLB.cpp for the dynarec versions. +// ------------------------------------------------------------------------ +// Helper for the BTS manual protection system. Sets a bit based on the given address, +// marking that piece of PS2 memory as 'dirty.' +// +static void memwritebits(u8* ptr) +{ + u32 offs=ptr-vtlbdata.alloc_base; + offs/=16; + vtlbdata.alloc_bits[offs/8] |= 1 << (offs%8); +} + +// ------------------------------------------------------------------------ // Interpreted VTLB lookup for 8, 16, and 32 bit accesses template __forceinline DataType __fastcall MemOp_r0(u32 addr) @@ -117,6 +128,7 @@ __forceinline DataType __fastcall MemOp_r0(u32 addr) } } +// ------------------------------------------------------------------------ // Interpreterd VTLB lookup for 64 and 128 bit accesses. template __forceinline void __fastcall MemOp_r1(u32 addr, DataType* data) @@ -148,6 +160,7 @@ __forceinline void __fastcall MemOp_r1(u32 addr, DataType* data) } } +// ------------------------------------------------------------------------ template __forceinline void __fastcall MemOp_w0(u32 addr, DataType data) { @@ -155,6 +168,7 @@ __forceinline void __fastcall MemOp_w0(u32 addr, DataType data) s32 ppf=addr+vmv; if (!(ppf<0)) { + memwritebits((u8*)ppf); *reinterpret_cast(ppf)=data; } else @@ -174,6 +188,8 @@ __forceinline void __fastcall MemOp_w0(u32 addr, DataType data) } } } + +// ------------------------------------------------------------------------ template __forceinline void __fastcall MemOp_w1(u32 addr,const DataType* data) { @@ -182,6 +198,7 @@ __forceinline void __fastcall MemOp_w1(u32 addr,const DataType* data) s32 ppf=addr+vmv; if (!(ppf<0)) { + memwritebits((u8*)ppf); *reinterpret_cast(ppf)=*data; if (DataSize==128) *reinterpret_cast(ppf+8)=data[1]; @@ -202,7 +219,6 @@ __forceinline void __fastcall MemOp_w1(u32 addr,const DataType* data) } } - mem8_t __fastcall vtlb_memRead8(u32 mem) { return MemOp_r0<8,mem8_t>(mem); @@ -328,7 +344,7 @@ void __fastcall vtlbDefaultPhyWrite64(u32 addr,const mem64_t* data) { Console::E void __fastcall vtlbDefaultPhyWrite128(u32 addr,const mem128_t* data) { Console::Error("vtlbDefaultPhyWrite128: 0x%X",params addr); verify(false); } -///////////////////////////////////////////////////////////////////////// +////////////////////////////////////////////////////////////////////////////////////////// // VTLB Public API -- Init/Term/RegisterHandler stuff // @@ -361,6 +377,7 @@ vtlbHandler vtlb_RegisterHandler( vtlbMemR8FP* r8,vtlbMemR16FP* r16,vtlbMemR32FP return rv; } +////////////////////////////////////////////////////////////////////////////////////////// // Maps the given hander (created with vtlb_RegisterHandler) to the specified memory region. // New mappings always assume priority over previous mappings, so place "generic" mappings for // large areas of memory first, and then specialize specific small regions of memory afterward. @@ -500,7 +517,8 @@ void vtlb_VMapUnmap(u32 vaddr,u32 sz) } } -// Clears vtlb handlers and memory mappings. +////////////////////////////////////////////////////////////////////////////////////////// +// vtlb_init -- Clears vtlb handlers and memory mappings. void vtlb_Init() { vtlbHandlerCount=0; @@ -540,7 +558,8 @@ void vtlb_Init() vtlb_VMapUnmap((VTLB_VMAP_ITEMS-1)*VTLB_PAGE_SIZE,VTLB_PAGE_SIZE); } -// Performs a COP0-level reset of the PS2's TLB. +////////////////////////////////////////////////////////////////////////////////////////// +// vtlb_Reset -- Performs a COP0-level reset of the PS2's TLB. // This function should probably be part of the COP0 rather than here in VTLB. void vtlb_Reset() { @@ -552,30 +571,65 @@ void vtlb_Term() //nothing to do for now } +////////////////////////////////////////////////////////////////////////////////////////// +// Reserves the vtlb core allocation used by various emulation components! +// +void vtlb_Core_Alloc() +{ + if( vtlbdata.alloc_base != NULL ) return; + + vtlbdata.alloc_current = 0; + +#ifdef __LINUX__ + vtlbdata.alloc_base = SysMmapEx( 0x16000000, VTLB_ALLOC_SIZE, 0x80000000, "Vtlb" ); +#else + // Win32 just needs this, since malloc always maps below 2GB. + vtlbdata.alloc_base = (u8*)_aligned_malloc( VTLB_ALLOC_SIZE, 4096 ); + if( vtlbdata.alloc_base == NULL ) + throw Exception::OutOfMemory( "Fatal Error: could not allocate 42Meg buffer for PS2's mappable system ram." ); +#endif +} + +////////////////////////////////////////////////////////////////////////////////////////// +// +void vtlb_Core_Shutdown() +{ + if( vtlbdata.alloc_base == NULL ) return; + +#ifdef __LINUX__ + SafeSysMunmap( vtlbdata.alloc_base, VTLB_ALLOC_SIZE ); +#else + // Make sure and unprotect memory first, since CrtDebug will try to write to it. + HostSys::MemProtect( vtlbdata.alloc_base, VTLB_ALLOC_SIZE, Protect_ReadWrite ); + safe_aligned_free( vtlbdata.alloc_base ); +#endif + +} + +////////////////////////////////////////////////////////////////////////////////////////// // This function allocates memory block with are compatible with the Vtlb's requirements // for memory locations. The Vtlb requires the topmost bit (Sign bit) of the memory // pointer to be cleared. Some operating systems and/or implementations of malloc do that, // but others do not. So use this instead to allocate the memory correctly for your // platform. -u8* vtlb_malloc( uint size, uint align, uptr tryBaseAddress ) +// +u8* vtlb_malloc( uint size, uint align ) { -#ifdef __LINUX__ - return SysMmapEx( tryBaseAddress, size, 0x80000000, "Vtlb" ); -#else - // Win32 just needs this, since malloc always maps below 2GB. - return (u8*)_aligned_malloc(size, align); -#endif + vtlbdata.alloc_current += align-1; + vtlbdata.alloc_current &= ~(align-1); + + int rv = vtlbdata.alloc_current; + vtlbdata.alloc_current += size; + return &vtlbdata.alloc_base[rv]; } +////////////////////////////////////////////////////////////////////////////////////////// +// void vtlb_free( void* pmem, uint size ) { - if( pmem == NULL ) return; - -#ifdef __LINUX__ - SafeSysMunmap( pmem, size ); -#else - // Make sure and unprotect memory first, since CrtDebug will try to write to it. - HostSys::MemProtect( pmem, size, Protect_ReadWrite ); - safe_aligned_free( pmem ); -#endif + // Does nothing anymore! Alloc/dealloc is now handled by vtlb_Core_Alloc / + // vtlb_Core_Shutdown. Placebo is left in place in case it becomes useful again + // at a later date. + + return; } diff --git a/pcsx2/vtlb.h b/pcsx2/vtlb.h index 5571d6b835..0eaaf5d90e 100644 --- a/pcsx2/vtlb.h +++ b/pcsx2/vtlb.h @@ -23,10 +23,12 @@ typedef void __fastcall vtlbMemW128FP(u32 addr,const mem128_t* data); typedef u32 vtlbHandler; +extern void vtlb_Core_Alloc(); +extern void vtlb_Core_Shutdown(); extern void vtlb_Init(); extern void vtlb_Reset(); extern void vtlb_Term(); -extern u8* vtlb_malloc( uint size, uint align, uptr tryBaseAddress ); +extern u8* vtlb_malloc( uint size, uint align ); extern void vtlb_free( void* pmem, uint size ); @@ -67,6 +69,8 @@ extern void vtlb_DynGenRead32_Const( u32 bits, bool sign, u32 addr_const ); namespace vtlb_private { + static const uint VTLB_ALLOC_SIZE = 0x2900000; //this is a bit more than required + static const uint VTLB_PAGE_BITS = 12; static const uint VTLB_PAGE_MASK = 4095; static const uint VTLB_PAGE_SIZE = 4096; @@ -77,6 +81,11 @@ namespace vtlb_private struct MapData { + u8 alloc_bits[VTLB_ALLOC_SIZE/16/8]; + + u8* alloc_base; //base of the memory array + int alloc_current; //current base + s32 pmap[VTLB_PMAP_ITEMS]; //512KB s32 vmap[VTLB_VMAP_ITEMS]; //4MB diff --git a/pcsx2/windows/VCprojects/pcsx2_2008.vcproj b/pcsx2/windows/VCprojects/pcsx2_2008.vcproj index d8a1aa780e..f11c2294aa 100644 --- a/pcsx2/windows/VCprojects/pcsx2_2008.vcproj +++ b/pcsx2/windows/VCprojects/pcsx2_2008.vcproj @@ -2883,7 +2883,7 @@ = ID_LANGS && LOWORD(wParam) <= (ID_LANGS + langsMax)) @@ -989,9 +985,7 @@ void CreateMainMenu() { ADDMENUITEM(0,_("Print cdvd &Info"), ID_CDVDPRINT); ADDMENUITEM(0,_("Close GS Window on Esc"), ID_CLOSEGS); ADDSEPARATOR(0); -#ifndef _DEBUG ADDMENUITEM(0,_("Enable &Profiler"), ID_PROFILER); -#endif ADDMENUITEM(0,_("Enable &Patches"), ID_PATCHES); ADDMENUITEM(0,_("Enable &Console"), ID_CONSOLE); ADDSEPARATOR(0); diff --git a/pcsx2/windows/debugger.rc b/pcsx2/windows/debugger.rc index 23fc66572d..6bb2e5b2bd 100644 --- a/pcsx2/windows/debugger.rc +++ b/pcsx2/windows/debugger.rc @@ -7,7 +7,8 @@ // // Generated from the TEXTINCLUDE 2 resource. // -#include "afxresmw.h" +#include "afxresmw.h" + ///////////////////////////////////////////////////////////////////////////// #undef APSTUDIO_READONLY_SYMBOLS @@ -899,7 +900,8 @@ END // // Generated from the TEXTINCLUDE 3 resource. // - + + ///////////////////////////////////////////////////////////////////////////// #endif // not APSTUDIO_INVOKED diff --git a/pcsx2/x86/ix86-32/iR5900-32.cpp b/pcsx2/x86/ix86-32/iR5900-32.cpp index 36bbfb7bfc..67c1b1e8ce 100644 --- a/pcsx2/x86/ix86-32/iR5900-32.cpp +++ b/pcsx2/x86/ix86-32/iR5900-32.cpp @@ -418,6 +418,9 @@ static void recAlloc() x86FpuState = FPU_STATE; } +PCSX2_ALIGNED16( static u16 manual_page[Ps2MemSize::Base >> 12] ); +PCSX2_ALIGNED16( static u8 manual_counter[Ps2MemSize::Base >> 12] ); + //////////////////////////////////////////////////// void recResetEE( void ) { @@ -427,6 +430,8 @@ void recResetEE( void ) memset_8<0xcc, REC_CACHEMEM>(recMem); // 0xcc is INT3 memzero_ptr( m_recBlockAlloc ); + memzero_obj( manual_page ); + memzero_obj( manual_counter ); ClearRecLUT((BASEBLOCK*)m_recBlockAlloc, (((Ps2MemSize::Base + Ps2MemSize::Rom + Ps2MemSize::Rom1) / 4))); @@ -720,7 +725,6 @@ static void ClearRecLUT(BASEBLOCK* base, int count) base[i].SetFnptr((uptr)JITCompile); } -// Returns the offset to the next instruction after any cleared memory void recClear(u32 addr, u32 size) { BASEBLOCKEX* pexblock; @@ -1256,14 +1260,16 @@ void badespfn() { void __fastcall dyna_block_discard(u32 start,u32 sz) { - DevCon::WriteLn("dyna_block_discard %08X , count %d", params start,sz); - Cpu->Clear(start,sz); + DevCon::WriteLn("dyna_block_discard .. start: %08X count=%d", params start,sz); + Cpu->Clear(start, sz); } -void __fastcall dyna_block_reset(u32 start,u32 sz) + +void __fastcall dyna_page_reset(u32 start,u32 sz) { - DevCon::WriteLn("dyna_block_reset %08X , count %d", params start,sz); + DevCon::WriteLn("dyna_page_reset .. start=%08X count=%d", params start,sz); Cpu->Clear(start & ~0xfffUL, 0x400); + manual_counter[start >> 10]++; mmap_MarkCountedRamPage(PSM(start), start & ~0xfffUL); } @@ -1490,98 +1496,6 @@ StartRecomp: // instruction being analyzed. if( usecop2 ) vucycle++; - // peephole optimizations // -#ifdef PCSX2_VM_COISSUE - if( i < s_nEndBlock-4 && recompileCodeSafe(i) ) { - u32 curcode = cpuRegs.code; - u32 nextcode = *(u32*)PSM(i+4); - if( _eeIsLoadStoreCoIssue(curcode, nextcode) && recBSC_co[curcode>>26] != NULL ) { - - // rs has to be the same, and cannot be just written - if( ((curcode >> 21) & 0x1F) == ((nextcode >> 21) & 0x1F) && !_eeLoadWritesRs(curcode) ) { - - if( _eeIsLoadStoreCoX(curcode) && ((nextcode>>16)&0x1f) != ((curcode>>21)&0x1f) ) { - // see how many stores there are - u32 j; - // use xmmregs since only supporting lwc1,lq,swc1,sq - for(j = i+8; j < s_nEndBlock && j < i+4*iREGCNT_XMM; j += 4 ) { - u32 nncode = *(u32*)PSM(j); - if( (nncode>>26) != (curcode>>26) || ((curcode>>21)&0x1f) != ((nncode>>21)&0x1f) || - _eeLoadWritesRs(nncode)) - break; - } - - if( j > i+8 ) { - u32 num = (j-i)>>2; // number of stores that can coissue - assert( num <= iREGCNT_XMM ); - - g_pCurInstInfo[0].numpeeps = num-1; - g_pCurInstInfo[0].info |= EEINSTINFO_COREC; - - while(i < j-4) { - g_pCurInstInfo++; - g_pCurInstInfo[0].info |= EEINSTINFO_NOREC; - i += 4; - } - - continue; - } - - // fall through - } - - // unaligned loadstores - - // if LWL, check if LWR and that offsets are +3 away - switch(curcode >> 26) { - case 0x22: // LWL - if( (nextcode>>26) != 0x26 || ((s16)nextcode)+3 != (s16)curcode ) - continue; - break; - case 0x26: // LWR - if( (nextcode>>26) != 0x22 || ((s16)nextcode) != (s16)curcode+3 ) - continue; - break; - - case 0x2a: // SWL - if( (nextcode>>26) != 0x2e || ((s16)nextcode)+3 != (s16)curcode ) - continue; - break; - case 0x2e: // SWR - if( (nextcode>>26) != 0x2a || ((s16)nextcode) != (s16)curcode+3 ) - continue; - break; - - case 0x1a: // LDL - if( (nextcode>>26) != 0x1b || ((s16)nextcode)+7 != (s16)curcode ) - continue; - break; - case 0x1b: // LWR - if( (nextcode>>26) != 0x1aa || ((s16)nextcode) != (s16)curcode+7 ) - continue; - break; - - case 0x2c: // SWL - if( (nextcode>>26) != 0x2d || ((s16)nextcode)+7 != (s16)curcode ) - continue; - break; - case 0x2d: // SWR - if( (nextcode>>26) != 0x2c || ((s16)nextcode) != (s16)curcode+7 ) - continue; - break; - } - - // good enough - g_pCurInstInfo[0].info |= EEINSTINFO_COREC; - g_pCurInstInfo[0].numpeeps = 1; - g_pCurInstInfo[1].info |= EEINSTINFO_NOREC; - g_pCurInstInfo++; - i += 4; - continue; - } - } - } -#endif // end peephole } // This *is* important because g_pCurInstInfo is checked a bit later on and // if it's not equal to s_pInstCache it handles recompilation differently. @@ -1611,7 +1525,6 @@ StartRecomp: iDumpBlock(startpc, recPtr); #endif - static u16 manual_page[Ps2MemSize::Base >> 12]; u32 sz=(s_nEndBlock-startpc)>>2; u32 inpage_ptr=HWADDR(startpc); @@ -1631,31 +1544,76 @@ StartRecomp: } else { + // import the vtlbdata (alloc_bits and alloc_base and stuff): + using namespace vtlb_private; + MOV32ItoR(ECX, inpage_ptr); MOV32ItoR(EDX, pgsz); + + u32 mask=0; + u32 writen=0; + u32 writen_start=0; u32 lpc=inpage_ptr; u32 stg=pgsz; + while(stg>0) { - // was dyna_block_discard_recmem. See note in recResetEE for details. - CMP32ItoM((uptr)PSM(lpc),*(u32*)PSM(lpc)); - JNE32(((u32)&dyna_block_discard)- ( (u32)x86Ptr + 6 )); + u32 bit = (lpc>>4) & 7; + if (mask==0) + { + //writen=bit; + writen_start=(((u8*)PSM(lpc)-vtlbdata.alloc_base)>>4)/8; + } + mask |= 1 << bit; - stg-=4; - lpc+=4; + if (bit==31) + { + vtlbdata.alloc_bits[writen_start]&=~mask; + xTEST( ptr32[&vtlbdata.alloc_bits[writen_start]], mask ); // auto-optimizes to imm8 when applicable. + xJNZ( dyna_block_discard ); + //SysPrintf("%08X %d %d\n",mask,pgsz,pgsz>>4); + mask = 0; + } + + //writen++; + + if (stg<=16) + { + lpc += stg; + stg = 0; + } + else + { + lpc += 16; + stg -= 16; + } } - if (startpc != 0x81fc0) { + + if (mask) + { + vtlbdata.alloc_bits[writen_start] &= ~mask; + xTEST( ptr32[&vtlbdata.alloc_bits[writen_start]], mask ); // auto-optimizes to imm8 when applicable. + xJNZ( dyna_block_discard ); + //SysPrintf("%08X %d %d\n",mask,pgsz,pgsz>>4); + mask = 0; + } + + if( startpc != 0x81fc0 && manual_counter[inpage_ptr >> 12] <= 4 ) + { + // Commented out until we replace it with a smarter algo that only + // recompiles blocks a limited number of times. + xADD(ptr16[&manual_page[inpage_ptr >> 12]], 1); - xJC( dyna_block_reset ); + xJC( dyna_page_reset ); } DbgCon::WriteLn("Manual block @ %08X : %08X %d %d %d %d", params startpc,inpage_ptr,pgsz,0x1000-inpage_offs,inpage_sz,sz*4); } } - inpage_ptr+=pgsz; - inpage_sz-=pgsz; + inpage_ptr += pgsz; + inpage_sz -= pgsz; } // finally recompile // diff --git a/pcsx2/x86/ix86-32/recVTLB.cpp b/pcsx2/x86/ix86-32/recVTLB.cpp index 19e8fe8408..2f43fa3686 100644 --- a/pcsx2/x86/ix86-32/recVTLB.cpp +++ b/pcsx2/x86/ix86-32/recVTLB.cpp @@ -23,31 +23,88 @@ #include "iCore.h" #include "iR5900.h" +#include "ix86\ix86_internal.h" using namespace vtlb_private; using namespace x86Emitter; -// NOTICE: This function *destroys* EAX!! -// Moves 128 bits of memory from the source register ptr to the dest register ptr. -// (used as an equivalent to movaps, when a free XMM register is unavailable for some reason) -void MOV128_MtoM( x86IntRegType destRm, x86IntRegType srcRm ) +////////////////////////////////////////////////////////////////////////////////////////// +// iAllocRegSSE -- allocates an xmm register. If no xmm register is available, xmm0 is +// saved into g_globalXMMData and returned as a free register. +// +class iAllocRegSSE { - // (this is one of my test cases for the new emitter --air) +protected: + xRegisterSSE m_reg; + bool m_free; - xAddressReg src( srcRm ); - xAddressReg dest( destRm ); +public: + iAllocRegSSE() : + m_reg( xmm0 ), + m_free( !!_hasFreeXMMreg() ) + { + if( m_free ) + m_reg = xRegisterSSE( _allocTempXMMreg( XMMT_INT, -1 ) ); + else + xStoreReg( m_reg ); + } - xMOV( eax, ptr[src] ); - xMOV( ptr[dest], eax ); + ~iAllocRegSSE() + { + if( m_free ) + _freeXMMreg( m_reg.Id ); + else + xRestoreReg( m_reg ); + } + + operator xRegisterSSE() const { return m_reg; } +}; - xMOV( eax, ptr[src+4] ); - xMOV( ptr[dest+4], eax ); +////////////////////////////////////////////////////////////////////////////////////////// +// Moves 128 bits from point B to point A, using SSE's MOVAPS (or MOVDQA). +// This instruction always uses an SSE register, even if all registers are allocated! It +// saves an SSE register to memory first, performs the copy, and restores the register. +// +void iMOV128_SSE( const ModSibBase& destRm, const ModSibBase& srcRm ) +{ + iAllocRegSSE reg; + xMOVDQA( reg, srcRm ); + xMOVDQA( destRm, reg ); +} - xMOV( eax, ptr[src+8] ); - xMOV( ptr[dest+8], eax ); +////////////////////////////////////////////////////////////////////////////////////////// +// Moves 64 bits of data from point B to point A, using either MMX, SSE, or x86 registers +// if neither MMX nor SSE is available to the task. +// +// Optimizations: This method uses MMX is the cpu is in MMX mode, or SSE if it's in FPU +// mode (saving on potential EMMS uses). +// +void iMOV64_Smart( const ModSibBase& destRm, const ModSibBase& srcRm ) +{ + if( (x86FpuState == FPU_STATE) && _hasFreeXMMreg() ) + { + // Move things using MOVLPS: + xRegisterSSE reg( _allocTempXMMreg( XMMT_INT, -1 ) ); + xMOVL.PS( reg, srcRm ); + xMOVL.PS( destRm, reg ); + _freeXMMreg( reg.Id ); + return; + } - xMOV( eax, ptr[src+12] ); - xMOV( ptr[dest+12], eax ); + if( _hasFreeMMXreg() ) + { + xRegisterMMX reg( _allocMMXreg(-1, MMX_TEMP, 0) ); + xMOVQ( reg, srcRm ); + xMOVQ( destRm, reg ); + _freeMMXreg( reg.Id ); + } + else + { + xMOV( eax, srcRm ); + xMOV( destRm, eax ); + xMOV( eax, srcRm+4 ); + xMOV( destRm+4, eax ); + } } /* @@ -127,38 +184,11 @@ static void _vtlb_DynGen_DirectRead( u32 bits, bool sign ) break; case 64: - if( _hasFreeMMXreg() ) - { - const int freereg = _allocMMXreg(-1, MMX_TEMP, 0); - MOVQRmtoR(freereg,ECX); - MOVQRtoRm(EDX,freereg); - _freeMMXreg(freereg); - } - else - { - MOV32RmtoR(EAX,ECX); - MOV32RtoRm(EDX,EAX); - - MOV32RmtoR(EAX,ECX,4); - MOV32RtoRm(EDX,EAX,4); - } + iMOV64_Smart(ptr[edx],ptr[ecx]); break; case 128: - if( _hasFreeXMMreg() ) - { - const int freereg = _allocTempXMMreg( XMMT_INT, -1 ); - SSE2_MOVDQARmtoR(freereg,ECX); - SSE2_MOVDQARtoRm(EDX,freereg); - _freeXMMreg(freereg); - } - else - { - // Could put in an MMX optimization here as well, but no point really. - // It's almost never used since there's almost always a free XMM reg. - - MOV128_MtoM( EDX, ECX ); // dest <- src! - } + iMOV128_SSE(ptr[edx],ptr[ecx]); break; jNO_DEFAULT @@ -262,39 +292,11 @@ void vtlb_DynGenRead64_Const( u32 bits, u32 addr_const ) switch( bits ) { case 64: - if( _hasFreeMMXreg() ) - { - const int freereg = _allocMMXreg(-1, MMX_TEMP, 0); - MOVQMtoR(freereg,ppf); - MOVQRtoRm(EDX,freereg); - _freeMMXreg(freereg); - } - else - { - MOV32MtoR(EAX,ppf); - MOV32RtoRm(EDX,EAX); - - MOV32MtoR(EAX,ppf+4); - MOV32RtoRm(EDX,EAX,4); - } + iMOV64_Smart(ptr[edx],ptr[ppf]); break; case 128: - if( _hasFreeXMMreg() ) - { - const int freereg = _allocTempXMMreg( XMMT_INT, -1 ); - SSE2_MOVDQA_M128_to_XMM( freereg, ppf ); - SSE2_MOVDQARtoRm(EDX,freereg); - _freeXMMreg(freereg); - } - else - { - // Could put in an MMX optimization here as well, but no point really. - // It's almost never used since there's almost always a free XMM reg. - - MOV32ItoR( ECX, ppf ); - MOV128_MtoM( EDX, ECX ); // dest <- src! - } + iMOV128_SSE(ptr[edx],ptr[ppf]); break; jNO_DEFAULT @@ -415,40 +417,21 @@ static void _vtlb_DynGen_DirectWrite( u32 bits ) break; case 64: - if( _hasFreeMMXreg() ) - { - const int freereg = _allocMMXreg(-1, MMX_TEMP, 0); - MOVQRmtoR(freereg,EDX); - MOVQRtoRm(ECX,freereg); - _freeMMXreg( freereg ); - } - else - { - MOV32RmtoR(EAX,EDX); - MOV32RtoRm(ECX,EAX); - - MOV32RmtoR(EAX,EDX,4); - MOV32RtoRm(ECX,EAX,4); - } + iMOV64_Smart(ptr[ecx],ptr[edx]); break; case 128: - if( _hasFreeXMMreg() ) - { - const int freereg = _allocTempXMMreg( XMMT_INT, -1 ); - SSE2_MOVDQARmtoR(freereg,EDX); - SSE2_MOVDQARtoRm(ECX,freereg); - _freeXMMreg( freereg ); - } - else - { - // Could put in an MMX optimization here as well, but no point really. - // It's almost never used since there's almost always a free XMM reg. - - MOV128_MtoM( ECX, EDX ); // dest <- src! - } + iMOV128_SSE(ptr[ecx],ptr[edx]); break; } + + xSHR( ecx, 4 ); + + uptr alloc_base = (uptr)vtlbdata.alloc_base; + u8* bits_base = vtlbdata.alloc_bits; + bits_base -= (alloc_base>>4)/8; //in bytes + + xBTS( ecx, bits_base ); } // ------------------------------------------------------------------------ @@ -514,39 +497,11 @@ void vtlb_DynGenWrite_Const( u32 bits, u32 addr_const ) break; case 64: - if( _hasFreeMMXreg() ) - { - const int freereg = _allocMMXreg(-1, MMX_TEMP, 0); - MOVQRmtoR(freereg,EDX); - MOVQRtoM(ppf,freereg); - _freeMMXreg( freereg ); - } - else - { - MOV32RmtoR(EAX,EDX); - MOV32RtoM(ppf,EAX); - - MOV32RmtoR(EAX,EDX,4); - MOV32RtoM(ppf+4,EAX); - } + iMOV64_Smart( ptr[ppf], ptr[edx] ); break; case 128: - if( _hasFreeXMMreg() ) - { - const int freereg = _allocTempXMMreg( XMMT_INT, -1 ); - SSE2_MOVDQARmtoR(freereg,EDX); - SSE2_MOVDQA_XMM_to_M128(ppf,freereg); - _freeXMMreg( freereg ); - } - else - { - // Could put in an MMX optimization here as well, but no point really. - // It's almost never used since there's almost always a free XMM reg. - - MOV32ItoR( ECX, ppf ); - MOV128_MtoM( ECX, EDX ); // dest <- src! - } + iMOV128_SSE( ptr[ppf], ptr[edx] ); break; } @@ -571,3 +526,4 @@ void vtlb_DynGenWrite_Const( u32 bits, u32 addr_const ) CALLFunc( (int)vtlbdata.RWFT[szidx][1][handler] ); } } + diff --git a/pcsx2/x86/ix86/ix86_instructions.h b/pcsx2/x86/ix86/ix86_instructions.h index 783adc6435..230ca93a5e 100644 --- a/pcsx2/x86/ix86/ix86_instructions.h +++ b/pcsx2/x86/ix86/ix86_instructions.h @@ -35,6 +35,9 @@ namespace x86Emitter { + extern void xStoreReg( const xRegisterSSE& src ); + extern void xRestoreReg( const xRegisterSSE& dest ); + // ------------------------------------------------------------------------ // Group 1 Instruction Class diff --git a/pcsx2/x86/ix86/ix86_legacy_instructions.h b/pcsx2/x86/ix86/ix86_legacy_instructions.h index 8e41e58f4c..ad91a6a51f 100644 --- a/pcsx2/x86/ix86/ix86_legacy_instructions.h +++ b/pcsx2/x86/ix86/ix86_legacy_instructions.h @@ -677,8 +677,6 @@ extern void CDQE( void ); extern void LAHF(); extern void SAHF(); -extern void BT32ItoR( x86IntRegType to, u8 from ); -extern void BTR32ItoR( x86IntRegType to, u8 from ); extern void BSRRtoR(x86IntRegType to, x86IntRegType from); extern void BSWAP32R( x86IntRegType to ); diff --git a/pcsx2/x86/ix86/ix86_tools.cpp b/pcsx2/x86/ix86/ix86_tools.cpp index 91a37d1333..f4abbb3231 100644 --- a/pcsx2/x86/ix86/ix86_tools.cpp +++ b/pcsx2/x86/ix86/ix86_tools.cpp @@ -30,9 +30,22 @@ u8 g_globalXMMSaved = 0; PCSX2_ALIGNED16( static u64 g_globalMMXData[8] ); PCSX2_ALIGNED16( static u64 g_globalXMMData[2*iREGCNT_XMM] ); +namespace x86Emitter +{ + void xStoreReg( const xRegisterSSE& src ) + { + xMOVDQA( &g_globalXMMData[src.Id], src ); + } + + void xRestoreReg( const xRegisterSSE& dest ) + { + xMOVDQA( dest, &g_globalXMMData[dest.Id] ); + } +} + ///////////////////////////////////////////////////////////////////// -// SetCPUState -- for assugnment of SSE roundmodes and clampmodes. +// SetCPUState -- for assignment of SSE roundmodes and clampmodes. u32 g_sseMXCSR = DEFAULT_sseMXCSR; u32 g_sseVUMXCSR = DEFAULT_sseVUMXCSR;