Merged drk||Raziel's "BTS Manual Protection" enhancement for the vtlb into /trunk, and combined it with Pseudonim's "Manual Block Clear" enhancement for an ideal two-phase protection system.

Most things should be a bit faster with this new system.  The system is more balanced than the previous one, in that it provides a better overall performance across most games, but some specific FMVs (like Disgaea 2's) will be a bit slower.  On the other hand, others like DQ8 and Kingdom Hearts 2 FMVs get a big speedup.  Almost all in-game stuff should be either the same or faster now.

Set a bunch of ignores for TortoiseSVN users, as suggested in Issue 166.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@1083 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
Jake.Stine 2009-04-29 04:24:46 +00:00
commit 31f0be6eb8
15 changed files with 271 additions and 278 deletions

View File

@ -37,7 +37,7 @@ static const uint m_psxMemSize =
void psxMemAlloc()
{
if( m_psxAllMem == NULL )
m_psxAllMem = vtlb_malloc( m_psxMemSize, 4096, 0x21000000 );
m_psxAllMem = vtlb_malloc( m_psxMemSize, 4096 );
if( m_psxAllMem == NULL)
throw Exception::OutOfMemory( "psxMemAlloc > failed allocating memory for the IOP processor." );

View File

@ -618,7 +618,7 @@ static u8* m_psAllMem = NULL;
void memAlloc()
{
if( m_psAllMem == NULL )
m_psAllMem = vtlb_malloc( m_allMemSize, 4096, 0x2400000 );
m_psAllMem = vtlb_malloc( m_allMemSize, 4096 );
if( m_psAllMem == NULL)
throw Exception::OutOfMemory( "memAlloc > failed to allocate PS2's base ram/rom/scratchpad." );

View File

@ -76,6 +76,9 @@ int _SPR0chain()
{
memcpy_fast((u8*)pMem, &PS2MEM_SCRATCH[spr0->sadr & 0x3fff], spr0->qwc << 4);
// Clear dependent EE recompiler blocks, if necessary [needed for BTS protection system]
Cpu->Clear( spr0->madr, spr0->qwc << 2 );
// clear VU mem also!
TestClearVUs(spr0->madr, spr0->qwc << 2); // Wtf is going on here? AFAIK, only VIF should affect VU micromem (cottonvibes)
@ -121,6 +124,7 @@ void _SPR0interleave()
{
// clear VU mem also!
TestClearVUs(spr0->madr, spr0->qwc << 2);
Cpu->Clear( spr0->madr, spr0->qwc << 2 );
memcpy_fast((u8*)pMem, &PS2MEM_SCRATCH[spr0->sadr & 0x3fff], spr0->qwc << 4);
}
spr0->sadr += spr0->qwc * 16;

View File

@ -168,6 +168,7 @@ bool SysAllocateMem()
try
{
vtlb_Core_Alloc();
memAlloc();
psxMemAlloc();
vuMicroMemAlloc();
@ -271,6 +272,7 @@ void SysShutdownMem()
vuMicroMemShutdown();
psxMemShutdown();
memShutdown();
vtlb_Core_Shutdown();
}
//////////////////////////////////////////////////////////////////////////////////////////

View File

@ -83,7 +83,7 @@ static const uint m_vuMemSize =
void vuMicroMemAlloc()
{
if( m_vuAllMem == NULL )
m_vuAllMem = vtlb_malloc( m_vuMemSize, 16, 0x28000000 );
m_vuAllMem = vtlb_malloc( m_vuMemSize, 16 );
if( m_vuAllMem == NULL )
throw Exception::OutOfMemory( "vuMicroMemInit > Failed to allocate VUmicro memory." );

View File

@ -61,7 +61,6 @@ vtlbHandler UnmappedVirtHandler1;
vtlbHandler UnmappedPhyHandler0;
vtlbHandler UnmappedPhyHandler1;
/*
__asm
{
@ -87,10 +86,22 @@ callfunction:
jmp [readfunctions8-0x800000+eax];
}*/
/////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////
// Interpreter Implementations of VTLB Memory Operations.
// See recVTLB.cpp for the dynarec versions.
// ------------------------------------------------------------------------
// Helper for the BTS manual protection system. Sets a bit based on the given address,
// marking that piece of PS2 memory as 'dirty.'
//
static void memwritebits(u8* ptr)
{
u32 offs=ptr-vtlbdata.alloc_base;
offs/=16;
vtlbdata.alloc_bits[offs/8] |= 1 << (offs%8);
}
// ------------------------------------------------------------------------
// Interpreted VTLB lookup for 8, 16, and 32 bit accesses
template<int DataSize,typename DataType>
__forceinline DataType __fastcall MemOp_r0(u32 addr)
@ -117,6 +128,7 @@ __forceinline DataType __fastcall MemOp_r0(u32 addr)
}
}
// ------------------------------------------------------------------------
// Interpreterd VTLB lookup for 64 and 128 bit accesses.
template<int DataSize,typename DataType>
__forceinline void __fastcall MemOp_r1(u32 addr, DataType* data)
@ -148,6 +160,7 @@ __forceinline void __fastcall MemOp_r1(u32 addr, DataType* data)
}
}
// ------------------------------------------------------------------------
template<int DataSize,typename DataType>
__forceinline void __fastcall MemOp_w0(u32 addr, DataType data)
{
@ -155,6 +168,7 @@ __forceinline void __fastcall MemOp_w0(u32 addr, DataType data)
s32 ppf=addr+vmv;
if (!(ppf<0))
{
memwritebits((u8*)ppf);
*reinterpret_cast<DataType*>(ppf)=data;
}
else
@ -174,6 +188,8 @@ __forceinline void __fastcall MemOp_w0(u32 addr, DataType data)
}
}
}
// ------------------------------------------------------------------------
template<int DataSize,typename DataType>
__forceinline void __fastcall MemOp_w1(u32 addr,const DataType* data)
{
@ -182,6 +198,7 @@ __forceinline void __fastcall MemOp_w1(u32 addr,const DataType* data)
s32 ppf=addr+vmv;
if (!(ppf<0))
{
memwritebits((u8*)ppf);
*reinterpret_cast<DataType*>(ppf)=*data;
if (DataSize==128)
*reinterpret_cast<DataType*>(ppf+8)=data[1];
@ -202,7 +219,6 @@ __forceinline void __fastcall MemOp_w1(u32 addr,const DataType* data)
}
}
mem8_t __fastcall vtlb_memRead8(u32 mem)
{
return MemOp_r0<8,mem8_t>(mem);
@ -328,7 +344,7 @@ void __fastcall vtlbDefaultPhyWrite64(u32 addr,const mem64_t* data) { Console::E
void __fastcall vtlbDefaultPhyWrite128(u32 addr,const mem128_t* data) { Console::Error("vtlbDefaultPhyWrite128: 0x%X",params addr); verify(false); }
/////////////////////////////////////////////////////////////////////////
//////////////////////////////////////////////////////////////////////////////////////////
// VTLB Public API -- Init/Term/RegisterHandler stuff
//
@ -361,6 +377,7 @@ vtlbHandler vtlb_RegisterHandler( vtlbMemR8FP* r8,vtlbMemR16FP* r16,vtlbMemR32FP
return rv;
}
//////////////////////////////////////////////////////////////////////////////////////////
// Maps the given hander (created with vtlb_RegisterHandler) to the specified memory region.
// New mappings always assume priority over previous mappings, so place "generic" mappings for
// large areas of memory first, and then specialize specific small regions of memory afterward.
@ -500,7 +517,8 @@ void vtlb_VMapUnmap(u32 vaddr,u32 sz)
}
}
// Clears vtlb handlers and memory mappings.
//////////////////////////////////////////////////////////////////////////////////////////
// vtlb_init -- Clears vtlb handlers and memory mappings.
void vtlb_Init()
{
vtlbHandlerCount=0;
@ -540,7 +558,8 @@ void vtlb_Init()
vtlb_VMapUnmap((VTLB_VMAP_ITEMS-1)*VTLB_PAGE_SIZE,VTLB_PAGE_SIZE);
}
// Performs a COP0-level reset of the PS2's TLB.
//////////////////////////////////////////////////////////////////////////////////////////
// vtlb_Reset -- Performs a COP0-level reset of the PS2's TLB.
// This function should probably be part of the COP0 rather than here in VTLB.
void vtlb_Reset()
{
@ -552,30 +571,65 @@ void vtlb_Term()
//nothing to do for now
}
//////////////////////////////////////////////////////////////////////////////////////////
// Reserves the vtlb core allocation used by various emulation components!
//
void vtlb_Core_Alloc()
{
if( vtlbdata.alloc_base != NULL ) return;
vtlbdata.alloc_current = 0;
#ifdef __LINUX__
vtlbdata.alloc_base = SysMmapEx( 0x16000000, VTLB_ALLOC_SIZE, 0x80000000, "Vtlb" );
#else
// Win32 just needs this, since malloc always maps below 2GB.
vtlbdata.alloc_base = (u8*)_aligned_malloc( VTLB_ALLOC_SIZE, 4096 );
if( vtlbdata.alloc_base == NULL )
throw Exception::OutOfMemory( "Fatal Error: could not allocate 42Meg buffer for PS2's mappable system ram." );
#endif
}
//////////////////////////////////////////////////////////////////////////////////////////
//
void vtlb_Core_Shutdown()
{
if( vtlbdata.alloc_base == NULL ) return;
#ifdef __LINUX__
SafeSysMunmap( vtlbdata.alloc_base, VTLB_ALLOC_SIZE );
#else
// Make sure and unprotect memory first, since CrtDebug will try to write to it.
HostSys::MemProtect( vtlbdata.alloc_base, VTLB_ALLOC_SIZE, Protect_ReadWrite );
safe_aligned_free( vtlbdata.alloc_base );
#endif
}
//////////////////////////////////////////////////////////////////////////////////////////
// This function allocates memory block with are compatible with the Vtlb's requirements
// for memory locations. The Vtlb requires the topmost bit (Sign bit) of the memory
// pointer to be cleared. Some operating systems and/or implementations of malloc do that,
// but others do not. So use this instead to allocate the memory correctly for your
// platform.
u8* vtlb_malloc( uint size, uint align, uptr tryBaseAddress )
//
u8* vtlb_malloc( uint size, uint align )
{
#ifdef __LINUX__
return SysMmapEx( tryBaseAddress, size, 0x80000000, "Vtlb" );
#else
// Win32 just needs this, since malloc always maps below 2GB.
return (u8*)_aligned_malloc(size, align);
#endif
vtlbdata.alloc_current += align-1;
vtlbdata.alloc_current &= ~(align-1);
int rv = vtlbdata.alloc_current;
vtlbdata.alloc_current += size;
return &vtlbdata.alloc_base[rv];
}
//////////////////////////////////////////////////////////////////////////////////////////
//
void vtlb_free( void* pmem, uint size )
{
if( pmem == NULL ) return;
#ifdef __LINUX__
SafeSysMunmap( pmem, size );
#else
// Make sure and unprotect memory first, since CrtDebug will try to write to it.
HostSys::MemProtect( pmem, size, Protect_ReadWrite );
safe_aligned_free( pmem );
#endif
// Does nothing anymore! Alloc/dealloc is now handled by vtlb_Core_Alloc /
// vtlb_Core_Shutdown. Placebo is left in place in case it becomes useful again
// at a later date.
return;
}

View File

@ -23,10 +23,12 @@ typedef void __fastcall vtlbMemW128FP(u32 addr,const mem128_t* data);
typedef u32 vtlbHandler;
extern void vtlb_Core_Alloc();
extern void vtlb_Core_Shutdown();
extern void vtlb_Init();
extern void vtlb_Reset();
extern void vtlb_Term();
extern u8* vtlb_malloc( uint size, uint align, uptr tryBaseAddress );
extern u8* vtlb_malloc( uint size, uint align );
extern void vtlb_free( void* pmem, uint size );
@ -67,6 +69,8 @@ extern void vtlb_DynGenRead32_Const( u32 bits, bool sign, u32 addr_const );
namespace vtlb_private
{
static const uint VTLB_ALLOC_SIZE = 0x2900000; //this is a bit more than required
static const uint VTLB_PAGE_BITS = 12;
static const uint VTLB_PAGE_MASK = 4095;
static const uint VTLB_PAGE_SIZE = 4096;
@ -77,6 +81,11 @@ namespace vtlb_private
struct MapData
{
u8 alloc_bits[VTLB_ALLOC_SIZE/16/8];
u8* alloc_base; //base of the memory array
int alloc_current; //current base
s32 pmap[VTLB_PMAP_ITEMS]; //512KB
s32 vmap[VTLB_VMAP_ITEMS]; //4MB

View File

@ -2883,7 +2883,7 @@
</Filter>
</Filter>
<Filter
Name="Dynarec Emitter"
Name="x86Emitter"
>
<File
RelativePath="..\..\x86\ix86\ix86.cpp"

View File

@ -202,10 +202,8 @@ void WinRun()
_doPluginOverride( "DEV9", g_Startup.dev9dll, Config.DEV9 );
#ifndef _DEBUG
if( Config.Profiler )
ProfilerInit();
#endif
InitCPUTicks();
@ -800,7 +798,6 @@ LRESULT WINAPI MainWndProc(HWND hWnd, UINT msg, WPARAM wParam, LPARAM lParam)
SaveConfig();
break;
#ifndef _DEBUG
case ID_PROFILER:
Config.Profiler = !Config.Profiler;
if( Config.Profiler )
@ -815,7 +812,6 @@ LRESULT WINAPI MainWndProc(HWND hWnd, UINT msg, WPARAM wParam, LPARAM lParam)
}
SaveConfig();
break;
#endif
default:
if (LOWORD(wParam) >= ID_LANGS && LOWORD(wParam) <= (ID_LANGS + langsMax))
@ -989,9 +985,7 @@ void CreateMainMenu() {
ADDMENUITEM(0,_("Print cdvd &Info"), ID_CDVDPRINT);
ADDMENUITEM(0,_("Close GS Window on Esc"), ID_CLOSEGS);
ADDSEPARATOR(0);
#ifndef _DEBUG
ADDMENUITEM(0,_("Enable &Profiler"), ID_PROFILER);
#endif
ADDMENUITEM(0,_("Enable &Patches"), ID_PATCHES);
ADDMENUITEM(0,_("Enable &Console"), ID_CONSOLE);
ADDSEPARATOR(0);

View File

@ -7,7 +7,8 @@
//
// Generated from the TEXTINCLUDE 2 resource.
//
#include "afxresmw.h"
#include "afxresmw.h"
/////////////////////////////////////////////////////////////////////////////
#undef APSTUDIO_READONLY_SYMBOLS
@ -899,7 +900,8 @@ END
//
// Generated from the TEXTINCLUDE 3 resource.
//
/////////////////////////////////////////////////////////////////////////////
#endif // not APSTUDIO_INVOKED

View File

@ -418,6 +418,9 @@ static void recAlloc()
x86FpuState = FPU_STATE;
}
PCSX2_ALIGNED16( static u16 manual_page[Ps2MemSize::Base >> 12] );
PCSX2_ALIGNED16( static u8 manual_counter[Ps2MemSize::Base >> 12] );
////////////////////////////////////////////////////
void recResetEE( void )
{
@ -427,6 +430,8 @@ void recResetEE( void )
memset_8<0xcc, REC_CACHEMEM>(recMem); // 0xcc is INT3
memzero_ptr<m_recBlockAllocSize>( m_recBlockAlloc );
memzero_obj( manual_page );
memzero_obj( manual_counter );
ClearRecLUT((BASEBLOCK*)m_recBlockAlloc,
(((Ps2MemSize::Base + Ps2MemSize::Rom + Ps2MemSize::Rom1) / 4)));
@ -720,7 +725,6 @@ static void ClearRecLUT(BASEBLOCK* base, int count)
base[i].SetFnptr((uptr)JITCompile);
}
// Returns the offset to the next instruction after any cleared memory
void recClear(u32 addr, u32 size)
{
BASEBLOCKEX* pexblock;
@ -1256,14 +1260,16 @@ void badespfn() {
void __fastcall dyna_block_discard(u32 start,u32 sz)
{
DevCon::WriteLn("dyna_block_discard %08X , count %d", params start,sz);
Cpu->Clear(start,sz);
DevCon::WriteLn("dyna_block_discard .. start: %08X count=%d", params start,sz);
Cpu->Clear(start, sz);
}
void __fastcall dyna_block_reset(u32 start,u32 sz)
void __fastcall dyna_page_reset(u32 start,u32 sz)
{
DevCon::WriteLn("dyna_block_reset %08X , count %d", params start,sz);
DevCon::WriteLn("dyna_page_reset .. start=%08X count=%d", params start,sz);
Cpu->Clear(start & ~0xfffUL, 0x400);
manual_counter[start >> 10]++;
mmap_MarkCountedRamPage(PSM(start), start & ~0xfffUL);
}
@ -1490,98 +1496,6 @@ StartRecomp:
// instruction being analyzed.
if( usecop2 ) vucycle++;
// peephole optimizations //
#ifdef PCSX2_VM_COISSUE
if( i < s_nEndBlock-4 && recompileCodeSafe(i) ) {
u32 curcode = cpuRegs.code;
u32 nextcode = *(u32*)PSM(i+4);
if( _eeIsLoadStoreCoIssue(curcode, nextcode) && recBSC_co[curcode>>26] != NULL ) {
// rs has to be the same, and cannot be just written
if( ((curcode >> 21) & 0x1F) == ((nextcode >> 21) & 0x1F) && !_eeLoadWritesRs(curcode) ) {
if( _eeIsLoadStoreCoX(curcode) && ((nextcode>>16)&0x1f) != ((curcode>>21)&0x1f) ) {
// see how many stores there are
u32 j;
// use xmmregs since only supporting lwc1,lq,swc1,sq
for(j = i+8; j < s_nEndBlock && j < i+4*iREGCNT_XMM; j += 4 ) {
u32 nncode = *(u32*)PSM(j);
if( (nncode>>26) != (curcode>>26) || ((curcode>>21)&0x1f) != ((nncode>>21)&0x1f) ||
_eeLoadWritesRs(nncode))
break;
}
if( j > i+8 ) {
u32 num = (j-i)>>2; // number of stores that can coissue
assert( num <= iREGCNT_XMM );
g_pCurInstInfo[0].numpeeps = num-1;
g_pCurInstInfo[0].info |= EEINSTINFO_COREC;
while(i < j-4) {
g_pCurInstInfo++;
g_pCurInstInfo[0].info |= EEINSTINFO_NOREC;
i += 4;
}
continue;
}
// fall through
}
// unaligned loadstores
// if LWL, check if LWR and that offsets are +3 away
switch(curcode >> 26) {
case 0x22: // LWL
if( (nextcode>>26) != 0x26 || ((s16)nextcode)+3 != (s16)curcode )
continue;
break;
case 0x26: // LWR
if( (nextcode>>26) != 0x22 || ((s16)nextcode) != (s16)curcode+3 )
continue;
break;
case 0x2a: // SWL
if( (nextcode>>26) != 0x2e || ((s16)nextcode)+3 != (s16)curcode )
continue;
break;
case 0x2e: // SWR
if( (nextcode>>26) != 0x2a || ((s16)nextcode) != (s16)curcode+3 )
continue;
break;
case 0x1a: // LDL
if( (nextcode>>26) != 0x1b || ((s16)nextcode)+7 != (s16)curcode )
continue;
break;
case 0x1b: // LWR
if( (nextcode>>26) != 0x1aa || ((s16)nextcode) != (s16)curcode+7 )
continue;
break;
case 0x2c: // SWL
if( (nextcode>>26) != 0x2d || ((s16)nextcode)+7 != (s16)curcode )
continue;
break;
case 0x2d: // SWR
if( (nextcode>>26) != 0x2c || ((s16)nextcode) != (s16)curcode+7 )
continue;
break;
}
// good enough
g_pCurInstInfo[0].info |= EEINSTINFO_COREC;
g_pCurInstInfo[0].numpeeps = 1;
g_pCurInstInfo[1].info |= EEINSTINFO_NOREC;
g_pCurInstInfo++;
i += 4;
continue;
}
}
}
#endif // end peephole
}
// This *is* important because g_pCurInstInfo is checked a bit later on and
// if it's not equal to s_pInstCache it handles recompilation differently.
@ -1611,7 +1525,6 @@ StartRecomp:
iDumpBlock(startpc, recPtr);
#endif
static u16 manual_page[Ps2MemSize::Base >> 12];
u32 sz=(s_nEndBlock-startpc)>>2;
u32 inpage_ptr=HWADDR(startpc);
@ -1631,31 +1544,76 @@ StartRecomp:
}
else
{
// import the vtlbdata (alloc_bits and alloc_base and stuff):
using namespace vtlb_private;
MOV32ItoR(ECX, inpage_ptr);
MOV32ItoR(EDX, pgsz);
u32 mask=0;
u32 writen=0;
u32 writen_start=0;
u32 lpc=inpage_ptr;
u32 stg=pgsz;
while(stg>0)
{
// was dyna_block_discard_recmem. See note in recResetEE for details.
CMP32ItoM((uptr)PSM(lpc),*(u32*)PSM(lpc));
JNE32(((u32)&dyna_block_discard)- ( (u32)x86Ptr + 6 ));
u32 bit = (lpc>>4) & 7;
if (mask==0)
{
//writen=bit;
writen_start=(((u8*)PSM(lpc)-vtlbdata.alloc_base)>>4)/8;
}
mask |= 1 << bit;
stg-=4;
lpc+=4;
if (bit==31)
{
vtlbdata.alloc_bits[writen_start]&=~mask;
xTEST( ptr32[&vtlbdata.alloc_bits[writen_start]], mask ); // auto-optimizes to imm8 when applicable.
xJNZ( dyna_block_discard );
//SysPrintf("%08X %d %d\n",mask,pgsz,pgsz>>4);
mask = 0;
}
//writen++;
if (stg<=16)
{
lpc += stg;
stg = 0;
}
else
{
lpc += 16;
stg -= 16;
}
}
if (startpc != 0x81fc0) {
if (mask)
{
vtlbdata.alloc_bits[writen_start] &= ~mask;
xTEST( ptr32[&vtlbdata.alloc_bits[writen_start]], mask ); // auto-optimizes to imm8 when applicable.
xJNZ( dyna_block_discard );
//SysPrintf("%08X %d %d\n",mask,pgsz,pgsz>>4);
mask = 0;
}
if( startpc != 0x81fc0 && manual_counter[inpage_ptr >> 12] <= 4 )
{
// Commented out until we replace it with a smarter algo that only
// recompiles blocks a limited number of times.
xADD(ptr16[&manual_page[inpage_ptr >> 12]], 1);
xJC( dyna_block_reset );
xJC( dyna_page_reset );
}
DbgCon::WriteLn("Manual block @ %08X : %08X %d %d %d %d", params
startpc,inpage_ptr,pgsz,0x1000-inpage_offs,inpage_sz,sz*4);
}
}
inpage_ptr+=pgsz;
inpage_sz-=pgsz;
inpage_ptr += pgsz;
inpage_sz -= pgsz;
}
// finally recompile //

View File

@ -23,31 +23,88 @@
#include "iCore.h"
#include "iR5900.h"
#include "ix86\ix86_internal.h"
using namespace vtlb_private;
using namespace x86Emitter;
// NOTICE: This function *destroys* EAX!!
// Moves 128 bits of memory from the source register ptr to the dest register ptr.
// (used as an equivalent to movaps, when a free XMM register is unavailable for some reason)
void MOV128_MtoM( x86IntRegType destRm, x86IntRegType srcRm )
//////////////////////////////////////////////////////////////////////////////////////////
// iAllocRegSSE -- allocates an xmm register. If no xmm register is available, xmm0 is
// saved into g_globalXMMData and returned as a free register.
//
class iAllocRegSSE
{
// (this is one of my test cases for the new emitter --air)
protected:
xRegisterSSE m_reg;
bool m_free;
xAddressReg src( srcRm );
xAddressReg dest( destRm );
public:
iAllocRegSSE() :
m_reg( xmm0 ),
m_free( !!_hasFreeXMMreg() )
{
if( m_free )
m_reg = xRegisterSSE( _allocTempXMMreg( XMMT_INT, -1 ) );
else
xStoreReg( m_reg );
}
xMOV( eax, ptr[src] );
xMOV( ptr[dest], eax );
~iAllocRegSSE()
{
if( m_free )
_freeXMMreg( m_reg.Id );
else
xRestoreReg( m_reg );
}
operator xRegisterSSE() const { return m_reg; }
};
xMOV( eax, ptr[src+4] );
xMOV( ptr[dest+4], eax );
//////////////////////////////////////////////////////////////////////////////////////////
// Moves 128 bits from point B to point A, using SSE's MOVAPS (or MOVDQA).
// This instruction always uses an SSE register, even if all registers are allocated! It
// saves an SSE register to memory first, performs the copy, and restores the register.
//
void iMOV128_SSE( const ModSibBase& destRm, const ModSibBase& srcRm )
{
iAllocRegSSE reg;
xMOVDQA( reg, srcRm );
xMOVDQA( destRm, reg );
}
xMOV( eax, ptr[src+8] );
xMOV( ptr[dest+8], eax );
//////////////////////////////////////////////////////////////////////////////////////////
// Moves 64 bits of data from point B to point A, using either MMX, SSE, or x86 registers
// if neither MMX nor SSE is available to the task.
//
// Optimizations: This method uses MMX is the cpu is in MMX mode, or SSE if it's in FPU
// mode (saving on potential EMMS uses).
//
void iMOV64_Smart( const ModSibBase& destRm, const ModSibBase& srcRm )
{
if( (x86FpuState == FPU_STATE) && _hasFreeXMMreg() )
{
// Move things using MOVLPS:
xRegisterSSE reg( _allocTempXMMreg( XMMT_INT, -1 ) );
xMOVL.PS( reg, srcRm );
xMOVL.PS( destRm, reg );
_freeXMMreg( reg.Id );
return;
}
xMOV( eax, ptr[src+12] );
xMOV( ptr[dest+12], eax );
if( _hasFreeMMXreg() )
{
xRegisterMMX reg( _allocMMXreg(-1, MMX_TEMP, 0) );
xMOVQ( reg, srcRm );
xMOVQ( destRm, reg );
_freeMMXreg( reg.Id );
}
else
{
xMOV( eax, srcRm );
xMOV( destRm, eax );
xMOV( eax, srcRm+4 );
xMOV( destRm+4, eax );
}
}
/*
@ -127,38 +184,11 @@ static void _vtlb_DynGen_DirectRead( u32 bits, bool sign )
break;
case 64:
if( _hasFreeMMXreg() )
{
const int freereg = _allocMMXreg(-1, MMX_TEMP, 0);
MOVQRmtoR(freereg,ECX);
MOVQRtoRm(EDX,freereg);
_freeMMXreg(freereg);
}
else
{
MOV32RmtoR(EAX,ECX);
MOV32RtoRm(EDX,EAX);
MOV32RmtoR(EAX,ECX,4);
MOV32RtoRm(EDX,EAX,4);
}
iMOV64_Smart(ptr[edx],ptr[ecx]);
break;
case 128:
if( _hasFreeXMMreg() )
{
const int freereg = _allocTempXMMreg( XMMT_INT, -1 );
SSE2_MOVDQARmtoR(freereg,ECX);
SSE2_MOVDQARtoRm(EDX,freereg);
_freeXMMreg(freereg);
}
else
{
// Could put in an MMX optimization here as well, but no point really.
// It's almost never used since there's almost always a free XMM reg.
MOV128_MtoM( EDX, ECX ); // dest <- src!
}
iMOV128_SSE(ptr[edx],ptr[ecx]);
break;
jNO_DEFAULT
@ -262,39 +292,11 @@ void vtlb_DynGenRead64_Const( u32 bits, u32 addr_const )
switch( bits )
{
case 64:
if( _hasFreeMMXreg() )
{
const int freereg = _allocMMXreg(-1, MMX_TEMP, 0);
MOVQMtoR(freereg,ppf);
MOVQRtoRm(EDX,freereg);
_freeMMXreg(freereg);
}
else
{
MOV32MtoR(EAX,ppf);
MOV32RtoRm(EDX,EAX);
MOV32MtoR(EAX,ppf+4);
MOV32RtoRm(EDX,EAX,4);
}
iMOV64_Smart(ptr[edx],ptr[ppf]);
break;
case 128:
if( _hasFreeXMMreg() )
{
const int freereg = _allocTempXMMreg( XMMT_INT, -1 );
SSE2_MOVDQA_M128_to_XMM( freereg, ppf );
SSE2_MOVDQARtoRm(EDX,freereg);
_freeXMMreg(freereg);
}
else
{
// Could put in an MMX optimization here as well, but no point really.
// It's almost never used since there's almost always a free XMM reg.
MOV32ItoR( ECX, ppf );
MOV128_MtoM( EDX, ECX ); // dest <- src!
}
iMOV128_SSE(ptr[edx],ptr[ppf]);
break;
jNO_DEFAULT
@ -415,40 +417,21 @@ static void _vtlb_DynGen_DirectWrite( u32 bits )
break;
case 64:
if( _hasFreeMMXreg() )
{
const int freereg = _allocMMXreg(-1, MMX_TEMP, 0);
MOVQRmtoR(freereg,EDX);
MOVQRtoRm(ECX,freereg);
_freeMMXreg( freereg );
}
else
{
MOV32RmtoR(EAX,EDX);
MOV32RtoRm(ECX,EAX);
MOV32RmtoR(EAX,EDX,4);
MOV32RtoRm(ECX,EAX,4);
}
iMOV64_Smart(ptr[ecx],ptr[edx]);
break;
case 128:
if( _hasFreeXMMreg() )
{
const int freereg = _allocTempXMMreg( XMMT_INT, -1 );
SSE2_MOVDQARmtoR(freereg,EDX);
SSE2_MOVDQARtoRm(ECX,freereg);
_freeXMMreg( freereg );
}
else
{
// Could put in an MMX optimization here as well, but no point really.
// It's almost never used since there's almost always a free XMM reg.
MOV128_MtoM( ECX, EDX ); // dest <- src!
}
iMOV128_SSE(ptr[ecx],ptr[edx]);
break;
}
xSHR( ecx, 4 );
uptr alloc_base = (uptr)vtlbdata.alloc_base;
u8* bits_base = vtlbdata.alloc_bits;
bits_base -= (alloc_base>>4)/8; //in bytes
xBTS( ecx, bits_base );
}
// ------------------------------------------------------------------------
@ -514,39 +497,11 @@ void vtlb_DynGenWrite_Const( u32 bits, u32 addr_const )
break;
case 64:
if( _hasFreeMMXreg() )
{
const int freereg = _allocMMXreg(-1, MMX_TEMP, 0);
MOVQRmtoR(freereg,EDX);
MOVQRtoM(ppf,freereg);
_freeMMXreg( freereg );
}
else
{
MOV32RmtoR(EAX,EDX);
MOV32RtoM(ppf,EAX);
MOV32RmtoR(EAX,EDX,4);
MOV32RtoM(ppf+4,EAX);
}
iMOV64_Smart( ptr[ppf], ptr[edx] );
break;
case 128:
if( _hasFreeXMMreg() )
{
const int freereg = _allocTempXMMreg( XMMT_INT, -1 );
SSE2_MOVDQARmtoR(freereg,EDX);
SSE2_MOVDQA_XMM_to_M128(ppf,freereg);
_freeXMMreg( freereg );
}
else
{
// Could put in an MMX optimization here as well, but no point really.
// It's almost never used since there's almost always a free XMM reg.
MOV32ItoR( ECX, ppf );
MOV128_MtoM( ECX, EDX ); // dest <- src!
}
iMOV128_SSE( ptr[ppf], ptr[edx] );
break;
}
@ -571,3 +526,4 @@ void vtlb_DynGenWrite_Const( u32 bits, u32 addr_const )
CALLFunc( (int)vtlbdata.RWFT[szidx][1][handler] );
}
}

View File

@ -35,6 +35,9 @@
namespace x86Emitter
{
extern void xStoreReg( const xRegisterSSE& src );
extern void xRestoreReg( const xRegisterSSE& dest );
// ------------------------------------------------------------------------
// Group 1 Instruction Class

View File

@ -677,8 +677,6 @@ extern void CDQE( void );
extern void LAHF();
extern void SAHF();
extern void BT32ItoR( x86IntRegType to, u8 from );
extern void BTR32ItoR( x86IntRegType to, u8 from );
extern void BSRRtoR(x86IntRegType to, x86IntRegType from);
extern void BSWAP32R( x86IntRegType to );

View File

@ -30,9 +30,22 @@ u8 g_globalXMMSaved = 0;
PCSX2_ALIGNED16( static u64 g_globalMMXData[8] );
PCSX2_ALIGNED16( static u64 g_globalXMMData[2*iREGCNT_XMM] );
namespace x86Emitter
{
void xStoreReg( const xRegisterSSE& src )
{
xMOVDQA( &g_globalXMMData[src.Id], src );
}
void xRestoreReg( const xRegisterSSE& dest )
{
xMOVDQA( dest, &g_globalXMMData[dest.Id] );
}
}
/////////////////////////////////////////////////////////////////////
// SetCPUState -- for assugnment of SSE roundmodes and clampmodes.
// SetCPUState -- for assignment of SSE roundmodes and clampmodes.
u32 g_sseMXCSR = DEFAULT_sseMXCSR;
u32 g_sseVUMXCSR = DEFAULT_sseVUMXCSR;