Made several improvements and cleanups to the memzero API - including better linux version of the header. Also changed memcpy_amd_ / memcpy_fast to use __fastcall convention.

git-svn-id: http://pcsx2-playground.googlecode.com/svn/trunk@639 a6443dda-0b58-4228-96e9-037be469359c
This commit is contained in:
Jake.Stine 2009-01-26 15:45:20 +00:00 committed by Gregory Hainaut
parent c63a7dc032
commit b2eb1cad4f
26 changed files with 446 additions and 228 deletions

View File

@ -419,7 +419,7 @@ s32 cdvdReadConfig(u8* config)
((cdvd.COffset == 2) && (cdvd.CBlockIndex >= 7))
)
{
memzero_air<16>(config);
memzero_ptr<16>(config);
return 0;
}
@ -490,7 +490,7 @@ void cdvdReadKey(u8 arg0, u16 arg1, u32 arg2, u8* key) {
key_14 = ((numbers & 0x003E0) >> 2) | 0x04; // numbers = F8 extra = 04 unused = 03
// clear key values
memzero_air<16>(key);
memzero_ptr<16>(key);
// store key values
key[ 0] = (key_0_3&0x000000FF)>> 0;

View File

@ -63,8 +63,8 @@ void CDVDFS_init(){
cdReadMode.datapattern = CdSecS2048; //isofs driver only needs
//2KB sectors
memset(fd_table, 0, sizeof(fd_table));
memset(fd_used, 0, 16*sizeof(int));
memzero_obj( fd_table );
memzero_obj( fd_used );
inited = TRUE;

View File

@ -520,7 +520,7 @@ void cdrReadInterrupt() {
if (cdr.RErr == -1) {
CDR_LOG(" err\n");
memzero_air<2340>(cdr.Transfer);
memzero_ptr<2340>(cdr.Transfer);
cdr.Stat = DiskError;
cdr.Result[0]|= 0x01;
ReadTrack();

View File

@ -113,7 +113,7 @@ static __forceinline void cpuRcntSet()
void rcntInit() {
int i;
memset(counters, 0, sizeof(counters));
memzero_obj(counters);
for (i=0; i<4; i++) {
counters[i].rate = 2;

View File

@ -136,7 +136,7 @@ void IPUProcessInterrupt()
// Register accesses (run on EE thread)
int ipuInit()
{
memzero_air<sizeof(IPUregisters)>(ipuRegs);
memzero_ptr<sizeof(IPUregisters)>(ipuRegs);
memzero_obj(g_BP);
//other stuff
@ -154,7 +154,7 @@ int ipuInit()
void ipuReset()
{
memzero_air<sizeof(IPUregisters)>(ipuRegs);
memzero_ptr<sizeof(IPUregisters)>(ipuRegs);
g_nDMATransfer = 0;
}
@ -384,7 +384,7 @@ static void ipuBCLR(u32 val) {
g_BP.IFC = 0;
ipuRegs->ctrl.BUSY = 0;
ipuRegs->cmd.BUSY = 0;
memzero_air<80>(readbits);
memzero_ptr<80>(readbits);
IPU_LOG("Clear IPU input FIFO. Set Bit offset=0x%X\n", g_BP.BP);
}

View File

@ -1054,8 +1054,8 @@ void mpeg2sliceIDEC(void* pdone)
decoder->coded_block_pattern = 0x3F;//all 6 blocks
//ipuRegs->ctrl.CBP = 0x3f;
memzero_air<sizeof(macroblock_8)>(decoder->mb8);
memzero_air<sizeof(rgb32)>(decoder->rgb32);
memzero_ptr<sizeof(macroblock_8)>(decoder->mb8);
memzero_ptr<sizeof(rgb32)>(decoder->rgb32);
slice_intra_DCT (decoder, 0, (u8*)decoder->mb8->Y, DCT_stride);
slice_intra_DCT (decoder, 0, (u8*)decoder->mb8->Y + 8, DCT_stride);
@ -1194,8 +1194,8 @@ void mpeg2_slice(void* pdone)
*(int*)pdone = 0;
ipuRegs->ctrl.ECD = 0;
memzero_air<sizeof(macroblock_8)>(decoder->mb8);
memzero_air<sizeof(macroblock_16)>(decoder->mb16);
memzero_ptr<sizeof(macroblock_8)>(decoder->mb8);
memzero_ptr<sizeof(macroblock_16)>(decoder->mb16);
bitstream_init (decoder);

View File

@ -35,7 +35,7 @@ void psxMemAlloc()
void psxMemReset()
{
memzero_air<Ps2MemSize::IopRam>(psxM);
memzero_ptr<Ps2MemSize::IopRam>(psxM);
}
void psxMemShutdown()
@ -404,8 +404,8 @@ void psxMemReset()
DbgCon::Status( "psxMemReset > Resetting core memory!" );
memzero_air<0x10000 * sizeof(uptr) * 2>( psxMemWLUT ); // clears both allocations, RLUT and WLUT
memzero_air<m_psxMemSize>( m_psxAllMem );
memzero_ptr<0x10000 * sizeof(uptr) * 2>( psxMemWLUT ); // clears both allocations, RLUT and WLUT
memzero_ptr<m_psxMemSize>( m_psxAllMem );
// Trick! We're accessing RLUT here through WLUT, since it's the non-const pointer.
// So the ones with a 1 prefixed (ala 0x18000, etc) are RLUT tables.

View File

@ -16,55 +16,55 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
*/
#ifndef _PCSX2_MEMZERO_H_
#define _PCSX2_MEMZERO_H_
#ifndef _LNX_MEMZERO_H_
#define _LNX_MEMZERO_H_
// This stubs out that memzero Windows specific stuff Air seems to have added
// all over, to allow Linux to compile. I may actually try to translate the file at
// some point, but for now, lets just use memset.
// This header contains non-optimized implementation of memzero_ptr and memset8_obj,
// memset16_obj, etc.
template< size_t bytes >
static __forceinline void memzero_air( void *dest )
template< u32 data, typename T >
static __forceinline void memset32_obj( T& obj )
{
memset(dest, 0, bytes);
// this function works on 32-bit aligned lengths of data only.
// If the data length is not a factor of 32 bits, the C++ optimizing compiler will
// probably just generate mysteriously broken code in Release builds. ;)
jASSUME( (sizeof(T) & 0x3) == 0 );
u32* dest = (u32*)&obj;
for( int i=sizeof(T)>>2; i; --i, ++dest )
*dest = data;
}
template< u8 data, size_t bytes >
static __forceinline void memset_8( void *dest )
template< uint size >
static __forceinline void memzero_ptr( void* dest )
{
memset(dest, data, bytes);
memset( dest, 0, size );
}
template< u16 data, size_t bytes >
static __forceinline void memset_16( void *dest )
{
memset(dest, data, bytes);
}
template< u32 data, size_t bytes >
static __forceinline void memset_32( void *dest )
{
memset(dest, data, bytes);
}
// This method can clear any object-like entity -- which is anything that is not a pointer.
// Structures, static arrays, etc. No need to include sizeof() crap, this does it automatically
// for you!
template< typename T >
static __forceinline void memzero_obj( T& object )
static __forceinline void memzero_obj( T& obj )
{
memzero_air<sizeof(T)>( &object );
memset( &obj, 0, sizeof( T ) );
}
template< uint data, typename T >
static __forceinline void memset_obj( T& object )
template< u8 data, typename T >
static __forceinline void memset8_obj( T& obj )
{
if( data <= 0xff )
memset_8<(u8)data, sizeof(T)>( &object );
else if( data <= 0xffff )
memset_16<(u16)data, sizeof(T)>( &object );
// Aligned sizes use the optimized 32 bit inline memset. Unaligned sizes use memset.
if( (sizeof(T) & 0x3) != 0 )
memset( &obj, data, sizeof( T ) );
else
memset_32<(u32)data, sizeof(T)>( &object );
memset32_obj<data + (data<<8) + (data<<16) + (data<<24)>( obj );
}
#endif
template< u16 data, typename T >
static __forceinline void memset16_obj( T& obj )
{
if( (sizeof(T) & 0x3) != 0 )
_memset_16_unaligned( &obj, data, sizeof( T ) )
else
memset32_obj<data + (data<<16)>( obj );
}
#endif

View File

@ -68,8 +68,6 @@ BIOS
extern u32 maxrecmem;
extern int rdram_devices, rdram_sdevid;
extern void * memcpy_fast(void *dest, const void *src, size_t n);
//#define FULLTLB
int MemMode = 0; // 0 is Kernel Mode, 1 is Supervisor Mode, 2 is User Mode
@ -518,7 +516,7 @@ void vm_Reset()
{
jASSUME( memLUT != NULL );
memzero_air<sizeof(PSMEMORYMAP)*0x100000>(memLUT);
memzero_ptr<sizeof(PSMEMORYMAP)*0x100000>(memLUT);
for (int i=0; i<0x02000; i++) memLUT[i + 0x00000] = initMemoryMap(&s_psM.aPFNs[i], &s_psM.aVFNs[i]);
for (int i=2; i<0x00010; i++) memLUT[i + 0x10000] = initMemoryMap(&s_psHw.aPFNs[i], &s_psHw.aVFNs[i]);
for (int i=0; i<0x00800; i++) memLUT[i + 0x1c000] = initMemoryMap(&s_psxM.aPFNs[(i & 0x1ff)], &s_psxM.aVFNs[(i & 0x1ff)]);
@ -2707,8 +2705,8 @@ void memReset()
mprotect(PS2EMEM_EROM, Ps2MemSize::ERom, PROT_READ|PROT_WRITE);
# endif
memzero_air<Ps2MemSize::Base>(PS2MEM_BASE);
memzero_air<Ps2MemSize::Scratch>(PS2MEM_SCRATCH);
memzero_ptr<Ps2MemSize::Base>(PS2MEM_BASE);
memzero_ptr<Ps2MemSize::Scratch>(PS2MEM_SCRATCH);
vm_Reset();
#else
@ -2728,7 +2726,7 @@ void memReset()
// rest of the emu is not really set up to support a "soft" reset of that sort
// we opt for the hard/safe version.
memzero_air<m_allMemSize>( m_psAllMem );
memzero_ptr<m_allMemSize>( m_psAllMem );
#ifdef ENABLECACHE
memset(pCache,0,sizeof(_cacheS)*64);
#endif

View File

@ -808,3 +808,12 @@ u64 GetCPUTicks()
return ((u64)t.tv_sec*GetTickFrequency())+t.tv_usec;
#endif
}
void _memset16_unaligned( void* dest, u16 data, size_t size )
{
jASSUME( (size & 0x1) == 0 );
u16* dst = (u16*)dest;
for(int i=size; i; --i, ++dst )
*dst = data;
}

View File

@ -227,22 +227,24 @@ extern u8 g_globalXMMSaved;
#define FreezeXMMRegs(save) if( g_EEFreezeRegs ) { FreezeXMMRegs_(save); }
#define FreezeMMXRegs(save) if( g_EEFreezeRegs ) { FreezeMMXRegs_(save); }
void _memset16_unaligned( void* dest, u16 data, size_t size );
#if defined(_WIN32) && !defined(__x86_64__)
// faster memcpy
extern void __fastcall memcpy_raz_u(void *dest, const void *src, size_t bytes);
extern void __fastcall memcpy_raz_(void *dest, const void *src, size_t qwc);
extern void * memcpy_amd_(void *dest, const void *src, size_t n);
#include "windows/memzero.h"
extern void __fastcall memcpy_amd_(void *dest, const void *src, size_t n);
# include "windows/memzero.h"
# define memcpy_fast memcpy_amd_
#define memcpy_fast memcpy_amd_
//#define memcpy_fast memcpy //Dont use normal memcpy, it has sse in 2k5!
#else
// for now disable linux fast memcpy
// for now linux uses the GCC memcpy/memset implementations.
#define memcpy_fast memcpy
#define memcpy_raz_ memcpy
#define memcpy_raz_u memcpy
#include "Linux/memzero.h"
#endif

View File

@ -86,9 +86,9 @@ void cpuReset()
psxMemReset();
vuMicroMemReset();
memset(&cpuRegs, 0, sizeof(cpuRegs));
memset(&fpuRegs, 0, sizeof(fpuRegs));
memset(&tlb, 0, sizeof(tlb));
memzero_obj(cpuRegs);
memzero_obj(fpuRegs);
memzero_obj(tlb);
cpuRegs.pc = 0xbfc00000; ///set pc reg to stack
cpuRegs.CP0.n.Config = 0x440;

View File

@ -153,7 +153,7 @@ void SIO_CommandWrite(u8 value,int way) {
PAD_LOG("RESET MEMORY CARD\n");
sio.bufcount = 8;
memset_obj<0xff>(sio.buf);
memset8_obj<0xff>(sio.buf);
sio.buf[3] = sio.terminator;
sio.buf[2] = '+';
sio.mcdst = 99;
@ -161,7 +161,7 @@ void SIO_CommandWrite(u8 value,int way) {
break;
case 0x12: // RESET
sio.bufcount = 8;
memset_obj<0xff>(sio.buf);
memset8_obj<0xff>(sio.buf);
sio.buf[3] = sio.terminator;
sio.buf[2] = '+';
sio.mcdst = 99;
@ -171,7 +171,7 @@ void SIO_CommandWrite(u8 value,int way) {
break;
case 0x81: // COMMIT
sio.bufcount = 8;
memset_obj<0xff>(sio.buf);
memset8_obj<0xff>(sio.buf);
sio.mcdst = 99;
sio.buf[3] = sio.terminator;
sio.buf[2] = '+';
@ -187,7 +187,7 @@ void SIO_CommandWrite(u8 value,int way) {
case 0x22:
case 0x23: // SECTOR SET
sio.bufcount = 8; sio.mcdst = 99; sio.sector=0; sio.k=0;
memset_obj<0xff>(sio.buf);
memset8_obj<0xff>(sio.buf);
sio2.packet.recvVal3 = 0x8c;
sio.buf[8]=sio.terminator;
sio.buf[7]='+';
@ -201,7 +201,7 @@ void SIO_CommandWrite(u8 value,int way) {
break;
case 0x26:
sio.bufcount = 12; sio.mcdst = 99; sio2.packet.recvVal3 = 0x83;
memset_obj<0xff>(sio.buf);
memset8_obj<0xff>(sio.buf);
memcpy(&sio.buf[2], &mc_command_0x26, sizeof(mc_command_0x26));
sio.buf[12]=sio.terminator;
MEMCARDS_LOG("MC(%d) command 0x%02X\n", ((sio.CtrlReg&0x2000)>>13)+1, value);
@ -210,7 +210,7 @@ void SIO_CommandWrite(u8 value,int way) {
case 0x28:
case 0xBF:
sio.bufcount = 4; sio.mcdst = 99; sio2.packet.recvVal3 = 0x8b;
memset_obj<0xff>(sio.buf);
memset8_obj<0xff>(sio.buf);
sio.buf[4]=sio.terminator;
sio.buf[3]='+';
MEMCARDS_LOG("MC(%d) command 0x%02X\n", ((sio.CtrlReg&0x2000)>>13)+1, value);
@ -223,7 +223,7 @@ void SIO_CommandWrite(u8 value,int way) {
if(value==0x43) sio.lastsector = sio.sector; // Reading
sio.bufcount =133; sio.mcdst = 99;
memset_obj<0xff>(sio.buf);
memset8_obj<0xff>(sio.buf);
sio.buf[133]=sio.terminator;
sio.buf[132]='+';
MEMCARDS_LOG("MC(%d) command 0x%02X\n", ((sio.CtrlReg&0x2000)>>13)+1, value);
@ -237,24 +237,24 @@ void SIO_CommandWrite(u8 value,int way) {
case 0xf3:
case 0xf7:
sio.bufcount = 4; sio.mcdst = 99;
memset_obj<0xff>(sio.buf);
memset8_obj<0xff>(sio.buf);
sio.buf[4]=sio.terminator;
sio.buf[3]='+';
MEMCARDS_LOG("MC(%d) command 0x%02X\n", ((sio.CtrlReg&0x2000)>>13)+1, value);
break;
case 0x52:
sio.rdwr = 1; memset_obj<0xff>(sio.buf);
sio.rdwr = 1; memset8_obj<0xff>(sio.buf);
sio.buf[sio.bufcount]=sio.terminator; sio.buf[sio.bufcount-1]='+';
MEMCARDS_LOG("MC(%d) command 0x%02X\n", ((sio.CtrlReg&0x2000)>>13)+1, value);
break;
case 0x57:
sio.rdwr = 2; memset_obj<0xff>(sio.buf);
sio.rdwr = 2; memset8_obj<0xff>(sio.buf);
sio.buf[sio.bufcount]=sio.terminator; sio.buf[sio.bufcount-1]='+';
MEMCARDS_LOG("MC(%d) command 0x%02X\n", ((sio.CtrlReg&0x2000)>>13)+1, value);
break;
default:
sio.mcdst = 0;
memset_obj<0xff>(sio.buf);
memset8_obj<0xff>(sio.buf);
sio.buf[sio.bufcount]=sio.terminator; sio.buf[sio.bufcount-1]='+';
MEMCARDS_LOG("Unknown MC(%d) command 0x%02X\n", ((sio.CtrlReg&0x2000)>>13)+1, value);
}
@ -313,7 +313,7 @@ void SIO_CommandWrite(u8 value,int way) {
case 0x42:
if (sio.parp==2) {
sio.bufcount=5+value;
memset_obj<0xff>(sio.buf);
memset8_obj<0xff>(sio.buf);
sio.buf[sio.bufcount-1]='+';
sio.buf[sio.bufcount]=sio.terminator;
MEMCARDS_LOG("MC(%d) WRITE command 0x%02X\n\n\n\n\n", ((sio.CtrlReg&0x2000)>>13)+1, value);
@ -387,7 +387,7 @@ void SIO_CommandWrite(u8 value,int way) {
case 17:
case 19:
sio.bufcount=13;
memset_obj<0xff>(sio.buf);
memset8_obj<0xff>(sio.buf);
sio.buf[12] = 0; // Xor value of data from index 4 to 11
sio.buf[3]='+';
sio.buf[13] = sio.terminator;
@ -396,13 +396,13 @@ void SIO_CommandWrite(u8 value,int way) {
case 7:
case 11:
sio.bufcount=13;
memset_obj<0xff>(sio.buf);
memset8_obj<0xff>(sio.buf);
sio.buf[12]='+';
sio.buf[13] = sio.terminator;
break;
default:
sio.bufcount=4;
memset_obj<0xff>(sio.buf);
memset8_obj<0xff>(sio.buf);
sio.buf[3]='+';
sio.buf[4] = sio.terminator;
}
@ -628,7 +628,7 @@ void SaveMcd(int mcd, const u8 *data, u32 adr, int size) {
void EraseMcd(int mcd, u32 adr) {
u8 data[528*16];
memset_obj<0xff>(data); // clears to -1's
memset8_obj<0xff>(data); // clears to -1's
if(mcd == 1)
{
SeekMcd(MemoryCard1, adr);

View File

@ -180,13 +180,6 @@ void vuMicroMemReset()
jASSUME( VU0.Mem != NULL );
jASSUME( VU1.Mem != NULL );
/*#ifdef PCSX2_VIRTUAL_MEM
memLUT[0x11000].aPFNs = &s_psVuMem.aPFNs[0]; memLUT[0x11000].aVFNs = &s_psVuMem.aVFNs[0];
memLUT[0x11001].aPFNs = &s_psVuMem.aPFNs[0]; memLUT[0x11001].aVFNs = &s_psVuMem.aVFNs[0];
memLUT[0x11002].aPFNs = &s_psVuMem.aPFNs[0]; memLUT[0x11002].aVFNs = &s_psVuMem.aVFNs[0];
memLUT[0x11003].aPFNs = &s_psVuMem.aPFNs[0]; memLUT[0x11003].aVFNs = &s_psVuMem.aVFNs[0];
#endif*/
// === VU0 Initialization ===
memzero_obj(VU0.ACC);
memzero_obj(VU0.VF);
@ -196,8 +189,8 @@ void vuMicroMemReset()
VU0.VF[0].f.z = 0.0f;
VU0.VF[0].f.w = 1.0f;
VU0.VI[0].UL = 0;
memzero_air<4*1024>(VU0.Mem);
memzero_air<4*1024>(VU0.Micro);
memzero_ptr<4*1024>(VU0.Mem);
memzero_ptr<4*1024>(VU0.Micro);
/* this is kinda tricky, maxmem is set to 0x4400 here,
tho it's not 100% accurate, since the mem goes from
@ -219,8 +212,8 @@ void vuMicroMemReset()
VU1.VF[0].f.z = 0.0f;
VU1.VF[0].f.w = 1.0f;
VU1.VI[0].UL = 0;
memzero_air<16*1024>(VU1.Mem);
memzero_air<16*1024>(VU1.Micro);
memzero_ptr<16*1024>(VU1.Mem);
memzero_ptr<16*1024>(VU1.Micro);
VU1.maxmem = -1;//16*1024-4;
VU1.maxmicro = 16*1024-4;
@ -247,7 +240,7 @@ void SaveState::vuMicroFreeze()
else
{
// Old versions stored the VIregs as 32 bit values...
memset( VU0.VI, 0, sizeof( VU0.VI ) );
memzero_obj( VU0.VI );
for(int i=0; i<32; i++ )
Freeze( VU0.VI[i].UL );
}
@ -263,7 +256,7 @@ void SaveState::vuMicroFreeze()
else
{
// Old versions stored the VIregs as 32 bit values...
memset( VU1.VI, 0, sizeof( VU1.VI ) );
memzero_obj( VU1.VI );
for(int i=0; i<32; i++ )
Freeze( VU1.VI[i].UL );
}

View File

@ -68,8 +68,6 @@ static const unsigned int VIF1dmanum = 1;
int g_vifCycles = 0;
int path3hack = 0;
extern void * memcpy_fast(void *dest, const void *src, size_t n);
typedef void (*UNPACKFUNCTYPE)( u32 *dest, u32 *data, int size );
typedef int (*UNPACKPARTFUNCTYPESSE)( u32 *dest, u32 *data, int size );
extern void (*Vif1CMDTLB[82])();
@ -856,9 +854,8 @@ static int Vif0TransSTRow(u32 *data){ // STROW
case 3: pmem[8] = data[2]; pmem2[2] = data[2];
case 2: pmem[4] = data[1]; pmem2[1] = data[1];
case 1: pmem[0] = data[0]; pmem2[0] = data[0]; break;
#ifdef _MSC_VER
default: __assume(0);
#endif
jNO_DEFAULT
}
vif0.tag.addr += ret;
vif0.tag.size -= ret;
@ -878,9 +875,8 @@ static int Vif0TransSTCol(u32 *data){ // STCOL
case 3: pmem[8] = data[2]; pmem2[2] = data[2];
case 2: pmem[4] = data[1]; pmem2[1] = data[1];
case 1: pmem[0] = data[0]; pmem2[0] = data[0]; break;
#ifdef _MSC_VER
default: __assume(0);
#endif
jNO_DEFAULT
}
vif0.tag.addr += ret;
vif0.tag.size -= ret;

View File

@ -334,7 +334,7 @@ int WINAPI WinMain(HINSTANCE hInstance, HINSTANCE hPrevInstance, LPSTR lpCmdLine
textdomain(PACKAGE);
#endif
memset(&g_TestRun, 0, sizeof(g_TestRun));
memzero_obj(g_TestRun);
_getcwd( g_WorkingFolder, g_MaxPath );
@ -429,8 +429,8 @@ BOOL Open_File_Proc( std::string& outstr )
char szFileTitle[ g_MaxPath ];
char * filter = "ELF Files (*.ELF)\0*.ELF\0ALL Files (*.*)\0*.*\0";
memset( &szFileName, 0, sizeof( szFileName ) );
memset( &szFileTitle, 0, sizeof( szFileTitle ) );
memzero_obj( szFileName );
memzero_obj( szFileTitle );
ofn.lStructSize = sizeof( OPENFILENAME );
ofn.hwndOwner = gApp.hWnd;

View File

@ -543,8 +543,8 @@ void OnStates_LoadOther()
char szFileTitle[g_MaxPath];
char szFilter[g_MaxPath];
memset(&szFileName, 0, sizeof(szFileName));
memset(&szFileTitle, 0, sizeof(szFileTitle));
memzero_obj( szFileName );
memzero_obj( szFileTitle );
strcpy(szFilter, _("PCSX2 State Format"));
strcatz(szFilter, "*.*;*.*");
@ -575,8 +575,8 @@ void OnStates_SaveOther()
char szFileTitle[g_MaxPath];
char szFilter[g_MaxPath];
memset(&szFileName, 0, sizeof(szFileName));
memset(&szFileTitle, 0, sizeof(szFileTitle));
memzero_obj( szFileName );
memzero_obj( szFileTitle );
strcpy(szFilter, _("PCSX2 State Format"));
strcatz(szFilter, "*.*;*.*");

View File

@ -16,44 +16,77 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
*/
#ifndef _PCSX2_MEMZERO_H_
#define _PCSX2_MEMZERO_H_
#ifndef _WIN_MEMZERO_H_
#define _WIN_MEMZERO_H_
// This is an implementation of the memzero_air fast memset routine (for zero-clears only).
// It uses templates so that it generates very efficient and compact inline code for clears.
// These functions are meant for memset operations of constant length only.
// For dynamic length clears, use the C-compiler provided memset instead.
// MemZero Code Strategies:
// I use a trick to help the MSVC compiler optimize it's asm code better. The compiler
// won't optimize local variables very well because it insists in storing them on the
// stack and then loading them out of the stack when I use them from inline ASM, and
// it won't allow me to use template parameters in inline asm code either. But I can
// assign the template parameters to enums, and then use the enums from asm code.
// Yeah, silly, but it works. :D (air)
// All methods defined in this header use template in combination with the aforementioned
// enumerations to generate very efficient and compact inlined code. These optimized
// memsets work on the theory that most uses of memset involve static arrays and
// structures, which are constant in size, thus allowing us to generate optimal compile-
// time code for each use of the function.
// Notes on XMM0's "storage" area (_xmm_backup):
// Unfortunately there's no way to guarantee alignment for this variable. If I use the
// __declspec(aligned(16)) decorator, MSVC fails to inline the function since stack
// alignment requires prep work. And for the same reason it's not possible to check the
// alignment of the stack at compile time, so I'm forced to use movups to store and
// retrieve xmm0.
// This is an implementation of the memzero_ptr fast memset routine (for zero-clears only).
template< size_t bytes >
static __forceinline void memzero_air( void *dest )
static __forceinline void memzero_ptr( void *dest )
{
if( bytes == 0 ) return;
u64 _xmm_backup[2];
// This function only works on 32-bit alignments. For anything else we just fall back
// on the compiler-provided implementation of memset...
enum half_local
if( (bytes & 0x3) != 0 )
{
memset( dest, 0, bytes );
return;
}
enum
{
remainder = bytes & 127,
bytes128 = bytes / 128
};
// Initial check -- if the length is not a multiple of 16 then fall back on
// using rep movsd methods. Handling these unaligned writes in a more efficient
// manner isn't necessary in pcsx2.
// using rep movsd methods. Handling these unaligned clears in a more efficient
// manner isn't necessary in pcsx2 (meaning they aren't used in speed-critical
// scenarios).
if( (bytes & 0xf) == 0 )
{
u64 _xmm_backup[2];
if( ((uptr)dest & 0xf) != 0 )
{
// UNALIGNED COPY MODE.
// For unaligned copies we have a threshold of at least 128 vectors. Anything
// less and it's probably better off just falling back on the rep movsd.
if( bytes128 >128 )
if( bytes128 > 128 )
{
__asm
{
movups _xmm_backup,xmm0;
mov eax,bytes128
mov ecx,dest
pxor xmm0,xmm0
mov eax,bytes128
align 16
@ -99,9 +132,9 @@ static __forceinline void memzero_air( void *dest )
__asm
{
movups _xmm_backup,xmm0;
mov eax,bytes128
mov ecx,dest
pxor xmm0,xmm0
mov eax,bytes128
align 16
@ -143,37 +176,26 @@ static __forceinline void memzero_air( void *dest )
jASSUME( (bytes & 0x3) == 0 );
jASSUME( ((uptr)dest & 0x3) == 0 );
enum __local
enum
{
remdat = bytes>>2
};
// This case statement handles 5 special-case sizes (small blocks)
// in addition to the generic large block.
// in addition to the generic large block that uses rep stosd.
switch( remdat )
{
case 1:
__asm
{
mov edi, dest
xor eax, eax
mov edi, eax
}
*(u32*)dest = 0;
return;
case 2:
_asm
{
mov edi, dest
xor eax, eax
stosd
stosd
}
*(u64*)dest = 0;
return;
case 3:
_asm
__asm
{
mov edi, dest
xor eax, eax
@ -184,7 +206,7 @@ static __forceinline void memzero_air( void *dest )
return;
case 4:
_asm
__asm
{
mov edi, dest
xor eax, eax
@ -196,7 +218,7 @@ static __forceinline void memzero_air( void *dest )
return;
case 5:
_asm
__asm
{
mov edi, dest
xor eax, eax
@ -220,11 +242,21 @@ static __forceinline void memzero_air( void *dest )
}
}
// An optimized memset for 8 bit destination data.
template< u8 data, size_t bytes >
static __forceinline void memset_8( void *dest )
{
if( bytes == 0 ) return;
if( (bytes & 0x3) != 0 )
{
// unaligned data length. No point in doing an optimized inline version (too complicated!)
// So fall back on the compiler implementation:
memset( dest, data, bytes );
return;
}
//u64 _xmm_backup[2];
/*static const size_t remainder = bytes & 127;
@ -274,25 +306,74 @@ static __forceinline void memset_8( void *dest )
{
movups xmm0,[_xmm_backup];
}
}
else*/
}*/
// This function only works on 32-bit alignments of data copied.
jASSUME( (bytes & 0x3) == 0 );
enum
{
// This function only works on 32-bit alignments of data copied.
jASSUME( (bytes & 0x3) == 0 );
remdat = bytes>>2,
data32 = data + (data<<8) + (data<<16) + (data<<24)
};
enum local
{
remdat = bytes>>2,
data32 = data + (data<<8) + (data<<16) + (data<<24)
};
// macro to execute the x86/32 "stosd" copies.
switch( remdat )
{
case 1:
*(u32*)dest = data32;
return;
__asm
{
mov eax, data32
mov ecx, remdat
mov edi, dest
rep stosd
}
case 2:
((u32*)dest)[0] = data32;
((u32*)dest)[1] = data32;
return;
case 3:
__asm
{
mov edi, dest;
mov eax, data32;
stosd;
stosd;
stosd;
}
return;
case 4:
__asm
{
mov edi, dest;
mov eax, data32;
stosd;
stosd;
stosd;
stosd;
}
return;
case 5:
__asm
{
mov edi, dest;
mov eax, data32;
stosd;
stosd;
stosd;
stosd;
stosd;
}
return;
default:
__asm
{
mov ecx, remdat;
mov edi, dest;
mov eax, data32;
rep stosd;
}
return;
}
}
@ -301,24 +382,86 @@ static __forceinline void memset_16( void *dest )
{
if( bytes == 0 ) return;
if( (bytes & 0x1) != 0 )
throw Exception::LogicError( "Invalid parameter passed to memset_16 - data length is not a multiple of 16 or 32 bits." );
if( (bytes & 0x3) != 0 )
{
// Unaligned data length. No point in doing an optimized inline version (too complicated with
// remainders and such).
_memset16_unaligned( dest, data, bytes );
return;
}
//u64 _xmm_backup[2];
{
// This function only works on 32-bit alignments of data copied.
jASSUME( (bytes & 0x3) == 0 );
// This function only works on 32-bit alignments of data copied.
jASSUME( (bytes & 0x3) == 0 );
enum local
{
remdat = bytes>>2,
data32 = data + (data<<16)
};
__asm
{
mov eax, data32
mov ecx, remdat
mov edi, dest
rep stosd
}
enum
{
remdat = bytes>>2,
data32 = data + (data<<16)
};
// macro to execute the x86/32 "stosd" copies.
switch( remdat )
{
case 1:
*(u32*)dest = data32;
return;
case 2:
((u32*)dest)[0] = data32;
((u32*)dest)[1] = data32;
return;
case 3:
__asm
{
mov edi, dest;
mov eax, data32;
stosd;
stosd;
stosd;
}
return;
case 4:
__asm
{
mov edi, dest;
mov eax, data32;
stosd;
stosd;
stosd;
stosd;
}
return;
case 5:
__asm
{
mov edi, dest;
mov eax, data32;
stosd;
stosd;
stosd;
stosd;
stosd;
}
return;
default:
__asm
{
mov ecx, remdat;
mov edi, dest;
mov eax, data32;
rep stosd;
}
return
}
}
@ -327,24 +470,86 @@ static __forceinline void memset_32( void *dest )
{
if( bytes == 0 ) return;
if( (bytes & 0x3) != 0 )
throw Exception::LogicError( "Invalid parameter passed to memset_32 - data length is not a multiple of 32 bits." );
//u64 _xmm_backup[2];
{
// This function only works on 32-bit alignments of data copied.
jASSUME( (bytes & 0x3) == 0 );
// This function only works on 32-bit alignments of data copied.
// If the data length is not a factor of 32 bits, the C++ optimizing compiler will
// probably just generate mysteriously broken code in Release builds. ;)
enum local
{
remdat = bytes>>2,
data32 = data
};
__asm
{
mov eax, data32
mov ecx, remdat
mov edi, dest
rep stosd
}
jASSUME( (bytes & 0x3) == 0 );
enum
{
remdat = bytes>>2,
data32 = data
};
// macro to execute the x86/32 "stosd" copies.
switch( remdat )
{
case 1:
*(u32*)dest = data32;
return;
case 2:
__asm
{
mov edi, dest;
mov eax, data32;
stosd;
stosd;
}
return;
case 3:
__asm
{
mov edi, dest;
mov eax, data32;
stosd;
stosd;
stosd;
}
return;
case 4:
__asm
{
mov edi, dest;
mov eax, data32;
stosd;
stosd;
stosd;
stosd;
}
return;
case 5:
__asm
{
mov edi, dest;
mov eax, data32;
stosd;
stosd;
stosd;
stosd;
stosd;
}
return;
default:
__asm
{
mov ecx, remdat;
mov edi, dest;
mov eax, data32;
rep stosd;
}
return
}
}
@ -354,18 +559,29 @@ static __forceinline void memset_32( void *dest )
template< typename T >
static __forceinline void memzero_obj( T& object )
{
memzero_air<sizeof(T)>( &object );
memzero_ptr<sizeof(T)>( &object );
}
template< uint data, typename T >
static __forceinline void memset_obj( T& object )
// This method clears an object with the given 8 bit value.
template< u8 data, typename T >
static __forceinline void memset8_obj( T& object )
{
if( data <= 0xff )
memset_8<(u8)data, sizeof(T)>( &object );
else if( data <= 0xffff )
memset_16<(u16)data, sizeof(T)>( &object );
else
memset_32<(u32)data, sizeof(T)>( &object );
memset_8<data, sizeof(T)>( &object );
}
#endif
// This method clears an object with the given 16 bit value.
template< u16 data, typename T >
static __forceinline void memset16_obj( T& object )
{
memset_16<data, sizeof(T)>( &object );
}
// This method clears an object with the given 32 bit value.
template< u32 data, typename T >
static __forceinline void memset32_obj( T& object )
{
memset_32<data, sizeof(T)>( &object );
}
#endif

View File

@ -89,16 +89,18 @@ void checkregs()
__declspec(align(16)) static u8 _xmm_backup[16*2];
//this one checks for alligments too ...
// this one checks for alignments too ...
__declspec(naked) void __fastcall memcpy_raz_u(void *dest, const void *src, size_t bytes)
{
// If src is aligned, use memcpy_raz instead:
__asm
{
test edx,0xf;
jz memcpy_raz_;
}
//THIS CODE IS COPY PASTED FROM memcpy_raz_
// MOVSRC = opcode used to read. I use the same code for the aligned version, with a different define :)
#define MOVSRC movups
__asm
{
@ -112,7 +114,7 @@ __declspec(naked) void __fastcall memcpy_raz_u(void *dest, const void *src, size
cmp eax,127;
jna _loop_1;
//unrolled version also toiches xmm1, save it :)
//unrolled version also touches xmm1, save it :)
movaps [_xmm_backup+0x10],xmm1;
//since this is a common branch target it could be good to align it -- no idea if it has any effect :p
@ -178,14 +180,17 @@ cleanup:
}
#undef MOVSRC
}
//Custom memcpy, only for 16 byte aligned stuff (used for mtgs)
//These functions are optimised for medium-small transfer sizes (<2048, >=128).No prefetching is used since the reads are linear
//and the cache logic can predict em :)
//this implementation use forward copy, in 128 byte blocks, and then does the remaining in 16 byte blocks :)
//MOVSRC = opcode used to read.I use the same code for the unaligned version, with a different define :)
#define MOVSRC movaps
// Custom memcpy, only for 16 byte aligned stuff (used for mtgs)
// This function is optimized for medium-small transfer sizes (<2048, >=128). No prefetching is
// used since the reads are linear and the cache logic can predict em :)
__declspec(naked) void __fastcall memcpy_raz_(void *dest, const void *src, size_t bytes)
{
// Code Implementation Notes:
// Uses a forward copy, in 128 byte blocks, and then does the remaining in 16 byte blocks :)
// MOVSRC = opcode used to read. I use the same code for the unaligned version, with a different define :)
#define MOVSRC movaps
__asm
{
//Reads before reads, to avoid stalls
@ -270,16 +275,15 @@ cleanup:
//////////////////////////////////////////////////////////////////////////
// Fast memcpy as coded by AMD.
void * memcpy_amd_(void *dest, const void *src, size_t n)
// This function clobbers all MMX registers, and is generally not optimal for short memory
// copies due to the amount of overhead required to test for alignments, copy length,
// and other ABI overhead.
void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
{
#ifdef _DEBUG
__asm call checkregs
#endif
__asm {
mov edi, ecx ; destination
mov esi, edx ; source
mov ecx, [n] ; number of bytes to copy
mov edi, [dest] ; destination
mov esi, [src] ; source
mov ebx, ecx ; keep a copy of count
cld
@ -473,7 +477,7 @@ $memcpy_last_few: ; dword aligned from before movsd's
$memcpy_final:
emms ; clean up the MMX state
sfence ; flush the write buffer
mov eax, [dest] ; ret value = destination pointer
//mov eax, [dest] ; ret value = destination pointer
}
}

View File

@ -18,7 +18,7 @@
#include "PrecompiledHeader.h"
#include "System.h"
#include "Misc.h"
#include "iR5900.h"
#include "Vif.h"
#include "VU.h"
@ -83,7 +83,7 @@ using namespace std;
static int s_xmmchecknext = 0;
void _initXMMregs() {
memset(xmmregs, 0, sizeof(xmmregs));
memzero_obj( xmmregs );
g_xmmAllocCounter = 0;
s_xmmchecknext = 0;
}

View File

@ -163,7 +163,7 @@ static void iIopDumpBlock( int startpc, u8 * ptr )
// write the instruction info
fprintf(f, "\n\nlive0 - %x, lastuse - %x used - %x\n", EEINST_LIVE0, EEINST_LASTUSE, EEINST_USED);
memset(used, 0, sizeof(used));
memzero_obj(used);
numused = 0;
for(i = 0; i < ARRAYSIZE(s_pInstCache->regs); ++i) {
if( s_pInstCache->regs[i] & EEINST_USED ) {

View File

@ -384,7 +384,7 @@ void SuperVUAnalyzeOp(VURegs *VU, _vuopinfo *info, _VURegsNum* pCodeRegs)
// check upper flags
if (ptr[1] & 0x80000000) { // I flag
info->cycle = vucycle;
memzero_air<sizeof(lregs)>(lregs);
memzero_ptr<sizeof(lregs)>(lregs);
}
else {
@ -1454,4 +1454,4 @@ void SetVUNanMode(int mode)
{
g_VuNanHandling = mode;
if ( mode ) SysPrintf("enabling vunan mode");
}
}

View File

@ -158,7 +158,7 @@ struct VuBlockHeader
class VuInstruction
{
public:
VuInstruction() { memzero_air<sizeof(VuInstruction)>(this); nParentPc = -1; vicached = -1; }
VuInstruction() { memzero_ptr<sizeof(VuInstruction)>(this); nParentPc = -1; vicached = -1; }
int nParentPc; // used for syncing with flag writes, -1 for no parent
@ -419,7 +419,7 @@ void SuperVUReset(int vuindex)
{
DbgCon::Status( "SuperVU reset > Resetting recompiler memory and structures." );
memset(s_recVUMem, 0xcd, VU_EXESIZE);
memzero_air<SUPERVU_STACKSIZE>(recVUStack);
memzero_ptr<SUPERVU_STACKSIZE>(recVUStack);
s_recVUPtr = s_recVUMem;
}

View File

@ -17,7 +17,7 @@
*/
#include "PrecompiledHeader.h"
#include "System.h"
#include "Misc.h"
#include "iR5900.h"
#include "Vif.h"
#include "VU.h"
@ -37,7 +37,7 @@ int g_x86checknext;
// use special x86 register allocation for ia32
void _initX86regs() {
memset(x86regs, 0, sizeof(x86regs));
memzero_obj(x86regs);
g_x86AllocCounter = 0;
g_x86checknext = 0;
}
@ -402,7 +402,7 @@ static int s_mmxchecknext = 0;
void _initMMXregs()
{
memset(mmxregs, 0, sizeof(mmxregs));
memzero_obj(mmxregs);
g_mmxAllocCounter = 0;
s_mmxchecknext = 0;
}

View File

@ -175,7 +175,7 @@ static void iDumpBlock( int startpc, u8 * ptr )
fprintf(f, "\n\nlive0 - %x, live1 - %x, live2 - %x, lastuse - %x\nmmx - %x, xmm - %x, used - %x\n",
EEINST_LIVE0, EEINST_LIVE1, EEINST_LIVE2, EEINST_LASTUSE, EEINST_MMX, EEINST_XMM, EEINST_USED);
memset(used, 0, sizeof(used));
memzero_obj(used);
numused = 0;
for(i = 0; i < ARRAYSIZE(s_pInstCache->regs); ++i) {
if( s_pInstCache->regs[i] & EEINST_USED ) {
@ -184,7 +184,7 @@ static void iDumpBlock( int startpc, u8 * ptr )
}
}
memset(fpuused, 0, sizeof(fpuused));
memzero_obj(fpuused);
fpunumused = 0;
for(i = 0; i < ARRAYSIZE(s_pInstCache->fpuregs); ++i) {
if( s_pInstCache->fpuregs[i] & EEINST_USED ) {

View File

@ -239,7 +239,7 @@ void cpudetectInit()
int num;
char str[50];
memset( cpuinfo.x86ID, 0, sizeof( cpuinfo.x86ID ) );
memzero_obj( cpuinfo.x86ID );
cpuinfo.x86Family = 0;
cpuinfo.x86Model = 0;
cpuinfo.x86PType = 0;
@ -326,7 +326,7 @@ void cpudetectInit()
if ( cpuinfo.x86ID[ 0 ] == 'G' ){ cputype=0;}//trick lines but if you know a way better ;p
if ( cpuinfo.x86ID[ 0 ] == 'A' ){ cputype=1;}
memset(cpuinfo.x86Fam, 0, sizeof(cpuinfo.x86Fam));
memzero_obj( cpuinfo.x86Fam );
iCpuId( 0x80000002, (u32*)cpuinfo.x86Fam);
iCpuId( 0x80000003, (u32*)(cpuinfo.x86Fam+16));
iCpuId( 0x80000004, (u32*)(cpuinfo.x86Fam+32));