From 44f5117d2453dfa284585accc3d91fa9e7d3fbf9 Mon Sep 17 00:00:00 2001 From: "Jake.Stine" Date: Tue, 27 Jan 2009 05:12:54 +0000 Subject: [PATCH] Optimized memcpy_fast. In addition to being quite a bit faster, it also auto-preserves mmx registers now. So I was also able to remove almost every instance of FreezeMMXRegs (all except those used to guard the GS plugin calls). memcpy_fast (aka memcpy_amd_) is now faster than memcpy_raz for *all* scenarios, so it's been made the new default. git-svn-id: http://pcsx2-playground.googlecode.com/svn/trunk@642 a6443dda-0b58-4228-96e9-037be469359c --- pcsx2/CDVD.cpp | 9 -- pcsx2/CDVDiso.cpp | 6 - pcsx2/CDVDisodrv.cpp | 4 - pcsx2/CdRom.cpp | 4 - pcsx2/GS.cpp | 4 +- pcsx2/Hw.h | 46 +++--- pcsx2/MTGS.cpp | 2 +- pcsx2/Misc.h | 19 ++- pcsx2/SPR.cpp | 6 - pcsx2/VifDma.cpp | 8 +- pcsx2/windows/WinThreads.cpp | 3 + pcsx2/x86/fast_routines.cpp | 297 ++++++++++++++++++++++++----------- pcsx2/x86/iVUmicroLower.cpp | 7 +- pcsx2/x86/iVUzerorec.cpp | 2 - 14 files changed, 248 insertions(+), 169 deletions(-) diff --git a/pcsx2/CDVD.cpp b/pcsx2/CDVD.cpp index e5daf56318..39078a2829 100644 --- a/pcsx2/CDVD.cpp +++ b/pcsx2/CDVD.cpp @@ -865,8 +865,6 @@ int cdvdReadSector() { return -1; } - FreezeMMXRegs(1); - const u32 madr = HW_DMA3_MADR; // if raw dvd sector 'fill in the blanks' @@ -935,7 +933,6 @@ int cdvdReadSector() { HW_DMA3_BCR_H16-= (cdvd.BlockSize / (HW_DMA3_BCR_L16*4)); HW_DMA3_MADR+= cdvd.BlockSize; - FreezeMMXRegs(0); return 0; } @@ -2024,9 +2021,7 @@ void cdvdWrite16(u8 rt) // SCOMMAND if (cdvd.mg_size + cdvd.ParamC > cdvd.mg_maxsize) cdvd.Result[0] = 0x80; else{ - FreezeMMXRegs(1); memcpy_fast(cdvd.mg_buffer + cdvd.mg_size, cdvd.Param, cdvd.ParamC); - FreezeMMXRegs(0); cdvd.mg_size += cdvd.ParamC; cdvd.Result[0] = 0; // 0 complete ; 1 busy ; 0x80 error } @@ -2034,11 +2029,9 @@ void cdvdWrite16(u8 rt) // SCOMMAND case 0x8E: // sceMgReadData SetResultSize( std::min(16, cdvd.mg_size) ); - FreezeMMXRegs(1); memcpy_fast(cdvd.Result, cdvd.mg_buffer, cdvd.ResultC); cdvd.mg_size -= cdvd.ResultC; memcpy_fast(cdvd.mg_buffer, cdvd.mg_buffer+cdvd.ResultC, cdvd.mg_size); - FreezeMMXRegs(0); break; case 0x88: // secrman: __mechacon_auth_0x88 //for now it is the same; so, fall;) @@ -2089,9 +2082,7 @@ fail_pol_cal: SetResultSize(3);//in:0 { int bit_ofs = mg_BIToffset(cdvd.mg_buffer); - FreezeMMXRegs(1); memcpy_fast(cdvd.mg_buffer, &cdvd.mg_buffer[bit_ofs], 8+16*cdvd.mg_buffer[bit_ofs+4]); - FreezeMMXRegs(0); } cdvd.mg_maxsize = 0; // don't allow any write cdvd.mg_size = 8+16*cdvd.mg_buffer[4];//new offset, i just moved the data diff --git a/pcsx2/CDVDiso.cpp b/pcsx2/CDVDiso.cpp index a9401b5b5b..5b26848f50 100644 --- a/pcsx2/CDVDiso.cpp +++ b/pcsx2/CDVDiso.cpp @@ -189,7 +189,6 @@ int CdRead(u32 lsn, u32 sectors, void *buf, CdRMode *mode){ return 0; buff = CDVDgetBuffer(); if (buff==NULL) return 0; - FreezeMMXRegs(1); switch (mode->datapattern){ case CdSecS2048: memcpy_fast((void*)((uptr)buf+2048*i), buff, 2048);break;//only data @@ -198,7 +197,6 @@ int CdRead(u32 lsn, u32 sectors, void *buf, CdRMode *mode){ case CdSecS2340: memcpy_fast((void*)((uptr)buf+2340*i), buff, 2340);break;//without sync } - FreezeMMXRegs(0); } return 1; } @@ -216,9 +214,7 @@ int DvdRead(u32 lsn, u32 sectors, void *buf, CdRMode *mode){ // switch (mode->datapattern){ // case CdSecS2064: ((u32*)buf)[0] = i + 0x30000; - FreezeMMXRegs(1); memcpy_fast((u8*)buf+12, buff, 2048); - FreezeMMXRegs(0); buf = (char*)buf + 2064; break; // default: // return 0; @@ -253,9 +249,7 @@ int CDVD_GetVolumeDescriptor(void){ if ((localVolDesc.filesystemType == 1) || (localVolDesc.filesystemType == 2)) { - FreezeMMXRegs(1); memcpy_fast(&CDVolDesc, &localVolDesc, sizeof(cdVolDesc)); - FreezeMMXRegs(0); } } else diff --git a/pcsx2/CDVDisodrv.cpp b/pcsx2/CDVDisodrv.cpp index 12bc083921..6f699e857a 100644 --- a/pcsx2/CDVDisodrv.cpp +++ b/pcsx2/CDVDisodrv.cpp @@ -188,9 +188,7 @@ int CDVDFS_read( int fd, char *buffer, int size ){ RPC_LOG("[CDVDisodrv: ] Couldn't Read from file for some reason\n"); return 0; } - FreezeMMXRegs(1); memcpy_fast(buffer, lb + off_sector, ssize); - FreezeMMXRegs(0); } if (asize) if (CdRead(asector, asize >> 11, buffer+ssize, &cdReadMode) != TRUE){ RPC_LOG("[CDVDisodrv: ] Couldn't Read from file for some reason\n"); @@ -201,9 +199,7 @@ int CDVDFS_read( int fd, char *buffer, int size ){ RPC_LOG("[CDVDisodrv: ] Couldn't Read from file for some reason\n"); return 0; } - FreezeMMXRegs(1); memcpy_fast(buffer+ssize+asize, lb, esize); - FreezeMMXRegs(0); } /*********************** // Now work out where we want to start reading from diff --git a/pcsx2/CdRom.cpp b/pcsx2/CdRom.cpp index 5985ade57d..583cb15e1d 100644 --- a/pcsx2/CdRom.cpp +++ b/pcsx2/CdRom.cpp @@ -527,9 +527,7 @@ void cdrReadInterrupt() { CDREAD_INT((cdr.Mode & 0x80) ? (cdReadTime / 2) : cdReadTime); return; } - FreezeMMXRegs(1); memcpy_fast(cdr.Transfer, buf+12, 2340); - FreezeMMXRegs(0); cdr.Stat = DataReady; CDR_LOG(" %x:%x:%x\n", cdr.Transfer[0], cdr.Transfer[1], cdr.Transfer[2]); @@ -923,9 +921,7 @@ void psxDma3(u32 madr, u32 bcr, u32 chcr) { } cdsize = (bcr & 0xffff) * 4; - FreezeMMXRegs(1); memcpy_fast((u8*)PSXM(madr), cdr.pTransfer, cdsize); - FreezeMMXRegs(0); psxCpu->Clear(madr, cdsize/4); cdr.pTransfer+=cdsize; diff --git a/pcsx2/GS.cpp b/pcsx2/GS.cpp index 024f2b5500..916c58b6ca 100644 --- a/pcsx2/GS.cpp +++ b/pcsx2/GS.cpp @@ -575,11 +575,11 @@ static void WRITERING_DMA(u32 *pMem, u32 qwc) { pendmem = (pendmem&~0xfff)-16; } - memcpy_raz_(pgsmem, pMem, pendmem-(u32)gif->madr+16); + memcpy_aligned(pgsmem, pMem, pendmem-(u32)gif->madr+16); } else #endif - memcpy_raz_(pgsmem, pMem, sizetoread); + memcpy_aligned(pgsmem, pMem, sizetoread); mtgsThread->SendDataPacket(); } diff --git a/pcsx2/Hw.h b/pcsx2/Hw.h index 2c1a83c52c..609f509f19 100644 --- a/pcsx2/Hw.h +++ b/pcsx2/Hw.h @@ -329,7 +329,7 @@ static __forceinline u8* dmaGetAddr(u32 mem) #else - +// Note: Dma addresses are guaranteed to be aligned to 16 bytes (128 bits) static __forceinline void *dmaGetAddr(u32 addr) { u8 *ptr; @@ -355,35 +355,17 @@ void hwShutdown(); // hw read functions extern u8 hwRead8 (u32 mem); -int hwConstRead8 (u32 x86reg, u32 mem, u32 sign); - extern u16 hwRead16(u32 mem); -int hwConstRead16(u32 x86reg, u32 mem, u32 sign); - extern u32 hwRead32(u32 mem); -int hwConstRead32(u32 x86reg, u32 mem); - -u64 hwRead64(u32 mem); -void hwConstRead64(u32 mem, int mmreg); - -void hwRead128(u32 mem, u64 *out); -void hwConstRead128(u32 mem, int xmmreg); +extern u64 hwRead64(u32 mem); +extern void hwRead128(u32 mem, u64 *out); // hw write functions -void hwWrite8 (u32 mem, u8 value); -void hwConstWrite8 (u32 mem, int mmreg); - -void hwWrite16(u32 mem, u16 value); -void hwConstWrite16(u32 mem, int mmreg); - -void hwWrite32(u32 mem, u32 value); -void hwConstWrite32(u32 mem, int mmreg); - -void hwWrite64(u32 mem, u64 value); -void hwConstWrite64(u32 mem, int mmreg); - -void hwWrite128(u32 mem, const u64 *value); -void hwConstWrite128(u32 mem, int xmmreg); +extern void hwWrite8 (u32 mem, u8 value); +extern void hwWrite16(u32 mem, u16 value); +extern void hwWrite32(u32 mem, u32 value); +extern void hwWrite64(u32 mem, u64 value); +extern void hwWrite128(u32 mem, const u64 *value); void hwIntcIrq(int n); void hwDmacIrq(int n); @@ -394,6 +376,18 @@ int hwMFIFOWrite(u32 addr, u8 *data, u32 size); int hwDmacSrcChainWithStack(DMACh *dma, int id); int hwDmacSrcChain(DMACh *dma, int id); +int hwConstRead8 (u32 x86reg, u32 mem, u32 sign); +int hwConstRead16(u32 x86reg, u32 mem, u32 sign); +int hwConstRead32(u32 x86reg, u32 mem); +void hwConstRead64(u32 mem, int mmreg); +void hwConstRead128(u32 mem, int xmmreg); + +void hwConstWrite8 (u32 mem, int mmreg); +void hwConstWrite16(u32 mem, int mmreg); +void hwConstWrite32(u32 mem, int mmreg); +void hwConstWrite64(u32 mem, int mmreg); +void hwConstWrite128(u32 mem, int xmmreg); + #ifdef PCSX2_VIRTUAL_MEM void iMemRead32Check(); #endif diff --git a/pcsx2/MTGS.cpp b/pcsx2/MTGS.cpp index 943796a310..a367df92d6 100644 --- a/pcsx2/MTGS.cpp +++ b/pcsx2/MTGS.cpp @@ -430,7 +430,7 @@ int mtgsThreadObject::Callback() { Console::WriteLn("MTGS > Thread Started, Opening GS Plugin..."); - memcpy_raz_( m_gsMem, PS2MEM_GS, sizeof(m_gsMem) ); + memcpy_aligned( m_gsMem, PS2MEM_GS, sizeof(m_gsMem) ); GSsetBaseMem( m_gsMem ); m_returncode = GSopen((void *)&pDsp, "PCSX2", 1); diff --git a/pcsx2/Misc.h b/pcsx2/Misc.h index e8a5503070..9f6cb3f1ad 100644 --- a/pcsx2/Misc.h +++ b/pcsx2/Misc.h @@ -230,12 +230,19 @@ extern u8 g_globalXMMSaved; void _memset16_unaligned( void* dest, u16 data, size_t size ); #if defined(_WIN32) && !defined(__x86_64__) - // faster memcpy - extern void __fastcall memcpy_raz_u(void *dest, const void *src, size_t bytes); - extern void __fastcall memcpy_raz_(void *dest, const void *src, size_t qwc); - extern void __fastcall memcpy_amd_(void *dest, const void *src, size_t n); + + // The new simplified memcpy_amd_ is now faster than memcpy_raz_. + // memcpy_amd_ also does mmx register saving, negating the need for freezeregs (code cleanup!) + // Additionally, using one single memcpy implementation keeps the code cache cleaner. + + //extern void __fastcall memcpy_raz_udst(void *dest, const void *src, size_t bytes); + //extern void __fastcall memcpy_raz_usrc(void *dest, const void *src, size_t bytes); + //extern void __fastcall memcpy_raz_(void *dest, const void *src, size_t bytes); + extern void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes); + # include "windows/memzero.h" # define memcpy_fast memcpy_amd_ +# define memcpy_aligned memcpy_amd_ #else @@ -243,6 +250,10 @@ void _memset16_unaligned( void* dest, u16 data, size_t size ); #define memcpy_fast memcpy #define memcpy_raz_ memcpy #define memcpy_raz_u memcpy + + #define memcpy_aligned memcpy + #define memcpy_raz_u memcpy + #include "Linux/memzero.h" #endif diff --git a/pcsx2/SPR.cpp b/pcsx2/SPR.cpp index 0813cc9ca2..d32f479b56 100644 --- a/pcsx2/SPR.cpp +++ b/pcsx2/SPR.cpp @@ -226,13 +226,11 @@ extern void mfifoGIFtransfer(int); #define gif ((DMACh*)&PS2MEM_HW[0xA000]) void dmaSPR0() { // fromSPR int qwc = spr0->qwc; - FreezeMMXRegs(1); SPR_LOG("dmaSPR0 chcr = %lx, madr = %lx, qwc = %lx, sadr = %lx\n", spr0->chcr, spr0->madr, spr0->qwc, spr0->sadr); _dmaSPR0(); - FreezeMMXRegs(0); if ((psHu32(DMAC_CTRL) & 0xC) == 0xC) { // GIF MFIFO if((spr0->madr & ~psHu32(DMAC_RBSR)) != psHu32(DMAC_RBOR)) SysPrintf("GIF MFIFO Write outside MFIFO area\n"); spr0->madr = psHu32(DMAC_RBOR) + (spr0->madr & psHu32(DMAC_RBSR)); @@ -308,7 +306,6 @@ void _SPR1interleave() { void dmaSPR1() { // toSPR - FreezeMMXRegs(1); #ifdef SPR_LOG SPR_LOG("dmaSPR1 chcr = 0x%x, madr = 0x%x, qwc = 0x%x\n" " tadr = 0x%x, sadr = 0x%x\n", @@ -325,7 +322,6 @@ void dmaSPR1() { // toSPR // Transfer Dn_QWC from Dn_MADR to SPR1 SPR1chain(); CPU_INT(9, cycles); - FreezeMMXRegs(0); return; } else if ((spr1->chcr & 0xc) == 0x4){ int cycles = 0; @@ -338,7 +334,6 @@ void dmaSPR1() { // toSPR // Transfer Dn_QWC from Dn_MADR to SPR1 SPR1chain(); CPU_INT(9, cycles); - FreezeMMXRegs(0); return; } // Chain Mode @@ -382,7 +377,6 @@ void dmaSPR1() { // toSPR } else { // Interleave Mode _SPR1interleave(); } - FreezeMMXRegs(0); } diff --git a/pcsx2/VifDma.cpp b/pcsx2/VifDma.cpp index 17e6b528b5..141ab8b468 100644 --- a/pcsx2/VifDma.cpp +++ b/pcsx2/VifDma.cpp @@ -574,9 +574,7 @@ static void VIFunpack(u32 *data, vifCode *v, int size, const unsigned int VIFdma // v4-32 if(vifRegs->mode == 0 && !(vifRegs->code & 0x10000000) && vif->usn == 0){ vifRegs->num -= size>>4; - FreezeMMXRegs(1); memcpy_fast((u8*)dest, cdata, size); - FreezeMMXRegs(0); size = 0; //unpacktotal += GetCPUTick()-basetick; return; @@ -814,9 +812,7 @@ static __forceinline void _vif0mpgTransfer(u32 addr, u32 *data, int size) { fclose(f); }*/ if (memcmp(VU0.Micro + addr, data, size << 2)) { - FreezeMMXRegs(1); memcpy_fast(VU0.Micro + addr, data, size << 2); - FreezeMMXRegs(0); CpuVU0->Clear(addr, size); } } @@ -1490,9 +1486,7 @@ static __forceinline void _vif1mpgTransfer(u32 addr, u32 *data, int size) { }*/ assert( VU1.Micro > 0 ); if (memcmp(VU1.Micro + addr, data, size << 2)) { - FreezeMMXRegs(1); memcpy_fast(VU1.Micro + addr, data, size << 2); - FreezeMMXRegs(0); CpuVU1->Clear(addr, size); } } @@ -1644,7 +1638,7 @@ static int Vif1TransDirectHL(u32 *data){ { //unaligned copy.VIF handling is -very- messy, so i'l use this code til i fix it :) const uint count = mtgsThread->PrepDataPacket( GIF_PATH_2, data, ret<<2 ); - memcpy_raz_u( mtgsThread->GetDataPacketPtr(), data, count ); + memcpy_fast( mtgsThread->GetDataPacketPtr(), data, count ); mtgsThread->SendDataPacket(); } else { diff --git a/pcsx2/windows/WinThreads.cpp b/pcsx2/windows/WinThreads.cpp index 1faa20c157..94ff95796c 100644 --- a/pcsx2/windows/WinThreads.cpp +++ b/pcsx2/windows/WinThreads.cpp @@ -49,6 +49,9 @@ namespace Threading } cpuinfo.LogicalCores = CPUs; + if( LogicalCoresPerPhysicalCPU > CPUs) // for 1-socket HTT-disabled machines + LogicalCoresPerPhysicalCPU = CPUs; + cpuinfo.PhysicalCores = ( CPUs / LogicalCoresPerPhysicalCPU ) * PhysicalCoresPerPhysicalCPU; ptw32_smp_system = ( cpuinfo.LogicalCores > 1 ) ? TRUE : FALSE; } diff --git a/pcsx2/x86/fast_routines.cpp b/pcsx2/x86/fast_routines.cpp index 99ff1a920c..c561a0e0ab 100644 --- a/pcsx2/x86/fast_routines.cpp +++ b/pcsx2/x86/fast_routines.cpp @@ -88,35 +88,27 @@ void checkregs() #endif -__declspec(align(16)) static u8 _xmm_backup[16*2]; +PCSX2_ALIGNED16( static u8 _xmm_backup[16*2] ); +PCSX2_ALIGNED16( static u8 _mmx_backup[8*4] ); -// this one checks for alignments too ... -__declspec(naked) void __fastcall memcpy_raz_u(void *dest, const void *src, size_t bytes) +static __declspec(naked) void __fastcall _memcpy_raz_usrc(void *dest, const void *src, size_t bytes) { - // If src is aligned, use memcpy_raz instead: - __asm - { - test edx,0xf; - jz memcpy_raz_; - } - // MOVSRC = opcode used to read. I use the same code for the aligned version, with a different define :) - #define MOVSRC movups + #define MOVSRC movdqu + #define MOVDST movdqa + __asm { //Reads before reads, to avoid stalls mov eax,[esp+4]; //Make sure to save xmm0, it must be preserved ... - movaps [_xmm_backup+0x00],xmm0; + movaps [_xmm_backup],xmm0; //if >=128 bytes use 128 byte unrolled loop //i use cmp ..,127 + jna because 127 is encodable using the simm8 form cmp eax,127; jna _loop_1; - //unrolled version also touches xmm1, save it :) - movaps [_xmm_backup+0x10],xmm1; - //since this is a common branch target it could be good to align it -- no idea if it has any effect :p align 16 @@ -124,34 +116,111 @@ __declspec(naked) void __fastcall memcpy_raz_u(void *dest, const void *src, size _loop_8: MOVSRC xmm0,[edx+0x00]; //read first to avoid read-after-write stalls - MOVSRC xmm1,[edx+0x10]; + MOVDST [ecx+0x00],xmm0; //then write :p + MOVSRC xmm0,[edx+0x10]; + MOVDST [ecx+0x10],xmm0; sub edx,-128; //edx won't be used for a while, so update it here. sub/-128 for simm8 encoding - movaps [ecx+0x00],xmm0; //then write :p - movaps [ecx+0x10],xmm1; sub ecx,-128; //ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding MOVSRC xmm0,[edx+0x20-128]; - MOVSRC xmm1,[edx+0x30-128]; + MOVDST [ecx+0x20-128],xmm0; + MOVSRC xmm0,[edx+0x30-128]; + MOVDST [ecx+0x30-128],xmm0; add eax,-128; //eax won't be used for a while, so update it here. add/-128 for simm8 encoding - movaps [ecx+0x20-128],xmm0; - movaps [ecx+0x30-128],xmm1; MOVSRC xmm0,[edx+0x40-128]; - MOVSRC xmm1,[edx+0x50-128]; - movaps [ecx+0x40-128],xmm0; - movaps [ecx+0x50-128],xmm1; + MOVDST [ecx+0x40-128],xmm0; + MOVSRC xmm0,[edx+0x50-128]; + MOVDST [ecx+0x50-128],xmm0; MOVSRC xmm0,[edx+0x60-128]; - MOVSRC xmm1,[edx+0x70-128]; - movaps [ecx+0x60-128],xmm0; - movaps [ecx+0x70-128],xmm1; + MOVDST [ecx+0x60-128],xmm0; + MOVSRC xmm0,[edx+0x70-128]; + MOVDST [ecx+0x70-128],xmm0; //127~ja, 127 is encodable as simm8 :) cmp eax,127; ja _loop_8; - //restore xmm1 :) - movaps xmm1,[_xmm_backup+0x10]; + //direct copy for 0~7 qwords + //in order to avoid the inc/dec of all 3 registers + //i use negative relative addressing from the top of the buffers + //[top-current index] + +_loop_1: + //prepare the regs for 'negative relative addressing' + add edx,eax; + add ecx,eax; + neg eax; + jz cleanup; //exit if nothing to do + +_loop_1_inner: + MOVSRC xmm0,[edx+eax]; + MOVDST [ecx+eax],xmm0; + + add eax,16; //while the offset is still negative we have data to copy + js _loop_1_inner; + + //done ! +cleanup: + //restore xmm and exit ~) + movaps xmm0,[_xmm_backup]; + ret 4; + } + #undef MOVSRC + #undef MOVDST +} + + +static __declspec(naked) void __fastcall _memcpy_raz_udst(void *dest, const void *src, size_t bytes) +{ + // MOVDST = opcode used to read. I use the same code for the aligned version, with a different define :) + #define MOVSRC movaps + #define MOVDST movups + __asm + { + //Reads before reads, to avoid stalls + mov eax,[esp+4]; + //Make sure to save xmm0, it must be preserved ... + movaps [_xmm_backup],xmm0; + + //if >=128 bytes use 128 byte unrolled loop + //i use cmp ..,127 + jna because 127 is encodable using the simm8 form + cmp eax,127; + jna _loop_1; + + //since this is a common branch target it could be good to align it -- no idea if it has any effect :p + align 16 + + //128 byte unrolled loop +_loop_8: + + MOVSRC xmm0,[edx+0x00]; //read first to avoid read-after-write stalls + MOVDST [ecx+0x00],xmm0; //then write :p + MOVSRC xmm0,[edx+0x10]; + MOVDST [ecx+0x10],xmm0; + sub edx,-128; //edx won't be used for a while, so update it here. sub/-128 for simm8 encoding + sub ecx,-128; //ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding + + MOVSRC xmm0,[edx+0x20-128]; + MOVDST [ecx+0x20-128],xmm0; + MOVSRC xmm0,[edx+0x30-128]; + MOVDST [ecx+0x30-128],xmm0; + add eax,-128; //eax won't be used for a while, so update it here. add/-128 for simm8 encoding + + MOVSRC xmm0,[edx+0x40-128]; + MOVDST [ecx+0x40-128],xmm0; + MOVSRC xmm0,[edx+0x50-128]; + MOVDST [ecx+0x50-128],xmm0; + + MOVSRC xmm0,[edx+0x60-128]; + MOVDST [ecx+0x60-128],xmm0; + MOVSRC xmm0,[edx+0x70-128]; + MOVDST [ecx+0x70-128],xmm0; + + //127~ja, 127 is encodable as simm8 :) + cmp eax,127; + ja _loop_8; //direct copy for 0~7 qwords //in order to avoid the inc/dec of all 3 registers @@ -168,22 +237,24 @@ _loop_1: _loop_1_inner: MOVSRC xmm0,[edx+eax]; movaps [ecx+eax],xmm0; - + add eax,16; //while the offset is still negative we have data to copy js _loop_1_inner; //done ! cleanup: //restore xmm and exit ~) - movaps xmm0,[_xmm_backup+0x00]; + movaps xmm0,[_xmm_backup]; ret 4; } #undef MOVSRC + #undef MOVDST } + // Custom memcpy, only for 16 byte aligned stuff (used for mtgs) // This function is optimized for medium-small transfer sizes (<2048, >=128). No prefetching is // used since the reads are linear and the cache logic can predict em :) - +// *OBSOLETE* -- memcpy_amd_ has been optimized and is now faster. __declspec(naked) void __fastcall memcpy_raz_(void *dest, const void *src, size_t bytes) { // Code Implementation Notes: @@ -191,21 +262,19 @@ __declspec(naked) void __fastcall memcpy_raz_(void *dest, const void *src, size_ // MOVSRC = opcode used to read. I use the same code for the unaligned version, with a different define :) #define MOVSRC movaps + #define MOVDST movaps __asm { //Reads before reads, to avoid stalls mov eax,[esp+4]; //Make sure to save xmm0, it must be preserved ... - movaps [_xmm_backup+0x00],xmm0; + movaps [_xmm_backup],xmm0; //if >=128 bytes use 128 byte unrolled loop //i use cmp ..,127 + jna because 127 is encodable using the simm8 form cmp eax,127; jna _loop_1; - //unrolled version also toiches xmm1, save it :) - movaps [_xmm_backup+0x10],xmm1; - //since this is a common branch target it could be good to align it -- no idea if it has any effect :p align 16 @@ -213,35 +282,32 @@ __declspec(naked) void __fastcall memcpy_raz_(void *dest, const void *src, size_ _loop_8: MOVSRC xmm0,[edx+0x00]; //read first to avoid read-after-write stalls - MOVSRC xmm1,[edx+0x10]; + MOVDST [ecx+0x00],xmm0; //then write :p + MOVSRC xmm0,[edx+0x10]; + MOVDST [ecx+0x10],xmm0; sub edx,-128; //edx won't be used for a while, so update it here. sub/-128 for simm8 encoding - movaps [ecx+0x00],xmm0; //then write :p - movaps [ecx+0x10],xmm1; sub ecx,-128; //ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding MOVSRC xmm0,[edx+0x20-128]; - MOVSRC xmm1,[edx+0x30-128]; + MOVDST [ecx+0x20-128],xmm0; + MOVSRC xmm0,[edx+0x30-128]; + MOVDST [ecx+0x30-128],xmm0; add eax,-128; //eax won't be used for a while, so update it here. add/-128 for simm8 encoding - movaps [ecx+0x20-128],xmm0; - movaps [ecx+0x30-128],xmm1; MOVSRC xmm0,[edx+0x40-128]; - MOVSRC xmm1,[edx+0x50-128]; - movaps [ecx+0x40-128],xmm0; - movaps [ecx+0x50-128],xmm1; + MOVDST [ecx+0x40-128],xmm0; + MOVSRC xmm0,[edx+0x50-128]; + MOVDST [ecx+0x50-128],xmm0; MOVSRC xmm0,[edx+0x60-128]; - MOVSRC xmm1,[edx+0x70-128]; - movaps [ecx+0x60-128],xmm0; - movaps [ecx+0x70-128],xmm1; + MOVDST [ecx+0x60-128],xmm0; + MOVSRC xmm0,[edx+0x70-128]; + MOVDST [ecx+0x70-128],xmm0; //127~ja, 127 is encodable as simm8 :) cmp eax,127; ja _loop_8; - //restore xmm1 :) - movaps xmm1,[_xmm_backup+0x10]; - //direct copy for 0~7 qwords //in order to avoid the inc/dec of all 3 registers //i use negative relative addressing from the top of the buffers @@ -256,7 +322,7 @@ _loop_1: _loop_1_inner: MOVSRC xmm0,[edx+eax]; - movaps [ecx+eax],xmm0; + MOVDST [ecx+eax],xmm0; add eax,16; //while the offset is still negative we have data to copy js _loop_1_inner; @@ -264,44 +330,64 @@ _loop_1_inner: //done ! cleanup: //restore xmm and exit ~) - movaps xmm0,[_xmm_backup+0x00]; + movaps xmm0,[_xmm_backup]; ret 4; } + #undef MOVSRC + #undef MOVDST } -#undef MOVSRC +// This memcpy routine is for use in situations where the source buffer's alignment is indeterminate. +__forceinline void __fastcall memcpy_raz_usrc(void *dest, const void *src, size_t bytes) +{ + if( ((uptr)src & 0xf) == 0 ) + memcpy_raz_( dest, src, bytes ); + else + _memcpy_raz_usrc( dest, src, bytes ); +} + +// This memcpy routine is for use in situations where the destination buffer's alignment is indeterminate. +__forceinline void __fastcall memcpy_raz_udst(void *dest, const void *src, size_t bytes) +{ + if( ((uptr)dest & 0xf) == 0 ) + memcpy_raz_( dest, src, bytes ); + else + _memcpy_raz_udst( dest, src, bytes ); +} ////////////////////////////////////////////////////////////////////////// -// Fast memcpy as coded by AMD. - -// This function clobbers all MMX registers, and is generally not optimal for short memory -// copies due to the amount of overhead required to test for alignments, copy length, -// and other ABI overhead. -void __fastcall memcpy_amd_(void *dest, const void *src, size_t n) +// Fast memcpy as coded by AMD, and thn improved by air. +// +// This routine preserves mmx registers! It's the complete real deal! +__declspec(naked) void __fastcall memcpy_amd_(void *dest, const void *src, size_t n) { - __asm { + __asm + { + push edi + push esi + mov edi, ecx ; destination mov esi, edx ; source - mov ecx, [n] ; number of bytes to copy - mov ebx, ecx ; keep a copy of count + mov ecx, [esp+12] ; number of bytes to copy + mov eax, ecx ; keep a copy of count cld - cmp ecx, TINY_BLOCK_COPY + cmp eax, TINY_BLOCK_COPY jb $memcpy_ic_3 ; tiny? skip mmx copy - cmp ecx, 32*1024 ; don't align between 32k-64k because + cmp eax, 32*1024 ; don't align between 32k-64k because jbe $memcpy_do_align ; it appears to be slower - cmp ecx, 64*1024 + cmp eax, 64*1024 jbe $memcpy_align_done $memcpy_do_align: - mov ecx, 8 ; a trick that's faster than rep movsb... - sub ecx, edi ; align destination to qword - and ecx, 111b ; get the low bits - sub ebx, ecx ; update copy count - neg ecx ; set up to jump into the array - add ecx, offset $memcpy_align_done - jmp ecx ; jump to array of movsb's + mov eax, 8 ; a trick that's faster than rep movsb... + sub eax, edi ; align destination to qword + and eax, 111b ; get the low bits + sub ecx, eax ; update copy count + neg eax ; set up to jump into the array + add eax, offset $memcpy_align_done + jmp eax ; jump to array of movsb's align 4 movsb @@ -314,13 +400,18 @@ align 4 movsb $memcpy_align_done: ; destination is dword aligned - mov ecx, ebx ; number of bytes left to copy - shr ecx, 6 ; get 64-byte block count + mov eax, ecx ; number of bytes left to copy + shr eax, 6 ; get 64-byte block count jz $memcpy_ic_2 ; finish the last few bytes - cmp ecx, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy + cmp eax, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy jae $memcpy_uc_test + movq [_mmx_backup+0x00],mm0 + movq [_mmx_backup+0x08],mm1 + movq [_mmx_backup+0x10],mm2 + movq [_mmx_backup+0x18],mm3 + // This is small block copy that uses the MMX registers to copy 8 bytes // at a time. It uses the "unrolled loop" optimization, and also uses // the software prefetch instruction to get the data into the cache. @@ -348,30 +439,39 @@ $memcpy_ic_1: ; 64-byte block copies, in-cache copy add esi, 64 ; update source pointer add edi, 64 ; update destination pointer - dec ecx ; count down + dec eax ; count down jnz $memcpy_ic_1 ; last 64-byte block? + movq mm0,[_mmx_backup+0x00] + movq mm1,[_mmx_backup+0x08] + movq mm2,[_mmx_backup+0x10] + movq mm3,[_mmx_backup+0x18] + $memcpy_ic_2: - mov ecx, ebx ; has valid low 6 bits of the byte count + mov eax, ecx ; has valid low 6 bits of the byte count $memcpy_ic_3: - shr ecx, 2 ; dword count - and ecx, 1111b ; only look at the "remainder" bits - neg ecx ; set up to jump into the array - add ecx, offset $memcpy_last_few - jmp ecx ; jump to array of movsd's + shr eax, 2 ; dword count + and eax, 1111b ; only look at the "remainder" bits + neg eax ; set up to jump into the array + add eax, offset $memcpy_last_few + jmp eax ; jump to array of movsd's $memcpy_uc_test: - cmp ecx, UNCACHED_COPY/64 ; big enough? use block prefetch copy + /*cmp ecx, UNCACHED_COPY/64 ; big enough? use block prefetch copy jae $memcpy_bp_1 - -$memcpy_64_test: - or ecx, ecx ; tail end of block prefetch will jump here +$memcpy_64_test:*/ + or eax, eax ; tail end of block prefetch will jump here jz $memcpy_ic_2 ; no more 64-byte blocks left // For larger blocks, which will spill beyond the cache, it's faster to // use the Streaming Store instruction MOVNTQ. This write instruction // bypasses the cache and writes straight to main memory. This code also // uses the software prefetch instruction to pre-read the data. + + movq [_mmx_backup+0x00],mm0 + movq [_mmx_backup+0x08],mm1 + movq [_mmx_backup+0x10],mm2 + align 16 $memcpy_uc_1: ; 64-byte blocks, uncached copy @@ -394,17 +494,25 @@ $memcpy_uc_1: ; 64-byte blocks, uncached copy movq mm1,[esi-8] movntq [edi-24], mm2 movntq [edi-16], mm0 - dec ecx + dec eax movntq [edi-8], mm1 jnz $memcpy_uc_1 ; last 64-byte block? - jmp $memcpy_ic_2 ; almost done + movq mm0,[_mmx_backup+0x00] + movq mm1,[_mmx_backup+0x08] + movq mm2,[_mmx_backup+0x10] + + jmp $memcpy_ic_2 ; almost done (not needed because large copy below was removed) // For the largest size blocks, a special technique called Block Prefetch // can be used to accelerate the read operations. Block Prefetch reads // one address per cache line, for a series of cache lines, in a short loop. // This is faster than using software prefetch. The technique is great for // getting maximum read bandwidth, especially in DDR memory systems. + +// Note: Pcsx2 rarely invokes large copies, so this mode has been disabled to +// help keep the code cache footprint of memcpy_fast to a minimum. +/* $memcpy_bp_1: ; large blocks, block prefetch copy cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop? @@ -447,6 +555,7 @@ $memcpy_bp_3: jnz $memcpy_bp_3 ; keep copying sub ecx, CACHEBLOCK ; update the 64-byte block count jmp $memcpy_bp_1 ; keep processing chunks +*/ // The smallest copy uses the X86 "movsd" instruction, in an optimized // form which is an "unrolled loop". Then it handles the last few bytes. @@ -469,8 +578,8 @@ align 4 movsd $memcpy_last_few: ; dword aligned from before movsd's - mov ecx, ebx ; has valid low 2 bits of the byte count - and ecx, 11b ; the last few cows must come home + mov eax, ecx ; has valid low 2 bits of the byte count + and eax, 11b ; the last few cows must come home jz $memcpy_final ; no more, let's leave rep movsb ; the last 1, 2, or 3 bytes @@ -479,10 +588,14 @@ $memcpy_final: sfence ; flush the write buffer //mov eax, [dest] ; ret value = destination pointer + pop esi + pop edi + + ret 4 } } -// mmx memcpy implementation, size has to be a multiple of 8 +// mmx mem-compare implementation, size has to be a multiple of 8 // returns 0 is equal, nonzero value if not equal // ~10 times faster than standard memcmp // (zerofrog) diff --git a/pcsx2/x86/iVUmicroLower.cpp b/pcsx2/x86/iVUmicroLower.cpp index 42e8a22eb1..ca68eb3bc0 100644 --- a/pcsx2/x86/iVUmicroLower.cpp +++ b/pcsx2/x86/iVUmicroLower.cpp @@ -1977,12 +1977,7 @@ void VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr) //if( size > 0 ) { u8* pmem = mtgsThread->GetDataPacketPtr(); - //FreezeMMXRegs(1); - //memcpy_fast(pmem, (u8*)pMem+addr, size); - //FreezeMMXRegs(0); - - // we can use the faster memcpy_raz_ here (src/dest are garaunteed to be aligned) - memcpy_raz_(pmem, (u8*)pMem+addr, size); + memcpy_aligned(pmem, (u8*)pMem+addr, size); mtgsThread->SendDataPacket(); } } diff --git a/pcsx2/x86/iVUzerorec.cpp b/pcsx2/x86/iVUzerorec.cpp index eeee16d8f4..fa11268ec3 100644 --- a/pcsx2/x86/iVUzerorec.cpp +++ b/pcsx2/x86/iVUzerorec.cpp @@ -880,9 +880,7 @@ static VuFunctionHeader* SuperVURecompileProgram(u32 startpc, int vuindex) #ifdef SUPERVU_CACHING //memxor_mmx(r.checksum, &VU->Micro[r.start], r.size); r.pmem = malloc(r.size); - FreezeMMXRegs(1); memcpy_fast(r.pmem, &VU->Micro[r.start], r.size); - FreezeMMXRegs(0); #endif s_pFnHeader->ranges.push_back(r); }