Optimized memcpy_fast. In addition to being quite a bit faster, it also auto-preserves mmx registers now. So I was also able to remove almost every instance of FreezeMMXRegs (all except those used to guard the GS plugin calls). memcpy_fast (aka memcpy_amd_) is now faster than memcpy_raz for *all* scenarios, so it's been made the new default.

git-svn-id: http://pcsx2-playground.googlecode.com/svn/trunk@642 a6443dda-0b58-4228-96e9-037be469359c
This commit is contained in:
Jake.Stine 2009-01-27 05:12:54 +00:00 committed by Gregory Hainaut
parent 4781be9e59
commit 44f5117d24
14 changed files with 248 additions and 169 deletions

View File

@ -865,8 +865,6 @@ int cdvdReadSector() {
return -1; return -1;
} }
FreezeMMXRegs(1);
const u32 madr = HW_DMA3_MADR; const u32 madr = HW_DMA3_MADR;
// if raw dvd sector 'fill in the blanks' // if raw dvd sector 'fill in the blanks'
@ -935,7 +933,6 @@ int cdvdReadSector() {
HW_DMA3_BCR_H16-= (cdvd.BlockSize / (HW_DMA3_BCR_L16*4)); HW_DMA3_BCR_H16-= (cdvd.BlockSize / (HW_DMA3_BCR_L16*4));
HW_DMA3_MADR+= cdvd.BlockSize; HW_DMA3_MADR+= cdvd.BlockSize;
FreezeMMXRegs(0);
return 0; return 0;
} }
@ -2024,9 +2021,7 @@ void cdvdWrite16(u8 rt) // SCOMMAND
if (cdvd.mg_size + cdvd.ParamC > cdvd.mg_maxsize) if (cdvd.mg_size + cdvd.ParamC > cdvd.mg_maxsize)
cdvd.Result[0] = 0x80; cdvd.Result[0] = 0x80;
else{ else{
FreezeMMXRegs(1);
memcpy_fast(cdvd.mg_buffer + cdvd.mg_size, cdvd.Param, cdvd.ParamC); memcpy_fast(cdvd.mg_buffer + cdvd.mg_size, cdvd.Param, cdvd.ParamC);
FreezeMMXRegs(0);
cdvd.mg_size += cdvd.ParamC; cdvd.mg_size += cdvd.ParamC;
cdvd.Result[0] = 0; // 0 complete ; 1 busy ; 0x80 error cdvd.Result[0] = 0; // 0 complete ; 1 busy ; 0x80 error
} }
@ -2034,11 +2029,9 @@ void cdvdWrite16(u8 rt) // SCOMMAND
case 0x8E: // sceMgReadData case 0x8E: // sceMgReadData
SetResultSize( std::min(16, cdvd.mg_size) ); SetResultSize( std::min(16, cdvd.mg_size) );
FreezeMMXRegs(1);
memcpy_fast(cdvd.Result, cdvd.mg_buffer, cdvd.ResultC); memcpy_fast(cdvd.Result, cdvd.mg_buffer, cdvd.ResultC);
cdvd.mg_size -= cdvd.ResultC; cdvd.mg_size -= cdvd.ResultC;
memcpy_fast(cdvd.mg_buffer, cdvd.mg_buffer+cdvd.ResultC, cdvd.mg_size); memcpy_fast(cdvd.mg_buffer, cdvd.mg_buffer+cdvd.ResultC, cdvd.mg_size);
FreezeMMXRegs(0);
break; break;
case 0x88: // secrman: __mechacon_auth_0x88 //for now it is the same; so, fall;) case 0x88: // secrman: __mechacon_auth_0x88 //for now it is the same; so, fall;)
@ -2089,9 +2082,7 @@ fail_pol_cal:
SetResultSize(3);//in:0 SetResultSize(3);//in:0
{ {
int bit_ofs = mg_BIToffset(cdvd.mg_buffer); int bit_ofs = mg_BIToffset(cdvd.mg_buffer);
FreezeMMXRegs(1);
memcpy_fast(cdvd.mg_buffer, &cdvd.mg_buffer[bit_ofs], 8+16*cdvd.mg_buffer[bit_ofs+4]); memcpy_fast(cdvd.mg_buffer, &cdvd.mg_buffer[bit_ofs], 8+16*cdvd.mg_buffer[bit_ofs+4]);
FreezeMMXRegs(0);
} }
cdvd.mg_maxsize = 0; // don't allow any write cdvd.mg_maxsize = 0; // don't allow any write
cdvd.mg_size = 8+16*cdvd.mg_buffer[4];//new offset, i just moved the data cdvd.mg_size = 8+16*cdvd.mg_buffer[4];//new offset, i just moved the data

View File

@ -189,7 +189,6 @@ int CdRead(u32 lsn, u32 sectors, void *buf, CdRMode *mode){
return 0; return 0;
buff = CDVDgetBuffer(); buff = CDVDgetBuffer();
if (buff==NULL) return 0; if (buff==NULL) return 0;
FreezeMMXRegs(1);
switch (mode->datapattern){ switch (mode->datapattern){
case CdSecS2048: case CdSecS2048:
memcpy_fast((void*)((uptr)buf+2048*i), buff, 2048);break;//only data memcpy_fast((void*)((uptr)buf+2048*i), buff, 2048);break;//only data
@ -198,7 +197,6 @@ int CdRead(u32 lsn, u32 sectors, void *buf, CdRMode *mode){
case CdSecS2340: case CdSecS2340:
memcpy_fast((void*)((uptr)buf+2340*i), buff, 2340);break;//without sync memcpy_fast((void*)((uptr)buf+2340*i), buff, 2340);break;//without sync
} }
FreezeMMXRegs(0);
} }
return 1; return 1;
} }
@ -216,9 +214,7 @@ int DvdRead(u32 lsn, u32 sectors, void *buf, CdRMode *mode){
// switch (mode->datapattern){ // switch (mode->datapattern){
// case CdSecS2064: // case CdSecS2064:
((u32*)buf)[0] = i + 0x30000; ((u32*)buf)[0] = i + 0x30000;
FreezeMMXRegs(1);
memcpy_fast((u8*)buf+12, buff, 2048); memcpy_fast((u8*)buf+12, buff, 2048);
FreezeMMXRegs(0);
buf = (char*)buf + 2064; break; buf = (char*)buf + 2064; break;
// default: // default:
// return 0; // return 0;
@ -253,9 +249,7 @@ int CDVD_GetVolumeDescriptor(void){
if ((localVolDesc.filesystemType == 1) || if ((localVolDesc.filesystemType == 1) ||
(localVolDesc.filesystemType == 2)) (localVolDesc.filesystemType == 2))
{ {
FreezeMMXRegs(1);
memcpy_fast(&CDVolDesc, &localVolDesc, sizeof(cdVolDesc)); memcpy_fast(&CDVolDesc, &localVolDesc, sizeof(cdVolDesc));
FreezeMMXRegs(0);
} }
} }
else else

View File

@ -188,9 +188,7 @@ int CDVDFS_read( int fd, char *buffer, int size ){
RPC_LOG("[CDVDisodrv: ] Couldn't Read from file for some reason\n"); RPC_LOG("[CDVDisodrv: ] Couldn't Read from file for some reason\n");
return 0; return 0;
} }
FreezeMMXRegs(1);
memcpy_fast(buffer, lb + off_sector, ssize); memcpy_fast(buffer, lb + off_sector, ssize);
FreezeMMXRegs(0);
} }
if (asize) if (CdRead(asector, asize >> 11, buffer+ssize, &cdReadMode) != TRUE){ if (asize) if (CdRead(asector, asize >> 11, buffer+ssize, &cdReadMode) != TRUE){
RPC_LOG("[CDVDisodrv: ] Couldn't Read from file for some reason\n"); RPC_LOG("[CDVDisodrv: ] Couldn't Read from file for some reason\n");
@ -201,9 +199,7 @@ int CDVDFS_read( int fd, char *buffer, int size ){
RPC_LOG("[CDVDisodrv: ] Couldn't Read from file for some reason\n"); RPC_LOG("[CDVDisodrv: ] Couldn't Read from file for some reason\n");
return 0; return 0;
} }
FreezeMMXRegs(1);
memcpy_fast(buffer+ssize+asize, lb, esize); memcpy_fast(buffer+ssize+asize, lb, esize);
FreezeMMXRegs(0);
} }
/*********************** /***********************
// Now work out where we want to start reading from // Now work out where we want to start reading from

View File

@ -527,9 +527,7 @@ void cdrReadInterrupt() {
CDREAD_INT((cdr.Mode & 0x80) ? (cdReadTime / 2) : cdReadTime); CDREAD_INT((cdr.Mode & 0x80) ? (cdReadTime / 2) : cdReadTime);
return; return;
} }
FreezeMMXRegs(1);
memcpy_fast(cdr.Transfer, buf+12, 2340); memcpy_fast(cdr.Transfer, buf+12, 2340);
FreezeMMXRegs(0);
cdr.Stat = DataReady; cdr.Stat = DataReady;
CDR_LOG(" %x:%x:%x\n", cdr.Transfer[0], cdr.Transfer[1], cdr.Transfer[2]); CDR_LOG(" %x:%x:%x\n", cdr.Transfer[0], cdr.Transfer[1], cdr.Transfer[2]);
@ -923,9 +921,7 @@ void psxDma3(u32 madr, u32 bcr, u32 chcr) {
} }
cdsize = (bcr & 0xffff) * 4; cdsize = (bcr & 0xffff) * 4;
FreezeMMXRegs(1);
memcpy_fast((u8*)PSXM(madr), cdr.pTransfer, cdsize); memcpy_fast((u8*)PSXM(madr), cdr.pTransfer, cdsize);
FreezeMMXRegs(0);
psxCpu->Clear(madr, cdsize/4); psxCpu->Clear(madr, cdsize/4);
cdr.pTransfer+=cdsize; cdr.pTransfer+=cdsize;

View File

@ -575,11 +575,11 @@ static void WRITERING_DMA(u32 *pMem, u32 qwc)
{ {
pendmem = (pendmem&~0xfff)-16; pendmem = (pendmem&~0xfff)-16;
} }
memcpy_raz_(pgsmem, pMem, pendmem-(u32)gif->madr+16); memcpy_aligned(pgsmem, pMem, pendmem-(u32)gif->madr+16);
} }
else else
#endif #endif
memcpy_raz_(pgsmem, pMem, sizetoread); memcpy_aligned(pgsmem, pMem, sizetoread);
mtgsThread->SendDataPacket(); mtgsThread->SendDataPacket();
} }

View File

@ -329,7 +329,7 @@ static __forceinline u8* dmaGetAddr(u32 mem)
#else #else
// Note: Dma addresses are guaranteed to be aligned to 16 bytes (128 bits)
static __forceinline void *dmaGetAddr(u32 addr) { static __forceinline void *dmaGetAddr(u32 addr) {
u8 *ptr; u8 *ptr;
@ -355,35 +355,17 @@ void hwShutdown();
// hw read functions // hw read functions
extern u8 hwRead8 (u32 mem); extern u8 hwRead8 (u32 mem);
int hwConstRead8 (u32 x86reg, u32 mem, u32 sign);
extern u16 hwRead16(u32 mem); extern u16 hwRead16(u32 mem);
int hwConstRead16(u32 x86reg, u32 mem, u32 sign);
extern u32 hwRead32(u32 mem); extern u32 hwRead32(u32 mem);
int hwConstRead32(u32 x86reg, u32 mem); extern u64 hwRead64(u32 mem);
extern void hwRead128(u32 mem, u64 *out);
u64 hwRead64(u32 mem);
void hwConstRead64(u32 mem, int mmreg);
void hwRead128(u32 mem, u64 *out);
void hwConstRead128(u32 mem, int xmmreg);
// hw write functions // hw write functions
void hwWrite8 (u32 mem, u8 value); extern void hwWrite8 (u32 mem, u8 value);
void hwConstWrite8 (u32 mem, int mmreg); extern void hwWrite16(u32 mem, u16 value);
extern void hwWrite32(u32 mem, u32 value);
void hwWrite16(u32 mem, u16 value); extern void hwWrite64(u32 mem, u64 value);
void hwConstWrite16(u32 mem, int mmreg); extern void hwWrite128(u32 mem, const u64 *value);
void hwWrite32(u32 mem, u32 value);
void hwConstWrite32(u32 mem, int mmreg);
void hwWrite64(u32 mem, u64 value);
void hwConstWrite64(u32 mem, int mmreg);
void hwWrite128(u32 mem, const u64 *value);
void hwConstWrite128(u32 mem, int xmmreg);
void hwIntcIrq(int n); void hwIntcIrq(int n);
void hwDmacIrq(int n); void hwDmacIrq(int n);
@ -394,6 +376,18 @@ int hwMFIFOWrite(u32 addr, u8 *data, u32 size);
int hwDmacSrcChainWithStack(DMACh *dma, int id); int hwDmacSrcChainWithStack(DMACh *dma, int id);
int hwDmacSrcChain(DMACh *dma, int id); int hwDmacSrcChain(DMACh *dma, int id);
int hwConstRead8 (u32 x86reg, u32 mem, u32 sign);
int hwConstRead16(u32 x86reg, u32 mem, u32 sign);
int hwConstRead32(u32 x86reg, u32 mem);
void hwConstRead64(u32 mem, int mmreg);
void hwConstRead128(u32 mem, int xmmreg);
void hwConstWrite8 (u32 mem, int mmreg);
void hwConstWrite16(u32 mem, int mmreg);
void hwConstWrite32(u32 mem, int mmreg);
void hwConstWrite64(u32 mem, int mmreg);
void hwConstWrite128(u32 mem, int xmmreg);
#ifdef PCSX2_VIRTUAL_MEM #ifdef PCSX2_VIRTUAL_MEM
void iMemRead32Check(); void iMemRead32Check();
#endif #endif

View File

@ -430,7 +430,7 @@ int mtgsThreadObject::Callback()
{ {
Console::WriteLn("MTGS > Thread Started, Opening GS Plugin..."); Console::WriteLn("MTGS > Thread Started, Opening GS Plugin...");
memcpy_raz_( m_gsMem, PS2MEM_GS, sizeof(m_gsMem) ); memcpy_aligned( m_gsMem, PS2MEM_GS, sizeof(m_gsMem) );
GSsetBaseMem( m_gsMem ); GSsetBaseMem( m_gsMem );
m_returncode = GSopen((void *)&pDsp, "PCSX2", 1); m_returncode = GSopen((void *)&pDsp, "PCSX2", 1);

View File

@ -230,12 +230,19 @@ extern u8 g_globalXMMSaved;
void _memset16_unaligned( void* dest, u16 data, size_t size ); void _memset16_unaligned( void* dest, u16 data, size_t size );
#if defined(_WIN32) && !defined(__x86_64__) #if defined(_WIN32) && !defined(__x86_64__)
// faster memcpy
extern void __fastcall memcpy_raz_u(void *dest, const void *src, size_t bytes); // The new simplified memcpy_amd_ is now faster than memcpy_raz_.
extern void __fastcall memcpy_raz_(void *dest, const void *src, size_t qwc); // memcpy_amd_ also does mmx register saving, negating the need for freezeregs (code cleanup!)
extern void __fastcall memcpy_amd_(void *dest, const void *src, size_t n); // Additionally, using one single memcpy implementation keeps the code cache cleaner.
//extern void __fastcall memcpy_raz_udst(void *dest, const void *src, size_t bytes);
//extern void __fastcall memcpy_raz_usrc(void *dest, const void *src, size_t bytes);
//extern void __fastcall memcpy_raz_(void *dest, const void *src, size_t bytes);
extern void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes);
# include "windows/memzero.h" # include "windows/memzero.h"
# define memcpy_fast memcpy_amd_ # define memcpy_fast memcpy_amd_
# define memcpy_aligned memcpy_amd_
#else #else
@ -243,6 +250,10 @@ void _memset16_unaligned( void* dest, u16 data, size_t size );
#define memcpy_fast memcpy #define memcpy_fast memcpy
#define memcpy_raz_ memcpy #define memcpy_raz_ memcpy
#define memcpy_raz_u memcpy #define memcpy_raz_u memcpy
#define memcpy_aligned memcpy
#define memcpy_raz_u memcpy
#include "Linux/memzero.h" #include "Linux/memzero.h"
#endif #endif

View File

@ -226,13 +226,11 @@ extern void mfifoGIFtransfer(int);
#define gif ((DMACh*)&PS2MEM_HW[0xA000]) #define gif ((DMACh*)&PS2MEM_HW[0xA000])
void dmaSPR0() { // fromSPR void dmaSPR0() { // fromSPR
int qwc = spr0->qwc; int qwc = spr0->qwc;
FreezeMMXRegs(1);
SPR_LOG("dmaSPR0 chcr = %lx, madr = %lx, qwc = %lx, sadr = %lx\n", SPR_LOG("dmaSPR0 chcr = %lx, madr = %lx, qwc = %lx, sadr = %lx\n",
spr0->chcr, spr0->madr, spr0->qwc, spr0->sadr); spr0->chcr, spr0->madr, spr0->qwc, spr0->sadr);
_dmaSPR0(); _dmaSPR0();
FreezeMMXRegs(0);
if ((psHu32(DMAC_CTRL) & 0xC) == 0xC) { // GIF MFIFO if ((psHu32(DMAC_CTRL) & 0xC) == 0xC) { // GIF MFIFO
if((spr0->madr & ~psHu32(DMAC_RBSR)) != psHu32(DMAC_RBOR)) SysPrintf("GIF MFIFO Write outside MFIFO area\n"); if((spr0->madr & ~psHu32(DMAC_RBSR)) != psHu32(DMAC_RBOR)) SysPrintf("GIF MFIFO Write outside MFIFO area\n");
spr0->madr = psHu32(DMAC_RBOR) + (spr0->madr & psHu32(DMAC_RBSR)); spr0->madr = psHu32(DMAC_RBOR) + (spr0->madr & psHu32(DMAC_RBSR));
@ -308,7 +306,6 @@ void _SPR1interleave() {
void dmaSPR1() { // toSPR void dmaSPR1() { // toSPR
FreezeMMXRegs(1);
#ifdef SPR_LOG #ifdef SPR_LOG
SPR_LOG("dmaSPR1 chcr = 0x%x, madr = 0x%x, qwc = 0x%x\n" SPR_LOG("dmaSPR1 chcr = 0x%x, madr = 0x%x, qwc = 0x%x\n"
" tadr = 0x%x, sadr = 0x%x\n", " tadr = 0x%x, sadr = 0x%x\n",
@ -325,7 +322,6 @@ void dmaSPR1() { // toSPR
// Transfer Dn_QWC from Dn_MADR to SPR1 // Transfer Dn_QWC from Dn_MADR to SPR1
SPR1chain(); SPR1chain();
CPU_INT(9, cycles); CPU_INT(9, cycles);
FreezeMMXRegs(0);
return; return;
} else if ((spr1->chcr & 0xc) == 0x4){ } else if ((spr1->chcr & 0xc) == 0x4){
int cycles = 0; int cycles = 0;
@ -338,7 +334,6 @@ void dmaSPR1() { // toSPR
// Transfer Dn_QWC from Dn_MADR to SPR1 // Transfer Dn_QWC from Dn_MADR to SPR1
SPR1chain(); SPR1chain();
CPU_INT(9, cycles); CPU_INT(9, cycles);
FreezeMMXRegs(0);
return; return;
} }
// Chain Mode // Chain Mode
@ -382,7 +377,6 @@ void dmaSPR1() { // toSPR
} else { // Interleave Mode } else { // Interleave Mode
_SPR1interleave(); _SPR1interleave();
} }
FreezeMMXRegs(0);
} }

View File

@ -574,9 +574,7 @@ static void VIFunpack(u32 *data, vifCode *v, int size, const unsigned int VIFdma
// v4-32 // v4-32
if(vifRegs->mode == 0 && !(vifRegs->code & 0x10000000) && vif->usn == 0){ if(vifRegs->mode == 0 && !(vifRegs->code & 0x10000000) && vif->usn == 0){
vifRegs->num -= size>>4; vifRegs->num -= size>>4;
FreezeMMXRegs(1);
memcpy_fast((u8*)dest, cdata, size); memcpy_fast((u8*)dest, cdata, size);
FreezeMMXRegs(0);
size = 0; size = 0;
//unpacktotal += GetCPUTick()-basetick; //unpacktotal += GetCPUTick()-basetick;
return; return;
@ -814,9 +812,7 @@ static __forceinline void _vif0mpgTransfer(u32 addr, u32 *data, int size) {
fclose(f); fclose(f);
}*/ }*/
if (memcmp(VU0.Micro + addr, data, size << 2)) { if (memcmp(VU0.Micro + addr, data, size << 2)) {
FreezeMMXRegs(1);
memcpy_fast(VU0.Micro + addr, data, size << 2); memcpy_fast(VU0.Micro + addr, data, size << 2);
FreezeMMXRegs(0);
CpuVU0->Clear(addr, size); CpuVU0->Clear(addr, size);
} }
} }
@ -1490,9 +1486,7 @@ static __forceinline void _vif1mpgTransfer(u32 addr, u32 *data, int size) {
}*/ }*/
assert( VU1.Micro > 0 ); assert( VU1.Micro > 0 );
if (memcmp(VU1.Micro + addr, data, size << 2)) { if (memcmp(VU1.Micro + addr, data, size << 2)) {
FreezeMMXRegs(1);
memcpy_fast(VU1.Micro + addr, data, size << 2); memcpy_fast(VU1.Micro + addr, data, size << 2);
FreezeMMXRegs(0);
CpuVU1->Clear(addr, size); CpuVU1->Clear(addr, size);
} }
} }
@ -1644,7 +1638,7 @@ static int Vif1TransDirectHL(u32 *data){
{ {
//unaligned copy.VIF handling is -very- messy, so i'l use this code til i fix it :) //unaligned copy.VIF handling is -very- messy, so i'l use this code til i fix it :)
const uint count = mtgsThread->PrepDataPacket( GIF_PATH_2, data, ret<<2 ); const uint count = mtgsThread->PrepDataPacket( GIF_PATH_2, data, ret<<2 );
memcpy_raz_u( mtgsThread->GetDataPacketPtr(), data, count ); memcpy_fast( mtgsThread->GetDataPacketPtr(), data, count );
mtgsThread->SendDataPacket(); mtgsThread->SendDataPacket();
} }
else { else {

View File

@ -49,6 +49,9 @@ namespace Threading
} }
cpuinfo.LogicalCores = CPUs; cpuinfo.LogicalCores = CPUs;
if( LogicalCoresPerPhysicalCPU > CPUs) // for 1-socket HTT-disabled machines
LogicalCoresPerPhysicalCPU = CPUs;
cpuinfo.PhysicalCores = ( CPUs / LogicalCoresPerPhysicalCPU ) * PhysicalCoresPerPhysicalCPU; cpuinfo.PhysicalCores = ( CPUs / LogicalCoresPerPhysicalCPU ) * PhysicalCoresPerPhysicalCPU;
ptw32_smp_system = ( cpuinfo.LogicalCores > 1 ) ? TRUE : FALSE; ptw32_smp_system = ( cpuinfo.LogicalCores > 1 ) ? TRUE : FALSE;
} }

View File

@ -88,35 +88,27 @@ void checkregs()
#endif #endif
__declspec(align(16)) static u8 _xmm_backup[16*2]; PCSX2_ALIGNED16( static u8 _xmm_backup[16*2] );
PCSX2_ALIGNED16( static u8 _mmx_backup[8*4] );
// this one checks for alignments too ... static __declspec(naked) void __fastcall _memcpy_raz_usrc(void *dest, const void *src, size_t bytes)
__declspec(naked) void __fastcall memcpy_raz_u(void *dest, const void *src, size_t bytes)
{ {
// If src is aligned, use memcpy_raz instead:
__asm
{
test edx,0xf;
jz memcpy_raz_;
}
// MOVSRC = opcode used to read. I use the same code for the aligned version, with a different define :) // MOVSRC = opcode used to read. I use the same code for the aligned version, with a different define :)
#define MOVSRC movups #define MOVSRC movdqu
#define MOVDST movdqa
__asm __asm
{ {
//Reads before reads, to avoid stalls //Reads before reads, to avoid stalls
mov eax,[esp+4]; mov eax,[esp+4];
//Make sure to save xmm0, it must be preserved ... //Make sure to save xmm0, it must be preserved ...
movaps [_xmm_backup+0x00],xmm0; movaps [_xmm_backup],xmm0;
//if >=128 bytes use 128 byte unrolled loop //if >=128 bytes use 128 byte unrolled loop
//i use cmp ..,127 + jna because 127 is encodable using the simm8 form //i use cmp ..,127 + jna because 127 is encodable using the simm8 form
cmp eax,127; cmp eax,127;
jna _loop_1; jna _loop_1;
//unrolled version also touches xmm1, save it :)
movaps [_xmm_backup+0x10],xmm1;
//since this is a common branch target it could be good to align it -- no idea if it has any effect :p //since this is a common branch target it could be good to align it -- no idea if it has any effect :p
align 16 align 16
@ -124,34 +116,111 @@ __declspec(naked) void __fastcall memcpy_raz_u(void *dest, const void *src, size
_loop_8: _loop_8:
MOVSRC xmm0,[edx+0x00]; //read first to avoid read-after-write stalls MOVSRC xmm0,[edx+0x00]; //read first to avoid read-after-write stalls
MOVSRC xmm1,[edx+0x10]; MOVDST [ecx+0x00],xmm0; //then write :p
MOVSRC xmm0,[edx+0x10];
MOVDST [ecx+0x10],xmm0;
sub edx,-128; //edx won't be used for a while, so update it here. sub/-128 for simm8 encoding sub edx,-128; //edx won't be used for a while, so update it here. sub/-128 for simm8 encoding
movaps [ecx+0x00],xmm0; //then write :p
movaps [ecx+0x10],xmm1;
sub ecx,-128; //ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding sub ecx,-128; //ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding
MOVSRC xmm0,[edx+0x20-128]; MOVSRC xmm0,[edx+0x20-128];
MOVSRC xmm1,[edx+0x30-128]; MOVDST [ecx+0x20-128],xmm0;
MOVSRC xmm0,[edx+0x30-128];
MOVDST [ecx+0x30-128],xmm0;
add eax,-128; //eax won't be used for a while, so update it here. add/-128 for simm8 encoding add eax,-128; //eax won't be used for a while, so update it here. add/-128 for simm8 encoding
movaps [ecx+0x20-128],xmm0;
movaps [ecx+0x30-128],xmm1;
MOVSRC xmm0,[edx+0x40-128]; MOVSRC xmm0,[edx+0x40-128];
MOVSRC xmm1,[edx+0x50-128]; MOVDST [ecx+0x40-128],xmm0;
movaps [ecx+0x40-128],xmm0; MOVSRC xmm0,[edx+0x50-128];
movaps [ecx+0x50-128],xmm1; MOVDST [ecx+0x50-128],xmm0;
MOVSRC xmm0,[edx+0x60-128]; MOVSRC xmm0,[edx+0x60-128];
MOVSRC xmm1,[edx+0x70-128]; MOVDST [ecx+0x60-128],xmm0;
movaps [ecx+0x60-128],xmm0; MOVSRC xmm0,[edx+0x70-128];
movaps [ecx+0x70-128],xmm1; MOVDST [ecx+0x70-128],xmm0;
//127~ja, 127 is encodable as simm8 :) //127~ja, 127 is encodable as simm8 :)
cmp eax,127; cmp eax,127;
ja _loop_8; ja _loop_8;
//restore xmm1 :) //direct copy for 0~7 qwords
movaps xmm1,[_xmm_backup+0x10]; //in order to avoid the inc/dec of all 3 registers
//i use negative relative addressing from the top of the buffers
//[top-current index]
_loop_1:
//prepare the regs for 'negative relative addressing'
add edx,eax;
add ecx,eax;
neg eax;
jz cleanup; //exit if nothing to do
_loop_1_inner:
MOVSRC xmm0,[edx+eax];
MOVDST [ecx+eax],xmm0;
add eax,16; //while the offset is still negative we have data to copy
js _loop_1_inner;
//done !
cleanup:
//restore xmm and exit ~)
movaps xmm0,[_xmm_backup];
ret 4;
}
#undef MOVSRC
#undef MOVDST
}
static __declspec(naked) void __fastcall _memcpy_raz_udst(void *dest, const void *src, size_t bytes)
{
// MOVDST = opcode used to read. I use the same code for the aligned version, with a different define :)
#define MOVSRC movaps
#define MOVDST movups
__asm
{
//Reads before reads, to avoid stalls
mov eax,[esp+4];
//Make sure to save xmm0, it must be preserved ...
movaps [_xmm_backup],xmm0;
//if >=128 bytes use 128 byte unrolled loop
//i use cmp ..,127 + jna because 127 is encodable using the simm8 form
cmp eax,127;
jna _loop_1;
//since this is a common branch target it could be good to align it -- no idea if it has any effect :p
align 16
//128 byte unrolled loop
_loop_8:
MOVSRC xmm0,[edx+0x00]; //read first to avoid read-after-write stalls
MOVDST [ecx+0x00],xmm0; //then write :p
MOVSRC xmm0,[edx+0x10];
MOVDST [ecx+0x10],xmm0;
sub edx,-128; //edx won't be used for a while, so update it here. sub/-128 for simm8 encoding
sub ecx,-128; //ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding
MOVSRC xmm0,[edx+0x20-128];
MOVDST [ecx+0x20-128],xmm0;
MOVSRC xmm0,[edx+0x30-128];
MOVDST [ecx+0x30-128],xmm0;
add eax,-128; //eax won't be used for a while, so update it here. add/-128 for simm8 encoding
MOVSRC xmm0,[edx+0x40-128];
MOVDST [ecx+0x40-128],xmm0;
MOVSRC xmm0,[edx+0x50-128];
MOVDST [ecx+0x50-128],xmm0;
MOVSRC xmm0,[edx+0x60-128];
MOVDST [ecx+0x60-128],xmm0;
MOVSRC xmm0,[edx+0x70-128];
MOVDST [ecx+0x70-128],xmm0;
//127~ja, 127 is encodable as simm8 :)
cmp eax,127;
ja _loop_8;
//direct copy for 0~7 qwords //direct copy for 0~7 qwords
//in order to avoid the inc/dec of all 3 registers //in order to avoid the inc/dec of all 3 registers
@ -168,22 +237,24 @@ _loop_1:
_loop_1_inner: _loop_1_inner:
MOVSRC xmm0,[edx+eax]; MOVSRC xmm0,[edx+eax];
movaps [ecx+eax],xmm0; movaps [ecx+eax],xmm0;
add eax,16; //while the offset is still negative we have data to copy add eax,16; //while the offset is still negative we have data to copy
js _loop_1_inner; js _loop_1_inner;
//done ! //done !
cleanup: cleanup:
//restore xmm and exit ~) //restore xmm and exit ~)
movaps xmm0,[_xmm_backup+0x00]; movaps xmm0,[_xmm_backup];
ret 4; ret 4;
} }
#undef MOVSRC #undef MOVSRC
#undef MOVDST
} }
// Custom memcpy, only for 16 byte aligned stuff (used for mtgs) // Custom memcpy, only for 16 byte aligned stuff (used for mtgs)
// This function is optimized for medium-small transfer sizes (<2048, >=128). No prefetching is // This function is optimized for medium-small transfer sizes (<2048, >=128). No prefetching is
// used since the reads are linear and the cache logic can predict em :) // used since the reads are linear and the cache logic can predict em :)
// *OBSOLETE* -- memcpy_amd_ has been optimized and is now faster.
__declspec(naked) void __fastcall memcpy_raz_(void *dest, const void *src, size_t bytes) __declspec(naked) void __fastcall memcpy_raz_(void *dest, const void *src, size_t bytes)
{ {
// Code Implementation Notes: // Code Implementation Notes:
@ -191,21 +262,19 @@ __declspec(naked) void __fastcall memcpy_raz_(void *dest, const void *src, size_
// MOVSRC = opcode used to read. I use the same code for the unaligned version, with a different define :) // MOVSRC = opcode used to read. I use the same code for the unaligned version, with a different define :)
#define MOVSRC movaps #define MOVSRC movaps
#define MOVDST movaps
__asm __asm
{ {
//Reads before reads, to avoid stalls //Reads before reads, to avoid stalls
mov eax,[esp+4]; mov eax,[esp+4];
//Make sure to save xmm0, it must be preserved ... //Make sure to save xmm0, it must be preserved ...
movaps [_xmm_backup+0x00],xmm0; movaps [_xmm_backup],xmm0;
//if >=128 bytes use 128 byte unrolled loop //if >=128 bytes use 128 byte unrolled loop
//i use cmp ..,127 + jna because 127 is encodable using the simm8 form //i use cmp ..,127 + jna because 127 is encodable using the simm8 form
cmp eax,127; cmp eax,127;
jna _loop_1; jna _loop_1;
//unrolled version also toiches xmm1, save it :)
movaps [_xmm_backup+0x10],xmm1;
//since this is a common branch target it could be good to align it -- no idea if it has any effect :p //since this is a common branch target it could be good to align it -- no idea if it has any effect :p
align 16 align 16
@ -213,35 +282,32 @@ __declspec(naked) void __fastcall memcpy_raz_(void *dest, const void *src, size_
_loop_8: _loop_8:
MOVSRC xmm0,[edx+0x00]; //read first to avoid read-after-write stalls MOVSRC xmm0,[edx+0x00]; //read first to avoid read-after-write stalls
MOVSRC xmm1,[edx+0x10]; MOVDST [ecx+0x00],xmm0; //then write :p
MOVSRC xmm0,[edx+0x10];
MOVDST [ecx+0x10],xmm0;
sub edx,-128; //edx won't be used for a while, so update it here. sub/-128 for simm8 encoding sub edx,-128; //edx won't be used for a while, so update it here. sub/-128 for simm8 encoding
movaps [ecx+0x00],xmm0; //then write :p
movaps [ecx+0x10],xmm1;
sub ecx,-128; //ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding sub ecx,-128; //ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding
MOVSRC xmm0,[edx+0x20-128]; MOVSRC xmm0,[edx+0x20-128];
MOVSRC xmm1,[edx+0x30-128]; MOVDST [ecx+0x20-128],xmm0;
MOVSRC xmm0,[edx+0x30-128];
MOVDST [ecx+0x30-128],xmm0;
add eax,-128; //eax won't be used for a while, so update it here. add/-128 for simm8 encoding add eax,-128; //eax won't be used for a while, so update it here. add/-128 for simm8 encoding
movaps [ecx+0x20-128],xmm0;
movaps [ecx+0x30-128],xmm1;
MOVSRC xmm0,[edx+0x40-128]; MOVSRC xmm0,[edx+0x40-128];
MOVSRC xmm1,[edx+0x50-128]; MOVDST [ecx+0x40-128],xmm0;
movaps [ecx+0x40-128],xmm0; MOVSRC xmm0,[edx+0x50-128];
movaps [ecx+0x50-128],xmm1; MOVDST [ecx+0x50-128],xmm0;
MOVSRC xmm0,[edx+0x60-128]; MOVSRC xmm0,[edx+0x60-128];
MOVSRC xmm1,[edx+0x70-128]; MOVDST [ecx+0x60-128],xmm0;
movaps [ecx+0x60-128],xmm0; MOVSRC xmm0,[edx+0x70-128];
movaps [ecx+0x70-128],xmm1; MOVDST [ecx+0x70-128],xmm0;
//127~ja, 127 is encodable as simm8 :) //127~ja, 127 is encodable as simm8 :)
cmp eax,127; cmp eax,127;
ja _loop_8; ja _loop_8;
//restore xmm1 :)
movaps xmm1,[_xmm_backup+0x10];
//direct copy for 0~7 qwords //direct copy for 0~7 qwords
//in order to avoid the inc/dec of all 3 registers //in order to avoid the inc/dec of all 3 registers
//i use negative relative addressing from the top of the buffers //i use negative relative addressing from the top of the buffers
@ -256,7 +322,7 @@ _loop_1:
_loop_1_inner: _loop_1_inner:
MOVSRC xmm0,[edx+eax]; MOVSRC xmm0,[edx+eax];
movaps [ecx+eax],xmm0; MOVDST [ecx+eax],xmm0;
add eax,16; //while the offset is still negative we have data to copy add eax,16; //while the offset is still negative we have data to copy
js _loop_1_inner; js _loop_1_inner;
@ -264,44 +330,64 @@ _loop_1_inner:
//done ! //done !
cleanup: cleanup:
//restore xmm and exit ~) //restore xmm and exit ~)
movaps xmm0,[_xmm_backup+0x00]; movaps xmm0,[_xmm_backup];
ret 4; ret 4;
} }
#undef MOVSRC
#undef MOVDST
} }
#undef MOVSRC // This memcpy routine is for use in situations where the source buffer's alignment is indeterminate.
__forceinline void __fastcall memcpy_raz_usrc(void *dest, const void *src, size_t bytes)
{
if( ((uptr)src & 0xf) == 0 )
memcpy_raz_( dest, src, bytes );
else
_memcpy_raz_usrc( dest, src, bytes );
}
// This memcpy routine is for use in situations where the destination buffer's alignment is indeterminate.
__forceinline void __fastcall memcpy_raz_udst(void *dest, const void *src, size_t bytes)
{
if( ((uptr)dest & 0xf) == 0 )
memcpy_raz_( dest, src, bytes );
else
_memcpy_raz_udst( dest, src, bytes );
}
////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////
// Fast memcpy as coded by AMD. // Fast memcpy as coded by AMD, and thn improved by air.
//
// This function clobbers all MMX registers, and is generally not optimal for short memory // This routine preserves mmx registers! It's the complete real deal!
// copies due to the amount of overhead required to test for alignments, copy length, __declspec(naked) void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
// and other ABI overhead.
void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
{ {
__asm { __asm
{
push edi
push esi
mov edi, ecx ; destination mov edi, ecx ; destination
mov esi, edx ; source mov esi, edx ; source
mov ecx, [n] ; number of bytes to copy mov ecx, [esp+12] ; number of bytes to copy
mov ebx, ecx ; keep a copy of count mov eax, ecx ; keep a copy of count
cld cld
cmp ecx, TINY_BLOCK_COPY cmp eax, TINY_BLOCK_COPY
jb $memcpy_ic_3 ; tiny? skip mmx copy jb $memcpy_ic_3 ; tiny? skip mmx copy
cmp ecx, 32*1024 ; don't align between 32k-64k because cmp eax, 32*1024 ; don't align between 32k-64k because
jbe $memcpy_do_align ; it appears to be slower jbe $memcpy_do_align ; it appears to be slower
cmp ecx, 64*1024 cmp eax, 64*1024
jbe $memcpy_align_done jbe $memcpy_align_done
$memcpy_do_align: $memcpy_do_align:
mov ecx, 8 ; a trick that's faster than rep movsb... mov eax, 8 ; a trick that's faster than rep movsb...
sub ecx, edi ; align destination to qword sub eax, edi ; align destination to qword
and ecx, 111b ; get the low bits and eax, 111b ; get the low bits
sub ebx, ecx ; update copy count sub ecx, eax ; update copy count
neg ecx ; set up to jump into the array neg eax ; set up to jump into the array
add ecx, offset $memcpy_align_done add eax, offset $memcpy_align_done
jmp ecx ; jump to array of movsb's jmp eax ; jump to array of movsb's
align 4 align 4
movsb movsb
@ -314,13 +400,18 @@ align 4
movsb movsb
$memcpy_align_done: ; destination is dword aligned $memcpy_align_done: ; destination is dword aligned
mov ecx, ebx ; number of bytes left to copy mov eax, ecx ; number of bytes left to copy
shr ecx, 6 ; get 64-byte block count shr eax, 6 ; get 64-byte block count
jz $memcpy_ic_2 ; finish the last few bytes jz $memcpy_ic_2 ; finish the last few bytes
cmp ecx, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy cmp eax, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy
jae $memcpy_uc_test jae $memcpy_uc_test
movq [_mmx_backup+0x00],mm0
movq [_mmx_backup+0x08],mm1
movq [_mmx_backup+0x10],mm2
movq [_mmx_backup+0x18],mm3
// This is small block copy that uses the MMX registers to copy 8 bytes // This is small block copy that uses the MMX registers to copy 8 bytes
// at a time. It uses the "unrolled loop" optimization, and also uses // at a time. It uses the "unrolled loop" optimization, and also uses
// the software prefetch instruction to get the data into the cache. // the software prefetch instruction to get the data into the cache.
@ -348,30 +439,39 @@ $memcpy_ic_1: ; 64-byte block copies, in-cache copy
add esi, 64 ; update source pointer add esi, 64 ; update source pointer
add edi, 64 ; update destination pointer add edi, 64 ; update destination pointer
dec ecx ; count down dec eax ; count down
jnz $memcpy_ic_1 ; last 64-byte block? jnz $memcpy_ic_1 ; last 64-byte block?
movq mm0,[_mmx_backup+0x00]
movq mm1,[_mmx_backup+0x08]
movq mm2,[_mmx_backup+0x10]
movq mm3,[_mmx_backup+0x18]
$memcpy_ic_2: $memcpy_ic_2:
mov ecx, ebx ; has valid low 6 bits of the byte count mov eax, ecx ; has valid low 6 bits of the byte count
$memcpy_ic_3: $memcpy_ic_3:
shr ecx, 2 ; dword count shr eax, 2 ; dword count
and ecx, 1111b ; only look at the "remainder" bits and eax, 1111b ; only look at the "remainder" bits
neg ecx ; set up to jump into the array neg eax ; set up to jump into the array
add ecx, offset $memcpy_last_few add eax, offset $memcpy_last_few
jmp ecx ; jump to array of movsd's jmp eax ; jump to array of movsd's
$memcpy_uc_test: $memcpy_uc_test:
cmp ecx, UNCACHED_COPY/64 ; big enough? use block prefetch copy /*cmp ecx, UNCACHED_COPY/64 ; big enough? use block prefetch copy
jae $memcpy_bp_1 jae $memcpy_bp_1
$memcpy_64_test:*/
$memcpy_64_test: or eax, eax ; tail end of block prefetch will jump here
or ecx, ecx ; tail end of block prefetch will jump here
jz $memcpy_ic_2 ; no more 64-byte blocks left jz $memcpy_ic_2 ; no more 64-byte blocks left
// For larger blocks, which will spill beyond the cache, it's faster to // For larger blocks, which will spill beyond the cache, it's faster to
// use the Streaming Store instruction MOVNTQ. This write instruction // use the Streaming Store instruction MOVNTQ. This write instruction
// bypasses the cache and writes straight to main memory. This code also // bypasses the cache and writes straight to main memory. This code also
// uses the software prefetch instruction to pre-read the data. // uses the software prefetch instruction to pre-read the data.
movq [_mmx_backup+0x00],mm0
movq [_mmx_backup+0x08],mm1
movq [_mmx_backup+0x10],mm2
align 16 align 16
$memcpy_uc_1: ; 64-byte blocks, uncached copy $memcpy_uc_1: ; 64-byte blocks, uncached copy
@ -394,17 +494,25 @@ $memcpy_uc_1: ; 64-byte blocks, uncached copy
movq mm1,[esi-8] movq mm1,[esi-8]
movntq [edi-24], mm2 movntq [edi-24], mm2
movntq [edi-16], mm0 movntq [edi-16], mm0
dec ecx dec eax
movntq [edi-8], mm1 movntq [edi-8], mm1
jnz $memcpy_uc_1 ; last 64-byte block? jnz $memcpy_uc_1 ; last 64-byte block?
jmp $memcpy_ic_2 ; almost done movq mm0,[_mmx_backup+0x00]
movq mm1,[_mmx_backup+0x08]
movq mm2,[_mmx_backup+0x10]
jmp $memcpy_ic_2 ; almost done (not needed because large copy below was removed)
// For the largest size blocks, a special technique called Block Prefetch // For the largest size blocks, a special technique called Block Prefetch
// can be used to accelerate the read operations. Block Prefetch reads // can be used to accelerate the read operations. Block Prefetch reads
// one address per cache line, for a series of cache lines, in a short loop. // one address per cache line, for a series of cache lines, in a short loop.
// This is faster than using software prefetch. The technique is great for // This is faster than using software prefetch. The technique is great for
// getting maximum read bandwidth, especially in DDR memory systems. // getting maximum read bandwidth, especially in DDR memory systems.
// Note: Pcsx2 rarely invokes large copies, so this mode has been disabled to
// help keep the code cache footprint of memcpy_fast to a minimum.
/*
$memcpy_bp_1: ; large blocks, block prefetch copy $memcpy_bp_1: ; large blocks, block prefetch copy
cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop? cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop?
@ -447,6 +555,7 @@ $memcpy_bp_3:
jnz $memcpy_bp_3 ; keep copying jnz $memcpy_bp_3 ; keep copying
sub ecx, CACHEBLOCK ; update the 64-byte block count sub ecx, CACHEBLOCK ; update the 64-byte block count
jmp $memcpy_bp_1 ; keep processing chunks jmp $memcpy_bp_1 ; keep processing chunks
*/
// The smallest copy uses the X86 "movsd" instruction, in an optimized // The smallest copy uses the X86 "movsd" instruction, in an optimized
// form which is an "unrolled loop". Then it handles the last few bytes. // form which is an "unrolled loop". Then it handles the last few bytes.
@ -469,8 +578,8 @@ align 4
movsd movsd
$memcpy_last_few: ; dword aligned from before movsd's $memcpy_last_few: ; dword aligned from before movsd's
mov ecx, ebx ; has valid low 2 bits of the byte count mov eax, ecx ; has valid low 2 bits of the byte count
and ecx, 11b ; the last few cows must come home and eax, 11b ; the last few cows must come home
jz $memcpy_final ; no more, let's leave jz $memcpy_final ; no more, let's leave
rep movsb ; the last 1, 2, or 3 bytes rep movsb ; the last 1, 2, or 3 bytes
@ -479,10 +588,14 @@ $memcpy_final:
sfence ; flush the write buffer sfence ; flush the write buffer
//mov eax, [dest] ; ret value = destination pointer //mov eax, [dest] ; ret value = destination pointer
pop esi
pop edi
ret 4
} }
} }
// mmx memcpy implementation, size has to be a multiple of 8 // mmx mem-compare implementation, size has to be a multiple of 8
// returns 0 is equal, nonzero value if not equal // returns 0 is equal, nonzero value if not equal
// ~10 times faster than standard memcmp // ~10 times faster than standard memcmp
// (zerofrog) // (zerofrog)

View File

@ -1977,12 +1977,7 @@ void VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr)
//if( size > 0 ) //if( size > 0 )
{ {
u8* pmem = mtgsThread->GetDataPacketPtr(); u8* pmem = mtgsThread->GetDataPacketPtr();
//FreezeMMXRegs(1); memcpy_aligned(pmem, (u8*)pMem+addr, size);
//memcpy_fast(pmem, (u8*)pMem+addr, size);
//FreezeMMXRegs(0);
// we can use the faster memcpy_raz_ here (src/dest are garaunteed to be aligned)
memcpy_raz_(pmem, (u8*)pMem+addr, size);
mtgsThread->SendDataPacket(); mtgsThread->SendDataPacket();
} }
} }

View File

@ -880,9 +880,7 @@ static VuFunctionHeader* SuperVURecompileProgram(u32 startpc, int vuindex)
#ifdef SUPERVU_CACHING #ifdef SUPERVU_CACHING
//memxor_mmx(r.checksum, &VU->Micro[r.start], r.size); //memxor_mmx(r.checksum, &VU->Micro[r.start], r.size);
r.pmem = malloc(r.size); r.pmem = malloc(r.size);
FreezeMMXRegs(1);
memcpy_fast(r.pmem, &VU->Micro[r.start], r.size); memcpy_fast(r.pmem, &VU->Micro[r.start], r.size);
FreezeMMXRegs(0);
#endif #endif
s_pFnHeader->ranges.push_back(r); s_pFnHeader->ranges.push_back(r);
} }