mirror of https://github.com/PCSX2/pcsx2.git
Optimized memcpy_fast. In addition to being quite a bit faster, it also auto-preserves mmx registers now. So I was also able to remove almost every instance of FreezeMMXRegs (all except those used to guard the GS plugin calls). memcpy_fast (aka memcpy_amd_) is now faster than memcpy_raz for *all* scenarios, so it's been made the new default.
git-svn-id: http://pcsx2-playground.googlecode.com/svn/trunk@642 a6443dda-0b58-4228-96e9-037be469359c
This commit is contained in:
parent
4781be9e59
commit
44f5117d24
|
@ -865,8 +865,6 @@ int cdvdReadSector() {
|
|||
return -1;
|
||||
}
|
||||
|
||||
FreezeMMXRegs(1);
|
||||
|
||||
const u32 madr = HW_DMA3_MADR;
|
||||
|
||||
// if raw dvd sector 'fill in the blanks'
|
||||
|
@ -935,7 +933,6 @@ int cdvdReadSector() {
|
|||
|
||||
HW_DMA3_BCR_H16-= (cdvd.BlockSize / (HW_DMA3_BCR_L16*4));
|
||||
HW_DMA3_MADR+= cdvd.BlockSize;
|
||||
FreezeMMXRegs(0);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -2024,9 +2021,7 @@ void cdvdWrite16(u8 rt) // SCOMMAND
|
|||
if (cdvd.mg_size + cdvd.ParamC > cdvd.mg_maxsize)
|
||||
cdvd.Result[0] = 0x80;
|
||||
else{
|
||||
FreezeMMXRegs(1);
|
||||
memcpy_fast(cdvd.mg_buffer + cdvd.mg_size, cdvd.Param, cdvd.ParamC);
|
||||
FreezeMMXRegs(0);
|
||||
cdvd.mg_size += cdvd.ParamC;
|
||||
cdvd.Result[0] = 0; // 0 complete ; 1 busy ; 0x80 error
|
||||
}
|
||||
|
@ -2034,11 +2029,9 @@ void cdvdWrite16(u8 rt) // SCOMMAND
|
|||
|
||||
case 0x8E: // sceMgReadData
|
||||
SetResultSize( std::min(16, cdvd.mg_size) );
|
||||
FreezeMMXRegs(1);
|
||||
memcpy_fast(cdvd.Result, cdvd.mg_buffer, cdvd.ResultC);
|
||||
cdvd.mg_size -= cdvd.ResultC;
|
||||
memcpy_fast(cdvd.mg_buffer, cdvd.mg_buffer+cdvd.ResultC, cdvd.mg_size);
|
||||
FreezeMMXRegs(0);
|
||||
break;
|
||||
|
||||
case 0x88: // secrman: __mechacon_auth_0x88 //for now it is the same; so, fall;)
|
||||
|
@ -2089,9 +2082,7 @@ fail_pol_cal:
|
|||
SetResultSize(3);//in:0
|
||||
{
|
||||
int bit_ofs = mg_BIToffset(cdvd.mg_buffer);
|
||||
FreezeMMXRegs(1);
|
||||
memcpy_fast(cdvd.mg_buffer, &cdvd.mg_buffer[bit_ofs], 8+16*cdvd.mg_buffer[bit_ofs+4]);
|
||||
FreezeMMXRegs(0);
|
||||
}
|
||||
cdvd.mg_maxsize = 0; // don't allow any write
|
||||
cdvd.mg_size = 8+16*cdvd.mg_buffer[4];//new offset, i just moved the data
|
||||
|
|
|
@ -189,7 +189,6 @@ int CdRead(u32 lsn, u32 sectors, void *buf, CdRMode *mode){
|
|||
return 0;
|
||||
buff = CDVDgetBuffer();
|
||||
if (buff==NULL) return 0;
|
||||
FreezeMMXRegs(1);
|
||||
switch (mode->datapattern){
|
||||
case CdSecS2048:
|
||||
memcpy_fast((void*)((uptr)buf+2048*i), buff, 2048);break;//only data
|
||||
|
@ -198,7 +197,6 @@ int CdRead(u32 lsn, u32 sectors, void *buf, CdRMode *mode){
|
|||
case CdSecS2340:
|
||||
memcpy_fast((void*)((uptr)buf+2340*i), buff, 2340);break;//without sync
|
||||
}
|
||||
FreezeMMXRegs(0);
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
@ -216,9 +214,7 @@ int DvdRead(u32 lsn, u32 sectors, void *buf, CdRMode *mode){
|
|||
// switch (mode->datapattern){
|
||||
// case CdSecS2064:
|
||||
((u32*)buf)[0] = i + 0x30000;
|
||||
FreezeMMXRegs(1);
|
||||
memcpy_fast((u8*)buf+12, buff, 2048);
|
||||
FreezeMMXRegs(0);
|
||||
buf = (char*)buf + 2064; break;
|
||||
// default:
|
||||
// return 0;
|
||||
|
@ -253,9 +249,7 @@ int CDVD_GetVolumeDescriptor(void){
|
|||
if ((localVolDesc.filesystemType == 1) ||
|
||||
(localVolDesc.filesystemType == 2))
|
||||
{
|
||||
FreezeMMXRegs(1);
|
||||
memcpy_fast(&CDVolDesc, &localVolDesc, sizeof(cdVolDesc));
|
||||
FreezeMMXRegs(0);
|
||||
}
|
||||
}
|
||||
else
|
||||
|
|
|
@ -188,9 +188,7 @@ int CDVDFS_read( int fd, char *buffer, int size ){
|
|||
RPC_LOG("[CDVDisodrv: ] Couldn't Read from file for some reason\n");
|
||||
return 0;
|
||||
}
|
||||
FreezeMMXRegs(1);
|
||||
memcpy_fast(buffer, lb + off_sector, ssize);
|
||||
FreezeMMXRegs(0);
|
||||
}
|
||||
if (asize) if (CdRead(asector, asize >> 11, buffer+ssize, &cdReadMode) != TRUE){
|
||||
RPC_LOG("[CDVDisodrv: ] Couldn't Read from file for some reason\n");
|
||||
|
@ -201,9 +199,7 @@ int CDVDFS_read( int fd, char *buffer, int size ){
|
|||
RPC_LOG("[CDVDisodrv: ] Couldn't Read from file for some reason\n");
|
||||
return 0;
|
||||
}
|
||||
FreezeMMXRegs(1);
|
||||
memcpy_fast(buffer+ssize+asize, lb, esize);
|
||||
FreezeMMXRegs(0);
|
||||
}
|
||||
/***********************
|
||||
// Now work out where we want to start reading from
|
||||
|
|
|
@ -527,9 +527,7 @@ void cdrReadInterrupt() {
|
|||
CDREAD_INT((cdr.Mode & 0x80) ? (cdReadTime / 2) : cdReadTime);
|
||||
return;
|
||||
}
|
||||
FreezeMMXRegs(1);
|
||||
memcpy_fast(cdr.Transfer, buf+12, 2340);
|
||||
FreezeMMXRegs(0);
|
||||
cdr.Stat = DataReady;
|
||||
|
||||
CDR_LOG(" %x:%x:%x\n", cdr.Transfer[0], cdr.Transfer[1], cdr.Transfer[2]);
|
||||
|
@ -923,9 +921,7 @@ void psxDma3(u32 madr, u32 bcr, u32 chcr) {
|
|||
}
|
||||
|
||||
cdsize = (bcr & 0xffff) * 4;
|
||||
FreezeMMXRegs(1);
|
||||
memcpy_fast((u8*)PSXM(madr), cdr.pTransfer, cdsize);
|
||||
FreezeMMXRegs(0);
|
||||
psxCpu->Clear(madr, cdsize/4);
|
||||
cdr.pTransfer+=cdsize;
|
||||
|
||||
|
|
|
@ -575,11 +575,11 @@ static void WRITERING_DMA(u32 *pMem, u32 qwc)
|
|||
{
|
||||
pendmem = (pendmem&~0xfff)-16;
|
||||
}
|
||||
memcpy_raz_(pgsmem, pMem, pendmem-(u32)gif->madr+16);
|
||||
memcpy_aligned(pgsmem, pMem, pendmem-(u32)gif->madr+16);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
memcpy_raz_(pgsmem, pMem, sizetoread);
|
||||
memcpy_aligned(pgsmem, pMem, sizetoread);
|
||||
|
||||
mtgsThread->SendDataPacket();
|
||||
}
|
||||
|
|
46
pcsx2/Hw.h
46
pcsx2/Hw.h
|
@ -329,7 +329,7 @@ static __forceinline u8* dmaGetAddr(u32 mem)
|
|||
|
||||
#else
|
||||
|
||||
|
||||
// Note: Dma addresses are guaranteed to be aligned to 16 bytes (128 bits)
|
||||
static __forceinline void *dmaGetAddr(u32 addr) {
|
||||
u8 *ptr;
|
||||
|
||||
|
@ -355,35 +355,17 @@ void hwShutdown();
|
|||
|
||||
// hw read functions
|
||||
extern u8 hwRead8 (u32 mem);
|
||||
int hwConstRead8 (u32 x86reg, u32 mem, u32 sign);
|
||||
|
||||
extern u16 hwRead16(u32 mem);
|
||||
int hwConstRead16(u32 x86reg, u32 mem, u32 sign);
|
||||
|
||||
extern u32 hwRead32(u32 mem);
|
||||
int hwConstRead32(u32 x86reg, u32 mem);
|
||||
|
||||
u64 hwRead64(u32 mem);
|
||||
void hwConstRead64(u32 mem, int mmreg);
|
||||
|
||||
void hwRead128(u32 mem, u64 *out);
|
||||
void hwConstRead128(u32 mem, int xmmreg);
|
||||
extern u64 hwRead64(u32 mem);
|
||||
extern void hwRead128(u32 mem, u64 *out);
|
||||
|
||||
// hw write functions
|
||||
void hwWrite8 (u32 mem, u8 value);
|
||||
void hwConstWrite8 (u32 mem, int mmreg);
|
||||
|
||||
void hwWrite16(u32 mem, u16 value);
|
||||
void hwConstWrite16(u32 mem, int mmreg);
|
||||
|
||||
void hwWrite32(u32 mem, u32 value);
|
||||
void hwConstWrite32(u32 mem, int mmreg);
|
||||
|
||||
void hwWrite64(u32 mem, u64 value);
|
||||
void hwConstWrite64(u32 mem, int mmreg);
|
||||
|
||||
void hwWrite128(u32 mem, const u64 *value);
|
||||
void hwConstWrite128(u32 mem, int xmmreg);
|
||||
extern void hwWrite8 (u32 mem, u8 value);
|
||||
extern void hwWrite16(u32 mem, u16 value);
|
||||
extern void hwWrite32(u32 mem, u32 value);
|
||||
extern void hwWrite64(u32 mem, u64 value);
|
||||
extern void hwWrite128(u32 mem, const u64 *value);
|
||||
|
||||
void hwIntcIrq(int n);
|
||||
void hwDmacIrq(int n);
|
||||
|
@ -394,6 +376,18 @@ int hwMFIFOWrite(u32 addr, u8 *data, u32 size);
|
|||
int hwDmacSrcChainWithStack(DMACh *dma, int id);
|
||||
int hwDmacSrcChain(DMACh *dma, int id);
|
||||
|
||||
int hwConstRead8 (u32 x86reg, u32 mem, u32 sign);
|
||||
int hwConstRead16(u32 x86reg, u32 mem, u32 sign);
|
||||
int hwConstRead32(u32 x86reg, u32 mem);
|
||||
void hwConstRead64(u32 mem, int mmreg);
|
||||
void hwConstRead128(u32 mem, int xmmreg);
|
||||
|
||||
void hwConstWrite8 (u32 mem, int mmreg);
|
||||
void hwConstWrite16(u32 mem, int mmreg);
|
||||
void hwConstWrite32(u32 mem, int mmreg);
|
||||
void hwConstWrite64(u32 mem, int mmreg);
|
||||
void hwConstWrite128(u32 mem, int xmmreg);
|
||||
|
||||
#ifdef PCSX2_VIRTUAL_MEM
|
||||
void iMemRead32Check();
|
||||
#endif
|
||||
|
|
|
@ -430,7 +430,7 @@ int mtgsThreadObject::Callback()
|
|||
{
|
||||
Console::WriteLn("MTGS > Thread Started, Opening GS Plugin...");
|
||||
|
||||
memcpy_raz_( m_gsMem, PS2MEM_GS, sizeof(m_gsMem) );
|
||||
memcpy_aligned( m_gsMem, PS2MEM_GS, sizeof(m_gsMem) );
|
||||
GSsetBaseMem( m_gsMem );
|
||||
|
||||
m_returncode = GSopen((void *)&pDsp, "PCSX2", 1);
|
||||
|
|
19
pcsx2/Misc.h
19
pcsx2/Misc.h
|
@ -230,12 +230,19 @@ extern u8 g_globalXMMSaved;
|
|||
void _memset16_unaligned( void* dest, u16 data, size_t size );
|
||||
|
||||
#if defined(_WIN32) && !defined(__x86_64__)
|
||||
// faster memcpy
|
||||
extern void __fastcall memcpy_raz_u(void *dest, const void *src, size_t bytes);
|
||||
extern void __fastcall memcpy_raz_(void *dest, const void *src, size_t qwc);
|
||||
extern void __fastcall memcpy_amd_(void *dest, const void *src, size_t n);
|
||||
|
||||
// The new simplified memcpy_amd_ is now faster than memcpy_raz_.
|
||||
// memcpy_amd_ also does mmx register saving, negating the need for freezeregs (code cleanup!)
|
||||
// Additionally, using one single memcpy implementation keeps the code cache cleaner.
|
||||
|
||||
//extern void __fastcall memcpy_raz_udst(void *dest, const void *src, size_t bytes);
|
||||
//extern void __fastcall memcpy_raz_usrc(void *dest, const void *src, size_t bytes);
|
||||
//extern void __fastcall memcpy_raz_(void *dest, const void *src, size_t bytes);
|
||||
extern void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes);
|
||||
|
||||
# include "windows/memzero.h"
|
||||
# define memcpy_fast memcpy_amd_
|
||||
# define memcpy_aligned memcpy_amd_
|
||||
|
||||
#else
|
||||
|
||||
|
@ -243,6 +250,10 @@ void _memset16_unaligned( void* dest, u16 data, size_t size );
|
|||
#define memcpy_fast memcpy
|
||||
#define memcpy_raz_ memcpy
|
||||
#define memcpy_raz_u memcpy
|
||||
|
||||
#define memcpy_aligned memcpy
|
||||
#define memcpy_raz_u memcpy
|
||||
|
||||
#include "Linux/memzero.h"
|
||||
|
||||
#endif
|
||||
|
|
|
@ -226,13 +226,11 @@ extern void mfifoGIFtransfer(int);
|
|||
#define gif ((DMACh*)&PS2MEM_HW[0xA000])
|
||||
void dmaSPR0() { // fromSPR
|
||||
int qwc = spr0->qwc;
|
||||
FreezeMMXRegs(1);
|
||||
|
||||
SPR_LOG("dmaSPR0 chcr = %lx, madr = %lx, qwc = %lx, sadr = %lx\n",
|
||||
spr0->chcr, spr0->madr, spr0->qwc, spr0->sadr);
|
||||
|
||||
_dmaSPR0();
|
||||
FreezeMMXRegs(0);
|
||||
if ((psHu32(DMAC_CTRL) & 0xC) == 0xC) { // GIF MFIFO
|
||||
if((spr0->madr & ~psHu32(DMAC_RBSR)) != psHu32(DMAC_RBOR)) SysPrintf("GIF MFIFO Write outside MFIFO area\n");
|
||||
spr0->madr = psHu32(DMAC_RBOR) + (spr0->madr & psHu32(DMAC_RBSR));
|
||||
|
@ -308,7 +306,6 @@ void _SPR1interleave() {
|
|||
void dmaSPR1() { // toSPR
|
||||
|
||||
|
||||
FreezeMMXRegs(1);
|
||||
#ifdef SPR_LOG
|
||||
SPR_LOG("dmaSPR1 chcr = 0x%x, madr = 0x%x, qwc = 0x%x\n"
|
||||
" tadr = 0x%x, sadr = 0x%x\n",
|
||||
|
@ -325,7 +322,6 @@ void dmaSPR1() { // toSPR
|
|||
// Transfer Dn_QWC from Dn_MADR to SPR1
|
||||
SPR1chain();
|
||||
CPU_INT(9, cycles);
|
||||
FreezeMMXRegs(0);
|
||||
return;
|
||||
} else if ((spr1->chcr & 0xc) == 0x4){
|
||||
int cycles = 0;
|
||||
|
@ -338,7 +334,6 @@ void dmaSPR1() { // toSPR
|
|||
// Transfer Dn_QWC from Dn_MADR to SPR1
|
||||
SPR1chain();
|
||||
CPU_INT(9, cycles);
|
||||
FreezeMMXRegs(0);
|
||||
return;
|
||||
}
|
||||
// Chain Mode
|
||||
|
@ -382,7 +377,6 @@ void dmaSPR1() { // toSPR
|
|||
} else { // Interleave Mode
|
||||
_SPR1interleave();
|
||||
}
|
||||
FreezeMMXRegs(0);
|
||||
|
||||
}
|
||||
|
||||
|
|
|
@ -574,9 +574,7 @@ static void VIFunpack(u32 *data, vifCode *v, int size, const unsigned int VIFdma
|
|||
// v4-32
|
||||
if(vifRegs->mode == 0 && !(vifRegs->code & 0x10000000) && vif->usn == 0){
|
||||
vifRegs->num -= size>>4;
|
||||
FreezeMMXRegs(1);
|
||||
memcpy_fast((u8*)dest, cdata, size);
|
||||
FreezeMMXRegs(0);
|
||||
size = 0;
|
||||
//unpacktotal += GetCPUTick()-basetick;
|
||||
return;
|
||||
|
@ -814,9 +812,7 @@ static __forceinline void _vif0mpgTransfer(u32 addr, u32 *data, int size) {
|
|||
fclose(f);
|
||||
}*/
|
||||
if (memcmp(VU0.Micro + addr, data, size << 2)) {
|
||||
FreezeMMXRegs(1);
|
||||
memcpy_fast(VU0.Micro + addr, data, size << 2);
|
||||
FreezeMMXRegs(0);
|
||||
CpuVU0->Clear(addr, size);
|
||||
}
|
||||
}
|
||||
|
@ -1490,9 +1486,7 @@ static __forceinline void _vif1mpgTransfer(u32 addr, u32 *data, int size) {
|
|||
}*/
|
||||
assert( VU1.Micro > 0 );
|
||||
if (memcmp(VU1.Micro + addr, data, size << 2)) {
|
||||
FreezeMMXRegs(1);
|
||||
memcpy_fast(VU1.Micro + addr, data, size << 2);
|
||||
FreezeMMXRegs(0);
|
||||
CpuVU1->Clear(addr, size);
|
||||
}
|
||||
}
|
||||
|
@ -1644,7 +1638,7 @@ static int Vif1TransDirectHL(u32 *data){
|
|||
{
|
||||
//unaligned copy.VIF handling is -very- messy, so i'l use this code til i fix it :)
|
||||
const uint count = mtgsThread->PrepDataPacket( GIF_PATH_2, data, ret<<2 );
|
||||
memcpy_raz_u( mtgsThread->GetDataPacketPtr(), data, count );
|
||||
memcpy_fast( mtgsThread->GetDataPacketPtr(), data, count );
|
||||
mtgsThread->SendDataPacket();
|
||||
}
|
||||
else {
|
||||
|
|
|
@ -49,6 +49,9 @@ namespace Threading
|
|||
}
|
||||
|
||||
cpuinfo.LogicalCores = CPUs;
|
||||
if( LogicalCoresPerPhysicalCPU > CPUs) // for 1-socket HTT-disabled machines
|
||||
LogicalCoresPerPhysicalCPU = CPUs;
|
||||
|
||||
cpuinfo.PhysicalCores = ( CPUs / LogicalCoresPerPhysicalCPU ) * PhysicalCoresPerPhysicalCPU;
|
||||
ptw32_smp_system = ( cpuinfo.LogicalCores > 1 ) ? TRUE : FALSE;
|
||||
}
|
||||
|
|
|
@ -88,35 +88,27 @@ void checkregs()
|
|||
#endif
|
||||
|
||||
|
||||
__declspec(align(16)) static u8 _xmm_backup[16*2];
|
||||
PCSX2_ALIGNED16( static u8 _xmm_backup[16*2] );
|
||||
PCSX2_ALIGNED16( static u8 _mmx_backup[8*4] );
|
||||
|
||||
// this one checks for alignments too ...
|
||||
__declspec(naked) void __fastcall memcpy_raz_u(void *dest, const void *src, size_t bytes)
|
||||
static __declspec(naked) void __fastcall _memcpy_raz_usrc(void *dest, const void *src, size_t bytes)
|
||||
{
|
||||
// If src is aligned, use memcpy_raz instead:
|
||||
__asm
|
||||
{
|
||||
test edx,0xf;
|
||||
jz memcpy_raz_;
|
||||
}
|
||||
|
||||
// MOVSRC = opcode used to read. I use the same code for the aligned version, with a different define :)
|
||||
#define MOVSRC movups
|
||||
#define MOVSRC movdqu
|
||||
#define MOVDST movdqa
|
||||
|
||||
__asm
|
||||
{
|
||||
//Reads before reads, to avoid stalls
|
||||
mov eax,[esp+4];
|
||||
//Make sure to save xmm0, it must be preserved ...
|
||||
movaps [_xmm_backup+0x00],xmm0;
|
||||
movaps [_xmm_backup],xmm0;
|
||||
|
||||
//if >=128 bytes use 128 byte unrolled loop
|
||||
//i use cmp ..,127 + jna because 127 is encodable using the simm8 form
|
||||
cmp eax,127;
|
||||
jna _loop_1;
|
||||
|
||||
//unrolled version also touches xmm1, save it :)
|
||||
movaps [_xmm_backup+0x10],xmm1;
|
||||
|
||||
//since this is a common branch target it could be good to align it -- no idea if it has any effect :p
|
||||
align 16
|
||||
|
||||
|
@ -124,34 +116,111 @@ __declspec(naked) void __fastcall memcpy_raz_u(void *dest, const void *src, size
|
|||
_loop_8:
|
||||
|
||||
MOVSRC xmm0,[edx+0x00]; //read first to avoid read-after-write stalls
|
||||
MOVSRC xmm1,[edx+0x10];
|
||||
MOVDST [ecx+0x00],xmm0; //then write :p
|
||||
MOVSRC xmm0,[edx+0x10];
|
||||
MOVDST [ecx+0x10],xmm0;
|
||||
sub edx,-128; //edx won't be used for a while, so update it here. sub/-128 for simm8 encoding
|
||||
movaps [ecx+0x00],xmm0; //then write :p
|
||||
movaps [ecx+0x10],xmm1;
|
||||
sub ecx,-128; //ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding
|
||||
|
||||
MOVSRC xmm0,[edx+0x20-128];
|
||||
MOVSRC xmm1,[edx+0x30-128];
|
||||
MOVDST [ecx+0x20-128],xmm0;
|
||||
MOVSRC xmm0,[edx+0x30-128];
|
||||
MOVDST [ecx+0x30-128],xmm0;
|
||||
add eax,-128; //eax won't be used for a while, so update it here. add/-128 for simm8 encoding
|
||||
movaps [ecx+0x20-128],xmm0;
|
||||
movaps [ecx+0x30-128],xmm1;
|
||||
|
||||
MOVSRC xmm0,[edx+0x40-128];
|
||||
MOVSRC xmm1,[edx+0x50-128];
|
||||
movaps [ecx+0x40-128],xmm0;
|
||||
movaps [ecx+0x50-128],xmm1;
|
||||
MOVDST [ecx+0x40-128],xmm0;
|
||||
MOVSRC xmm0,[edx+0x50-128];
|
||||
MOVDST [ecx+0x50-128],xmm0;
|
||||
|
||||
MOVSRC xmm0,[edx+0x60-128];
|
||||
MOVSRC xmm1,[edx+0x70-128];
|
||||
movaps [ecx+0x60-128],xmm0;
|
||||
movaps [ecx+0x70-128],xmm1;
|
||||
MOVDST [ecx+0x60-128],xmm0;
|
||||
MOVSRC xmm0,[edx+0x70-128];
|
||||
MOVDST [ecx+0x70-128],xmm0;
|
||||
|
||||
//127~ja, 127 is encodable as simm8 :)
|
||||
cmp eax,127;
|
||||
ja _loop_8;
|
||||
|
||||
//restore xmm1 :)
|
||||
movaps xmm1,[_xmm_backup+0x10];
|
||||
//direct copy for 0~7 qwords
|
||||
//in order to avoid the inc/dec of all 3 registers
|
||||
//i use negative relative addressing from the top of the buffers
|
||||
//[top-current index]
|
||||
|
||||
_loop_1:
|
||||
//prepare the regs for 'negative relative addressing'
|
||||
add edx,eax;
|
||||
add ecx,eax;
|
||||
neg eax;
|
||||
jz cleanup; //exit if nothing to do
|
||||
|
||||
_loop_1_inner:
|
||||
MOVSRC xmm0,[edx+eax];
|
||||
MOVDST [ecx+eax],xmm0;
|
||||
|
||||
add eax,16; //while the offset is still negative we have data to copy
|
||||
js _loop_1_inner;
|
||||
|
||||
//done !
|
||||
cleanup:
|
||||
//restore xmm and exit ~)
|
||||
movaps xmm0,[_xmm_backup];
|
||||
ret 4;
|
||||
}
|
||||
#undef MOVSRC
|
||||
#undef MOVDST
|
||||
}
|
||||
|
||||
|
||||
static __declspec(naked) void __fastcall _memcpy_raz_udst(void *dest, const void *src, size_t bytes)
|
||||
{
|
||||
// MOVDST = opcode used to read. I use the same code for the aligned version, with a different define :)
|
||||
#define MOVSRC movaps
|
||||
#define MOVDST movups
|
||||
__asm
|
||||
{
|
||||
//Reads before reads, to avoid stalls
|
||||
mov eax,[esp+4];
|
||||
//Make sure to save xmm0, it must be preserved ...
|
||||
movaps [_xmm_backup],xmm0;
|
||||
|
||||
//if >=128 bytes use 128 byte unrolled loop
|
||||
//i use cmp ..,127 + jna because 127 is encodable using the simm8 form
|
||||
cmp eax,127;
|
||||
jna _loop_1;
|
||||
|
||||
//since this is a common branch target it could be good to align it -- no idea if it has any effect :p
|
||||
align 16
|
||||
|
||||
//128 byte unrolled loop
|
||||
_loop_8:
|
||||
|
||||
MOVSRC xmm0,[edx+0x00]; //read first to avoid read-after-write stalls
|
||||
MOVDST [ecx+0x00],xmm0; //then write :p
|
||||
MOVSRC xmm0,[edx+0x10];
|
||||
MOVDST [ecx+0x10],xmm0;
|
||||
sub edx,-128; //edx won't be used for a while, so update it here. sub/-128 for simm8 encoding
|
||||
sub ecx,-128; //ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding
|
||||
|
||||
MOVSRC xmm0,[edx+0x20-128];
|
||||
MOVDST [ecx+0x20-128],xmm0;
|
||||
MOVSRC xmm0,[edx+0x30-128];
|
||||
MOVDST [ecx+0x30-128],xmm0;
|
||||
add eax,-128; //eax won't be used for a while, so update it here. add/-128 for simm8 encoding
|
||||
|
||||
MOVSRC xmm0,[edx+0x40-128];
|
||||
MOVDST [ecx+0x40-128],xmm0;
|
||||
MOVSRC xmm0,[edx+0x50-128];
|
||||
MOVDST [ecx+0x50-128],xmm0;
|
||||
|
||||
MOVSRC xmm0,[edx+0x60-128];
|
||||
MOVDST [ecx+0x60-128],xmm0;
|
||||
MOVSRC xmm0,[edx+0x70-128];
|
||||
MOVDST [ecx+0x70-128],xmm0;
|
||||
|
||||
//127~ja, 127 is encodable as simm8 :)
|
||||
cmp eax,127;
|
||||
ja _loop_8;
|
||||
|
||||
//direct copy for 0~7 qwords
|
||||
//in order to avoid the inc/dec of all 3 registers
|
||||
|
@ -168,22 +237,24 @@ _loop_1:
|
|||
_loop_1_inner:
|
||||
MOVSRC xmm0,[edx+eax];
|
||||
movaps [ecx+eax],xmm0;
|
||||
|
||||
|
||||
add eax,16; //while the offset is still negative we have data to copy
|
||||
js _loop_1_inner;
|
||||
|
||||
//done !
|
||||
cleanup:
|
||||
//restore xmm and exit ~)
|
||||
movaps xmm0,[_xmm_backup+0x00];
|
||||
movaps xmm0,[_xmm_backup];
|
||||
ret 4;
|
||||
}
|
||||
#undef MOVSRC
|
||||
#undef MOVDST
|
||||
}
|
||||
|
||||
// Custom memcpy, only for 16 byte aligned stuff (used for mtgs)
|
||||
// This function is optimized for medium-small transfer sizes (<2048, >=128). No prefetching is
|
||||
// used since the reads are linear and the cache logic can predict em :)
|
||||
|
||||
// *OBSOLETE* -- memcpy_amd_ has been optimized and is now faster.
|
||||
__declspec(naked) void __fastcall memcpy_raz_(void *dest, const void *src, size_t bytes)
|
||||
{
|
||||
// Code Implementation Notes:
|
||||
|
@ -191,21 +262,19 @@ __declspec(naked) void __fastcall memcpy_raz_(void *dest, const void *src, size_
|
|||
|
||||
// MOVSRC = opcode used to read. I use the same code for the unaligned version, with a different define :)
|
||||
#define MOVSRC movaps
|
||||
#define MOVDST movaps
|
||||
__asm
|
||||
{
|
||||
//Reads before reads, to avoid stalls
|
||||
mov eax,[esp+4];
|
||||
//Make sure to save xmm0, it must be preserved ...
|
||||
movaps [_xmm_backup+0x00],xmm0;
|
||||
movaps [_xmm_backup],xmm0;
|
||||
|
||||
//if >=128 bytes use 128 byte unrolled loop
|
||||
//i use cmp ..,127 + jna because 127 is encodable using the simm8 form
|
||||
cmp eax,127;
|
||||
jna _loop_1;
|
||||
|
||||
//unrolled version also toiches xmm1, save it :)
|
||||
movaps [_xmm_backup+0x10],xmm1;
|
||||
|
||||
//since this is a common branch target it could be good to align it -- no idea if it has any effect :p
|
||||
align 16
|
||||
|
||||
|
@ -213,35 +282,32 @@ __declspec(naked) void __fastcall memcpy_raz_(void *dest, const void *src, size_
|
|||
_loop_8:
|
||||
|
||||
MOVSRC xmm0,[edx+0x00]; //read first to avoid read-after-write stalls
|
||||
MOVSRC xmm1,[edx+0x10];
|
||||
MOVDST [ecx+0x00],xmm0; //then write :p
|
||||
MOVSRC xmm0,[edx+0x10];
|
||||
MOVDST [ecx+0x10],xmm0;
|
||||
sub edx,-128; //edx won't be used for a while, so update it here. sub/-128 for simm8 encoding
|
||||
movaps [ecx+0x00],xmm0; //then write :p
|
||||
movaps [ecx+0x10],xmm1;
|
||||
sub ecx,-128; //ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding
|
||||
|
||||
MOVSRC xmm0,[edx+0x20-128];
|
||||
MOVSRC xmm1,[edx+0x30-128];
|
||||
MOVDST [ecx+0x20-128],xmm0;
|
||||
MOVSRC xmm0,[edx+0x30-128];
|
||||
MOVDST [ecx+0x30-128],xmm0;
|
||||
add eax,-128; //eax won't be used for a while, so update it here. add/-128 for simm8 encoding
|
||||
movaps [ecx+0x20-128],xmm0;
|
||||
movaps [ecx+0x30-128],xmm1;
|
||||
|
||||
MOVSRC xmm0,[edx+0x40-128];
|
||||
MOVSRC xmm1,[edx+0x50-128];
|
||||
movaps [ecx+0x40-128],xmm0;
|
||||
movaps [ecx+0x50-128],xmm1;
|
||||
MOVDST [ecx+0x40-128],xmm0;
|
||||
MOVSRC xmm0,[edx+0x50-128];
|
||||
MOVDST [ecx+0x50-128],xmm0;
|
||||
|
||||
MOVSRC xmm0,[edx+0x60-128];
|
||||
MOVSRC xmm1,[edx+0x70-128];
|
||||
movaps [ecx+0x60-128],xmm0;
|
||||
movaps [ecx+0x70-128],xmm1;
|
||||
MOVDST [ecx+0x60-128],xmm0;
|
||||
MOVSRC xmm0,[edx+0x70-128];
|
||||
MOVDST [ecx+0x70-128],xmm0;
|
||||
|
||||
//127~ja, 127 is encodable as simm8 :)
|
||||
cmp eax,127;
|
||||
ja _loop_8;
|
||||
|
||||
//restore xmm1 :)
|
||||
movaps xmm1,[_xmm_backup+0x10];
|
||||
|
||||
//direct copy for 0~7 qwords
|
||||
//in order to avoid the inc/dec of all 3 registers
|
||||
//i use negative relative addressing from the top of the buffers
|
||||
|
@ -256,7 +322,7 @@ _loop_1:
|
|||
|
||||
_loop_1_inner:
|
||||
MOVSRC xmm0,[edx+eax];
|
||||
movaps [ecx+eax],xmm0;
|
||||
MOVDST [ecx+eax],xmm0;
|
||||
|
||||
add eax,16; //while the offset is still negative we have data to copy
|
||||
js _loop_1_inner;
|
||||
|
@ -264,44 +330,64 @@ _loop_1_inner:
|
|||
//done !
|
||||
cleanup:
|
||||
//restore xmm and exit ~)
|
||||
movaps xmm0,[_xmm_backup+0x00];
|
||||
movaps xmm0,[_xmm_backup];
|
||||
ret 4;
|
||||
}
|
||||
#undef MOVSRC
|
||||
#undef MOVDST
|
||||
}
|
||||
|
||||
#undef MOVSRC
|
||||
// This memcpy routine is for use in situations where the source buffer's alignment is indeterminate.
|
||||
__forceinline void __fastcall memcpy_raz_usrc(void *dest, const void *src, size_t bytes)
|
||||
{
|
||||
if( ((uptr)src & 0xf) == 0 )
|
||||
memcpy_raz_( dest, src, bytes );
|
||||
else
|
||||
_memcpy_raz_usrc( dest, src, bytes );
|
||||
}
|
||||
|
||||
// This memcpy routine is for use in situations where the destination buffer's alignment is indeterminate.
|
||||
__forceinline void __fastcall memcpy_raz_udst(void *dest, const void *src, size_t bytes)
|
||||
{
|
||||
if( ((uptr)dest & 0xf) == 0 )
|
||||
memcpy_raz_( dest, src, bytes );
|
||||
else
|
||||
_memcpy_raz_udst( dest, src, bytes );
|
||||
}
|
||||
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// Fast memcpy as coded by AMD.
|
||||
|
||||
// This function clobbers all MMX registers, and is generally not optimal for short memory
|
||||
// copies due to the amount of overhead required to test for alignments, copy length,
|
||||
// and other ABI overhead.
|
||||
void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
|
||||
// Fast memcpy as coded by AMD, and thn improved by air.
|
||||
//
|
||||
// This routine preserves mmx registers! It's the complete real deal!
|
||||
__declspec(naked) void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
|
||||
{
|
||||
__asm {
|
||||
__asm
|
||||
{
|
||||
push edi
|
||||
push esi
|
||||
|
||||
mov edi, ecx ; destination
|
||||
mov esi, edx ; source
|
||||
mov ecx, [n] ; number of bytes to copy
|
||||
mov ebx, ecx ; keep a copy of count
|
||||
mov ecx, [esp+12] ; number of bytes to copy
|
||||
mov eax, ecx ; keep a copy of count
|
||||
|
||||
cld
|
||||
cmp ecx, TINY_BLOCK_COPY
|
||||
cmp eax, TINY_BLOCK_COPY
|
||||
jb $memcpy_ic_3 ; tiny? skip mmx copy
|
||||
|
||||
cmp ecx, 32*1024 ; don't align between 32k-64k because
|
||||
cmp eax, 32*1024 ; don't align between 32k-64k because
|
||||
jbe $memcpy_do_align ; it appears to be slower
|
||||
cmp ecx, 64*1024
|
||||
cmp eax, 64*1024
|
||||
jbe $memcpy_align_done
|
||||
$memcpy_do_align:
|
||||
mov ecx, 8 ; a trick that's faster than rep movsb...
|
||||
sub ecx, edi ; align destination to qword
|
||||
and ecx, 111b ; get the low bits
|
||||
sub ebx, ecx ; update copy count
|
||||
neg ecx ; set up to jump into the array
|
||||
add ecx, offset $memcpy_align_done
|
||||
jmp ecx ; jump to array of movsb's
|
||||
mov eax, 8 ; a trick that's faster than rep movsb...
|
||||
sub eax, edi ; align destination to qword
|
||||
and eax, 111b ; get the low bits
|
||||
sub ecx, eax ; update copy count
|
||||
neg eax ; set up to jump into the array
|
||||
add eax, offset $memcpy_align_done
|
||||
jmp eax ; jump to array of movsb's
|
||||
|
||||
align 4
|
||||
movsb
|
||||
|
@ -314,13 +400,18 @@ align 4
|
|||
movsb
|
||||
|
||||
$memcpy_align_done: ; destination is dword aligned
|
||||
mov ecx, ebx ; number of bytes left to copy
|
||||
shr ecx, 6 ; get 64-byte block count
|
||||
mov eax, ecx ; number of bytes left to copy
|
||||
shr eax, 6 ; get 64-byte block count
|
||||
jz $memcpy_ic_2 ; finish the last few bytes
|
||||
|
||||
cmp ecx, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy
|
||||
cmp eax, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy
|
||||
jae $memcpy_uc_test
|
||||
|
||||
movq [_mmx_backup+0x00],mm0
|
||||
movq [_mmx_backup+0x08],mm1
|
||||
movq [_mmx_backup+0x10],mm2
|
||||
movq [_mmx_backup+0x18],mm3
|
||||
|
||||
// This is small block copy that uses the MMX registers to copy 8 bytes
|
||||
// at a time. It uses the "unrolled loop" optimization, and also uses
|
||||
// the software prefetch instruction to get the data into the cache.
|
||||
|
@ -348,30 +439,39 @@ $memcpy_ic_1: ; 64-byte block copies, in-cache copy
|
|||
|
||||
add esi, 64 ; update source pointer
|
||||
add edi, 64 ; update destination pointer
|
||||
dec ecx ; count down
|
||||
dec eax ; count down
|
||||
jnz $memcpy_ic_1 ; last 64-byte block?
|
||||
|
||||
movq mm0,[_mmx_backup+0x00]
|
||||
movq mm1,[_mmx_backup+0x08]
|
||||
movq mm2,[_mmx_backup+0x10]
|
||||
movq mm3,[_mmx_backup+0x18]
|
||||
|
||||
$memcpy_ic_2:
|
||||
mov ecx, ebx ; has valid low 6 bits of the byte count
|
||||
mov eax, ecx ; has valid low 6 bits of the byte count
|
||||
$memcpy_ic_3:
|
||||
shr ecx, 2 ; dword count
|
||||
and ecx, 1111b ; only look at the "remainder" bits
|
||||
neg ecx ; set up to jump into the array
|
||||
add ecx, offset $memcpy_last_few
|
||||
jmp ecx ; jump to array of movsd's
|
||||
shr eax, 2 ; dword count
|
||||
and eax, 1111b ; only look at the "remainder" bits
|
||||
neg eax ; set up to jump into the array
|
||||
add eax, offset $memcpy_last_few
|
||||
jmp eax ; jump to array of movsd's
|
||||
|
||||
$memcpy_uc_test:
|
||||
cmp ecx, UNCACHED_COPY/64 ; big enough? use block prefetch copy
|
||||
/*cmp ecx, UNCACHED_COPY/64 ; big enough? use block prefetch copy
|
||||
jae $memcpy_bp_1
|
||||
|
||||
$memcpy_64_test:
|
||||
or ecx, ecx ; tail end of block prefetch will jump here
|
||||
$memcpy_64_test:*/
|
||||
or eax, eax ; tail end of block prefetch will jump here
|
||||
jz $memcpy_ic_2 ; no more 64-byte blocks left
|
||||
|
||||
// For larger blocks, which will spill beyond the cache, it's faster to
|
||||
// use the Streaming Store instruction MOVNTQ. This write instruction
|
||||
// bypasses the cache and writes straight to main memory. This code also
|
||||
// uses the software prefetch instruction to pre-read the data.
|
||||
|
||||
movq [_mmx_backup+0x00],mm0
|
||||
movq [_mmx_backup+0x08],mm1
|
||||
movq [_mmx_backup+0x10],mm2
|
||||
|
||||
align 16
|
||||
$memcpy_uc_1: ; 64-byte blocks, uncached copy
|
||||
|
||||
|
@ -394,17 +494,25 @@ $memcpy_uc_1: ; 64-byte blocks, uncached copy
|
|||
movq mm1,[esi-8]
|
||||
movntq [edi-24], mm2
|
||||
movntq [edi-16], mm0
|
||||
dec ecx
|
||||
dec eax
|
||||
movntq [edi-8], mm1
|
||||
jnz $memcpy_uc_1 ; last 64-byte block?
|
||||
|
||||
jmp $memcpy_ic_2 ; almost done
|
||||
movq mm0,[_mmx_backup+0x00]
|
||||
movq mm1,[_mmx_backup+0x08]
|
||||
movq mm2,[_mmx_backup+0x10]
|
||||
|
||||
jmp $memcpy_ic_2 ; almost done (not needed because large copy below was removed)
|
||||
|
||||
// For the largest size blocks, a special technique called Block Prefetch
|
||||
// can be used to accelerate the read operations. Block Prefetch reads
|
||||
// one address per cache line, for a series of cache lines, in a short loop.
|
||||
// This is faster than using software prefetch. The technique is great for
|
||||
// getting maximum read bandwidth, especially in DDR memory systems.
|
||||
|
||||
// Note: Pcsx2 rarely invokes large copies, so this mode has been disabled to
|
||||
// help keep the code cache footprint of memcpy_fast to a minimum.
|
||||
/*
|
||||
$memcpy_bp_1: ; large blocks, block prefetch copy
|
||||
|
||||
cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop?
|
||||
|
@ -447,6 +555,7 @@ $memcpy_bp_3:
|
|||
jnz $memcpy_bp_3 ; keep copying
|
||||
sub ecx, CACHEBLOCK ; update the 64-byte block count
|
||||
jmp $memcpy_bp_1 ; keep processing chunks
|
||||
*/
|
||||
|
||||
// The smallest copy uses the X86 "movsd" instruction, in an optimized
|
||||
// form which is an "unrolled loop". Then it handles the last few bytes.
|
||||
|
@ -469,8 +578,8 @@ align 4
|
|||
movsd
|
||||
|
||||
$memcpy_last_few: ; dword aligned from before movsd's
|
||||
mov ecx, ebx ; has valid low 2 bits of the byte count
|
||||
and ecx, 11b ; the last few cows must come home
|
||||
mov eax, ecx ; has valid low 2 bits of the byte count
|
||||
and eax, 11b ; the last few cows must come home
|
||||
jz $memcpy_final ; no more, let's leave
|
||||
rep movsb ; the last 1, 2, or 3 bytes
|
||||
|
||||
|
@ -479,10 +588,14 @@ $memcpy_final:
|
|||
sfence ; flush the write buffer
|
||||
//mov eax, [dest] ; ret value = destination pointer
|
||||
|
||||
pop esi
|
||||
pop edi
|
||||
|
||||
ret 4
|
||||
}
|
||||
}
|
||||
|
||||
// mmx memcpy implementation, size has to be a multiple of 8
|
||||
// mmx mem-compare implementation, size has to be a multiple of 8
|
||||
// returns 0 is equal, nonzero value if not equal
|
||||
// ~10 times faster than standard memcmp
|
||||
// (zerofrog)
|
||||
|
|
|
@ -1977,12 +1977,7 @@ void VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr)
|
|||
//if( size > 0 )
|
||||
{
|
||||
u8* pmem = mtgsThread->GetDataPacketPtr();
|
||||
//FreezeMMXRegs(1);
|
||||
//memcpy_fast(pmem, (u8*)pMem+addr, size);
|
||||
//FreezeMMXRegs(0);
|
||||
|
||||
// we can use the faster memcpy_raz_ here (src/dest are garaunteed to be aligned)
|
||||
memcpy_raz_(pmem, (u8*)pMem+addr, size);
|
||||
memcpy_aligned(pmem, (u8*)pMem+addr, size);
|
||||
mtgsThread->SendDataPacket();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -880,9 +880,7 @@ static VuFunctionHeader* SuperVURecompileProgram(u32 startpc, int vuindex)
|
|||
#ifdef SUPERVU_CACHING
|
||||
//memxor_mmx(r.checksum, &VU->Micro[r.start], r.size);
|
||||
r.pmem = malloc(r.size);
|
||||
FreezeMMXRegs(1);
|
||||
memcpy_fast(r.pmem, &VU->Micro[r.start], r.size);
|
||||
FreezeMMXRegs(0);
|
||||
#endif
|
||||
s_pFnHeader->ranges.push_back(r);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue