mirror of https://github.com/PCSX2/pcsx2.git
Optimized memcpy_fast. In addition to being quite a bit faster, it also auto-preserves mmx registers now. So I was also able to remove almost every instance of FreezeMMXRegs (all except those used to guard the GS plugin calls). memcpy_fast (aka memcpy_amd_) is now faster than memcpy_raz for *all* scenarios, so it's been made the new default.
git-svn-id: http://pcsx2-playground.googlecode.com/svn/trunk@642 a6443dda-0b58-4228-96e9-037be469359c
This commit is contained in:
parent
4781be9e59
commit
44f5117d24
|
@ -865,8 +865,6 @@ int cdvdReadSector() {
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
FreezeMMXRegs(1);
|
|
||||||
|
|
||||||
const u32 madr = HW_DMA3_MADR;
|
const u32 madr = HW_DMA3_MADR;
|
||||||
|
|
||||||
// if raw dvd sector 'fill in the blanks'
|
// if raw dvd sector 'fill in the blanks'
|
||||||
|
@ -935,7 +933,6 @@ int cdvdReadSector() {
|
||||||
|
|
||||||
HW_DMA3_BCR_H16-= (cdvd.BlockSize / (HW_DMA3_BCR_L16*4));
|
HW_DMA3_BCR_H16-= (cdvd.BlockSize / (HW_DMA3_BCR_L16*4));
|
||||||
HW_DMA3_MADR+= cdvd.BlockSize;
|
HW_DMA3_MADR+= cdvd.BlockSize;
|
||||||
FreezeMMXRegs(0);
|
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -2024,9 +2021,7 @@ void cdvdWrite16(u8 rt) // SCOMMAND
|
||||||
if (cdvd.mg_size + cdvd.ParamC > cdvd.mg_maxsize)
|
if (cdvd.mg_size + cdvd.ParamC > cdvd.mg_maxsize)
|
||||||
cdvd.Result[0] = 0x80;
|
cdvd.Result[0] = 0x80;
|
||||||
else{
|
else{
|
||||||
FreezeMMXRegs(1);
|
|
||||||
memcpy_fast(cdvd.mg_buffer + cdvd.mg_size, cdvd.Param, cdvd.ParamC);
|
memcpy_fast(cdvd.mg_buffer + cdvd.mg_size, cdvd.Param, cdvd.ParamC);
|
||||||
FreezeMMXRegs(0);
|
|
||||||
cdvd.mg_size += cdvd.ParamC;
|
cdvd.mg_size += cdvd.ParamC;
|
||||||
cdvd.Result[0] = 0; // 0 complete ; 1 busy ; 0x80 error
|
cdvd.Result[0] = 0; // 0 complete ; 1 busy ; 0x80 error
|
||||||
}
|
}
|
||||||
|
@ -2034,11 +2029,9 @@ void cdvdWrite16(u8 rt) // SCOMMAND
|
||||||
|
|
||||||
case 0x8E: // sceMgReadData
|
case 0x8E: // sceMgReadData
|
||||||
SetResultSize( std::min(16, cdvd.mg_size) );
|
SetResultSize( std::min(16, cdvd.mg_size) );
|
||||||
FreezeMMXRegs(1);
|
|
||||||
memcpy_fast(cdvd.Result, cdvd.mg_buffer, cdvd.ResultC);
|
memcpy_fast(cdvd.Result, cdvd.mg_buffer, cdvd.ResultC);
|
||||||
cdvd.mg_size -= cdvd.ResultC;
|
cdvd.mg_size -= cdvd.ResultC;
|
||||||
memcpy_fast(cdvd.mg_buffer, cdvd.mg_buffer+cdvd.ResultC, cdvd.mg_size);
|
memcpy_fast(cdvd.mg_buffer, cdvd.mg_buffer+cdvd.ResultC, cdvd.mg_size);
|
||||||
FreezeMMXRegs(0);
|
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case 0x88: // secrman: __mechacon_auth_0x88 //for now it is the same; so, fall;)
|
case 0x88: // secrman: __mechacon_auth_0x88 //for now it is the same; so, fall;)
|
||||||
|
@ -2089,9 +2082,7 @@ fail_pol_cal:
|
||||||
SetResultSize(3);//in:0
|
SetResultSize(3);//in:0
|
||||||
{
|
{
|
||||||
int bit_ofs = mg_BIToffset(cdvd.mg_buffer);
|
int bit_ofs = mg_BIToffset(cdvd.mg_buffer);
|
||||||
FreezeMMXRegs(1);
|
|
||||||
memcpy_fast(cdvd.mg_buffer, &cdvd.mg_buffer[bit_ofs], 8+16*cdvd.mg_buffer[bit_ofs+4]);
|
memcpy_fast(cdvd.mg_buffer, &cdvd.mg_buffer[bit_ofs], 8+16*cdvd.mg_buffer[bit_ofs+4]);
|
||||||
FreezeMMXRegs(0);
|
|
||||||
}
|
}
|
||||||
cdvd.mg_maxsize = 0; // don't allow any write
|
cdvd.mg_maxsize = 0; // don't allow any write
|
||||||
cdvd.mg_size = 8+16*cdvd.mg_buffer[4];//new offset, i just moved the data
|
cdvd.mg_size = 8+16*cdvd.mg_buffer[4];//new offset, i just moved the data
|
||||||
|
|
|
@ -189,7 +189,6 @@ int CdRead(u32 lsn, u32 sectors, void *buf, CdRMode *mode){
|
||||||
return 0;
|
return 0;
|
||||||
buff = CDVDgetBuffer();
|
buff = CDVDgetBuffer();
|
||||||
if (buff==NULL) return 0;
|
if (buff==NULL) return 0;
|
||||||
FreezeMMXRegs(1);
|
|
||||||
switch (mode->datapattern){
|
switch (mode->datapattern){
|
||||||
case CdSecS2048:
|
case CdSecS2048:
|
||||||
memcpy_fast((void*)((uptr)buf+2048*i), buff, 2048);break;//only data
|
memcpy_fast((void*)((uptr)buf+2048*i), buff, 2048);break;//only data
|
||||||
|
@ -198,7 +197,6 @@ int CdRead(u32 lsn, u32 sectors, void *buf, CdRMode *mode){
|
||||||
case CdSecS2340:
|
case CdSecS2340:
|
||||||
memcpy_fast((void*)((uptr)buf+2340*i), buff, 2340);break;//without sync
|
memcpy_fast((void*)((uptr)buf+2340*i), buff, 2340);break;//without sync
|
||||||
}
|
}
|
||||||
FreezeMMXRegs(0);
|
|
||||||
}
|
}
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
@ -216,9 +214,7 @@ int DvdRead(u32 lsn, u32 sectors, void *buf, CdRMode *mode){
|
||||||
// switch (mode->datapattern){
|
// switch (mode->datapattern){
|
||||||
// case CdSecS2064:
|
// case CdSecS2064:
|
||||||
((u32*)buf)[0] = i + 0x30000;
|
((u32*)buf)[0] = i + 0x30000;
|
||||||
FreezeMMXRegs(1);
|
|
||||||
memcpy_fast((u8*)buf+12, buff, 2048);
|
memcpy_fast((u8*)buf+12, buff, 2048);
|
||||||
FreezeMMXRegs(0);
|
|
||||||
buf = (char*)buf + 2064; break;
|
buf = (char*)buf + 2064; break;
|
||||||
// default:
|
// default:
|
||||||
// return 0;
|
// return 0;
|
||||||
|
@ -253,9 +249,7 @@ int CDVD_GetVolumeDescriptor(void){
|
||||||
if ((localVolDesc.filesystemType == 1) ||
|
if ((localVolDesc.filesystemType == 1) ||
|
||||||
(localVolDesc.filesystemType == 2))
|
(localVolDesc.filesystemType == 2))
|
||||||
{
|
{
|
||||||
FreezeMMXRegs(1);
|
|
||||||
memcpy_fast(&CDVolDesc, &localVolDesc, sizeof(cdVolDesc));
|
memcpy_fast(&CDVolDesc, &localVolDesc, sizeof(cdVolDesc));
|
||||||
FreezeMMXRegs(0);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
|
|
@ -188,9 +188,7 @@ int CDVDFS_read( int fd, char *buffer, int size ){
|
||||||
RPC_LOG("[CDVDisodrv: ] Couldn't Read from file for some reason\n");
|
RPC_LOG("[CDVDisodrv: ] Couldn't Read from file for some reason\n");
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
FreezeMMXRegs(1);
|
|
||||||
memcpy_fast(buffer, lb + off_sector, ssize);
|
memcpy_fast(buffer, lb + off_sector, ssize);
|
||||||
FreezeMMXRegs(0);
|
|
||||||
}
|
}
|
||||||
if (asize) if (CdRead(asector, asize >> 11, buffer+ssize, &cdReadMode) != TRUE){
|
if (asize) if (CdRead(asector, asize >> 11, buffer+ssize, &cdReadMode) != TRUE){
|
||||||
RPC_LOG("[CDVDisodrv: ] Couldn't Read from file for some reason\n");
|
RPC_LOG("[CDVDisodrv: ] Couldn't Read from file for some reason\n");
|
||||||
|
@ -201,9 +199,7 @@ int CDVDFS_read( int fd, char *buffer, int size ){
|
||||||
RPC_LOG("[CDVDisodrv: ] Couldn't Read from file for some reason\n");
|
RPC_LOG("[CDVDisodrv: ] Couldn't Read from file for some reason\n");
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
FreezeMMXRegs(1);
|
|
||||||
memcpy_fast(buffer+ssize+asize, lb, esize);
|
memcpy_fast(buffer+ssize+asize, lb, esize);
|
||||||
FreezeMMXRegs(0);
|
|
||||||
}
|
}
|
||||||
/***********************
|
/***********************
|
||||||
// Now work out where we want to start reading from
|
// Now work out where we want to start reading from
|
||||||
|
|
|
@ -527,9 +527,7 @@ void cdrReadInterrupt() {
|
||||||
CDREAD_INT((cdr.Mode & 0x80) ? (cdReadTime / 2) : cdReadTime);
|
CDREAD_INT((cdr.Mode & 0x80) ? (cdReadTime / 2) : cdReadTime);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
FreezeMMXRegs(1);
|
|
||||||
memcpy_fast(cdr.Transfer, buf+12, 2340);
|
memcpy_fast(cdr.Transfer, buf+12, 2340);
|
||||||
FreezeMMXRegs(0);
|
|
||||||
cdr.Stat = DataReady;
|
cdr.Stat = DataReady;
|
||||||
|
|
||||||
CDR_LOG(" %x:%x:%x\n", cdr.Transfer[0], cdr.Transfer[1], cdr.Transfer[2]);
|
CDR_LOG(" %x:%x:%x\n", cdr.Transfer[0], cdr.Transfer[1], cdr.Transfer[2]);
|
||||||
|
@ -923,9 +921,7 @@ void psxDma3(u32 madr, u32 bcr, u32 chcr) {
|
||||||
}
|
}
|
||||||
|
|
||||||
cdsize = (bcr & 0xffff) * 4;
|
cdsize = (bcr & 0xffff) * 4;
|
||||||
FreezeMMXRegs(1);
|
|
||||||
memcpy_fast((u8*)PSXM(madr), cdr.pTransfer, cdsize);
|
memcpy_fast((u8*)PSXM(madr), cdr.pTransfer, cdsize);
|
||||||
FreezeMMXRegs(0);
|
|
||||||
psxCpu->Clear(madr, cdsize/4);
|
psxCpu->Clear(madr, cdsize/4);
|
||||||
cdr.pTransfer+=cdsize;
|
cdr.pTransfer+=cdsize;
|
||||||
|
|
||||||
|
|
|
@ -575,11 +575,11 @@ static void WRITERING_DMA(u32 *pMem, u32 qwc)
|
||||||
{
|
{
|
||||||
pendmem = (pendmem&~0xfff)-16;
|
pendmem = (pendmem&~0xfff)-16;
|
||||||
}
|
}
|
||||||
memcpy_raz_(pgsmem, pMem, pendmem-(u32)gif->madr+16);
|
memcpy_aligned(pgsmem, pMem, pendmem-(u32)gif->madr+16);
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
memcpy_raz_(pgsmem, pMem, sizetoread);
|
memcpy_aligned(pgsmem, pMem, sizetoread);
|
||||||
|
|
||||||
mtgsThread->SendDataPacket();
|
mtgsThread->SendDataPacket();
|
||||||
}
|
}
|
||||||
|
|
46
pcsx2/Hw.h
46
pcsx2/Hw.h
|
@ -329,7 +329,7 @@ static __forceinline u8* dmaGetAddr(u32 mem)
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
// Note: Dma addresses are guaranteed to be aligned to 16 bytes (128 bits)
|
||||||
static __forceinline void *dmaGetAddr(u32 addr) {
|
static __forceinline void *dmaGetAddr(u32 addr) {
|
||||||
u8 *ptr;
|
u8 *ptr;
|
||||||
|
|
||||||
|
@ -355,35 +355,17 @@ void hwShutdown();
|
||||||
|
|
||||||
// hw read functions
|
// hw read functions
|
||||||
extern u8 hwRead8 (u32 mem);
|
extern u8 hwRead8 (u32 mem);
|
||||||
int hwConstRead8 (u32 x86reg, u32 mem, u32 sign);
|
|
||||||
|
|
||||||
extern u16 hwRead16(u32 mem);
|
extern u16 hwRead16(u32 mem);
|
||||||
int hwConstRead16(u32 x86reg, u32 mem, u32 sign);
|
|
||||||
|
|
||||||
extern u32 hwRead32(u32 mem);
|
extern u32 hwRead32(u32 mem);
|
||||||
int hwConstRead32(u32 x86reg, u32 mem);
|
extern u64 hwRead64(u32 mem);
|
||||||
|
extern void hwRead128(u32 mem, u64 *out);
|
||||||
u64 hwRead64(u32 mem);
|
|
||||||
void hwConstRead64(u32 mem, int mmreg);
|
|
||||||
|
|
||||||
void hwRead128(u32 mem, u64 *out);
|
|
||||||
void hwConstRead128(u32 mem, int xmmreg);
|
|
||||||
|
|
||||||
// hw write functions
|
// hw write functions
|
||||||
void hwWrite8 (u32 mem, u8 value);
|
extern void hwWrite8 (u32 mem, u8 value);
|
||||||
void hwConstWrite8 (u32 mem, int mmreg);
|
extern void hwWrite16(u32 mem, u16 value);
|
||||||
|
extern void hwWrite32(u32 mem, u32 value);
|
||||||
void hwWrite16(u32 mem, u16 value);
|
extern void hwWrite64(u32 mem, u64 value);
|
||||||
void hwConstWrite16(u32 mem, int mmreg);
|
extern void hwWrite128(u32 mem, const u64 *value);
|
||||||
|
|
||||||
void hwWrite32(u32 mem, u32 value);
|
|
||||||
void hwConstWrite32(u32 mem, int mmreg);
|
|
||||||
|
|
||||||
void hwWrite64(u32 mem, u64 value);
|
|
||||||
void hwConstWrite64(u32 mem, int mmreg);
|
|
||||||
|
|
||||||
void hwWrite128(u32 mem, const u64 *value);
|
|
||||||
void hwConstWrite128(u32 mem, int xmmreg);
|
|
||||||
|
|
||||||
void hwIntcIrq(int n);
|
void hwIntcIrq(int n);
|
||||||
void hwDmacIrq(int n);
|
void hwDmacIrq(int n);
|
||||||
|
@ -394,6 +376,18 @@ int hwMFIFOWrite(u32 addr, u8 *data, u32 size);
|
||||||
int hwDmacSrcChainWithStack(DMACh *dma, int id);
|
int hwDmacSrcChainWithStack(DMACh *dma, int id);
|
||||||
int hwDmacSrcChain(DMACh *dma, int id);
|
int hwDmacSrcChain(DMACh *dma, int id);
|
||||||
|
|
||||||
|
int hwConstRead8 (u32 x86reg, u32 mem, u32 sign);
|
||||||
|
int hwConstRead16(u32 x86reg, u32 mem, u32 sign);
|
||||||
|
int hwConstRead32(u32 x86reg, u32 mem);
|
||||||
|
void hwConstRead64(u32 mem, int mmreg);
|
||||||
|
void hwConstRead128(u32 mem, int xmmreg);
|
||||||
|
|
||||||
|
void hwConstWrite8 (u32 mem, int mmreg);
|
||||||
|
void hwConstWrite16(u32 mem, int mmreg);
|
||||||
|
void hwConstWrite32(u32 mem, int mmreg);
|
||||||
|
void hwConstWrite64(u32 mem, int mmreg);
|
||||||
|
void hwConstWrite128(u32 mem, int xmmreg);
|
||||||
|
|
||||||
#ifdef PCSX2_VIRTUAL_MEM
|
#ifdef PCSX2_VIRTUAL_MEM
|
||||||
void iMemRead32Check();
|
void iMemRead32Check();
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -430,7 +430,7 @@ int mtgsThreadObject::Callback()
|
||||||
{
|
{
|
||||||
Console::WriteLn("MTGS > Thread Started, Opening GS Plugin...");
|
Console::WriteLn("MTGS > Thread Started, Opening GS Plugin...");
|
||||||
|
|
||||||
memcpy_raz_( m_gsMem, PS2MEM_GS, sizeof(m_gsMem) );
|
memcpy_aligned( m_gsMem, PS2MEM_GS, sizeof(m_gsMem) );
|
||||||
GSsetBaseMem( m_gsMem );
|
GSsetBaseMem( m_gsMem );
|
||||||
|
|
||||||
m_returncode = GSopen((void *)&pDsp, "PCSX2", 1);
|
m_returncode = GSopen((void *)&pDsp, "PCSX2", 1);
|
||||||
|
|
19
pcsx2/Misc.h
19
pcsx2/Misc.h
|
@ -230,12 +230,19 @@ extern u8 g_globalXMMSaved;
|
||||||
void _memset16_unaligned( void* dest, u16 data, size_t size );
|
void _memset16_unaligned( void* dest, u16 data, size_t size );
|
||||||
|
|
||||||
#if defined(_WIN32) && !defined(__x86_64__)
|
#if defined(_WIN32) && !defined(__x86_64__)
|
||||||
// faster memcpy
|
|
||||||
extern void __fastcall memcpy_raz_u(void *dest, const void *src, size_t bytes);
|
// The new simplified memcpy_amd_ is now faster than memcpy_raz_.
|
||||||
extern void __fastcall memcpy_raz_(void *dest, const void *src, size_t qwc);
|
// memcpy_amd_ also does mmx register saving, negating the need for freezeregs (code cleanup!)
|
||||||
extern void __fastcall memcpy_amd_(void *dest, const void *src, size_t n);
|
// Additionally, using one single memcpy implementation keeps the code cache cleaner.
|
||||||
|
|
||||||
|
//extern void __fastcall memcpy_raz_udst(void *dest, const void *src, size_t bytes);
|
||||||
|
//extern void __fastcall memcpy_raz_usrc(void *dest, const void *src, size_t bytes);
|
||||||
|
//extern void __fastcall memcpy_raz_(void *dest, const void *src, size_t bytes);
|
||||||
|
extern void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes);
|
||||||
|
|
||||||
# include "windows/memzero.h"
|
# include "windows/memzero.h"
|
||||||
# define memcpy_fast memcpy_amd_
|
# define memcpy_fast memcpy_amd_
|
||||||
|
# define memcpy_aligned memcpy_amd_
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
|
@ -243,6 +250,10 @@ void _memset16_unaligned( void* dest, u16 data, size_t size );
|
||||||
#define memcpy_fast memcpy
|
#define memcpy_fast memcpy
|
||||||
#define memcpy_raz_ memcpy
|
#define memcpy_raz_ memcpy
|
||||||
#define memcpy_raz_u memcpy
|
#define memcpy_raz_u memcpy
|
||||||
|
|
||||||
|
#define memcpy_aligned memcpy
|
||||||
|
#define memcpy_raz_u memcpy
|
||||||
|
|
||||||
#include "Linux/memzero.h"
|
#include "Linux/memzero.h"
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -226,13 +226,11 @@ extern void mfifoGIFtransfer(int);
|
||||||
#define gif ((DMACh*)&PS2MEM_HW[0xA000])
|
#define gif ((DMACh*)&PS2MEM_HW[0xA000])
|
||||||
void dmaSPR0() { // fromSPR
|
void dmaSPR0() { // fromSPR
|
||||||
int qwc = spr0->qwc;
|
int qwc = spr0->qwc;
|
||||||
FreezeMMXRegs(1);
|
|
||||||
|
|
||||||
SPR_LOG("dmaSPR0 chcr = %lx, madr = %lx, qwc = %lx, sadr = %lx\n",
|
SPR_LOG("dmaSPR0 chcr = %lx, madr = %lx, qwc = %lx, sadr = %lx\n",
|
||||||
spr0->chcr, spr0->madr, spr0->qwc, spr0->sadr);
|
spr0->chcr, spr0->madr, spr0->qwc, spr0->sadr);
|
||||||
|
|
||||||
_dmaSPR0();
|
_dmaSPR0();
|
||||||
FreezeMMXRegs(0);
|
|
||||||
if ((psHu32(DMAC_CTRL) & 0xC) == 0xC) { // GIF MFIFO
|
if ((psHu32(DMAC_CTRL) & 0xC) == 0xC) { // GIF MFIFO
|
||||||
if((spr0->madr & ~psHu32(DMAC_RBSR)) != psHu32(DMAC_RBOR)) SysPrintf("GIF MFIFO Write outside MFIFO area\n");
|
if((spr0->madr & ~psHu32(DMAC_RBSR)) != psHu32(DMAC_RBOR)) SysPrintf("GIF MFIFO Write outside MFIFO area\n");
|
||||||
spr0->madr = psHu32(DMAC_RBOR) + (spr0->madr & psHu32(DMAC_RBSR));
|
spr0->madr = psHu32(DMAC_RBOR) + (spr0->madr & psHu32(DMAC_RBSR));
|
||||||
|
@ -308,7 +306,6 @@ void _SPR1interleave() {
|
||||||
void dmaSPR1() { // toSPR
|
void dmaSPR1() { // toSPR
|
||||||
|
|
||||||
|
|
||||||
FreezeMMXRegs(1);
|
|
||||||
#ifdef SPR_LOG
|
#ifdef SPR_LOG
|
||||||
SPR_LOG("dmaSPR1 chcr = 0x%x, madr = 0x%x, qwc = 0x%x\n"
|
SPR_LOG("dmaSPR1 chcr = 0x%x, madr = 0x%x, qwc = 0x%x\n"
|
||||||
" tadr = 0x%x, sadr = 0x%x\n",
|
" tadr = 0x%x, sadr = 0x%x\n",
|
||||||
|
@ -325,7 +322,6 @@ void dmaSPR1() { // toSPR
|
||||||
// Transfer Dn_QWC from Dn_MADR to SPR1
|
// Transfer Dn_QWC from Dn_MADR to SPR1
|
||||||
SPR1chain();
|
SPR1chain();
|
||||||
CPU_INT(9, cycles);
|
CPU_INT(9, cycles);
|
||||||
FreezeMMXRegs(0);
|
|
||||||
return;
|
return;
|
||||||
} else if ((spr1->chcr & 0xc) == 0x4){
|
} else if ((spr1->chcr & 0xc) == 0x4){
|
||||||
int cycles = 0;
|
int cycles = 0;
|
||||||
|
@ -338,7 +334,6 @@ void dmaSPR1() { // toSPR
|
||||||
// Transfer Dn_QWC from Dn_MADR to SPR1
|
// Transfer Dn_QWC from Dn_MADR to SPR1
|
||||||
SPR1chain();
|
SPR1chain();
|
||||||
CPU_INT(9, cycles);
|
CPU_INT(9, cycles);
|
||||||
FreezeMMXRegs(0);
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// Chain Mode
|
// Chain Mode
|
||||||
|
@ -382,7 +377,6 @@ void dmaSPR1() { // toSPR
|
||||||
} else { // Interleave Mode
|
} else { // Interleave Mode
|
||||||
_SPR1interleave();
|
_SPR1interleave();
|
||||||
}
|
}
|
||||||
FreezeMMXRegs(0);
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -574,9 +574,7 @@ static void VIFunpack(u32 *data, vifCode *v, int size, const unsigned int VIFdma
|
||||||
// v4-32
|
// v4-32
|
||||||
if(vifRegs->mode == 0 && !(vifRegs->code & 0x10000000) && vif->usn == 0){
|
if(vifRegs->mode == 0 && !(vifRegs->code & 0x10000000) && vif->usn == 0){
|
||||||
vifRegs->num -= size>>4;
|
vifRegs->num -= size>>4;
|
||||||
FreezeMMXRegs(1);
|
|
||||||
memcpy_fast((u8*)dest, cdata, size);
|
memcpy_fast((u8*)dest, cdata, size);
|
||||||
FreezeMMXRegs(0);
|
|
||||||
size = 0;
|
size = 0;
|
||||||
//unpacktotal += GetCPUTick()-basetick;
|
//unpacktotal += GetCPUTick()-basetick;
|
||||||
return;
|
return;
|
||||||
|
@ -814,9 +812,7 @@ static __forceinline void _vif0mpgTransfer(u32 addr, u32 *data, int size) {
|
||||||
fclose(f);
|
fclose(f);
|
||||||
}*/
|
}*/
|
||||||
if (memcmp(VU0.Micro + addr, data, size << 2)) {
|
if (memcmp(VU0.Micro + addr, data, size << 2)) {
|
||||||
FreezeMMXRegs(1);
|
|
||||||
memcpy_fast(VU0.Micro + addr, data, size << 2);
|
memcpy_fast(VU0.Micro + addr, data, size << 2);
|
||||||
FreezeMMXRegs(0);
|
|
||||||
CpuVU0->Clear(addr, size);
|
CpuVU0->Clear(addr, size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1490,9 +1486,7 @@ static __forceinline void _vif1mpgTransfer(u32 addr, u32 *data, int size) {
|
||||||
}*/
|
}*/
|
||||||
assert( VU1.Micro > 0 );
|
assert( VU1.Micro > 0 );
|
||||||
if (memcmp(VU1.Micro + addr, data, size << 2)) {
|
if (memcmp(VU1.Micro + addr, data, size << 2)) {
|
||||||
FreezeMMXRegs(1);
|
|
||||||
memcpy_fast(VU1.Micro + addr, data, size << 2);
|
memcpy_fast(VU1.Micro + addr, data, size << 2);
|
||||||
FreezeMMXRegs(0);
|
|
||||||
CpuVU1->Clear(addr, size);
|
CpuVU1->Clear(addr, size);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1644,7 +1638,7 @@ static int Vif1TransDirectHL(u32 *data){
|
||||||
{
|
{
|
||||||
//unaligned copy.VIF handling is -very- messy, so i'l use this code til i fix it :)
|
//unaligned copy.VIF handling is -very- messy, so i'l use this code til i fix it :)
|
||||||
const uint count = mtgsThread->PrepDataPacket( GIF_PATH_2, data, ret<<2 );
|
const uint count = mtgsThread->PrepDataPacket( GIF_PATH_2, data, ret<<2 );
|
||||||
memcpy_raz_u( mtgsThread->GetDataPacketPtr(), data, count );
|
memcpy_fast( mtgsThread->GetDataPacketPtr(), data, count );
|
||||||
mtgsThread->SendDataPacket();
|
mtgsThread->SendDataPacket();
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
|
|
|
@ -49,6 +49,9 @@ namespace Threading
|
||||||
}
|
}
|
||||||
|
|
||||||
cpuinfo.LogicalCores = CPUs;
|
cpuinfo.LogicalCores = CPUs;
|
||||||
|
if( LogicalCoresPerPhysicalCPU > CPUs) // for 1-socket HTT-disabled machines
|
||||||
|
LogicalCoresPerPhysicalCPU = CPUs;
|
||||||
|
|
||||||
cpuinfo.PhysicalCores = ( CPUs / LogicalCoresPerPhysicalCPU ) * PhysicalCoresPerPhysicalCPU;
|
cpuinfo.PhysicalCores = ( CPUs / LogicalCoresPerPhysicalCPU ) * PhysicalCoresPerPhysicalCPU;
|
||||||
ptw32_smp_system = ( cpuinfo.LogicalCores > 1 ) ? TRUE : FALSE;
|
ptw32_smp_system = ( cpuinfo.LogicalCores > 1 ) ? TRUE : FALSE;
|
||||||
}
|
}
|
||||||
|
|
|
@ -88,35 +88,27 @@ void checkregs()
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
__declspec(align(16)) static u8 _xmm_backup[16*2];
|
PCSX2_ALIGNED16( static u8 _xmm_backup[16*2] );
|
||||||
|
PCSX2_ALIGNED16( static u8 _mmx_backup[8*4] );
|
||||||
|
|
||||||
// this one checks for alignments too ...
|
static __declspec(naked) void __fastcall _memcpy_raz_usrc(void *dest, const void *src, size_t bytes)
|
||||||
__declspec(naked) void __fastcall memcpy_raz_u(void *dest, const void *src, size_t bytes)
|
|
||||||
{
|
{
|
||||||
// If src is aligned, use memcpy_raz instead:
|
|
||||||
__asm
|
|
||||||
{
|
|
||||||
test edx,0xf;
|
|
||||||
jz memcpy_raz_;
|
|
||||||
}
|
|
||||||
|
|
||||||
// MOVSRC = opcode used to read. I use the same code for the aligned version, with a different define :)
|
// MOVSRC = opcode used to read. I use the same code for the aligned version, with a different define :)
|
||||||
#define MOVSRC movups
|
#define MOVSRC movdqu
|
||||||
|
#define MOVDST movdqa
|
||||||
|
|
||||||
__asm
|
__asm
|
||||||
{
|
{
|
||||||
//Reads before reads, to avoid stalls
|
//Reads before reads, to avoid stalls
|
||||||
mov eax,[esp+4];
|
mov eax,[esp+4];
|
||||||
//Make sure to save xmm0, it must be preserved ...
|
//Make sure to save xmm0, it must be preserved ...
|
||||||
movaps [_xmm_backup+0x00],xmm0;
|
movaps [_xmm_backup],xmm0;
|
||||||
|
|
||||||
//if >=128 bytes use 128 byte unrolled loop
|
//if >=128 bytes use 128 byte unrolled loop
|
||||||
//i use cmp ..,127 + jna because 127 is encodable using the simm8 form
|
//i use cmp ..,127 + jna because 127 is encodable using the simm8 form
|
||||||
cmp eax,127;
|
cmp eax,127;
|
||||||
jna _loop_1;
|
jna _loop_1;
|
||||||
|
|
||||||
//unrolled version also touches xmm1, save it :)
|
|
||||||
movaps [_xmm_backup+0x10],xmm1;
|
|
||||||
|
|
||||||
//since this is a common branch target it could be good to align it -- no idea if it has any effect :p
|
//since this is a common branch target it could be good to align it -- no idea if it has any effect :p
|
||||||
align 16
|
align 16
|
||||||
|
|
||||||
|
@ -124,34 +116,111 @@ __declspec(naked) void __fastcall memcpy_raz_u(void *dest, const void *src, size
|
||||||
_loop_8:
|
_loop_8:
|
||||||
|
|
||||||
MOVSRC xmm0,[edx+0x00]; //read first to avoid read-after-write stalls
|
MOVSRC xmm0,[edx+0x00]; //read first to avoid read-after-write stalls
|
||||||
MOVSRC xmm1,[edx+0x10];
|
MOVDST [ecx+0x00],xmm0; //then write :p
|
||||||
|
MOVSRC xmm0,[edx+0x10];
|
||||||
|
MOVDST [ecx+0x10],xmm0;
|
||||||
sub edx,-128; //edx won't be used for a while, so update it here. sub/-128 for simm8 encoding
|
sub edx,-128; //edx won't be used for a while, so update it here. sub/-128 for simm8 encoding
|
||||||
movaps [ecx+0x00],xmm0; //then write :p
|
|
||||||
movaps [ecx+0x10],xmm1;
|
|
||||||
sub ecx,-128; //ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding
|
sub ecx,-128; //ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding
|
||||||
|
|
||||||
MOVSRC xmm0,[edx+0x20-128];
|
MOVSRC xmm0,[edx+0x20-128];
|
||||||
MOVSRC xmm1,[edx+0x30-128];
|
MOVDST [ecx+0x20-128],xmm0;
|
||||||
|
MOVSRC xmm0,[edx+0x30-128];
|
||||||
|
MOVDST [ecx+0x30-128],xmm0;
|
||||||
add eax,-128; //eax won't be used for a while, so update it here. add/-128 for simm8 encoding
|
add eax,-128; //eax won't be used for a while, so update it here. add/-128 for simm8 encoding
|
||||||
movaps [ecx+0x20-128],xmm0;
|
|
||||||
movaps [ecx+0x30-128],xmm1;
|
|
||||||
|
|
||||||
MOVSRC xmm0,[edx+0x40-128];
|
MOVSRC xmm0,[edx+0x40-128];
|
||||||
MOVSRC xmm1,[edx+0x50-128];
|
MOVDST [ecx+0x40-128],xmm0;
|
||||||
movaps [ecx+0x40-128],xmm0;
|
MOVSRC xmm0,[edx+0x50-128];
|
||||||
movaps [ecx+0x50-128],xmm1;
|
MOVDST [ecx+0x50-128],xmm0;
|
||||||
|
|
||||||
MOVSRC xmm0,[edx+0x60-128];
|
MOVSRC xmm0,[edx+0x60-128];
|
||||||
MOVSRC xmm1,[edx+0x70-128];
|
MOVDST [ecx+0x60-128],xmm0;
|
||||||
movaps [ecx+0x60-128],xmm0;
|
MOVSRC xmm0,[edx+0x70-128];
|
||||||
movaps [ecx+0x70-128],xmm1;
|
MOVDST [ecx+0x70-128],xmm0;
|
||||||
|
|
||||||
//127~ja, 127 is encodable as simm8 :)
|
//127~ja, 127 is encodable as simm8 :)
|
||||||
cmp eax,127;
|
cmp eax,127;
|
||||||
ja _loop_8;
|
ja _loop_8;
|
||||||
|
|
||||||
//restore xmm1 :)
|
//direct copy for 0~7 qwords
|
||||||
movaps xmm1,[_xmm_backup+0x10];
|
//in order to avoid the inc/dec of all 3 registers
|
||||||
|
//i use negative relative addressing from the top of the buffers
|
||||||
|
//[top-current index]
|
||||||
|
|
||||||
|
_loop_1:
|
||||||
|
//prepare the regs for 'negative relative addressing'
|
||||||
|
add edx,eax;
|
||||||
|
add ecx,eax;
|
||||||
|
neg eax;
|
||||||
|
jz cleanup; //exit if nothing to do
|
||||||
|
|
||||||
|
_loop_1_inner:
|
||||||
|
MOVSRC xmm0,[edx+eax];
|
||||||
|
MOVDST [ecx+eax],xmm0;
|
||||||
|
|
||||||
|
add eax,16; //while the offset is still negative we have data to copy
|
||||||
|
js _loop_1_inner;
|
||||||
|
|
||||||
|
//done !
|
||||||
|
cleanup:
|
||||||
|
//restore xmm and exit ~)
|
||||||
|
movaps xmm0,[_xmm_backup];
|
||||||
|
ret 4;
|
||||||
|
}
|
||||||
|
#undef MOVSRC
|
||||||
|
#undef MOVDST
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static __declspec(naked) void __fastcall _memcpy_raz_udst(void *dest, const void *src, size_t bytes)
|
||||||
|
{
|
||||||
|
// MOVDST = opcode used to read. I use the same code for the aligned version, with a different define :)
|
||||||
|
#define MOVSRC movaps
|
||||||
|
#define MOVDST movups
|
||||||
|
__asm
|
||||||
|
{
|
||||||
|
//Reads before reads, to avoid stalls
|
||||||
|
mov eax,[esp+4];
|
||||||
|
//Make sure to save xmm0, it must be preserved ...
|
||||||
|
movaps [_xmm_backup],xmm0;
|
||||||
|
|
||||||
|
//if >=128 bytes use 128 byte unrolled loop
|
||||||
|
//i use cmp ..,127 + jna because 127 is encodable using the simm8 form
|
||||||
|
cmp eax,127;
|
||||||
|
jna _loop_1;
|
||||||
|
|
||||||
|
//since this is a common branch target it could be good to align it -- no idea if it has any effect :p
|
||||||
|
align 16
|
||||||
|
|
||||||
|
//128 byte unrolled loop
|
||||||
|
_loop_8:
|
||||||
|
|
||||||
|
MOVSRC xmm0,[edx+0x00]; //read first to avoid read-after-write stalls
|
||||||
|
MOVDST [ecx+0x00],xmm0; //then write :p
|
||||||
|
MOVSRC xmm0,[edx+0x10];
|
||||||
|
MOVDST [ecx+0x10],xmm0;
|
||||||
|
sub edx,-128; //edx won't be used for a while, so update it here. sub/-128 for simm8 encoding
|
||||||
|
sub ecx,-128; //ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding
|
||||||
|
|
||||||
|
MOVSRC xmm0,[edx+0x20-128];
|
||||||
|
MOVDST [ecx+0x20-128],xmm0;
|
||||||
|
MOVSRC xmm0,[edx+0x30-128];
|
||||||
|
MOVDST [ecx+0x30-128],xmm0;
|
||||||
|
add eax,-128; //eax won't be used for a while, so update it here. add/-128 for simm8 encoding
|
||||||
|
|
||||||
|
MOVSRC xmm0,[edx+0x40-128];
|
||||||
|
MOVDST [ecx+0x40-128],xmm0;
|
||||||
|
MOVSRC xmm0,[edx+0x50-128];
|
||||||
|
MOVDST [ecx+0x50-128],xmm0;
|
||||||
|
|
||||||
|
MOVSRC xmm0,[edx+0x60-128];
|
||||||
|
MOVDST [ecx+0x60-128],xmm0;
|
||||||
|
MOVSRC xmm0,[edx+0x70-128];
|
||||||
|
MOVDST [ecx+0x70-128],xmm0;
|
||||||
|
|
||||||
|
//127~ja, 127 is encodable as simm8 :)
|
||||||
|
cmp eax,127;
|
||||||
|
ja _loop_8;
|
||||||
|
|
||||||
//direct copy for 0~7 qwords
|
//direct copy for 0~7 qwords
|
||||||
//in order to avoid the inc/dec of all 3 registers
|
//in order to avoid the inc/dec of all 3 registers
|
||||||
|
@ -168,22 +237,24 @@ _loop_1:
|
||||||
_loop_1_inner:
|
_loop_1_inner:
|
||||||
MOVSRC xmm0,[edx+eax];
|
MOVSRC xmm0,[edx+eax];
|
||||||
movaps [ecx+eax],xmm0;
|
movaps [ecx+eax],xmm0;
|
||||||
|
|
||||||
add eax,16; //while the offset is still negative we have data to copy
|
add eax,16; //while the offset is still negative we have data to copy
|
||||||
js _loop_1_inner;
|
js _loop_1_inner;
|
||||||
|
|
||||||
//done !
|
//done !
|
||||||
cleanup:
|
cleanup:
|
||||||
//restore xmm and exit ~)
|
//restore xmm and exit ~)
|
||||||
movaps xmm0,[_xmm_backup+0x00];
|
movaps xmm0,[_xmm_backup];
|
||||||
ret 4;
|
ret 4;
|
||||||
}
|
}
|
||||||
#undef MOVSRC
|
#undef MOVSRC
|
||||||
|
#undef MOVDST
|
||||||
}
|
}
|
||||||
|
|
||||||
// Custom memcpy, only for 16 byte aligned stuff (used for mtgs)
|
// Custom memcpy, only for 16 byte aligned stuff (used for mtgs)
|
||||||
// This function is optimized for medium-small transfer sizes (<2048, >=128). No prefetching is
|
// This function is optimized for medium-small transfer sizes (<2048, >=128). No prefetching is
|
||||||
// used since the reads are linear and the cache logic can predict em :)
|
// used since the reads are linear and the cache logic can predict em :)
|
||||||
|
// *OBSOLETE* -- memcpy_amd_ has been optimized and is now faster.
|
||||||
__declspec(naked) void __fastcall memcpy_raz_(void *dest, const void *src, size_t bytes)
|
__declspec(naked) void __fastcall memcpy_raz_(void *dest, const void *src, size_t bytes)
|
||||||
{
|
{
|
||||||
// Code Implementation Notes:
|
// Code Implementation Notes:
|
||||||
|
@ -191,21 +262,19 @@ __declspec(naked) void __fastcall memcpy_raz_(void *dest, const void *src, size_
|
||||||
|
|
||||||
// MOVSRC = opcode used to read. I use the same code for the unaligned version, with a different define :)
|
// MOVSRC = opcode used to read. I use the same code for the unaligned version, with a different define :)
|
||||||
#define MOVSRC movaps
|
#define MOVSRC movaps
|
||||||
|
#define MOVDST movaps
|
||||||
__asm
|
__asm
|
||||||
{
|
{
|
||||||
//Reads before reads, to avoid stalls
|
//Reads before reads, to avoid stalls
|
||||||
mov eax,[esp+4];
|
mov eax,[esp+4];
|
||||||
//Make sure to save xmm0, it must be preserved ...
|
//Make sure to save xmm0, it must be preserved ...
|
||||||
movaps [_xmm_backup+0x00],xmm0;
|
movaps [_xmm_backup],xmm0;
|
||||||
|
|
||||||
//if >=128 bytes use 128 byte unrolled loop
|
//if >=128 bytes use 128 byte unrolled loop
|
||||||
//i use cmp ..,127 + jna because 127 is encodable using the simm8 form
|
//i use cmp ..,127 + jna because 127 is encodable using the simm8 form
|
||||||
cmp eax,127;
|
cmp eax,127;
|
||||||
jna _loop_1;
|
jna _loop_1;
|
||||||
|
|
||||||
//unrolled version also toiches xmm1, save it :)
|
|
||||||
movaps [_xmm_backup+0x10],xmm1;
|
|
||||||
|
|
||||||
//since this is a common branch target it could be good to align it -- no idea if it has any effect :p
|
//since this is a common branch target it could be good to align it -- no idea if it has any effect :p
|
||||||
align 16
|
align 16
|
||||||
|
|
||||||
|
@ -213,35 +282,32 @@ __declspec(naked) void __fastcall memcpy_raz_(void *dest, const void *src, size_
|
||||||
_loop_8:
|
_loop_8:
|
||||||
|
|
||||||
MOVSRC xmm0,[edx+0x00]; //read first to avoid read-after-write stalls
|
MOVSRC xmm0,[edx+0x00]; //read first to avoid read-after-write stalls
|
||||||
MOVSRC xmm1,[edx+0x10];
|
MOVDST [ecx+0x00],xmm0; //then write :p
|
||||||
|
MOVSRC xmm0,[edx+0x10];
|
||||||
|
MOVDST [ecx+0x10],xmm0;
|
||||||
sub edx,-128; //edx won't be used for a while, so update it here. sub/-128 for simm8 encoding
|
sub edx,-128; //edx won't be used for a while, so update it here. sub/-128 for simm8 encoding
|
||||||
movaps [ecx+0x00],xmm0; //then write :p
|
|
||||||
movaps [ecx+0x10],xmm1;
|
|
||||||
sub ecx,-128; //ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding
|
sub ecx,-128; //ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding
|
||||||
|
|
||||||
MOVSRC xmm0,[edx+0x20-128];
|
MOVSRC xmm0,[edx+0x20-128];
|
||||||
MOVSRC xmm1,[edx+0x30-128];
|
MOVDST [ecx+0x20-128],xmm0;
|
||||||
|
MOVSRC xmm0,[edx+0x30-128];
|
||||||
|
MOVDST [ecx+0x30-128],xmm0;
|
||||||
add eax,-128; //eax won't be used for a while, so update it here. add/-128 for simm8 encoding
|
add eax,-128; //eax won't be used for a while, so update it here. add/-128 for simm8 encoding
|
||||||
movaps [ecx+0x20-128],xmm0;
|
|
||||||
movaps [ecx+0x30-128],xmm1;
|
|
||||||
|
|
||||||
MOVSRC xmm0,[edx+0x40-128];
|
MOVSRC xmm0,[edx+0x40-128];
|
||||||
MOVSRC xmm1,[edx+0x50-128];
|
MOVDST [ecx+0x40-128],xmm0;
|
||||||
movaps [ecx+0x40-128],xmm0;
|
MOVSRC xmm0,[edx+0x50-128];
|
||||||
movaps [ecx+0x50-128],xmm1;
|
MOVDST [ecx+0x50-128],xmm0;
|
||||||
|
|
||||||
MOVSRC xmm0,[edx+0x60-128];
|
MOVSRC xmm0,[edx+0x60-128];
|
||||||
MOVSRC xmm1,[edx+0x70-128];
|
MOVDST [ecx+0x60-128],xmm0;
|
||||||
movaps [ecx+0x60-128],xmm0;
|
MOVSRC xmm0,[edx+0x70-128];
|
||||||
movaps [ecx+0x70-128],xmm1;
|
MOVDST [ecx+0x70-128],xmm0;
|
||||||
|
|
||||||
//127~ja, 127 is encodable as simm8 :)
|
//127~ja, 127 is encodable as simm8 :)
|
||||||
cmp eax,127;
|
cmp eax,127;
|
||||||
ja _loop_8;
|
ja _loop_8;
|
||||||
|
|
||||||
//restore xmm1 :)
|
|
||||||
movaps xmm1,[_xmm_backup+0x10];
|
|
||||||
|
|
||||||
//direct copy for 0~7 qwords
|
//direct copy for 0~7 qwords
|
||||||
//in order to avoid the inc/dec of all 3 registers
|
//in order to avoid the inc/dec of all 3 registers
|
||||||
//i use negative relative addressing from the top of the buffers
|
//i use negative relative addressing from the top of the buffers
|
||||||
|
@ -256,7 +322,7 @@ _loop_1:
|
||||||
|
|
||||||
_loop_1_inner:
|
_loop_1_inner:
|
||||||
MOVSRC xmm0,[edx+eax];
|
MOVSRC xmm0,[edx+eax];
|
||||||
movaps [ecx+eax],xmm0;
|
MOVDST [ecx+eax],xmm0;
|
||||||
|
|
||||||
add eax,16; //while the offset is still negative we have data to copy
|
add eax,16; //while the offset is still negative we have data to copy
|
||||||
js _loop_1_inner;
|
js _loop_1_inner;
|
||||||
|
@ -264,44 +330,64 @@ _loop_1_inner:
|
||||||
//done !
|
//done !
|
||||||
cleanup:
|
cleanup:
|
||||||
//restore xmm and exit ~)
|
//restore xmm and exit ~)
|
||||||
movaps xmm0,[_xmm_backup+0x00];
|
movaps xmm0,[_xmm_backup];
|
||||||
ret 4;
|
ret 4;
|
||||||
}
|
}
|
||||||
|
#undef MOVSRC
|
||||||
|
#undef MOVDST
|
||||||
}
|
}
|
||||||
|
|
||||||
#undef MOVSRC
|
// This memcpy routine is for use in situations where the source buffer's alignment is indeterminate.
|
||||||
|
__forceinline void __fastcall memcpy_raz_usrc(void *dest, const void *src, size_t bytes)
|
||||||
|
{
|
||||||
|
if( ((uptr)src & 0xf) == 0 )
|
||||||
|
memcpy_raz_( dest, src, bytes );
|
||||||
|
else
|
||||||
|
_memcpy_raz_usrc( dest, src, bytes );
|
||||||
|
}
|
||||||
|
|
||||||
|
// This memcpy routine is for use in situations where the destination buffer's alignment is indeterminate.
|
||||||
|
__forceinline void __fastcall memcpy_raz_udst(void *dest, const void *src, size_t bytes)
|
||||||
|
{
|
||||||
|
if( ((uptr)dest & 0xf) == 0 )
|
||||||
|
memcpy_raz_( dest, src, bytes );
|
||||||
|
else
|
||||||
|
_memcpy_raz_udst( dest, src, bytes );
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
//////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////
|
||||||
// Fast memcpy as coded by AMD.
|
// Fast memcpy as coded by AMD, and thn improved by air.
|
||||||
|
//
|
||||||
// This function clobbers all MMX registers, and is generally not optimal for short memory
|
// This routine preserves mmx registers! It's the complete real deal!
|
||||||
// copies due to the amount of overhead required to test for alignments, copy length,
|
__declspec(naked) void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
|
||||||
// and other ABI overhead.
|
|
||||||
void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
|
|
||||||
{
|
{
|
||||||
__asm {
|
__asm
|
||||||
|
{
|
||||||
|
push edi
|
||||||
|
push esi
|
||||||
|
|
||||||
mov edi, ecx ; destination
|
mov edi, ecx ; destination
|
||||||
mov esi, edx ; source
|
mov esi, edx ; source
|
||||||
mov ecx, [n] ; number of bytes to copy
|
mov ecx, [esp+12] ; number of bytes to copy
|
||||||
mov ebx, ecx ; keep a copy of count
|
mov eax, ecx ; keep a copy of count
|
||||||
|
|
||||||
cld
|
cld
|
||||||
cmp ecx, TINY_BLOCK_COPY
|
cmp eax, TINY_BLOCK_COPY
|
||||||
jb $memcpy_ic_3 ; tiny? skip mmx copy
|
jb $memcpy_ic_3 ; tiny? skip mmx copy
|
||||||
|
|
||||||
cmp ecx, 32*1024 ; don't align between 32k-64k because
|
cmp eax, 32*1024 ; don't align between 32k-64k because
|
||||||
jbe $memcpy_do_align ; it appears to be slower
|
jbe $memcpy_do_align ; it appears to be slower
|
||||||
cmp ecx, 64*1024
|
cmp eax, 64*1024
|
||||||
jbe $memcpy_align_done
|
jbe $memcpy_align_done
|
||||||
$memcpy_do_align:
|
$memcpy_do_align:
|
||||||
mov ecx, 8 ; a trick that's faster than rep movsb...
|
mov eax, 8 ; a trick that's faster than rep movsb...
|
||||||
sub ecx, edi ; align destination to qword
|
sub eax, edi ; align destination to qword
|
||||||
and ecx, 111b ; get the low bits
|
and eax, 111b ; get the low bits
|
||||||
sub ebx, ecx ; update copy count
|
sub ecx, eax ; update copy count
|
||||||
neg ecx ; set up to jump into the array
|
neg eax ; set up to jump into the array
|
||||||
add ecx, offset $memcpy_align_done
|
add eax, offset $memcpy_align_done
|
||||||
jmp ecx ; jump to array of movsb's
|
jmp eax ; jump to array of movsb's
|
||||||
|
|
||||||
align 4
|
align 4
|
||||||
movsb
|
movsb
|
||||||
|
@ -314,13 +400,18 @@ align 4
|
||||||
movsb
|
movsb
|
||||||
|
|
||||||
$memcpy_align_done: ; destination is dword aligned
|
$memcpy_align_done: ; destination is dword aligned
|
||||||
mov ecx, ebx ; number of bytes left to copy
|
mov eax, ecx ; number of bytes left to copy
|
||||||
shr ecx, 6 ; get 64-byte block count
|
shr eax, 6 ; get 64-byte block count
|
||||||
jz $memcpy_ic_2 ; finish the last few bytes
|
jz $memcpy_ic_2 ; finish the last few bytes
|
||||||
|
|
||||||
cmp ecx, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy
|
cmp eax, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy
|
||||||
jae $memcpy_uc_test
|
jae $memcpy_uc_test
|
||||||
|
|
||||||
|
movq [_mmx_backup+0x00],mm0
|
||||||
|
movq [_mmx_backup+0x08],mm1
|
||||||
|
movq [_mmx_backup+0x10],mm2
|
||||||
|
movq [_mmx_backup+0x18],mm3
|
||||||
|
|
||||||
// This is small block copy that uses the MMX registers to copy 8 bytes
|
// This is small block copy that uses the MMX registers to copy 8 bytes
|
||||||
// at a time. It uses the "unrolled loop" optimization, and also uses
|
// at a time. It uses the "unrolled loop" optimization, and also uses
|
||||||
// the software prefetch instruction to get the data into the cache.
|
// the software prefetch instruction to get the data into the cache.
|
||||||
|
@ -348,30 +439,39 @@ $memcpy_ic_1: ; 64-byte block copies, in-cache copy
|
||||||
|
|
||||||
add esi, 64 ; update source pointer
|
add esi, 64 ; update source pointer
|
||||||
add edi, 64 ; update destination pointer
|
add edi, 64 ; update destination pointer
|
||||||
dec ecx ; count down
|
dec eax ; count down
|
||||||
jnz $memcpy_ic_1 ; last 64-byte block?
|
jnz $memcpy_ic_1 ; last 64-byte block?
|
||||||
|
|
||||||
|
movq mm0,[_mmx_backup+0x00]
|
||||||
|
movq mm1,[_mmx_backup+0x08]
|
||||||
|
movq mm2,[_mmx_backup+0x10]
|
||||||
|
movq mm3,[_mmx_backup+0x18]
|
||||||
|
|
||||||
$memcpy_ic_2:
|
$memcpy_ic_2:
|
||||||
mov ecx, ebx ; has valid low 6 bits of the byte count
|
mov eax, ecx ; has valid low 6 bits of the byte count
|
||||||
$memcpy_ic_3:
|
$memcpy_ic_3:
|
||||||
shr ecx, 2 ; dword count
|
shr eax, 2 ; dword count
|
||||||
and ecx, 1111b ; only look at the "remainder" bits
|
and eax, 1111b ; only look at the "remainder" bits
|
||||||
neg ecx ; set up to jump into the array
|
neg eax ; set up to jump into the array
|
||||||
add ecx, offset $memcpy_last_few
|
add eax, offset $memcpy_last_few
|
||||||
jmp ecx ; jump to array of movsd's
|
jmp eax ; jump to array of movsd's
|
||||||
|
|
||||||
$memcpy_uc_test:
|
$memcpy_uc_test:
|
||||||
cmp ecx, UNCACHED_COPY/64 ; big enough? use block prefetch copy
|
/*cmp ecx, UNCACHED_COPY/64 ; big enough? use block prefetch copy
|
||||||
jae $memcpy_bp_1
|
jae $memcpy_bp_1
|
||||||
|
$memcpy_64_test:*/
|
||||||
$memcpy_64_test:
|
or eax, eax ; tail end of block prefetch will jump here
|
||||||
or ecx, ecx ; tail end of block prefetch will jump here
|
|
||||||
jz $memcpy_ic_2 ; no more 64-byte blocks left
|
jz $memcpy_ic_2 ; no more 64-byte blocks left
|
||||||
|
|
||||||
// For larger blocks, which will spill beyond the cache, it's faster to
|
// For larger blocks, which will spill beyond the cache, it's faster to
|
||||||
// use the Streaming Store instruction MOVNTQ. This write instruction
|
// use the Streaming Store instruction MOVNTQ. This write instruction
|
||||||
// bypasses the cache and writes straight to main memory. This code also
|
// bypasses the cache and writes straight to main memory. This code also
|
||||||
// uses the software prefetch instruction to pre-read the data.
|
// uses the software prefetch instruction to pre-read the data.
|
||||||
|
|
||||||
|
movq [_mmx_backup+0x00],mm0
|
||||||
|
movq [_mmx_backup+0x08],mm1
|
||||||
|
movq [_mmx_backup+0x10],mm2
|
||||||
|
|
||||||
align 16
|
align 16
|
||||||
$memcpy_uc_1: ; 64-byte blocks, uncached copy
|
$memcpy_uc_1: ; 64-byte blocks, uncached copy
|
||||||
|
|
||||||
|
@ -394,17 +494,25 @@ $memcpy_uc_1: ; 64-byte blocks, uncached copy
|
||||||
movq mm1,[esi-8]
|
movq mm1,[esi-8]
|
||||||
movntq [edi-24], mm2
|
movntq [edi-24], mm2
|
||||||
movntq [edi-16], mm0
|
movntq [edi-16], mm0
|
||||||
dec ecx
|
dec eax
|
||||||
movntq [edi-8], mm1
|
movntq [edi-8], mm1
|
||||||
jnz $memcpy_uc_1 ; last 64-byte block?
|
jnz $memcpy_uc_1 ; last 64-byte block?
|
||||||
|
|
||||||
jmp $memcpy_ic_2 ; almost done
|
movq mm0,[_mmx_backup+0x00]
|
||||||
|
movq mm1,[_mmx_backup+0x08]
|
||||||
|
movq mm2,[_mmx_backup+0x10]
|
||||||
|
|
||||||
|
jmp $memcpy_ic_2 ; almost done (not needed because large copy below was removed)
|
||||||
|
|
||||||
// For the largest size blocks, a special technique called Block Prefetch
|
// For the largest size blocks, a special technique called Block Prefetch
|
||||||
// can be used to accelerate the read operations. Block Prefetch reads
|
// can be used to accelerate the read operations. Block Prefetch reads
|
||||||
// one address per cache line, for a series of cache lines, in a short loop.
|
// one address per cache line, for a series of cache lines, in a short loop.
|
||||||
// This is faster than using software prefetch. The technique is great for
|
// This is faster than using software prefetch. The technique is great for
|
||||||
// getting maximum read bandwidth, especially in DDR memory systems.
|
// getting maximum read bandwidth, especially in DDR memory systems.
|
||||||
|
|
||||||
|
// Note: Pcsx2 rarely invokes large copies, so this mode has been disabled to
|
||||||
|
// help keep the code cache footprint of memcpy_fast to a minimum.
|
||||||
|
/*
|
||||||
$memcpy_bp_1: ; large blocks, block prefetch copy
|
$memcpy_bp_1: ; large blocks, block prefetch copy
|
||||||
|
|
||||||
cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop?
|
cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop?
|
||||||
|
@ -447,6 +555,7 @@ $memcpy_bp_3:
|
||||||
jnz $memcpy_bp_3 ; keep copying
|
jnz $memcpy_bp_3 ; keep copying
|
||||||
sub ecx, CACHEBLOCK ; update the 64-byte block count
|
sub ecx, CACHEBLOCK ; update the 64-byte block count
|
||||||
jmp $memcpy_bp_1 ; keep processing chunks
|
jmp $memcpy_bp_1 ; keep processing chunks
|
||||||
|
*/
|
||||||
|
|
||||||
// The smallest copy uses the X86 "movsd" instruction, in an optimized
|
// The smallest copy uses the X86 "movsd" instruction, in an optimized
|
||||||
// form which is an "unrolled loop". Then it handles the last few bytes.
|
// form which is an "unrolled loop". Then it handles the last few bytes.
|
||||||
|
@ -469,8 +578,8 @@ align 4
|
||||||
movsd
|
movsd
|
||||||
|
|
||||||
$memcpy_last_few: ; dword aligned from before movsd's
|
$memcpy_last_few: ; dword aligned from before movsd's
|
||||||
mov ecx, ebx ; has valid low 2 bits of the byte count
|
mov eax, ecx ; has valid low 2 bits of the byte count
|
||||||
and ecx, 11b ; the last few cows must come home
|
and eax, 11b ; the last few cows must come home
|
||||||
jz $memcpy_final ; no more, let's leave
|
jz $memcpy_final ; no more, let's leave
|
||||||
rep movsb ; the last 1, 2, or 3 bytes
|
rep movsb ; the last 1, 2, or 3 bytes
|
||||||
|
|
||||||
|
@ -479,10 +588,14 @@ $memcpy_final:
|
||||||
sfence ; flush the write buffer
|
sfence ; flush the write buffer
|
||||||
//mov eax, [dest] ; ret value = destination pointer
|
//mov eax, [dest] ; ret value = destination pointer
|
||||||
|
|
||||||
|
pop esi
|
||||||
|
pop edi
|
||||||
|
|
||||||
|
ret 4
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// mmx memcpy implementation, size has to be a multiple of 8
|
// mmx mem-compare implementation, size has to be a multiple of 8
|
||||||
// returns 0 is equal, nonzero value if not equal
|
// returns 0 is equal, nonzero value if not equal
|
||||||
// ~10 times faster than standard memcmp
|
// ~10 times faster than standard memcmp
|
||||||
// (zerofrog)
|
// (zerofrog)
|
||||||
|
|
|
@ -1977,12 +1977,7 @@ void VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr)
|
||||||
//if( size > 0 )
|
//if( size > 0 )
|
||||||
{
|
{
|
||||||
u8* pmem = mtgsThread->GetDataPacketPtr();
|
u8* pmem = mtgsThread->GetDataPacketPtr();
|
||||||
//FreezeMMXRegs(1);
|
memcpy_aligned(pmem, (u8*)pMem+addr, size);
|
||||||
//memcpy_fast(pmem, (u8*)pMem+addr, size);
|
|
||||||
//FreezeMMXRegs(0);
|
|
||||||
|
|
||||||
// we can use the faster memcpy_raz_ here (src/dest are garaunteed to be aligned)
|
|
||||||
memcpy_raz_(pmem, (u8*)pMem+addr, size);
|
|
||||||
mtgsThread->SendDataPacket();
|
mtgsThread->SendDataPacket();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -880,9 +880,7 @@ static VuFunctionHeader* SuperVURecompileProgram(u32 startpc, int vuindex)
|
||||||
#ifdef SUPERVU_CACHING
|
#ifdef SUPERVU_CACHING
|
||||||
//memxor_mmx(r.checksum, &VU->Micro[r.start], r.size);
|
//memxor_mmx(r.checksum, &VU->Micro[r.start], r.size);
|
||||||
r.pmem = malloc(r.size);
|
r.pmem = malloc(r.size);
|
||||||
FreezeMMXRegs(1);
|
|
||||||
memcpy_fast(r.pmem, &VU->Micro[r.start], r.size);
|
memcpy_fast(r.pmem, &VU->Micro[r.start], r.size);
|
||||||
FreezeMMXRegs(0);
|
|
||||||
#endif
|
#endif
|
||||||
s_pFnHeader->ranges.push_back(r);
|
s_pFnHeader->ranges.push_back(r);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue