Optimized memcpy_fast. In addition to being quite a bit faster, it also auto-preserves mmx registers now. So I was also able to remove almost every instance of FreezeMMXRegs (all except those used to guard the GS plugin calls). memcpy_fast (aka memcpy_amd_) is now faster than memcpy_raz for *all* scenarios, so it's been made the new default.

git-svn-id: http://pcsx2-playground.googlecode.com/svn/trunk@642 a6443dda-0b58-4228-96e9-037be469359c
This commit is contained in:
Jake.Stine 2009-01-27 05:12:54 +00:00 committed by Gregory Hainaut
parent 4781be9e59
commit 44f5117d24
14 changed files with 248 additions and 169 deletions

View File

@ -865,8 +865,6 @@ int cdvdReadSector() {
return -1;
}
FreezeMMXRegs(1);
const u32 madr = HW_DMA3_MADR;
// if raw dvd sector 'fill in the blanks'
@ -935,7 +933,6 @@ int cdvdReadSector() {
HW_DMA3_BCR_H16-= (cdvd.BlockSize / (HW_DMA3_BCR_L16*4));
HW_DMA3_MADR+= cdvd.BlockSize;
FreezeMMXRegs(0);
return 0;
}
@ -2024,9 +2021,7 @@ void cdvdWrite16(u8 rt) // SCOMMAND
if (cdvd.mg_size + cdvd.ParamC > cdvd.mg_maxsize)
cdvd.Result[0] = 0x80;
else{
FreezeMMXRegs(1);
memcpy_fast(cdvd.mg_buffer + cdvd.mg_size, cdvd.Param, cdvd.ParamC);
FreezeMMXRegs(0);
cdvd.mg_size += cdvd.ParamC;
cdvd.Result[0] = 0; // 0 complete ; 1 busy ; 0x80 error
}
@ -2034,11 +2029,9 @@ void cdvdWrite16(u8 rt) // SCOMMAND
case 0x8E: // sceMgReadData
SetResultSize( std::min(16, cdvd.mg_size) );
FreezeMMXRegs(1);
memcpy_fast(cdvd.Result, cdvd.mg_buffer, cdvd.ResultC);
cdvd.mg_size -= cdvd.ResultC;
memcpy_fast(cdvd.mg_buffer, cdvd.mg_buffer+cdvd.ResultC, cdvd.mg_size);
FreezeMMXRegs(0);
break;
case 0x88: // secrman: __mechacon_auth_0x88 //for now it is the same; so, fall;)
@ -2089,9 +2082,7 @@ fail_pol_cal:
SetResultSize(3);//in:0
{
int bit_ofs = mg_BIToffset(cdvd.mg_buffer);
FreezeMMXRegs(1);
memcpy_fast(cdvd.mg_buffer, &cdvd.mg_buffer[bit_ofs], 8+16*cdvd.mg_buffer[bit_ofs+4]);
FreezeMMXRegs(0);
}
cdvd.mg_maxsize = 0; // don't allow any write
cdvd.mg_size = 8+16*cdvd.mg_buffer[4];//new offset, i just moved the data

View File

@ -189,7 +189,6 @@ int CdRead(u32 lsn, u32 sectors, void *buf, CdRMode *mode){
return 0;
buff = CDVDgetBuffer();
if (buff==NULL) return 0;
FreezeMMXRegs(1);
switch (mode->datapattern){
case CdSecS2048:
memcpy_fast((void*)((uptr)buf+2048*i), buff, 2048);break;//only data
@ -198,7 +197,6 @@ int CdRead(u32 lsn, u32 sectors, void *buf, CdRMode *mode){
case CdSecS2340:
memcpy_fast((void*)((uptr)buf+2340*i), buff, 2340);break;//without sync
}
FreezeMMXRegs(0);
}
return 1;
}
@ -216,9 +214,7 @@ int DvdRead(u32 lsn, u32 sectors, void *buf, CdRMode *mode){
// switch (mode->datapattern){
// case CdSecS2064:
((u32*)buf)[0] = i + 0x30000;
FreezeMMXRegs(1);
memcpy_fast((u8*)buf+12, buff, 2048);
FreezeMMXRegs(0);
buf = (char*)buf + 2064; break;
// default:
// return 0;
@ -253,9 +249,7 @@ int CDVD_GetVolumeDescriptor(void){
if ((localVolDesc.filesystemType == 1) ||
(localVolDesc.filesystemType == 2))
{
FreezeMMXRegs(1);
memcpy_fast(&CDVolDesc, &localVolDesc, sizeof(cdVolDesc));
FreezeMMXRegs(0);
}
}
else

View File

@ -188,9 +188,7 @@ int CDVDFS_read( int fd, char *buffer, int size ){
RPC_LOG("[CDVDisodrv: ] Couldn't Read from file for some reason\n");
return 0;
}
FreezeMMXRegs(1);
memcpy_fast(buffer, lb + off_sector, ssize);
FreezeMMXRegs(0);
}
if (asize) if (CdRead(asector, asize >> 11, buffer+ssize, &cdReadMode) != TRUE){
RPC_LOG("[CDVDisodrv: ] Couldn't Read from file for some reason\n");
@ -201,9 +199,7 @@ int CDVDFS_read( int fd, char *buffer, int size ){
RPC_LOG("[CDVDisodrv: ] Couldn't Read from file for some reason\n");
return 0;
}
FreezeMMXRegs(1);
memcpy_fast(buffer+ssize+asize, lb, esize);
FreezeMMXRegs(0);
}
/***********************
// Now work out where we want to start reading from

View File

@ -527,9 +527,7 @@ void cdrReadInterrupt() {
CDREAD_INT((cdr.Mode & 0x80) ? (cdReadTime / 2) : cdReadTime);
return;
}
FreezeMMXRegs(1);
memcpy_fast(cdr.Transfer, buf+12, 2340);
FreezeMMXRegs(0);
cdr.Stat = DataReady;
CDR_LOG(" %x:%x:%x\n", cdr.Transfer[0], cdr.Transfer[1], cdr.Transfer[2]);
@ -923,9 +921,7 @@ void psxDma3(u32 madr, u32 bcr, u32 chcr) {
}
cdsize = (bcr & 0xffff) * 4;
FreezeMMXRegs(1);
memcpy_fast((u8*)PSXM(madr), cdr.pTransfer, cdsize);
FreezeMMXRegs(0);
psxCpu->Clear(madr, cdsize/4);
cdr.pTransfer+=cdsize;

View File

@ -575,11 +575,11 @@ static void WRITERING_DMA(u32 *pMem, u32 qwc)
{
pendmem = (pendmem&~0xfff)-16;
}
memcpy_raz_(pgsmem, pMem, pendmem-(u32)gif->madr+16);
memcpy_aligned(pgsmem, pMem, pendmem-(u32)gif->madr+16);
}
else
#endif
memcpy_raz_(pgsmem, pMem, sizetoread);
memcpy_aligned(pgsmem, pMem, sizetoread);
mtgsThread->SendDataPacket();
}

View File

@ -329,7 +329,7 @@ static __forceinline u8* dmaGetAddr(u32 mem)
#else
// Note: Dma addresses are guaranteed to be aligned to 16 bytes (128 bits)
static __forceinline void *dmaGetAddr(u32 addr) {
u8 *ptr;
@ -355,35 +355,17 @@ void hwShutdown();
// hw read functions
extern u8 hwRead8 (u32 mem);
int hwConstRead8 (u32 x86reg, u32 mem, u32 sign);
extern u16 hwRead16(u32 mem);
int hwConstRead16(u32 x86reg, u32 mem, u32 sign);
extern u32 hwRead32(u32 mem);
int hwConstRead32(u32 x86reg, u32 mem);
u64 hwRead64(u32 mem);
void hwConstRead64(u32 mem, int mmreg);
void hwRead128(u32 mem, u64 *out);
void hwConstRead128(u32 mem, int xmmreg);
extern u64 hwRead64(u32 mem);
extern void hwRead128(u32 mem, u64 *out);
// hw write functions
void hwWrite8 (u32 mem, u8 value);
void hwConstWrite8 (u32 mem, int mmreg);
void hwWrite16(u32 mem, u16 value);
void hwConstWrite16(u32 mem, int mmreg);
void hwWrite32(u32 mem, u32 value);
void hwConstWrite32(u32 mem, int mmreg);
void hwWrite64(u32 mem, u64 value);
void hwConstWrite64(u32 mem, int mmreg);
void hwWrite128(u32 mem, const u64 *value);
void hwConstWrite128(u32 mem, int xmmreg);
extern void hwWrite8 (u32 mem, u8 value);
extern void hwWrite16(u32 mem, u16 value);
extern void hwWrite32(u32 mem, u32 value);
extern void hwWrite64(u32 mem, u64 value);
extern void hwWrite128(u32 mem, const u64 *value);
void hwIntcIrq(int n);
void hwDmacIrq(int n);
@ -394,6 +376,18 @@ int hwMFIFOWrite(u32 addr, u8 *data, u32 size);
int hwDmacSrcChainWithStack(DMACh *dma, int id);
int hwDmacSrcChain(DMACh *dma, int id);
int hwConstRead8 (u32 x86reg, u32 mem, u32 sign);
int hwConstRead16(u32 x86reg, u32 mem, u32 sign);
int hwConstRead32(u32 x86reg, u32 mem);
void hwConstRead64(u32 mem, int mmreg);
void hwConstRead128(u32 mem, int xmmreg);
void hwConstWrite8 (u32 mem, int mmreg);
void hwConstWrite16(u32 mem, int mmreg);
void hwConstWrite32(u32 mem, int mmreg);
void hwConstWrite64(u32 mem, int mmreg);
void hwConstWrite128(u32 mem, int xmmreg);
#ifdef PCSX2_VIRTUAL_MEM
void iMemRead32Check();
#endif

View File

@ -430,7 +430,7 @@ int mtgsThreadObject::Callback()
{
Console::WriteLn("MTGS > Thread Started, Opening GS Plugin...");
memcpy_raz_( m_gsMem, PS2MEM_GS, sizeof(m_gsMem) );
memcpy_aligned( m_gsMem, PS2MEM_GS, sizeof(m_gsMem) );
GSsetBaseMem( m_gsMem );
m_returncode = GSopen((void *)&pDsp, "PCSX2", 1);

View File

@ -230,12 +230,19 @@ extern u8 g_globalXMMSaved;
void _memset16_unaligned( void* dest, u16 data, size_t size );
#if defined(_WIN32) && !defined(__x86_64__)
// faster memcpy
extern void __fastcall memcpy_raz_u(void *dest, const void *src, size_t bytes);
extern void __fastcall memcpy_raz_(void *dest, const void *src, size_t qwc);
extern void __fastcall memcpy_amd_(void *dest, const void *src, size_t n);
// The new simplified memcpy_amd_ is now faster than memcpy_raz_.
// memcpy_amd_ also does mmx register saving, negating the need for freezeregs (code cleanup!)
// Additionally, using one single memcpy implementation keeps the code cache cleaner.
//extern void __fastcall memcpy_raz_udst(void *dest, const void *src, size_t bytes);
//extern void __fastcall memcpy_raz_usrc(void *dest, const void *src, size_t bytes);
//extern void __fastcall memcpy_raz_(void *dest, const void *src, size_t bytes);
extern void __fastcall memcpy_amd_(void *dest, const void *src, size_t bytes);
# include "windows/memzero.h"
# define memcpy_fast memcpy_amd_
# define memcpy_aligned memcpy_amd_
#else
@ -243,6 +250,10 @@ void _memset16_unaligned( void* dest, u16 data, size_t size );
#define memcpy_fast memcpy
#define memcpy_raz_ memcpy
#define memcpy_raz_u memcpy
#define memcpy_aligned memcpy
#define memcpy_raz_u memcpy
#include "Linux/memzero.h"
#endif

View File

@ -226,13 +226,11 @@ extern void mfifoGIFtransfer(int);
#define gif ((DMACh*)&PS2MEM_HW[0xA000])
void dmaSPR0() { // fromSPR
int qwc = spr0->qwc;
FreezeMMXRegs(1);
SPR_LOG("dmaSPR0 chcr = %lx, madr = %lx, qwc = %lx, sadr = %lx\n",
spr0->chcr, spr0->madr, spr0->qwc, spr0->sadr);
_dmaSPR0();
FreezeMMXRegs(0);
if ((psHu32(DMAC_CTRL) & 0xC) == 0xC) { // GIF MFIFO
if((spr0->madr & ~psHu32(DMAC_RBSR)) != psHu32(DMAC_RBOR)) SysPrintf("GIF MFIFO Write outside MFIFO area\n");
spr0->madr = psHu32(DMAC_RBOR) + (spr0->madr & psHu32(DMAC_RBSR));
@ -308,7 +306,6 @@ void _SPR1interleave() {
void dmaSPR1() { // toSPR
FreezeMMXRegs(1);
#ifdef SPR_LOG
SPR_LOG("dmaSPR1 chcr = 0x%x, madr = 0x%x, qwc = 0x%x\n"
" tadr = 0x%x, sadr = 0x%x\n",
@ -325,7 +322,6 @@ void dmaSPR1() { // toSPR
// Transfer Dn_QWC from Dn_MADR to SPR1
SPR1chain();
CPU_INT(9, cycles);
FreezeMMXRegs(0);
return;
} else if ((spr1->chcr & 0xc) == 0x4){
int cycles = 0;
@ -338,7 +334,6 @@ void dmaSPR1() { // toSPR
// Transfer Dn_QWC from Dn_MADR to SPR1
SPR1chain();
CPU_INT(9, cycles);
FreezeMMXRegs(0);
return;
}
// Chain Mode
@ -382,7 +377,6 @@ void dmaSPR1() { // toSPR
} else { // Interleave Mode
_SPR1interleave();
}
FreezeMMXRegs(0);
}

View File

@ -574,9 +574,7 @@ static void VIFunpack(u32 *data, vifCode *v, int size, const unsigned int VIFdma
// v4-32
if(vifRegs->mode == 0 && !(vifRegs->code & 0x10000000) && vif->usn == 0){
vifRegs->num -= size>>4;
FreezeMMXRegs(1);
memcpy_fast((u8*)dest, cdata, size);
FreezeMMXRegs(0);
size = 0;
//unpacktotal += GetCPUTick()-basetick;
return;
@ -814,9 +812,7 @@ static __forceinline void _vif0mpgTransfer(u32 addr, u32 *data, int size) {
fclose(f);
}*/
if (memcmp(VU0.Micro + addr, data, size << 2)) {
FreezeMMXRegs(1);
memcpy_fast(VU0.Micro + addr, data, size << 2);
FreezeMMXRegs(0);
CpuVU0->Clear(addr, size);
}
}
@ -1490,9 +1486,7 @@ static __forceinline void _vif1mpgTransfer(u32 addr, u32 *data, int size) {
}*/
assert( VU1.Micro > 0 );
if (memcmp(VU1.Micro + addr, data, size << 2)) {
FreezeMMXRegs(1);
memcpy_fast(VU1.Micro + addr, data, size << 2);
FreezeMMXRegs(0);
CpuVU1->Clear(addr, size);
}
}
@ -1644,7 +1638,7 @@ static int Vif1TransDirectHL(u32 *data){
{
//unaligned copy.VIF handling is -very- messy, so i'l use this code til i fix it :)
const uint count = mtgsThread->PrepDataPacket( GIF_PATH_2, data, ret<<2 );
memcpy_raz_u( mtgsThread->GetDataPacketPtr(), data, count );
memcpy_fast( mtgsThread->GetDataPacketPtr(), data, count );
mtgsThread->SendDataPacket();
}
else {

View File

@ -49,6 +49,9 @@ namespace Threading
}
cpuinfo.LogicalCores = CPUs;
if( LogicalCoresPerPhysicalCPU > CPUs) // for 1-socket HTT-disabled machines
LogicalCoresPerPhysicalCPU = CPUs;
cpuinfo.PhysicalCores = ( CPUs / LogicalCoresPerPhysicalCPU ) * PhysicalCoresPerPhysicalCPU;
ptw32_smp_system = ( cpuinfo.LogicalCores > 1 ) ? TRUE : FALSE;
}

View File

@ -88,35 +88,27 @@ void checkregs()
#endif
__declspec(align(16)) static u8 _xmm_backup[16*2];
PCSX2_ALIGNED16( static u8 _xmm_backup[16*2] );
PCSX2_ALIGNED16( static u8 _mmx_backup[8*4] );
// this one checks for alignments too ...
__declspec(naked) void __fastcall memcpy_raz_u(void *dest, const void *src, size_t bytes)
static __declspec(naked) void __fastcall _memcpy_raz_usrc(void *dest, const void *src, size_t bytes)
{
// If src is aligned, use memcpy_raz instead:
__asm
{
test edx,0xf;
jz memcpy_raz_;
}
// MOVSRC = opcode used to read. I use the same code for the aligned version, with a different define :)
#define MOVSRC movups
#define MOVSRC movdqu
#define MOVDST movdqa
__asm
{
//Reads before reads, to avoid stalls
mov eax,[esp+4];
//Make sure to save xmm0, it must be preserved ...
movaps [_xmm_backup+0x00],xmm0;
movaps [_xmm_backup],xmm0;
//if >=128 bytes use 128 byte unrolled loop
//i use cmp ..,127 + jna because 127 is encodable using the simm8 form
cmp eax,127;
jna _loop_1;
//unrolled version also touches xmm1, save it :)
movaps [_xmm_backup+0x10],xmm1;
//since this is a common branch target it could be good to align it -- no idea if it has any effect :p
align 16
@ -124,34 +116,111 @@ __declspec(naked) void __fastcall memcpy_raz_u(void *dest, const void *src, size
_loop_8:
MOVSRC xmm0,[edx+0x00]; //read first to avoid read-after-write stalls
MOVSRC xmm1,[edx+0x10];
MOVDST [ecx+0x00],xmm0; //then write :p
MOVSRC xmm0,[edx+0x10];
MOVDST [ecx+0x10],xmm0;
sub edx,-128; //edx won't be used for a while, so update it here. sub/-128 for simm8 encoding
movaps [ecx+0x00],xmm0; //then write :p
movaps [ecx+0x10],xmm1;
sub ecx,-128; //ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding
MOVSRC xmm0,[edx+0x20-128];
MOVSRC xmm1,[edx+0x30-128];
MOVDST [ecx+0x20-128],xmm0;
MOVSRC xmm0,[edx+0x30-128];
MOVDST [ecx+0x30-128],xmm0;
add eax,-128; //eax won't be used for a while, so update it here. add/-128 for simm8 encoding
movaps [ecx+0x20-128],xmm0;
movaps [ecx+0x30-128],xmm1;
MOVSRC xmm0,[edx+0x40-128];
MOVSRC xmm1,[edx+0x50-128];
movaps [ecx+0x40-128],xmm0;
movaps [ecx+0x50-128],xmm1;
MOVDST [ecx+0x40-128],xmm0;
MOVSRC xmm0,[edx+0x50-128];
MOVDST [ecx+0x50-128],xmm0;
MOVSRC xmm0,[edx+0x60-128];
MOVSRC xmm1,[edx+0x70-128];
movaps [ecx+0x60-128],xmm0;
movaps [ecx+0x70-128],xmm1;
MOVDST [ecx+0x60-128],xmm0;
MOVSRC xmm0,[edx+0x70-128];
MOVDST [ecx+0x70-128],xmm0;
//127~ja, 127 is encodable as simm8 :)
cmp eax,127;
ja _loop_8;
//restore xmm1 :)
movaps xmm1,[_xmm_backup+0x10];
//direct copy for 0~7 qwords
//in order to avoid the inc/dec of all 3 registers
//i use negative relative addressing from the top of the buffers
//[top-current index]
_loop_1:
//prepare the regs for 'negative relative addressing'
add edx,eax;
add ecx,eax;
neg eax;
jz cleanup; //exit if nothing to do
_loop_1_inner:
MOVSRC xmm0,[edx+eax];
MOVDST [ecx+eax],xmm0;
add eax,16; //while the offset is still negative we have data to copy
js _loop_1_inner;
//done !
cleanup:
//restore xmm and exit ~)
movaps xmm0,[_xmm_backup];
ret 4;
}
#undef MOVSRC
#undef MOVDST
}
static __declspec(naked) void __fastcall _memcpy_raz_udst(void *dest, const void *src, size_t bytes)
{
// MOVDST = opcode used to read. I use the same code for the aligned version, with a different define :)
#define MOVSRC movaps
#define MOVDST movups
__asm
{
//Reads before reads, to avoid stalls
mov eax,[esp+4];
//Make sure to save xmm0, it must be preserved ...
movaps [_xmm_backup],xmm0;
//if >=128 bytes use 128 byte unrolled loop
//i use cmp ..,127 + jna because 127 is encodable using the simm8 form
cmp eax,127;
jna _loop_1;
//since this is a common branch target it could be good to align it -- no idea if it has any effect :p
align 16
//128 byte unrolled loop
_loop_8:
MOVSRC xmm0,[edx+0x00]; //read first to avoid read-after-write stalls
MOVDST [ecx+0x00],xmm0; //then write :p
MOVSRC xmm0,[edx+0x10];
MOVDST [ecx+0x10],xmm0;
sub edx,-128; //edx won't be used for a while, so update it here. sub/-128 for simm8 encoding
sub ecx,-128; //ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding
MOVSRC xmm0,[edx+0x20-128];
MOVDST [ecx+0x20-128],xmm0;
MOVSRC xmm0,[edx+0x30-128];
MOVDST [ecx+0x30-128],xmm0;
add eax,-128; //eax won't be used for a while, so update it here. add/-128 for simm8 encoding
MOVSRC xmm0,[edx+0x40-128];
MOVDST [ecx+0x40-128],xmm0;
MOVSRC xmm0,[edx+0x50-128];
MOVDST [ecx+0x50-128],xmm0;
MOVSRC xmm0,[edx+0x60-128];
MOVDST [ecx+0x60-128],xmm0;
MOVSRC xmm0,[edx+0x70-128];
MOVDST [ecx+0x70-128],xmm0;
//127~ja, 127 is encodable as simm8 :)
cmp eax,127;
ja _loop_8;
//direct copy for 0~7 qwords
//in order to avoid the inc/dec of all 3 registers
@ -168,22 +237,24 @@ _loop_1:
_loop_1_inner:
MOVSRC xmm0,[edx+eax];
movaps [ecx+eax],xmm0;
add eax,16; //while the offset is still negative we have data to copy
js _loop_1_inner;
//done !
cleanup:
//restore xmm and exit ~)
movaps xmm0,[_xmm_backup+0x00];
movaps xmm0,[_xmm_backup];
ret 4;
}
#undef MOVSRC
#undef MOVDST
}
// Custom memcpy, only for 16 byte aligned stuff (used for mtgs)
// This function is optimized for medium-small transfer sizes (<2048, >=128). No prefetching is
// used since the reads are linear and the cache logic can predict em :)
// *OBSOLETE* -- memcpy_amd_ has been optimized and is now faster.
__declspec(naked) void __fastcall memcpy_raz_(void *dest, const void *src, size_t bytes)
{
// Code Implementation Notes:
@ -191,21 +262,19 @@ __declspec(naked) void __fastcall memcpy_raz_(void *dest, const void *src, size_
// MOVSRC = opcode used to read. I use the same code for the unaligned version, with a different define :)
#define MOVSRC movaps
#define MOVDST movaps
__asm
{
//Reads before reads, to avoid stalls
mov eax,[esp+4];
//Make sure to save xmm0, it must be preserved ...
movaps [_xmm_backup+0x00],xmm0;
movaps [_xmm_backup],xmm0;
//if >=128 bytes use 128 byte unrolled loop
//i use cmp ..,127 + jna because 127 is encodable using the simm8 form
cmp eax,127;
jna _loop_1;
//unrolled version also toiches xmm1, save it :)
movaps [_xmm_backup+0x10],xmm1;
//since this is a common branch target it could be good to align it -- no idea if it has any effect :p
align 16
@ -213,35 +282,32 @@ __declspec(naked) void __fastcall memcpy_raz_(void *dest, const void *src, size_
_loop_8:
MOVSRC xmm0,[edx+0x00]; //read first to avoid read-after-write stalls
MOVSRC xmm1,[edx+0x10];
MOVDST [ecx+0x00],xmm0; //then write :p
MOVSRC xmm0,[edx+0x10];
MOVDST [ecx+0x10],xmm0;
sub edx,-128; //edx won't be used for a while, so update it here. sub/-128 for simm8 encoding
movaps [ecx+0x00],xmm0; //then write :p
movaps [ecx+0x10],xmm1;
sub ecx,-128; //ecx won't be used for a while, so update it here. sub/-128 for simm8 encoding
MOVSRC xmm0,[edx+0x20-128];
MOVSRC xmm1,[edx+0x30-128];
MOVDST [ecx+0x20-128],xmm0;
MOVSRC xmm0,[edx+0x30-128];
MOVDST [ecx+0x30-128],xmm0;
add eax,-128; //eax won't be used for a while, so update it here. add/-128 for simm8 encoding
movaps [ecx+0x20-128],xmm0;
movaps [ecx+0x30-128],xmm1;
MOVSRC xmm0,[edx+0x40-128];
MOVSRC xmm1,[edx+0x50-128];
movaps [ecx+0x40-128],xmm0;
movaps [ecx+0x50-128],xmm1;
MOVDST [ecx+0x40-128],xmm0;
MOVSRC xmm0,[edx+0x50-128];
MOVDST [ecx+0x50-128],xmm0;
MOVSRC xmm0,[edx+0x60-128];
MOVSRC xmm1,[edx+0x70-128];
movaps [ecx+0x60-128],xmm0;
movaps [ecx+0x70-128],xmm1;
MOVDST [ecx+0x60-128],xmm0;
MOVSRC xmm0,[edx+0x70-128];
MOVDST [ecx+0x70-128],xmm0;
//127~ja, 127 is encodable as simm8 :)
cmp eax,127;
ja _loop_8;
//restore xmm1 :)
movaps xmm1,[_xmm_backup+0x10];
//direct copy for 0~7 qwords
//in order to avoid the inc/dec of all 3 registers
//i use negative relative addressing from the top of the buffers
@ -256,7 +322,7 @@ _loop_1:
_loop_1_inner:
MOVSRC xmm0,[edx+eax];
movaps [ecx+eax],xmm0;
MOVDST [ecx+eax],xmm0;
add eax,16; //while the offset is still negative we have data to copy
js _loop_1_inner;
@ -264,44 +330,64 @@ _loop_1_inner:
//done !
cleanup:
//restore xmm and exit ~)
movaps xmm0,[_xmm_backup+0x00];
movaps xmm0,[_xmm_backup];
ret 4;
}
#undef MOVSRC
#undef MOVDST
}
#undef MOVSRC
// This memcpy routine is for use in situations where the source buffer's alignment is indeterminate.
__forceinline void __fastcall memcpy_raz_usrc(void *dest, const void *src, size_t bytes)
{
if( ((uptr)src & 0xf) == 0 )
memcpy_raz_( dest, src, bytes );
else
_memcpy_raz_usrc( dest, src, bytes );
}
// This memcpy routine is for use in situations where the destination buffer's alignment is indeterminate.
__forceinline void __fastcall memcpy_raz_udst(void *dest, const void *src, size_t bytes)
{
if( ((uptr)dest & 0xf) == 0 )
memcpy_raz_( dest, src, bytes );
else
_memcpy_raz_udst( dest, src, bytes );
}
//////////////////////////////////////////////////////////////////////////
// Fast memcpy as coded by AMD.
// This function clobbers all MMX registers, and is generally not optimal for short memory
// copies due to the amount of overhead required to test for alignments, copy length,
// and other ABI overhead.
void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
// Fast memcpy as coded by AMD, and thn improved by air.
//
// This routine preserves mmx registers! It's the complete real deal!
__declspec(naked) void __fastcall memcpy_amd_(void *dest, const void *src, size_t n)
{
__asm {
__asm
{
push edi
push esi
mov edi, ecx ; destination
mov esi, edx ; source
mov ecx, [n] ; number of bytes to copy
mov ebx, ecx ; keep a copy of count
mov ecx, [esp+12] ; number of bytes to copy
mov eax, ecx ; keep a copy of count
cld
cmp ecx, TINY_BLOCK_COPY
cmp eax, TINY_BLOCK_COPY
jb $memcpy_ic_3 ; tiny? skip mmx copy
cmp ecx, 32*1024 ; don't align between 32k-64k because
cmp eax, 32*1024 ; don't align between 32k-64k because
jbe $memcpy_do_align ; it appears to be slower
cmp ecx, 64*1024
cmp eax, 64*1024
jbe $memcpy_align_done
$memcpy_do_align:
mov ecx, 8 ; a trick that's faster than rep movsb...
sub ecx, edi ; align destination to qword
and ecx, 111b ; get the low bits
sub ebx, ecx ; update copy count
neg ecx ; set up to jump into the array
add ecx, offset $memcpy_align_done
jmp ecx ; jump to array of movsb's
mov eax, 8 ; a trick that's faster than rep movsb...
sub eax, edi ; align destination to qword
and eax, 111b ; get the low bits
sub ecx, eax ; update copy count
neg eax ; set up to jump into the array
add eax, offset $memcpy_align_done
jmp eax ; jump to array of movsb's
align 4
movsb
@ -314,13 +400,18 @@ align 4
movsb
$memcpy_align_done: ; destination is dword aligned
mov ecx, ebx ; number of bytes left to copy
shr ecx, 6 ; get 64-byte block count
mov eax, ecx ; number of bytes left to copy
shr eax, 6 ; get 64-byte block count
jz $memcpy_ic_2 ; finish the last few bytes
cmp ecx, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy
cmp eax, IN_CACHE_COPY/64 ; too big 4 cache? use uncached copy
jae $memcpy_uc_test
movq [_mmx_backup+0x00],mm0
movq [_mmx_backup+0x08],mm1
movq [_mmx_backup+0x10],mm2
movq [_mmx_backup+0x18],mm3
// This is small block copy that uses the MMX registers to copy 8 bytes
// at a time. It uses the "unrolled loop" optimization, and also uses
// the software prefetch instruction to get the data into the cache.
@ -348,30 +439,39 @@ $memcpy_ic_1: ; 64-byte block copies, in-cache copy
add esi, 64 ; update source pointer
add edi, 64 ; update destination pointer
dec ecx ; count down
dec eax ; count down
jnz $memcpy_ic_1 ; last 64-byte block?
movq mm0,[_mmx_backup+0x00]
movq mm1,[_mmx_backup+0x08]
movq mm2,[_mmx_backup+0x10]
movq mm3,[_mmx_backup+0x18]
$memcpy_ic_2:
mov ecx, ebx ; has valid low 6 bits of the byte count
mov eax, ecx ; has valid low 6 bits of the byte count
$memcpy_ic_3:
shr ecx, 2 ; dword count
and ecx, 1111b ; only look at the "remainder" bits
neg ecx ; set up to jump into the array
add ecx, offset $memcpy_last_few
jmp ecx ; jump to array of movsd's
shr eax, 2 ; dword count
and eax, 1111b ; only look at the "remainder" bits
neg eax ; set up to jump into the array
add eax, offset $memcpy_last_few
jmp eax ; jump to array of movsd's
$memcpy_uc_test:
cmp ecx, UNCACHED_COPY/64 ; big enough? use block prefetch copy
/*cmp ecx, UNCACHED_COPY/64 ; big enough? use block prefetch copy
jae $memcpy_bp_1
$memcpy_64_test:
or ecx, ecx ; tail end of block prefetch will jump here
$memcpy_64_test:*/
or eax, eax ; tail end of block prefetch will jump here
jz $memcpy_ic_2 ; no more 64-byte blocks left
// For larger blocks, which will spill beyond the cache, it's faster to
// use the Streaming Store instruction MOVNTQ. This write instruction
// bypasses the cache and writes straight to main memory. This code also
// uses the software prefetch instruction to pre-read the data.
movq [_mmx_backup+0x00],mm0
movq [_mmx_backup+0x08],mm1
movq [_mmx_backup+0x10],mm2
align 16
$memcpy_uc_1: ; 64-byte blocks, uncached copy
@ -394,17 +494,25 @@ $memcpy_uc_1: ; 64-byte blocks, uncached copy
movq mm1,[esi-8]
movntq [edi-24], mm2
movntq [edi-16], mm0
dec ecx
dec eax
movntq [edi-8], mm1
jnz $memcpy_uc_1 ; last 64-byte block?
jmp $memcpy_ic_2 ; almost done
movq mm0,[_mmx_backup+0x00]
movq mm1,[_mmx_backup+0x08]
movq mm2,[_mmx_backup+0x10]
jmp $memcpy_ic_2 ; almost done (not needed because large copy below was removed)
// For the largest size blocks, a special technique called Block Prefetch
// can be used to accelerate the read operations. Block Prefetch reads
// one address per cache line, for a series of cache lines, in a short loop.
// This is faster than using software prefetch. The technique is great for
// getting maximum read bandwidth, especially in DDR memory systems.
// Note: Pcsx2 rarely invokes large copies, so this mode has been disabled to
// help keep the code cache footprint of memcpy_fast to a minimum.
/*
$memcpy_bp_1: ; large blocks, block prefetch copy
cmp ecx, CACHEBLOCK ; big enough to run another prefetch loop?
@ -447,6 +555,7 @@ $memcpy_bp_3:
jnz $memcpy_bp_3 ; keep copying
sub ecx, CACHEBLOCK ; update the 64-byte block count
jmp $memcpy_bp_1 ; keep processing chunks
*/
// The smallest copy uses the X86 "movsd" instruction, in an optimized
// form which is an "unrolled loop". Then it handles the last few bytes.
@ -469,8 +578,8 @@ align 4
movsd
$memcpy_last_few: ; dword aligned from before movsd's
mov ecx, ebx ; has valid low 2 bits of the byte count
and ecx, 11b ; the last few cows must come home
mov eax, ecx ; has valid low 2 bits of the byte count
and eax, 11b ; the last few cows must come home
jz $memcpy_final ; no more, let's leave
rep movsb ; the last 1, 2, or 3 bytes
@ -479,10 +588,14 @@ $memcpy_final:
sfence ; flush the write buffer
//mov eax, [dest] ; ret value = destination pointer
pop esi
pop edi
ret 4
}
}
// mmx memcpy implementation, size has to be a multiple of 8
// mmx mem-compare implementation, size has to be a multiple of 8
// returns 0 is equal, nonzero value if not equal
// ~10 times faster than standard memcmp
// (zerofrog)

View File

@ -1977,12 +1977,7 @@ void VU1XGKICK_MTGSTransfer(u32 *pMem, u32 addr)
//if( size > 0 )
{
u8* pmem = mtgsThread->GetDataPacketPtr();
//FreezeMMXRegs(1);
//memcpy_fast(pmem, (u8*)pMem+addr, size);
//FreezeMMXRegs(0);
// we can use the faster memcpy_raz_ here (src/dest are garaunteed to be aligned)
memcpy_raz_(pmem, (u8*)pMem+addr, size);
memcpy_aligned(pmem, (u8*)pMem+addr, size);
mtgsThread->SendDataPacket();
}
}

View File

@ -880,9 +880,7 @@ static VuFunctionHeader* SuperVURecompileProgram(u32 startpc, int vuindex)
#ifdef SUPERVU_CACHING
//memxor_mmx(r.checksum, &VU->Micro[r.start], r.size);
r.pmem = malloc(r.size);
FreezeMMXRegs(1);
memcpy_fast(r.pmem, &VU->Micro[r.start], r.size);
FreezeMMXRegs(0);
#endif
s_pFnHeader->ranges.push_back(r);
}