Attempted generated sse memcpy using movaps.

Seems slower than memcpy_amd_ so didn't enable it.

I tried two different versions, one generates 0x400 different functions, and the other generates 1 function and jumps to the correct entry point.
The later seems faster, but still slower than memcpy_amd_...
(only tested the title-screen of GoW though...)

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@3465 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
cottonvibes 2010-07-11 15:40:12 +00:00
parent f0f0cef2d5
commit ee5192abb5
5 changed files with 127 additions and 13 deletions

View File

@ -227,6 +227,10 @@
RelativePath="..\..\src\Utilities\x86\MemcpyFast.cpp"
>
</File>
<File
RelativePath="..\..\src\Utilities\x86\MemcpyVibes.cpp"
>
</File>
<File
RelativePath="..\..\src\Utilities\PathUtils.cpp"
>

View File

@ -36,7 +36,13 @@
// Only used in the Windows version of memzero.h. But it's in Misc.cpp for some reason.
void _memset16_unaligned( void* dest, u16 data, size_t size );
#define memcpy_fast memcpy_amd_ // Fast memcpy
#define memcpy_aligned memcpy_amd_ // Memcpy with 16-byte Aligned addresses
#define memcpy_const memcpy_amd_ // Memcpy with constant size
#define memcpy_constA memcpy_amd_ // Memcpy with constant size and 16-byte aligned
// MemcpyVibes.cpp functions
extern void __fastcall memcpy_vibes(void * dest, void * src, int size);
extern void gen_memcpy_vibes();
#define memcpy_fast memcpy_amd_ // Fast memcpy
#define memcpy_aligned memcpy_amd_ // Memcpy with 16-byte Aligned addresses
#define memcpy_const memcpy_amd_ // Memcpy with constant size
#define memcpy_constA memcpy_amd_ // Memcpy with constant size and 16-byte aligned
#define memcpy_qwc_ memcpy_vibes // Memcpy in aligned qwc increments, with 0x400 qwc or less
#define memcpy_qwc(x,y,z) memcpy_amd_(x, y, z*16) // Memcpy in aligned qwc increments

View File

@ -0,0 +1,104 @@
/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2010 PCSX2 Dev Team
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
*
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
* PURPOSE. See the GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along with PCSX2.
* If not, see <http://www.gnu.org/licenses/>.
*/
#include "PrecompiledHeader.h"
#include "x86emitter/x86emitter.h"
using namespace x86Emitter;
// Max Number of qwc supported
#define _maxSize 0x400
typedef void (__fastcall *_memCpyCall)(void*, void*);
__aligned16 _memCpyCall _memcpy_vibes[_maxSize+1];
#if 1
// This version creates one function with a lot of movaps
// It jumps to the correct movaps entry-point while adding
// the proper offset for adjustment...
static __pagealigned u8 _memCpyExec[__pagesize*16];
void gen_memcpy_vibes() {
HostSys::MemProtectStatic(_memCpyExec, Protect_ReadWrite, false);
memset (_memCpyExec, 0xcc, sizeof(_memCpyExec));
xSetPtr(_memCpyExec);
int off =-(((_maxSize & 0xf) - 7) << 4);
for (int i = _maxSize, x = 0; i > 0; i--, x=(x+1)&7, off+=16) {
_memcpy_vibes[i] = (_memCpyCall)xGetPtr();
if (off >= 128) {
off = -128;
xADD(edx, 256);
xADD(ecx, 256);
}
const xRegisterSSE xmm_t(x);
xMOVAPS(xmm_t, ptr32[edx+off]);
xMOVAPS(ptr32[ecx+off], xmm_t);
}
_memcpy_vibes[0] = (_memCpyCall)xGetPtr();
xRET();
pxAssert(((uptr)xGetPtr() - (uptr)_memCpyExec) < sizeof(_memCpyExec));
HostSys::MemProtectStatic(_memCpyExec, Protect_ReadOnly, true);
}
void __fastcall memcpy_vibes(void * dest, void * src, int size) {
int offset = ((size & 0xf) - 7) << 4;
_memcpy_vibes[size]((void*)((uptr)dest + offset), (void*)((uptr)src + offset));
}
#else
// This version creates '_maxSize' number of different functions,
// and calls the appropriate one...
static __pagealigned u8 _memCpyExec[__pagesize*_maxSize*2];
void gen_memcpy_vibes() {
HostSys::MemProtectStatic(_memCpyExec, Protect_ReadWrite, false);
memset (_memCpyExec, 0xcc, sizeof(_memCpyExec));
xSetPtr(_memCpyExec);
for (int i = 0; i < _maxSize+1; i++)
{
int off = 0;
_memcpy_vibes[i] = (_memCpyCall)xGetAlignedCallTarget();
for (int j = 0, x = 0; j < i; j++, x=(x+1)&7, off+=16) {
if (off >= 128) {
off = -128;
xADD(edx, 256);
xADD(ecx, 256);
}
const xRegisterSSE xmm_t(x);
xMOVAPS(xmm_t, ptr32[edx+off]);
xMOVAPS(ptr32[ecx+off], xmm_t);
}
xRET();
pxAssert(((uptr)xGetPtr() - (uptr)_memCpyExec) < sizeof(_memCpyExec));
}
HostSys::MemProtectStatic(_memCpyExec, Protect_ReadOnly, true);
}
void __fastcall memcpy_vibes(void * dest, void * src, int size) {
_memcpy_vibes[size](dest, src);
}
#endif

View File

@ -106,6 +106,7 @@ _f void mVUinit(VURegs* vuRegsPtr, int vuIndex) {
// Allocates rec-cache and calls mVUreset()
mVUresizeCache(mVU, mVU->cacheSize + mVUcacheSafeZone);
//if (vuIndex) gen_memcpy_vibes();
}
// Resets Rec Data

View File

@ -1109,17 +1109,16 @@ void __fastcall mVU_XGKICK_(u32 addr) {
}
size = GetMTGS().PrepDataPacket(GIF_PATH_1, data, diff);
pDest = GetMTGS().GetDataPacketPtr();
if (size > diff) {
// fixme: one of these days the following *16's will get cleaned up when we introduce
// a special qwc/simd16 optimized version of memcpy_aligned. :)
//DevCon.Status("XGkick Wrap!");
memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), diff*16);
//DevCon.WriteLn("XGkick Wrap!");
memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), diff);
size -= diff;
pDest += diff*16;
memcpy_aligned(pDest, microVU1.regs->Mem, size*16);
memcpy_qwc(pDest, microVU1.regs->Mem, size);
}
else {
memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), size*16);
memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), size);
}
GetMTGS().SendDataPacket();
if(GSTransferStatus.PTH1 == STOPPED_MODE)
@ -1141,14 +1140,14 @@ void __fastcall mVU_XGKICK_(u32 addr) {
// fixme: one of these days the following *16's will get cleaned up when we introduce
// a special qwc/simd16 optimized version of memcpy_aligned. :)
//DevCon.Status("XGkick Wrap!");
memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), diff*16);
memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), diff);
Path1WritePos += size;
size -= diff;
pDest += diff*16;
memcpy_aligned(pDest, microVU1.regs->Mem, size*16);
memcpy_qwc(pDest, microVU1.regs->Mem, size);
}
else {
memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), size*16);
memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), size);
Path1WritePos += size;
}
//if(!gifRegs->stat.P1Q) CPU_INT(28, 128);