mirror of https://github.com/PCSX2/pcsx2.git
Attempted generated sse memcpy using movaps.
Seems slower than memcpy_amd_ so didn't enable it. I tried two different versions, one generates 0x400 different functions, and the other generates 1 function and jumps to the correct entry point. The later seems faster, but still slower than memcpy_amd_... (only tested the title-screen of GoW though...) git-svn-id: http://pcsx2.googlecode.com/svn/trunk@3465 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
f0f0cef2d5
commit
ee5192abb5
|
@ -227,6 +227,10 @@
|
|||
RelativePath="..\..\src\Utilities\x86\MemcpyFast.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath="..\..\src\Utilities\x86\MemcpyVibes.cpp"
|
||||
>
|
||||
</File>
|
||||
<File
|
||||
RelativePath="..\..\src\Utilities\PathUtils.cpp"
|
||||
>
|
||||
|
|
|
@ -36,7 +36,13 @@
|
|||
// Only used in the Windows version of memzero.h. But it's in Misc.cpp for some reason.
|
||||
void _memset16_unaligned( void* dest, u16 data, size_t size );
|
||||
|
||||
#define memcpy_fast memcpy_amd_ // Fast memcpy
|
||||
#define memcpy_aligned memcpy_amd_ // Memcpy with 16-byte Aligned addresses
|
||||
#define memcpy_const memcpy_amd_ // Memcpy with constant size
|
||||
#define memcpy_constA memcpy_amd_ // Memcpy with constant size and 16-byte aligned
|
||||
// MemcpyVibes.cpp functions
|
||||
extern void __fastcall memcpy_vibes(void * dest, void * src, int size);
|
||||
extern void gen_memcpy_vibes();
|
||||
|
||||
#define memcpy_fast memcpy_amd_ // Fast memcpy
|
||||
#define memcpy_aligned memcpy_amd_ // Memcpy with 16-byte Aligned addresses
|
||||
#define memcpy_const memcpy_amd_ // Memcpy with constant size
|
||||
#define memcpy_constA memcpy_amd_ // Memcpy with constant size and 16-byte aligned
|
||||
#define memcpy_qwc_ memcpy_vibes // Memcpy in aligned qwc increments, with 0x400 qwc or less
|
||||
#define memcpy_qwc(x,y,z) memcpy_amd_(x, y, z*16) // Memcpy in aligned qwc increments
|
||||
|
|
|
@ -0,0 +1,104 @@
|
|||
/* PCSX2 - PS2 Emulator for PCs
|
||||
* Copyright (C) 2002-2010 PCSX2 Dev Team
|
||||
*
|
||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||
* ation, either version 3 of the License, or (at your option) any later version.
|
||||
*
|
||||
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||
* PURPOSE. See the GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License along with PCSX2.
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#include "PrecompiledHeader.h"
|
||||
#include "x86emitter/x86emitter.h"
|
||||
using namespace x86Emitter;
|
||||
|
||||
// Max Number of qwc supported
|
||||
#define _maxSize 0x400
|
||||
|
||||
typedef void (__fastcall *_memCpyCall)(void*, void*);
|
||||
__aligned16 _memCpyCall _memcpy_vibes[_maxSize+1];
|
||||
|
||||
#if 1
|
||||
// This version creates one function with a lot of movaps
|
||||
// It jumps to the correct movaps entry-point while adding
|
||||
// the proper offset for adjustment...
|
||||
|
||||
static __pagealigned u8 _memCpyExec[__pagesize*16];
|
||||
|
||||
void gen_memcpy_vibes() {
|
||||
HostSys::MemProtectStatic(_memCpyExec, Protect_ReadWrite, false);
|
||||
memset (_memCpyExec, 0xcc, sizeof(_memCpyExec));
|
||||
xSetPtr(_memCpyExec);
|
||||
|
||||
int off =-(((_maxSize & 0xf) - 7) << 4);
|
||||
for (int i = _maxSize, x = 0; i > 0; i--, x=(x+1)&7, off+=16) {
|
||||
|
||||
_memcpy_vibes[i] = (_memCpyCall)xGetPtr();
|
||||
|
||||
if (off >= 128) {
|
||||
off = -128;
|
||||
xADD(edx, 256);
|
||||
xADD(ecx, 256);
|
||||
}
|
||||
const xRegisterSSE xmm_t(x);
|
||||
xMOVAPS(xmm_t, ptr32[edx+off]);
|
||||
xMOVAPS(ptr32[ecx+off], xmm_t);
|
||||
}
|
||||
|
||||
_memcpy_vibes[0] = (_memCpyCall)xGetPtr();
|
||||
|
||||
xRET();
|
||||
pxAssert(((uptr)xGetPtr() - (uptr)_memCpyExec) < sizeof(_memCpyExec));
|
||||
|
||||
HostSys::MemProtectStatic(_memCpyExec, Protect_ReadOnly, true);
|
||||
}
|
||||
|
||||
void __fastcall memcpy_vibes(void * dest, void * src, int size) {
|
||||
int offset = ((size & 0xf) - 7) << 4;
|
||||
_memcpy_vibes[size]((void*)((uptr)dest + offset), (void*)((uptr)src + offset));
|
||||
}
|
||||
|
||||
#else
|
||||
// This version creates '_maxSize' number of different functions,
|
||||
// and calls the appropriate one...
|
||||
|
||||
static __pagealigned u8 _memCpyExec[__pagesize*_maxSize*2];
|
||||
|
||||
void gen_memcpy_vibes() {
|
||||
HostSys::MemProtectStatic(_memCpyExec, Protect_ReadWrite, false);
|
||||
memset (_memCpyExec, 0xcc, sizeof(_memCpyExec));
|
||||
xSetPtr(_memCpyExec);
|
||||
|
||||
for (int i = 0; i < _maxSize+1; i++)
|
||||
{
|
||||
int off = 0;
|
||||
_memcpy_vibes[i] = (_memCpyCall)xGetAlignedCallTarget();
|
||||
|
||||
for (int j = 0, x = 0; j < i; j++, x=(x+1)&7, off+=16) {
|
||||
if (off >= 128) {
|
||||
off = -128;
|
||||
xADD(edx, 256);
|
||||
xADD(ecx, 256);
|
||||
}
|
||||
const xRegisterSSE xmm_t(x);
|
||||
xMOVAPS(xmm_t, ptr32[edx+off]);
|
||||
xMOVAPS(ptr32[ecx+off], xmm_t);
|
||||
}
|
||||
|
||||
xRET();
|
||||
pxAssert(((uptr)xGetPtr() - (uptr)_memCpyExec) < sizeof(_memCpyExec));
|
||||
}
|
||||
|
||||
HostSys::MemProtectStatic(_memCpyExec, Protect_ReadOnly, true);
|
||||
}
|
||||
|
||||
void __fastcall memcpy_vibes(void * dest, void * src, int size) {
|
||||
_memcpy_vibes[size](dest, src);
|
||||
}
|
||||
|
||||
#endif
|
|
@ -106,6 +106,7 @@ _f void mVUinit(VURegs* vuRegsPtr, int vuIndex) {
|
|||
|
||||
// Allocates rec-cache and calls mVUreset()
|
||||
mVUresizeCache(mVU, mVU->cacheSize + mVUcacheSafeZone);
|
||||
//if (vuIndex) gen_memcpy_vibes();
|
||||
}
|
||||
|
||||
// Resets Rec Data
|
||||
|
|
|
@ -1109,17 +1109,16 @@ void __fastcall mVU_XGKICK_(u32 addr) {
|
|||
}
|
||||
size = GetMTGS().PrepDataPacket(GIF_PATH_1, data, diff);
|
||||
pDest = GetMTGS().GetDataPacketPtr();
|
||||
|
||||
if (size > diff) {
|
||||
// fixme: one of these days the following *16's will get cleaned up when we introduce
|
||||
// a special qwc/simd16 optimized version of memcpy_aligned. :)
|
||||
//DevCon.Status("XGkick Wrap!");
|
||||
memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), diff*16);
|
||||
//DevCon.WriteLn("XGkick Wrap!");
|
||||
memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), diff);
|
||||
size -= diff;
|
||||
pDest += diff*16;
|
||||
memcpy_aligned(pDest, microVU1.regs->Mem, size*16);
|
||||
memcpy_qwc(pDest, microVU1.regs->Mem, size);
|
||||
}
|
||||
else {
|
||||
memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), size*16);
|
||||
memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), size);
|
||||
}
|
||||
GetMTGS().SendDataPacket();
|
||||
if(GSTransferStatus.PTH1 == STOPPED_MODE)
|
||||
|
@ -1141,14 +1140,14 @@ void __fastcall mVU_XGKICK_(u32 addr) {
|
|||
// fixme: one of these days the following *16's will get cleaned up when we introduce
|
||||
// a special qwc/simd16 optimized version of memcpy_aligned. :)
|
||||
//DevCon.Status("XGkick Wrap!");
|
||||
memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), diff*16);
|
||||
memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), diff);
|
||||
Path1WritePos += size;
|
||||
size -= diff;
|
||||
pDest += diff*16;
|
||||
memcpy_aligned(pDest, microVU1.regs->Mem, size*16);
|
||||
memcpy_qwc(pDest, microVU1.regs->Mem, size);
|
||||
}
|
||||
else {
|
||||
memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), size*16);
|
||||
memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), size);
|
||||
Path1WritePos += size;
|
||||
}
|
||||
//if(!gifRegs->stat.P1Q) CPU_INT(28, 128);
|
||||
|
|
Loading…
Reference in New Issue