mirror of https://github.com/PCSX2/pcsx2.git
Attempted generated sse memcpy using movaps.
Seems slower than memcpy_amd_ so didn't enable it. I tried two different versions, one generates 0x400 different functions, and the other generates 1 function and jumps to the correct entry point. The later seems faster, but still slower than memcpy_amd_... (only tested the title-screen of GoW though...) git-svn-id: http://pcsx2.googlecode.com/svn/trunk@3465 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
f0f0cef2d5
commit
ee5192abb5
|
@ -227,6 +227,10 @@
|
||||||
RelativePath="..\..\src\Utilities\x86\MemcpyFast.cpp"
|
RelativePath="..\..\src\Utilities\x86\MemcpyFast.cpp"
|
||||||
>
|
>
|
||||||
</File>
|
</File>
|
||||||
|
<File
|
||||||
|
RelativePath="..\..\src\Utilities\x86\MemcpyVibes.cpp"
|
||||||
|
>
|
||||||
|
</File>
|
||||||
<File
|
<File
|
||||||
RelativePath="..\..\src\Utilities\PathUtils.cpp"
|
RelativePath="..\..\src\Utilities\PathUtils.cpp"
|
||||||
>
|
>
|
||||||
|
|
|
@ -36,7 +36,13 @@
|
||||||
// Only used in the Windows version of memzero.h. But it's in Misc.cpp for some reason.
|
// Only used in the Windows version of memzero.h. But it's in Misc.cpp for some reason.
|
||||||
void _memset16_unaligned( void* dest, u16 data, size_t size );
|
void _memset16_unaligned( void* dest, u16 data, size_t size );
|
||||||
|
|
||||||
#define memcpy_fast memcpy_amd_ // Fast memcpy
|
// MemcpyVibes.cpp functions
|
||||||
#define memcpy_aligned memcpy_amd_ // Memcpy with 16-byte Aligned addresses
|
extern void __fastcall memcpy_vibes(void * dest, void * src, int size);
|
||||||
#define memcpy_const memcpy_amd_ // Memcpy with constant size
|
extern void gen_memcpy_vibes();
|
||||||
#define memcpy_constA memcpy_amd_ // Memcpy with constant size and 16-byte aligned
|
|
||||||
|
#define memcpy_fast memcpy_amd_ // Fast memcpy
|
||||||
|
#define memcpy_aligned memcpy_amd_ // Memcpy with 16-byte Aligned addresses
|
||||||
|
#define memcpy_const memcpy_amd_ // Memcpy with constant size
|
||||||
|
#define memcpy_constA memcpy_amd_ // Memcpy with constant size and 16-byte aligned
|
||||||
|
#define memcpy_qwc_ memcpy_vibes // Memcpy in aligned qwc increments, with 0x400 qwc or less
|
||||||
|
#define memcpy_qwc(x,y,z) memcpy_amd_(x, y, z*16) // Memcpy in aligned qwc increments
|
||||||
|
|
|
@ -0,0 +1,104 @@
|
||||||
|
/* PCSX2 - PS2 Emulator for PCs
|
||||||
|
* Copyright (C) 2002-2010 PCSX2 Dev Team
|
||||||
|
*
|
||||||
|
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||||
|
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||||
|
* ation, either version 3 of the License, or (at your option) any later version.
|
||||||
|
*
|
||||||
|
* PCSX2 is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
|
||||||
|
* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
|
||||||
|
* PURPOSE. See the GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License along with PCSX2.
|
||||||
|
* If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "PrecompiledHeader.h"
|
||||||
|
#include "x86emitter/x86emitter.h"
|
||||||
|
using namespace x86Emitter;
|
||||||
|
|
||||||
|
// Max Number of qwc supported
|
||||||
|
#define _maxSize 0x400
|
||||||
|
|
||||||
|
typedef void (__fastcall *_memCpyCall)(void*, void*);
|
||||||
|
__aligned16 _memCpyCall _memcpy_vibes[_maxSize+1];
|
||||||
|
|
||||||
|
#if 1
|
||||||
|
// This version creates one function with a lot of movaps
|
||||||
|
// It jumps to the correct movaps entry-point while adding
|
||||||
|
// the proper offset for adjustment...
|
||||||
|
|
||||||
|
static __pagealigned u8 _memCpyExec[__pagesize*16];
|
||||||
|
|
||||||
|
void gen_memcpy_vibes() {
|
||||||
|
HostSys::MemProtectStatic(_memCpyExec, Protect_ReadWrite, false);
|
||||||
|
memset (_memCpyExec, 0xcc, sizeof(_memCpyExec));
|
||||||
|
xSetPtr(_memCpyExec);
|
||||||
|
|
||||||
|
int off =-(((_maxSize & 0xf) - 7) << 4);
|
||||||
|
for (int i = _maxSize, x = 0; i > 0; i--, x=(x+1)&7, off+=16) {
|
||||||
|
|
||||||
|
_memcpy_vibes[i] = (_memCpyCall)xGetPtr();
|
||||||
|
|
||||||
|
if (off >= 128) {
|
||||||
|
off = -128;
|
||||||
|
xADD(edx, 256);
|
||||||
|
xADD(ecx, 256);
|
||||||
|
}
|
||||||
|
const xRegisterSSE xmm_t(x);
|
||||||
|
xMOVAPS(xmm_t, ptr32[edx+off]);
|
||||||
|
xMOVAPS(ptr32[ecx+off], xmm_t);
|
||||||
|
}
|
||||||
|
|
||||||
|
_memcpy_vibes[0] = (_memCpyCall)xGetPtr();
|
||||||
|
|
||||||
|
xRET();
|
||||||
|
pxAssert(((uptr)xGetPtr() - (uptr)_memCpyExec) < sizeof(_memCpyExec));
|
||||||
|
|
||||||
|
HostSys::MemProtectStatic(_memCpyExec, Protect_ReadOnly, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
void __fastcall memcpy_vibes(void * dest, void * src, int size) {
|
||||||
|
int offset = ((size & 0xf) - 7) << 4;
|
||||||
|
_memcpy_vibes[size]((void*)((uptr)dest + offset), (void*)((uptr)src + offset));
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
// This version creates '_maxSize' number of different functions,
|
||||||
|
// and calls the appropriate one...
|
||||||
|
|
||||||
|
static __pagealigned u8 _memCpyExec[__pagesize*_maxSize*2];
|
||||||
|
|
||||||
|
void gen_memcpy_vibes() {
|
||||||
|
HostSys::MemProtectStatic(_memCpyExec, Protect_ReadWrite, false);
|
||||||
|
memset (_memCpyExec, 0xcc, sizeof(_memCpyExec));
|
||||||
|
xSetPtr(_memCpyExec);
|
||||||
|
|
||||||
|
for (int i = 0; i < _maxSize+1; i++)
|
||||||
|
{
|
||||||
|
int off = 0;
|
||||||
|
_memcpy_vibes[i] = (_memCpyCall)xGetAlignedCallTarget();
|
||||||
|
|
||||||
|
for (int j = 0, x = 0; j < i; j++, x=(x+1)&7, off+=16) {
|
||||||
|
if (off >= 128) {
|
||||||
|
off = -128;
|
||||||
|
xADD(edx, 256);
|
||||||
|
xADD(ecx, 256);
|
||||||
|
}
|
||||||
|
const xRegisterSSE xmm_t(x);
|
||||||
|
xMOVAPS(xmm_t, ptr32[edx+off]);
|
||||||
|
xMOVAPS(ptr32[ecx+off], xmm_t);
|
||||||
|
}
|
||||||
|
|
||||||
|
xRET();
|
||||||
|
pxAssert(((uptr)xGetPtr() - (uptr)_memCpyExec) < sizeof(_memCpyExec));
|
||||||
|
}
|
||||||
|
|
||||||
|
HostSys::MemProtectStatic(_memCpyExec, Protect_ReadOnly, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
void __fastcall memcpy_vibes(void * dest, void * src, int size) {
|
||||||
|
_memcpy_vibes[size](dest, src);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
|
@ -106,6 +106,7 @@ _f void mVUinit(VURegs* vuRegsPtr, int vuIndex) {
|
||||||
|
|
||||||
// Allocates rec-cache and calls mVUreset()
|
// Allocates rec-cache and calls mVUreset()
|
||||||
mVUresizeCache(mVU, mVU->cacheSize + mVUcacheSafeZone);
|
mVUresizeCache(mVU, mVU->cacheSize + mVUcacheSafeZone);
|
||||||
|
//if (vuIndex) gen_memcpy_vibes();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Resets Rec Data
|
// Resets Rec Data
|
||||||
|
|
|
@ -1109,17 +1109,16 @@ void __fastcall mVU_XGKICK_(u32 addr) {
|
||||||
}
|
}
|
||||||
size = GetMTGS().PrepDataPacket(GIF_PATH_1, data, diff);
|
size = GetMTGS().PrepDataPacket(GIF_PATH_1, data, diff);
|
||||||
pDest = GetMTGS().GetDataPacketPtr();
|
pDest = GetMTGS().GetDataPacketPtr();
|
||||||
|
|
||||||
if (size > diff) {
|
if (size > diff) {
|
||||||
// fixme: one of these days the following *16's will get cleaned up when we introduce
|
//DevCon.WriteLn("XGkick Wrap!");
|
||||||
// a special qwc/simd16 optimized version of memcpy_aligned. :)
|
memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), diff);
|
||||||
//DevCon.Status("XGkick Wrap!");
|
|
||||||
memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), diff*16);
|
|
||||||
size -= diff;
|
size -= diff;
|
||||||
pDest += diff*16;
|
pDest += diff*16;
|
||||||
memcpy_aligned(pDest, microVU1.regs->Mem, size*16);
|
memcpy_qwc(pDest, microVU1.regs->Mem, size);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), size*16);
|
memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), size);
|
||||||
}
|
}
|
||||||
GetMTGS().SendDataPacket();
|
GetMTGS().SendDataPacket();
|
||||||
if(GSTransferStatus.PTH1 == STOPPED_MODE)
|
if(GSTransferStatus.PTH1 == STOPPED_MODE)
|
||||||
|
@ -1141,14 +1140,14 @@ void __fastcall mVU_XGKICK_(u32 addr) {
|
||||||
// fixme: one of these days the following *16's will get cleaned up when we introduce
|
// fixme: one of these days the following *16's will get cleaned up when we introduce
|
||||||
// a special qwc/simd16 optimized version of memcpy_aligned. :)
|
// a special qwc/simd16 optimized version of memcpy_aligned. :)
|
||||||
//DevCon.Status("XGkick Wrap!");
|
//DevCon.Status("XGkick Wrap!");
|
||||||
memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), diff*16);
|
memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), diff);
|
||||||
Path1WritePos += size;
|
Path1WritePos += size;
|
||||||
size -= diff;
|
size -= diff;
|
||||||
pDest += diff*16;
|
pDest += diff*16;
|
||||||
memcpy_aligned(pDest, microVU1.regs->Mem, size*16);
|
memcpy_qwc(pDest, microVU1.regs->Mem, size);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
memcpy_aligned(pDest, microVU1.regs->Mem + (addr*16), size*16);
|
memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), size);
|
||||||
Path1WritePos += size;
|
Path1WritePos += size;
|
||||||
}
|
}
|
||||||
//if(!gifRegs->stat.P1Q) CPU_INT(28, 128);
|
//if(!gifRegs->stat.P1Q) CPU_INT(28, 128);
|
||||||
|
|
Loading…
Reference in New Issue