Added a third alternative for memcpy_vibes. This one uses SSE intrinsics and is able to inline fully (no call/ret overhead).

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@3468 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
Jake.Stine 2010-07-12 04:13:47 +00:00
parent 742b9c5535
commit 383c58ba3e
3 changed files with 61 additions and 8 deletions

View File

@ -37,7 +37,7 @@
void _memset16_unaligned( void* dest, u16 data, size_t size ); void _memset16_unaligned( void* dest, u16 data, size_t size );
// MemcpyVibes.cpp functions // MemcpyVibes.cpp functions
extern void __fastcall memcpy_vibes(void * dest, void * src, int size); extern void memcpy_vibes(void * dest, const void * src, int size);
extern void gen_memcpy_vibes(); extern void gen_memcpy_vibes();
#define memcpy_fast memcpy_amd_ // Fast memcpy #define memcpy_fast memcpy_amd_ // Fast memcpy

View File

@ -15,6 +15,8 @@
#include "PrecompiledHeader.h" #include "PrecompiledHeader.h"
#include "x86emitter/x86emitter.h" #include "x86emitter/x86emitter.h"
#include <xmmintrin.h>
using namespace x86Emitter; using namespace x86Emitter;
// Max Number of qwc supported // Max Number of qwc supported
@ -23,6 +25,56 @@ using namespace x86Emitter;
typedef void (__fastcall *_memCpyCall)(void*, void*); typedef void (__fastcall *_memCpyCall)(void*, void*);
__aligned16 _memCpyCall _memcpy_vibes[_maxSize+1]; __aligned16 _memCpyCall _memcpy_vibes[_maxSize+1];
#if 1
// this version uses SSE intrinsics to perform an inline copy. MSVC disasm shows pretty
// decent code generation on whole, but it hasn't been benchmarked at all yet --air
__forceinline void memcpy_vibes(void * dest, const void * src, int size) {
float (*destxmm)[4] = (float(*)[4])dest, (*srcxmm)[4] = (float(*)[4])src;
size_t count = size & ~15, extra = size & 15;
destxmm += 8 - extra, srcxmm += 8 - extra;
switch (extra) {
do {
destxmm += 16, srcxmm += 16, count -= 16;
_mm_store_ps(&destxmm[-8][0], _mm_load_ps(&srcxmm[-8][0]));
case 15:
_mm_store_ps(&destxmm[-7][0], _mm_load_ps(&srcxmm[-7][0]));
case 14:
_mm_store_ps(&destxmm[-6][0], _mm_load_ps(&srcxmm[-6][0]));
case 13:
_mm_store_ps(&destxmm[-5][0], _mm_load_ps(&srcxmm[-5][0]));
case 12:
_mm_store_ps(&destxmm[-4][0], _mm_load_ps(&srcxmm[-4][0]));
case 11:
_mm_store_ps(&destxmm[-3][0], _mm_load_ps(&srcxmm[-3][0]));
case 10:
_mm_store_ps(&destxmm[-2][0], _mm_load_ps(&srcxmm[-2][0]));
case 9:
_mm_store_ps(&destxmm[-1][0], _mm_load_ps(&srcxmm[-1][0]));
case 8:
_mm_store_ps(&destxmm[ 0][0], _mm_load_ps(&srcxmm[ 0][0]));
case 7:
_mm_store_ps(&destxmm[ 1][0], _mm_load_ps(&srcxmm[ 1][0]));
case 6:
_mm_store_ps(&destxmm[ 2][0], _mm_load_ps(&srcxmm[ 2][0]));
case 5:
_mm_store_ps(&destxmm[ 3][0], _mm_load_ps(&srcxmm[ 3][0]));
case 4:
_mm_store_ps(&destxmm[ 4][0], _mm_load_ps(&srcxmm[ 4][0]));
case 3:
_mm_store_ps(&destxmm[ 5][0], _mm_load_ps(&srcxmm[ 5][0]));
case 2:
_mm_store_ps(&destxmm[ 6][0], _mm_load_ps(&srcxmm[ 6][0]));
case 1:
_mm_store_ps(&destxmm[ 7][0], _mm_load_ps(&srcxmm[ 7][0]));
case 0: NULL;
} while (count);
}
}
#else
#if 1 #if 1
// This version creates one function with a lot of movaps // This version creates one function with a lot of movaps
// It jumps to the correct movaps entry-point while adding // It jumps to the correct movaps entry-point while adding
@ -58,12 +110,13 @@ void gen_memcpy_vibes() {
HostSys::MemProtectStatic(_memCpyExec, Protect_ReadOnly, true); HostSys::MemProtectStatic(_memCpyExec, Protect_ReadOnly, true);
} }
void __fastcall memcpy_vibes(void * dest, void * src, int size) { __forceinline void memcpy_vibes(void * dest, const void * src, int size) {
int offset = ((size & 0xf) - 7) << 4; int offset = ((size & 0xf) - 7) << 4;
_memcpy_vibes[size]((void*)((uptr)dest + offset), (void*)((uptr)src + offset)); _memcpy_vibes[size]((void*)((uptr)dest + offset), (void*)((uptr)src + offset));
} }
#else #else
// This version creates '_maxSize' number of different functions, // This version creates '_maxSize' number of different functions,
// and calls the appropriate one... // and calls the appropriate one...
@ -97,8 +150,9 @@ void gen_memcpy_vibes() {
HostSys::MemProtectStatic(_memCpyExec, Protect_ReadOnly, true); HostSys::MemProtectStatic(_memCpyExec, Protect_ReadOnly, true);
} }
void __fastcall memcpy_vibes(void * dest, void * src, int size) { __forceinline void memcpy_vibes(void * dest, const void * src, int size) {
_memcpy_vibes[size](dest, src); _memcpy_vibes[size](dest, src);
} }
#endif #endif
#endif

View File

@ -1101,7 +1101,6 @@ void __fastcall mVU_XGKICK_(u32 addr) {
if(gifRegs->stat.APATH <= GIF_APATH1 || (gifRegs->stat.APATH == GIF_APATH3 && gifRegs->stat.IP3 == true) && SIGNAL_IMR_Pending == false) if(gifRegs->stat.APATH <= GIF_APATH1 || (gifRegs->stat.APATH == GIF_APATH3 && gifRegs->stat.IP3 == true) && SIGNAL_IMR_Pending == false)
{ {
if(Path1WritePos != 0) if(Path1WritePos != 0)
{ {
//Flush any pending transfers so things dont go up in the wrong order //Flush any pending transfers so things dont go up in the wrong order
@ -1112,13 +1111,13 @@ void __fastcall mVU_XGKICK_(u32 addr) {
if (size > diff) { if (size > diff) {
//DevCon.WriteLn("XGkick Wrap!"); //DevCon.WriteLn("XGkick Wrap!");
memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), diff); memcpy_qwc_(pDest, microVU1.regs->Mem + (addr*16), diff);
size -= diff; size -= diff;
pDest += diff*16; pDest += diff*16;
memcpy_qwc(pDest, microVU1.regs->Mem, size); memcpy_qwc_(pDest, microVU1.regs->Mem, size);
} }
else { else {
memcpy_qwc(pDest, microVU1.regs->Mem + (addr*16), size); memcpy_qwc_(pDest, microVU1.regs->Mem + (addr*16), size);
} }
GetMTGS().SendDataPacket(); GetMTGS().SendDataPacket();
if(GSTransferStatus.PTH1 == STOPPED_MODE) if(GSTransferStatus.PTH1 == STOPPED_MODE)