mirror of https://github.com/PCSX2/pcsx2.git
GregMiscellaneous: zzogl-pg:
* Finish GAS removal. I use SSE2 instead of mmx because it was easier to write (faster anyway). Code seems equivalent between GAS & intrinsics but control flow will need a good testing (and some explanation) Note: I also add a basic C version for reference/debug (not tested yet) git-svn-id: http://pcsx2.googlecode.com/svn/branches/GregMiscellaneous@3837 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
8036106b07
commit
8435ad3f01
|
@ -32,6 +32,9 @@
|
||||||
#include "targets.h"
|
#include "targets.h"
|
||||||
#include "GLWin.h"
|
#include "GLWin.h"
|
||||||
#include "ZZoglShaders.h"
|
#include "ZZoglShaders.h"
|
||||||
|
#ifdef ZEROGS_SSE2
|
||||||
|
#include <emmintrin.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
//----------------------- Defines
|
//----------------------- Defines
|
||||||
|
|
||||||
|
@ -851,6 +854,55 @@ bool IsDirty(u32 highdword, u32 psm, int cld, int cbp)
|
||||||
|
|
||||||
bool bRet = false;
|
bool bRet = false;
|
||||||
|
|
||||||
|
// FIXME code generated by intrinsics is the same as the linux asm.
|
||||||
|
// However there is no "cmp %%esi, 0x90" equivalent in the windows asm !!!
|
||||||
|
// So control flow must be check
|
||||||
|
#define TEST_THIS
|
||||||
|
#ifdef TEST_THIS
|
||||||
|
while(entries != 0) {
|
||||||
|
#ifdef ZEROGS_SSE2
|
||||||
|
__m128i result = _mm_cmpeq_epi32(_mm_load_si128((__m128i*)src), _mm_load_si128((__m128i*)dst));
|
||||||
|
|
||||||
|
__m128i result_tmp = _mm_cmpeq_epi32(_mm_load_si128((__m128i*)src+1), _mm_load_si128((__m128i*)dst+1));
|
||||||
|
result = _mm_and_si128(result, result_tmp);
|
||||||
|
|
||||||
|
result_tmp = _mm_cmpeq_epi32(_mm_load_si128((__m128i*)src+2), _mm_load_si128((__m128i*)dst+2));
|
||||||
|
result = _mm_and_si128(result, result_tmp);
|
||||||
|
|
||||||
|
result_tmp = _mm_cmpeq_epi32(_mm_load_si128((__m128i*)src+3), _mm_load_si128((__m128i*)dst+3));
|
||||||
|
result = _mm_and_si128(result, result_tmp);
|
||||||
|
|
||||||
|
u32 result_int = _mm_movemask_epi8(result);
|
||||||
|
if (result_int != 0xFF) {
|
||||||
|
bRet = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
// I see no point to keep an mmx version. SSE2 versions is probably faster.
|
||||||
|
// Keep a slow portable C version for reference/debug
|
||||||
|
for (int i=0; i < 16 ; i++) {
|
||||||
|
if (*((u32*)src+i) != *((u32*)dst+i)) {
|
||||||
|
bRet = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (entries & 0x10) {
|
||||||
|
src -= 56; // go back and down one column
|
||||||
|
}
|
||||||
|
|
||||||
|
src += 32; // go to the right block
|
||||||
|
|
||||||
|
if (entries == 0x90) {
|
||||||
|
src += 32; // skip whole block
|
||||||
|
}
|
||||||
|
|
||||||
|
dst += 8;
|
||||||
|
entries -= 16;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
|
||||||
// do a fast test with MMX
|
// do a fast test with MMX
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
int storeebx;
|
int storeebx;
|
||||||
|
@ -977,6 +1029,7 @@ Return:
|
||||||
".att_syntax\n" : "=m"(bRet) : "c"(dst), "d"(src), "S"(entries) : "eax", "memory");
|
".att_syntax\n" : "=m"(bRet) : "c"(dst), "d"(src), "S"(entries) : "eax", "memory");
|
||||||
|
|
||||||
#endif // _WIN32
|
#endif // _WIN32
|
||||||
|
#endif
|
||||||
return bRet;
|
return bRet;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue