GregMiscellaneous: zzogl-pg:

* Finish GAS removal. I use SSE2 instead of mmx because it was easier to write (faster anyway).
Code seems equivalent between GAS & intrinsics but control flow will need a good testing (and some explanation)

Note: I also add a basic C version for reference/debug (not tested yet)


git-svn-id: http://pcsx2.googlecode.com/svn/branches/GregMiscellaneous@3837 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
gregory.hainaut@gmail.com 2010-09-25 14:01:36 +00:00
parent 8036106b07
commit 8435ad3f01
1 changed files with 53 additions and 0 deletions

View File

@ -32,6 +32,9 @@
#include "targets.h"
#include "GLWin.h"
#include "ZZoglShaders.h"
#ifdef ZEROGS_SSE2
#include <emmintrin.h>
#endif
//----------------------- Defines
@ -851,6 +854,55 @@ bool IsDirty(u32 highdword, u32 psm, int cld, int cbp)
bool bRet = false;
// FIXME code generated by intrinsics is the same as the linux asm.
// However there is no "cmp %%esi, 0x90" equivalent in the windows asm !!!
// So control flow must be check
#define TEST_THIS
#ifdef TEST_THIS
while(entries != 0) {
#ifdef ZEROGS_SSE2
__m128i result = _mm_cmpeq_epi32(_mm_load_si128((__m128i*)src), _mm_load_si128((__m128i*)dst));
__m128i result_tmp = _mm_cmpeq_epi32(_mm_load_si128((__m128i*)src+1), _mm_load_si128((__m128i*)dst+1));
result = _mm_and_si128(result, result_tmp);
result_tmp = _mm_cmpeq_epi32(_mm_load_si128((__m128i*)src+2), _mm_load_si128((__m128i*)dst+2));
result = _mm_and_si128(result, result_tmp);
result_tmp = _mm_cmpeq_epi32(_mm_load_si128((__m128i*)src+3), _mm_load_si128((__m128i*)dst+3));
result = _mm_and_si128(result, result_tmp);
u32 result_int = _mm_movemask_epi8(result);
if (result_int != 0xFF) {
bRet = true;
break;
}
#else
// I see no point to keep an mmx version. SSE2 versions is probably faster.
// Keep a slow portable C version for reference/debug
for (int i=0; i < 16 ; i++) {
if (*((u32*)src+i) != *((u32*)dst+i)) {
bRet = true;
break;
}
}
#endif
if (entries & 0x10) {
src -= 56; // go back and down one column
}
src += 32; // go to the right block
if (entries == 0x90) {
src += 32; // skip whole block
}
dst += 8;
entries -= 16;
}
#else
// do a fast test with MMX
#ifdef _MSC_VER
int storeebx;
@ -977,6 +1029,7 @@ Return:
".att_syntax\n" : "=m"(bRet) : "c"(dst), "d"(src), "S"(entries) : "eax", "memory");
#endif // _WIN32
#endif
return bRet;
}