From 8435ad3f01d6f3d32f95e1fdbe3e83df7ad7595d Mon Sep 17 00:00:00 2001 From: "gregory.hainaut@gmail.com" Date: Sat, 25 Sep 2010 14:01:36 +0000 Subject: [PATCH] GregMiscellaneous: zzogl-pg: * Finish GAS removal. I use SSE2 instead of mmx because it was easier to write (faster anyway). Code seems equivalent between GAS & intrinsics but control flow will need a good testing (and some explanation) Note: I also add a basic C version for reference/debug (not tested yet) git-svn-id: http://pcsx2.googlecode.com/svn/branches/GregMiscellaneous@3837 96395faa-99c1-11dd-bbfe-3dabce05a288 --- plugins/zzogl-pg/opengl/zerogs.cpp | 53 ++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/plugins/zzogl-pg/opengl/zerogs.cpp b/plugins/zzogl-pg/opengl/zerogs.cpp index 88bda55557..081955bb70 100644 --- a/plugins/zzogl-pg/opengl/zerogs.cpp +++ b/plugins/zzogl-pg/opengl/zerogs.cpp @@ -32,6 +32,9 @@ #include "targets.h" #include "GLWin.h" #include "ZZoglShaders.h" +#ifdef ZEROGS_SSE2 +#include +#endif //----------------------- Defines @@ -851,6 +854,55 @@ bool IsDirty(u32 highdword, u32 psm, int cld, int cbp) bool bRet = false; + // FIXME code generated by intrinsics is the same as the linux asm. + // However there is no "cmp %%esi, 0x90" equivalent in the windows asm !!! + // So control flow must be check +#define TEST_THIS +#ifdef TEST_THIS + while(entries != 0) { +#ifdef ZEROGS_SSE2 + __m128i result = _mm_cmpeq_epi32(_mm_load_si128((__m128i*)src), _mm_load_si128((__m128i*)dst)); + + __m128i result_tmp = _mm_cmpeq_epi32(_mm_load_si128((__m128i*)src+1), _mm_load_si128((__m128i*)dst+1)); + result = _mm_and_si128(result, result_tmp); + + result_tmp = _mm_cmpeq_epi32(_mm_load_si128((__m128i*)src+2), _mm_load_si128((__m128i*)dst+2)); + result = _mm_and_si128(result, result_tmp); + + result_tmp = _mm_cmpeq_epi32(_mm_load_si128((__m128i*)src+3), _mm_load_si128((__m128i*)dst+3)); + result = _mm_and_si128(result, result_tmp); + + u32 result_int = _mm_movemask_epi8(result); + if (result_int != 0xFF) { + bRet = true; + break; + } +#else + // I see no point to keep an mmx version. SSE2 versions is probably faster. + // Keep a slow portable C version for reference/debug + for (int i=0; i < 16 ; i++) { + if (*((u32*)src+i) != *((u32*)dst+i)) { + bRet = true; + break; + } + } +#endif + + if (entries & 0x10) { + src -= 56; // go back and down one column + } + + src += 32; // go to the right block + + if (entries == 0x90) { + src += 32; // skip whole block + } + + dst += 8; + entries -= 16; + } +#else + // do a fast test with MMX #ifdef _MSC_VER int storeebx; @@ -977,6 +1029,7 @@ Return: ".att_syntax\n" : "=m"(bRet) : "c"(dst), "d"(src), "S"(entries) : "eax", "memory"); #endif // _WIN32 +#endif return bRet; }