diff --git a/plugins/zzogl-pg/opengl/GS.h b/plugins/zzogl-pg/opengl/GS.h index 1614ccbf74..2602db26ed 100644 --- a/plugins/zzogl-pg/opengl/GS.h +++ b/plugins/zzogl-pg/opengl/GS.h @@ -167,6 +167,7 @@ inline bool PSMT_IS16Z(int psm) {return ((psm & 0x32) == 0x32);} // Check to see if it is 32 bits. According to code comments, anyways. // I'll have to look closer at it, because it'd seem like it'd return true for 24 bits. +// Note: the function only works for clut format. Clut PSM is 4 bits only. The possible value are PSMCT32, PSMCT16, PSMCT16S inline bool PSMT_IS32BIT(int psm) {return !!(psm <= 1);} // When color format is RGB24 (PSMCT24) or RGBA16 (PSMCT16 & 16S) alpha value expanded, based on @@ -544,9 +545,7 @@ typedef struct extern GSinternal gs; -// Note the function is used in a template parameter so it must be declared extern -// Note2: In this case extern is not compatible with __forceinline so just inline it... -extern inline u16 RGBA32to16(u32 c) +static __forceinline u16 RGBA32to16(u32 c) { return (u16)((((c) & 0x000000f8) >> 3) | (((c) & 0x0000f800) >> 6) | diff --git a/plugins/zzogl-pg/opengl/zerogs.cpp b/plugins/zzogl-pg/opengl/zerogs.cpp index 081955bb70..a97bd91c2c 100644 --- a/plugins/zzogl-pg/opengl/zerogs.cpp +++ b/plugins/zzogl-pg/opengl/zerogs.cpp @@ -841,7 +841,7 @@ bool IsDirty(u32 highdword, u32 psm, int cld, int cbp) if (cpsm > 1 || csm) { // Mana Khemia triggers this. - //ZZLog::Error_Log("16 bit clut not supported."); + //ZZLog::Error_Log("16 bit clut not supported."); return true; } @@ -861,30 +861,41 @@ bool IsDirty(u32 highdword, u32 psm, int cld, int cbp) #ifdef TEST_THIS while(entries != 0) { #ifdef ZEROGS_SSE2 - __m128i result = _mm_cmpeq_epi32(_mm_load_si128((__m128i*)src), _mm_load_si128((__m128i*)dst)); + // Note: local memory datas are swizzles + __m128i src_0 = _mm_load_si128((__m128i*)src); // 9 8 1 0 + __m128i src_1 = _mm_load_si128((__m128i*)src+1); // 11 10 3 2 + __m128i src_2 = _mm_load_si128((__m128i*)src+2); // 13 12 5 4 + __m128i src_3 = _mm_load_si128((__m128i*)src+3); // 15 14 7 6 - __m128i result_tmp = _mm_cmpeq_epi32(_mm_load_si128((__m128i*)src+1), _mm_load_si128((__m128i*)dst+1)); + __m128i dst_0 = _mm_load_si128((__m128i*)dst); + __m128i dst_1 = _mm_load_si128((__m128i*)dst+1); + __m128i dst_2 = _mm_load_si128((__m128i*)dst+2); + __m128i dst_3 = _mm_load_si128((__m128i*)dst+3); + + __m128i result = _mm_cmpeq_epi32(_mm_unpacklo_epi64(src_0, src_1), dst_0); + + __m128i result_tmp = _mm_cmpeq_epi32(_mm_unpacklo_epi64(src_2, src_3), dst_1); result = _mm_and_si128(result, result_tmp); - result_tmp = _mm_cmpeq_epi32(_mm_load_si128((__m128i*)src+2), _mm_load_si128((__m128i*)dst+2)); + result_tmp = _mm_cmpeq_epi32(_mm_unpackhi_epi64(src_0, src_1), dst_2); result = _mm_and_si128(result, result_tmp); - result_tmp = _mm_cmpeq_epi32(_mm_load_si128((__m128i*)src+3), _mm_load_si128((__m128i*)dst+3)); + result_tmp = _mm_cmpeq_epi32(_mm_unpackhi_epi64(src_2, src_3), dst_3); result = _mm_and_si128(result, result_tmp); u32 result_int = _mm_movemask_epi8(result); - if (result_int != 0xFF) { + if (result_int != 0xFFFF) { bRet = true; break; } #else // I see no point to keep an mmx version. SSE2 versions is probably faster. // Keep a slow portable C version for reference/debug - for (int i=0; i < 16 ; i++) { - if (*((u32*)src+i) != *((u32*)dst+i)) { - bRet = true; - break; - } + // Note: local memory datas are swizzles + if (dst[0] != src[0] || dst[1] != src[2] || dst[2] != src[4] || dst[3] != src[6] + || dst[4] != src[1] || dst[5] != src[3] || dst[6] != src[5] || dst[7] != src[7]) { + bRet = true; + break; } #endif