zzogl-pg:

* rework isdirty intrinsic. I miss swizzle stuff in first try :p


git-svn-id: http://pcsx2.googlecode.com/svn/trunk@3918 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
gregory.hainaut@gmail.com 2010-10-13 17:00:14 +00:00
parent 33214df091
commit 8c58c8ee3c
2 changed files with 24 additions and 14 deletions

View File

@ -167,6 +167,7 @@ inline bool PSMT_IS16Z(int psm) {return ((psm & 0x32) == 0x32);}
// Check to see if it is 32 bits. According to code comments, anyways.
// I'll have to look closer at it, because it'd seem like it'd return true for 24 bits.
// Note: the function only works for clut format. Clut PSM is 4 bits only. The possible value are PSMCT32, PSMCT16, PSMCT16S
inline bool PSMT_IS32BIT(int psm) {return !!(psm <= 1);}
// When color format is RGB24 (PSMCT24) or RGBA16 (PSMCT16 & 16S) alpha value expanded, based on
@ -544,9 +545,7 @@ typedef struct
extern GSinternal gs;
// Note the function is used in a template parameter so it must be declared extern
// Note2: In this case extern is not compatible with __forceinline so just inline it...
extern inline u16 RGBA32to16(u32 c)
static __forceinline u16 RGBA32to16(u32 c)
{
return (u16)((((c) & 0x000000f8) >> 3) |
(((c) & 0x0000f800) >> 6) |

View File

@ -861,31 +861,42 @@ bool IsDirty(u32 highdword, u32 psm, int cld, int cbp)
#ifdef TEST_THIS
while(entries != 0) {
#ifdef ZEROGS_SSE2
__m128i result = _mm_cmpeq_epi32(_mm_load_si128((__m128i*)src), _mm_load_si128((__m128i*)dst));
// Note: local memory datas are swizzles
__m128i src_0 = _mm_load_si128((__m128i*)src); // 9 8 1 0
__m128i src_1 = _mm_load_si128((__m128i*)src+1); // 11 10 3 2
__m128i src_2 = _mm_load_si128((__m128i*)src+2); // 13 12 5 4
__m128i src_3 = _mm_load_si128((__m128i*)src+3); // 15 14 7 6
__m128i result_tmp = _mm_cmpeq_epi32(_mm_load_si128((__m128i*)src+1), _mm_load_si128((__m128i*)dst+1));
__m128i dst_0 = _mm_load_si128((__m128i*)dst);
__m128i dst_1 = _mm_load_si128((__m128i*)dst+1);
__m128i dst_2 = _mm_load_si128((__m128i*)dst+2);
__m128i dst_3 = _mm_load_si128((__m128i*)dst+3);
__m128i result = _mm_cmpeq_epi32(_mm_unpacklo_epi64(src_0, src_1), dst_0);
__m128i result_tmp = _mm_cmpeq_epi32(_mm_unpacklo_epi64(src_2, src_3), dst_1);
result = _mm_and_si128(result, result_tmp);
result_tmp = _mm_cmpeq_epi32(_mm_load_si128((__m128i*)src+2), _mm_load_si128((__m128i*)dst+2));
result_tmp = _mm_cmpeq_epi32(_mm_unpackhi_epi64(src_0, src_1), dst_2);
result = _mm_and_si128(result, result_tmp);
result_tmp = _mm_cmpeq_epi32(_mm_load_si128((__m128i*)src+3), _mm_load_si128((__m128i*)dst+3));
result_tmp = _mm_cmpeq_epi32(_mm_unpackhi_epi64(src_2, src_3), dst_3);
result = _mm_and_si128(result, result_tmp);
u32 result_int = _mm_movemask_epi8(result);
if (result_int != 0xFF) {
if (result_int != 0xFFFF) {
bRet = true;
break;
}
#else
// I see no point to keep an mmx version. SSE2 versions is probably faster.
// Keep a slow portable C version for reference/debug
for (int i=0; i < 16 ; i++) {
if (*((u32*)src+i) != *((u32*)dst+i)) {
// Note: local memory datas are swizzles
if (dst[0] != src[0] || dst[1] != src[2] || dst[2] != src[4] || dst[3] != src[6]
|| dst[4] != src[1] || dst[5] != src[3] || dst[6] != src[5] || dst[7] != src[7]) {
bRet = true;
break;
}
}
#endif
if (entries & 0x10) {