zzogl-pg: fix the crash problem in msvc/sse intrinsics, and avoid compiler use of MMX by using the SSE version of 64-bit loads.

git-svn-id: http://pcsx2.googlecode.com/svn/trunk@3871 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
Jake.Stine 2010-10-04 18:03:44 +00:00
parent 02d5ed7f45
commit 115b431af5
1 changed files with 11 additions and 24 deletions

View File

@ -3127,15 +3127,15 @@ __forceinline void update_4pixels_sse2(u32* src, Tdst* basepage, u32 i_msk, u32
// p0 p0 p0 p0 p1 p1 p1 p1 // p0 p0 p0 p0 p1 p1 p1 p1
// p2 p2 p2 p2 p3 p3 p3 p3 // p2 p2 p2 p2 p3 p3 p3 p3
base_ptr = &src[(((j<<6)+INDEX)<<2)]; base_ptr = &src[(((j<<6)+INDEX)<<2)];
__m128i pixel_low = _mm_movpi64_epi64(*(__m64*)(base_ptr+3)); __m128i pixel_low = _mm_loadl_epi64((__m128i*)(base_ptr+3));
__m128i pixel_high = _mm_movpi64_epi64(*(__m64*)(base_ptr+11)); __m128i pixel_high = _mm_loadl_epi64((__m128i*)(base_ptr+11));
pixels = _mm_unpacklo_epi64(pixel_low, pixel_high); pixels = _mm_unpacklo_epi64(pixel_low, pixel_high);
} else if(AA.x ==1) { } else if(AA.x ==1) {
// Note: pixels (32bits) are stored like that: // Note: pixels (32bits) are stored like that:
// p0 p0 p1 p1 p2 p2 p3 p3 // p0 p0 p1 p1 p2 p2 p3 p3
base_ptr = &src[(((j<<6)+INDEX)<<1)]; base_ptr = &src[(((j<<6)+INDEX)<<1)];
__m128i pixel_low = _mm_movpi64_epi64(*(__m64*)(base_ptr+1)); __m128i pixel_low = _mm_loadl_epi64((__m128i*)(base_ptr+1));
__m128i pixel_high = _mm_movpi64_epi64(*(__m64*)(base_ptr+5)); __m128i pixel_high = _mm_loadl_epi64((__m128i*)(base_ptr+5));
pixels = _mm_unpacklo_epi64(pixel_low, pixel_high); pixels = _mm_unpacklo_epi64(pixel_low, pixel_high);
} else { } else {
base_ptr = &src[((j<<6)+INDEX)]; base_ptr = &src[((j<<6)+INDEX)];
@ -3192,9 +3192,6 @@ __forceinline void update_4pixels_sse2(u32* src, Tdst* basepage, u32 i_msk, u32
// The MS compiler complains about the missing of the emms clear function... // The MS compiler complains about the missing of the emms clear function...
// My guess, it uses the mmx register for the 64 bits transfer. Newer version // My guess, it uses the mmx register for the 64 bits transfer. Newer version
// of the compiler probably generates better code. -- Gregory // of the compiler probably generates better code. -- Gregory
#ifdef _WIN32
_mm_empty();
#endif
} }
template <u32 size, u32 pageTable[size][64], bool do_conversion, bool texture_16b, u32 INDEX> template <u32 size, u32 pageTable[size][64], bool do_conversion, bool texture_16b, u32 INDEX>
@ -3214,8 +3211,8 @@ __forceinline void update_4pixels_sse2_bis(u32* src, u32* basepage, u32 i_msk, u
// ... // ...
// p2 p2 p2 p2 p3 p3 p3 p3 p6 p6 p6 p6 p7 p7 p7 p7 // p2 p2 p2 p2 p3 p3 p3 p3 p6 p6 p6 p6 p7 p7 p7 p7
base_ptr = &src[(((j<<6)+INDEX)<<2)]; base_ptr = &src[(((j<<6)+INDEX)<<2)];
__m128i pixel_0_low = _mm_movpi64_epi64(*(__m64*)(base_ptr + 3)); __m128i pixel_0_low = _mm_loadl_epi64((__m128i*)(base_ptr + 3));
__m128i pixel_0_high = _mm_movpi64_epi64(*(__m64*)(base_ptr + 3 + src_pitch)); __m128i pixel_0_high = _mm_loadl_epi64((__m128i*)(base_ptr + 3 + src_pitch));
pixels_0 = _mm_unpacklo_epi64(pixel_0_low, pixel_0_high); pixels_0 = _mm_unpacklo_epi64(pixel_0_low, pixel_0_high);
} else if(AA.x ==1) { } else if(AA.x ==1) {
// Note: pixels (32bits) are stored like that: // Note: pixels (32bits) are stored like that:
@ -3223,8 +3220,8 @@ __forceinline void update_4pixels_sse2_bis(u32* src, u32* basepage, u32 i_msk, u
// ... // ...
// p2 p2 p3 p3 p6 p6 p7 p7 // p2 p2 p3 p3 p6 p6 p7 p7
base_ptr = &src[(((j<<6)+INDEX)<<1)]; base_ptr = &src[(((j<<6)+INDEX)<<1)];
__m128i pixel_0_low = _mm_movpi64_epi64(*(__m64*)(base_ptr + 1)); __m128i pixel_0_low = _mm_loadl_epi64((__m128i*)(base_ptr + 1));
__m128i pixel_0_high = _mm_movpi64_epi64(*(__m64*)(base_ptr + 1 + src_pitch)); __m128i pixel_0_high = _mm_loadl_epi64((__m128i*)(base_ptr + 1 + src_pitch));
pixels_0 = _mm_unpacklo_epi64(pixel_0_low, pixel_0_high); pixels_0 = _mm_unpacklo_epi64(pixel_0_low, pixel_0_high);
} else { } else {
// Note: pixels (32bits) are stored like that: // Note: pixels (32bits) are stored like that:
@ -3232,10 +3229,8 @@ __forceinline void update_4pixels_sse2_bis(u32* src, u32* basepage, u32 i_msk, u
// ... // ...
// p2 p3 p6 p7 // p2 p3 p6 p7
base_ptr = &src[((j<<6)+INDEX)]; base_ptr = &src[((j<<6)+INDEX)];
__m128i pixel_0_low = _mm_movpi64_epi64(*(__m64*)base_ptr); __m128i pixel_0_low = _mm_loadl_epi64((__m128i*)base_ptr);
__m128i pixel_0_high = _mm_loadl_epi64((__m128i*)(base_ptr + src_pitch));
// MSVC currently crashes about here.
__m128i pixel_0_high = _mm_movpi64_epi64(*(__m64*)(base_ptr + src_pitch));
pixels_0 = _mm_unpacklo_epi64(pixel_0_low, pixel_0_high); pixels_0 = _mm_unpacklo_epi64(pixel_0_low, pixel_0_high);
} }
@ -3329,9 +3324,6 @@ __forceinline void update_4pixels_sse2_bis(u32* src, u32* basepage, u32 i_msk, u
// The MS compiler complains about the missing of the emms clear function... // The MS compiler complains about the missing of the emms clear function...
// My guess, it uses the mmx register for the 64 bits transfer. Newer version // My guess, it uses the mmx register for the 64 bits transfer. Newer version
// of the compiler probably generates better code. -- Gregory // of the compiler probably generates better code. -- Gregory
#ifdef _WIN32
_mm_empty();
#endif
} }
template <u32 size, u32 pageTable[size][64], typename Tdst, bool do_conversion, bool texture_16b> template <u32 size, u32 pageTable[size][64], typename Tdst, bool do_conversion, bool texture_16b>
@ -3399,11 +3391,6 @@ void Resolve_32b(const void* psrc, int fbp, int fbw, int fbh, u32 fbm)
for(int j = fbw_div-1; j >= 0; --j) { for(int j = fbw_div-1; j >= 0; --j) {
// for(u32 j = 0 ; j < fbw_div; ++j) { // for(u32 j = 0 ; j < fbw_div; ++j) {
// Workaround until we work out why update_4pixels_sse2_bis is crashing Windows.
#ifndef _WIN32
#define DO_8_PIX
#endif
#ifdef DO_8_PIX #ifdef DO_8_PIX
u32* basepage = (u32*)pPageOffset + (i_div + j) * 2048; u32* basepage = (u32*)pPageOffset + (i_div + j) * 2048;
update_4pixels_sse2_bis<size, pageTable, do_conversion, texture_16b, 0>(src, basepage, i_msk, j, pix_mask, raw_size); update_4pixels_sse2_bis<size, pageTable, do_conversion, texture_16b, 0>(src, basepage, i_msk, j, pix_mask, raw_size);