GregMiscellaneous:zzogl-pg:

* fix sse2 code for 16bits cluts. * Fix debug build git-svn-id: http://pcsx2.googlecode.com/svn/branches/GregMiscellaneous@3952 96395faa-99c1-11dd-bbfe-3dabce05a288
2010-10-21 11:13:49 +00:00 · 2010-10-21 11:13:49 +00:00 · d886dbfadb
parent 0fc2e87809
commit d886dbfadb
4 changed files with 793 additions and 789 deletions
--- a/plugins/zzogl-pg/opengl/GLWinX11.cpp
+++ b/plugins/zzogl-pg/opengl/GLWinX11.cpp
@ -136,7 +136,7 @@ void GLWindow::GetWindowSize()
    // update the gl buffer size
    ChangeWindowSize(width, height);

-    ZZLog::Error_Log("Resolution %dx%d. Depth %d bpp. Position (%d,%d)", width, height, depth, conf.x, conf.y);
+    ZZLog::Dev_Log("Resolution %dx%d. Depth %d bpp. Position (%d,%d)", width, height, depth, conf.x, conf.y);
 }

 void GLWindow::GetGLXVersion()
--- a/plugins/zzogl-pg/opengl/NewRegs.h
+++ b/plugins/zzogl-pg/opengl/NewRegs.h
--- a/plugins/zzogl-pg/opengl/ZZClut.cpp
+++ b/plugins/zzogl-pg/opengl/ZZClut.cpp
@ -173,10 +173,10 @@ __forceinline void GSMem_to_ClutBuffer__T16_I4_CSM1_core_sse2(u32* vm, u32* clut
    }

    // Unsizzle the data
-    __m128i row_0 = _mm_unpacklo_epi32(vm_0, vm_1); // 3 2 1 0
-    __m128i row_1 = _mm_unpacklo_epi32(vm_2, vm_3); // 7 6 5 4
-    __m128i row_2 = _mm_unpackhi_epi32(vm_0, vm_1); // 11 10 9 8
-    __m128i row_3 = _mm_unpackhi_epi32(vm_2, vm_3); // 15 14 13 12
+    __m128i row_0 = _mm_unpacklo_epi64(vm_0, vm_1); // 3 2 1 0
+    __m128i row_1 = _mm_unpacklo_epi64(vm_2, vm_3); // 7 6 5 4
+    __m128i row_2 = _mm_unpackhi_epi64(vm_0, vm_1); // 11 10 9 8
+    __m128i row_3 = _mm_unpackhi_epi64(vm_2, vm_3); // 15 14 13 12

    // load old data & remove useless part
    if(CSA_0_15) {
@ -241,6 +241,8 @@ __forceinline void GSMem_to_ClutBuffer__T16_I8_CSM1_sse2(u32* vm, u32 csa)
        clut = GetClutBufferAddress<u32>(0); // Keep aligned version for sse2

        GSMem_to_ClutBuffer__T16_I4_CSM1_core_sse2<false,true>(vm, clut);
+        clut += 16;
+        vm += 16; // go down one column
    } else if(csa_right != 0) {
        // go back to the base before processing left clut column
        clut = GetClutBufferAddress<u32>(0); // Keep aligned version for sse2
@ -512,7 +514,7 @@ __forceinline void ClutBuffer_to_Array<u16>(u16* dst, u32 csa, u32 clutsize)

    while (clutsize_right > 0)
    {
-#ifdef ZEROGS_SSE4
+#ifdef ZEROGS_SSE2
        // only lower 16 bits of dword are valid
        __m128i clut_0 = _mm_load_si128((__m128i*)clut);
        __m128i clut_1 = _mm_load_si128((__m128i*)clut+1);
--- a/plugins/zzogl-pg/opengl/targets.cpp
+++ b/plugins/zzogl-pg/opengl/targets.cpp
@ -2014,7 +2014,7 @@ CMemoryTarget* CMemoryTargetMngr::GetMemoryTarget(const tex0Info& tex0, int forc
 			Build_Clut_Texture<u16>(tex0.psm, targ->height, (u16*)targ->clut, psrc, (u16*)ptexdata);
 		}

-        assert(targ->clut.size() > 0);
+        assert(targ->clutsize > 0);
 	}
 	else
 	{
@ -2027,7 +2027,7 @@ CMemoryTarget* CMemoryTargetMngr::GetMemoryTarget(const tex0Info& tex0, int forc
 			u16* dst = (u16*)ptexdata;
 			u16* src = (u16*)(MemoryAddress(targ->realy));

-#if defined(ZEROGS_SSE2)
+#ifdef ZEROGS_SSE2
 			assert(((u32)(uptr)dst) % 16 == 0);
            // FIXME Uncomment to test intrinsic versions (instead of asm)
            // perf improvement vs asm:
@ -2830,6 +2830,7 @@ inline void Resolve_32_Bit(const void* psrc, int fbp, int fbw, int fbh, const in

 static const __aligned16 unsigned int pixel_5b_mask[4] = {0x0000001F, 0x0000001F, 0x0000001F, 0x0000001F};

+#ifdef ZEROGS_SSE2
 // The function process 2*2 pixels in 32bits. And 2*4 pixels in 16bits
 template <u32 psm, u32 size, u32 pageTable[size][64], bool null_second_line, u32 INDEX>
 __forceinline void update_8pixels_sse2(u32* src, u32* basepage, u32 i_msk, u32 j, u32 pix_mask, u32 src_pitch)
@ -3141,6 +3142,7 @@ void Resolve_32_Bit_sse2(const void* psrc, int fbp, int fbw, int fbh, u32 fbm)
 #endif
 #endif
 }
+#endif

 void _Resolve(const void* psrc, int fbp, int fbw, int fbh, int psm, u32 fbm, bool mode = true)
 {