GregMiscellaneous: zzogl-pg:

* Port more ASM to intrinsics. Note use non-cacheable store instead to reduce cache pollution git-svn-id: http://pcsx2.googlecode.com/svn/branches/GregMiscellaneous@3825 96395faa-99c1-11dd-bbfe-3dabce05a288
2010-09-23 18:00:38 +00:00 · 2010-09-23 18:00:38 +00:00 · 3a1ef55cb1
parent 79c4b1825d
commit 3a1ef55cb1
1 changed files with 50 additions and 0 deletions
--- a/plugins/zzogl-pg/opengl/targets.cpp
+++ b/plugins/zzogl-pg/opengl/targets.cpp
@ -2246,7 +2246,57 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info
 #if defined(ZEROGS_SSE2)
 			assert(((u32)(uptr)dst) % 16 == 0);
            // FIXME Uncomment to test intrinsic versions (instead of asm)
            // perf improvement vs asm:
            // 1/ gcc updates both pointer with 1 addition
            // 2/ Bypass the cache for the store
 #define NEW_INTRINSIC_VERSION
 #ifdef NEW_INTRINSIC_VERSION
            __m128i zero_128;
            zero_128 = _mm_xor_si128(zero_128, zero_128);
            // NOTE: future performance improvement
            // SSE4.1 support uncacheable load 128bits. Maybe it can
            // avoid some cache pollution
            // NOTE2: I create multiple _n variable to mimic the previous ASM behavior
            // but I'm not sure there are real gains.
 			for (int i = targ->height * GPU_TEXWIDTH/16 ; i >=0 ; --i)
            {
                // Convert 16 bits pixels to 32bits (zero extended)
                // Batch 64 bytes (32 pixels) at once.
                __m128i pixels_1 = _mm_load_si128((__m128i*)src);
                __m128i pixels_2 = _mm_load_si128((__m128i*)(src+8));
                __m128i pixels_3 = _mm_load_si128((__m128i*)(src+16));
                __m128i pixels_4 = _mm_load_si128((__m128i*)(src+24));
                __m128i pix_low_1 = _mm_unpacklo_epi16(pixels_1, zero_128);
                __m128i pix_high_1 = _mm_unpackhi_epi16(pixels_1, zero_128);
                __m128i pix_low_2 = _mm_unpacklo_epi16(pixels_2, zero_128);
                __m128i pix_high_2 = _mm_unpackhi_epi16(pixels_2, zero_128);
                // Note: bypass cache
                _mm_stream_si128((__m128i*)dst, pix_low_1);
                _mm_stream_si128((__m128i*)(dst+8), pix_high_1);
                _mm_stream_si128((__m128i*)(dst+16), pix_low_2);
                _mm_stream_si128((__m128i*)(dst+24), pix_high_2);
                __m128i pix_low_3 = _mm_unpacklo_epi16(pixels_3, zero_128);
                __m128i pix_high_3 = _mm_unpackhi_epi16(pixels_3, zero_128);
                __m128i pix_low_4 = _mm_unpacklo_epi16(pixels_4, zero_128);
                __m128i pix_high_4 = _mm_unpackhi_epi16(pixels_4, zero_128);
                // Note: bypass cache
                _mm_stream_si128((__m128i*)(dst+32), pix_low_3);
                _mm_stream_si128((__m128i*)(dst+40), pix_high_3);
                _mm_stream_si128((__m128i*)(dst+48), pix_low_4);
                _mm_stream_si128((__m128i*)(dst+56), pix_high_4);
                src += 32;
                dst += 64;
            }
 #else
 			SSE2_UnswizzleZ16Target(dst, src, targ->height * GPU_TEXWIDTH / 16);
 #endif
 #else // ZEROGS_SSE2
 			for (int i = 0; i < targ->height; ++i)