diff --git a/plugins/zzogl-pg/opengl/targets.cpp b/plugins/zzogl-pg/opengl/targets.cpp
index abe2bbe0ad..4bb70c7117 100644
--- a/plugins/zzogl-pg/opengl/targets.cpp
+++ b/plugins/zzogl-pg/opengl/targets.cpp
@@ -2246,7 +2246,57 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info
 
 #if defined(ZEROGS_SSE2)
 			assert(((u32)(uptr)dst) % 16 == 0);
+            // FIXME Uncomment to test intrinsic versions (instead of asm)
+            // perf improvement vs asm:
+            // 1/ gcc updates both pointer with 1 addition
+            // 2/ Bypass the cache for the store
+#define NEW_INTRINSIC_VERSION
+#ifdef NEW_INTRINSIC_VERSION
+
+            __m128i zero_128;
+            zero_128 = _mm_xor_si128(zero_128, zero_128);
+            // NOTE: future performance improvement
+            // SSE4.1 support uncacheable load 128bits. Maybe it can
+            // avoid some cache pollution
+            // NOTE2: I create multiple _n variable to mimic the previous ASM behavior
+            // but I'm not sure there are real gains.
+			for (int i = targ->height * GPU_TEXWIDTH/16 ; i >=0 ; --i)
+            {
+                // Convert 16 bits pixels to 32bits (zero extended)
+                // Batch 64 bytes (32 pixels) at once.
+                __m128i pixels_1 = _mm_load_si128((__m128i*)src);
+                __m128i pixels_2 = _mm_load_si128((__m128i*)(src+8));
+                __m128i pixels_3 = _mm_load_si128((__m128i*)(src+16));
+                __m128i pixels_4 = _mm_load_si128((__m128i*)(src+24));
+
+                __m128i pix_low_1 = _mm_unpacklo_epi16(pixels_1, zero_128);
+                __m128i pix_high_1 = _mm_unpackhi_epi16(pixels_1, zero_128);
+                __m128i pix_low_2 = _mm_unpacklo_epi16(pixels_2, zero_128);
+                __m128i pix_high_2 = _mm_unpackhi_epi16(pixels_2, zero_128);
+
+                // Note: bypass cache
+                _mm_stream_si128((__m128i*)dst, pix_low_1);
+                _mm_stream_si128((__m128i*)(dst+8), pix_high_1);
+                _mm_stream_si128((__m128i*)(dst+16), pix_low_2);
+                _mm_stream_si128((__m128i*)(dst+24), pix_high_2);
+
+                __m128i pix_low_3 = _mm_unpacklo_epi16(pixels_3, zero_128);
+                __m128i pix_high_3 = _mm_unpackhi_epi16(pixels_3, zero_128);
+                __m128i pix_low_4 = _mm_unpacklo_epi16(pixels_4, zero_128);
+                __m128i pix_high_4 = _mm_unpackhi_epi16(pixels_4, zero_128);
+
+                // Note: bypass cache
+                _mm_stream_si128((__m128i*)(dst+32), pix_low_3);
+                _mm_stream_si128((__m128i*)(dst+40), pix_high_3);
+                _mm_stream_si128((__m128i*)(dst+48), pix_low_4);
+                _mm_stream_si128((__m128i*)(dst+56), pix_high_4);
+
+                src += 32;
+                dst += 64;
+            }
+#else
 			SSE2_UnswizzleZ16Target(dst, src, targ->height * GPU_TEXWIDTH / 16);
+#endif
 #else // ZEROGS_SSE2
 
 			for (int i = 0; i < targ->height; ++i)