diff --git a/plugins/zzogl-pg/opengl/targets.cpp b/plugins/zzogl-pg/opengl/targets.cpp index abe2bbe0ad..4bb70c7117 100644 --- a/plugins/zzogl-pg/opengl/targets.cpp +++ b/plugins/zzogl-pg/opengl/targets.cpp @@ -2246,7 +2246,57 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info #if defined(ZEROGS_SSE2) assert(((u32)(uptr)dst) % 16 == 0); + // FIXME Uncomment to test intrinsic versions (instead of asm) + // perf improvement vs asm: + // 1/ gcc updates both pointer with 1 addition + // 2/ Bypass the cache for the store +#define NEW_INTRINSIC_VERSION +#ifdef NEW_INTRINSIC_VERSION + + __m128i zero_128; + zero_128 = _mm_xor_si128(zero_128, zero_128); + // NOTE: future performance improvement + // SSE4.1 support uncacheable load 128bits. Maybe it can + // avoid some cache pollution + // NOTE2: I create multiple _n variable to mimic the previous ASM behavior + // but I'm not sure there are real gains. + for (int i = targ->height * GPU_TEXWIDTH/16 ; i >=0 ; --i) + { + // Convert 16 bits pixels to 32bits (zero extended) + // Batch 64 bytes (32 pixels) at once. + __m128i pixels_1 = _mm_load_si128((__m128i*)src); + __m128i pixels_2 = _mm_load_si128((__m128i*)(src+8)); + __m128i pixels_3 = _mm_load_si128((__m128i*)(src+16)); + __m128i pixels_4 = _mm_load_si128((__m128i*)(src+24)); + + __m128i pix_low_1 = _mm_unpacklo_epi16(pixels_1, zero_128); + __m128i pix_high_1 = _mm_unpackhi_epi16(pixels_1, zero_128); + __m128i pix_low_2 = _mm_unpacklo_epi16(pixels_2, zero_128); + __m128i pix_high_2 = _mm_unpackhi_epi16(pixels_2, zero_128); + + // Note: bypass cache + _mm_stream_si128((__m128i*)dst, pix_low_1); + _mm_stream_si128((__m128i*)(dst+8), pix_high_1); + _mm_stream_si128((__m128i*)(dst+16), pix_low_2); + _mm_stream_si128((__m128i*)(dst+24), pix_high_2); + + __m128i pix_low_3 = _mm_unpacklo_epi16(pixels_3, zero_128); + __m128i pix_high_3 = _mm_unpackhi_epi16(pixels_3, zero_128); + __m128i pix_low_4 = _mm_unpacklo_epi16(pixels_4, zero_128); + __m128i pix_high_4 = _mm_unpackhi_epi16(pixels_4, zero_128); + + // Note: bypass cache + _mm_stream_si128((__m128i*)(dst+32), pix_low_3); + _mm_stream_si128((__m128i*)(dst+40), pix_high_3); + _mm_stream_si128((__m128i*)(dst+48), pix_low_4); + _mm_stream_si128((__m128i*)(dst+56), pix_high_4); + + src += 32; + dst += 64; + } +#else SSE2_UnswizzleZ16Target(dst, src, targ->height * GPU_TEXWIDTH / 16); +#endif #else // ZEROGS_SSE2 for (int i = 0; i < targ->height; ++i)