GregMiscellaneous: zzogl-pg:

* Port more ASM to intrinsics. Note use non-cacheable store instead to reduce cache pollution


git-svn-id: http://pcsx2.googlecode.com/svn/branches/GregMiscellaneous@3825 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
gregory.hainaut@gmail.com 2010-09-23 18:00:38 +00:00
parent 79c4b1825d
commit 3a1ef55cb1
1 changed files with 50 additions and 0 deletions

View File

@ -2246,7 +2246,57 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info
#if defined(ZEROGS_SSE2) #if defined(ZEROGS_SSE2)
assert(((u32)(uptr)dst) % 16 == 0); assert(((u32)(uptr)dst) % 16 == 0);
// FIXME Uncomment to test intrinsic versions (instead of asm)
// perf improvement vs asm:
// 1/ gcc updates both pointer with 1 addition
// 2/ Bypass the cache for the store
#define NEW_INTRINSIC_VERSION
#ifdef NEW_INTRINSIC_VERSION
__m128i zero_128;
zero_128 = _mm_xor_si128(zero_128, zero_128);
// NOTE: future performance improvement
// SSE4.1 support uncacheable load 128bits. Maybe it can
// avoid some cache pollution
// NOTE2: I create multiple _n variable to mimic the previous ASM behavior
// but I'm not sure there are real gains.
for (int i = targ->height * GPU_TEXWIDTH/16 ; i >=0 ; --i)
{
// Convert 16 bits pixels to 32bits (zero extended)
// Batch 64 bytes (32 pixels) at once.
__m128i pixels_1 = _mm_load_si128((__m128i*)src);
__m128i pixels_2 = _mm_load_si128((__m128i*)(src+8));
__m128i pixels_3 = _mm_load_si128((__m128i*)(src+16));
__m128i pixels_4 = _mm_load_si128((__m128i*)(src+24));
__m128i pix_low_1 = _mm_unpacklo_epi16(pixels_1, zero_128);
__m128i pix_high_1 = _mm_unpackhi_epi16(pixels_1, zero_128);
__m128i pix_low_2 = _mm_unpacklo_epi16(pixels_2, zero_128);
__m128i pix_high_2 = _mm_unpackhi_epi16(pixels_2, zero_128);
// Note: bypass cache
_mm_stream_si128((__m128i*)dst, pix_low_1);
_mm_stream_si128((__m128i*)(dst+8), pix_high_1);
_mm_stream_si128((__m128i*)(dst+16), pix_low_2);
_mm_stream_si128((__m128i*)(dst+24), pix_high_2);
__m128i pix_low_3 = _mm_unpacklo_epi16(pixels_3, zero_128);
__m128i pix_high_3 = _mm_unpackhi_epi16(pixels_3, zero_128);
__m128i pix_low_4 = _mm_unpacklo_epi16(pixels_4, zero_128);
__m128i pix_high_4 = _mm_unpackhi_epi16(pixels_4, zero_128);
// Note: bypass cache
_mm_stream_si128((__m128i*)(dst+32), pix_low_3);
_mm_stream_si128((__m128i*)(dst+40), pix_high_3);
_mm_stream_si128((__m128i*)(dst+48), pix_low_4);
_mm_stream_si128((__m128i*)(dst+56), pix_high_4);
src += 32;
dst += 64;
}
#else
SSE2_UnswizzleZ16Target(dst, src, targ->height * GPU_TEXWIDTH / 16); SSE2_UnswizzleZ16Target(dst, src, targ->height * GPU_TEXWIDTH / 16);
#endif
#else // ZEROGS_SSE2 #else // ZEROGS_SSE2
for (int i = 0; i < targ->height; ++i) for (int i = 0; i < targ->height; ++i)