mirror of https://github.com/PCSX2/pcsx2.git
GregMiscellaneous: zzogl-pg:
* Port more ASM to intrinsics. Note use non-cacheable store instead to reduce cache pollution git-svn-id: http://pcsx2.googlecode.com/svn/branches/GregMiscellaneous@3825 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
79c4b1825d
commit
3a1ef55cb1
|
@ -2246,7 +2246,57 @@ ZeroGS::CMemoryTarget* ZeroGS::CMemoryTargetMngr::GetMemoryTarget(const tex0Info
|
|||
|
||||
#if defined(ZEROGS_SSE2)
|
||||
assert(((u32)(uptr)dst) % 16 == 0);
|
||||
// FIXME Uncomment to test intrinsic versions (instead of asm)
|
||||
// perf improvement vs asm:
|
||||
// 1/ gcc updates both pointer with 1 addition
|
||||
// 2/ Bypass the cache for the store
|
||||
#define NEW_INTRINSIC_VERSION
|
||||
#ifdef NEW_INTRINSIC_VERSION
|
||||
|
||||
__m128i zero_128;
|
||||
zero_128 = _mm_xor_si128(zero_128, zero_128);
|
||||
// NOTE: future performance improvement
|
||||
// SSE4.1 support uncacheable load 128bits. Maybe it can
|
||||
// avoid some cache pollution
|
||||
// NOTE2: I create multiple _n variable to mimic the previous ASM behavior
|
||||
// but I'm not sure there are real gains.
|
||||
for (int i = targ->height * GPU_TEXWIDTH/16 ; i >=0 ; --i)
|
||||
{
|
||||
// Convert 16 bits pixels to 32bits (zero extended)
|
||||
// Batch 64 bytes (32 pixels) at once.
|
||||
__m128i pixels_1 = _mm_load_si128((__m128i*)src);
|
||||
__m128i pixels_2 = _mm_load_si128((__m128i*)(src+8));
|
||||
__m128i pixels_3 = _mm_load_si128((__m128i*)(src+16));
|
||||
__m128i pixels_4 = _mm_load_si128((__m128i*)(src+24));
|
||||
|
||||
__m128i pix_low_1 = _mm_unpacklo_epi16(pixels_1, zero_128);
|
||||
__m128i pix_high_1 = _mm_unpackhi_epi16(pixels_1, zero_128);
|
||||
__m128i pix_low_2 = _mm_unpacklo_epi16(pixels_2, zero_128);
|
||||
__m128i pix_high_2 = _mm_unpackhi_epi16(pixels_2, zero_128);
|
||||
|
||||
// Note: bypass cache
|
||||
_mm_stream_si128((__m128i*)dst, pix_low_1);
|
||||
_mm_stream_si128((__m128i*)(dst+8), pix_high_1);
|
||||
_mm_stream_si128((__m128i*)(dst+16), pix_low_2);
|
||||
_mm_stream_si128((__m128i*)(dst+24), pix_high_2);
|
||||
|
||||
__m128i pix_low_3 = _mm_unpacklo_epi16(pixels_3, zero_128);
|
||||
__m128i pix_high_3 = _mm_unpackhi_epi16(pixels_3, zero_128);
|
||||
__m128i pix_low_4 = _mm_unpacklo_epi16(pixels_4, zero_128);
|
||||
__m128i pix_high_4 = _mm_unpackhi_epi16(pixels_4, zero_128);
|
||||
|
||||
// Note: bypass cache
|
||||
_mm_stream_si128((__m128i*)(dst+32), pix_low_3);
|
||||
_mm_stream_si128((__m128i*)(dst+40), pix_high_3);
|
||||
_mm_stream_si128((__m128i*)(dst+48), pix_low_4);
|
||||
_mm_stream_si128((__m128i*)(dst+56), pix_high_4);
|
||||
|
||||
src += 32;
|
||||
dst += 64;
|
||||
}
|
||||
#else
|
||||
SSE2_UnswizzleZ16Target(dst, src, targ->height * GPU_TEXWIDTH / 16);
|
||||
#endif
|
||||
#else // ZEROGS_SSE2
|
||||
|
||||
for (int i = 0; i < targ->height; ++i)
|
||||
|
|
Loading…
Reference in New Issue