diff --git a/plugins/zzogl-pg/opengl/Mem.cpp b/plugins/zzogl-pg/opengl/Mem.cpp index c6eb6f1601..b4f9812604 100644 --- a/plugins/zzogl-pg/opengl/Mem.cpp +++ b/plugins/zzogl-pg/opengl/Mem.cpp @@ -24,6 +24,9 @@ #include "Mem_Transmit.h" #include "Mem_Swizzle.h" +#ifdef ZEROGS_SSE2 +#include +#endif BLOCK m_Blocks[0x40]; // do so blocks are indexable @@ -129,6 +132,12 @@ static __forceinline const T* TransferAligningToBlocks(TransferData data, Transf u8 *temp = pstart + fun.gp(tempj, tempY, gs.dstbuf.bw) * data.blockbits / 8; swizzle(temp, (u8*)pbuf, TransPitch(pitch, data.transfersize), 0xffffffff); } +#ifdef ZEROGS_SSE2 + // Note: swizzle function uses some non temporal move (mm_stream) instruction. + // store fence insures that previous store are finish before execute new one. + _mm_sfence(); + +#endif /* transfer the rest */ if (alignedPt.x < gs.imageEndX) diff --git a/plugins/zzogl-pg/opengl/Mem_Swizzle.cpp b/plugins/zzogl-pg/opengl/Mem_Swizzle.cpp index e97ba18da9..f16da7f64a 100644 --- a/plugins/zzogl-pg/opengl/Mem_Swizzle.cpp +++ b/plugins/zzogl-pg/opengl/Mem_Swizzle.cpp @@ -24,12 +24,17 @@ #include #endif +// WARNING a sfence instruction must be call after SwizzleBlock sse2 function + // Current port of the ASM function to intrinsic #define INTRINSIC_PORT_32 #define INTRINSIC_PORT_16 #define INTRINSIC_PORT_8 #define INTRINSIC_PORT_4 #ifdef ZEROGS_SSE2 +static const __aligned16 u32 mask_24b_H[4] = {0x0000FFFF, 0xFF000000, 0x0000FFFF, 0xFF000000}; +static const __aligned16 u32 mask_24b_L[4] = {0x00000000, 0x00FFFFFF, 0x00000000, 0x00FFFFFF}; + template __forceinline void SwizzleBlock32_sse2_I(u8 *dst, u8 *src, int pitch, u32 WriteMask) { @@ -125,8 +130,6 @@ __forceinline void SwizzleBlock32_sse2_I(u8 *dst, u8 *src, int pitch, u32 WriteM src += 2*pitch; } } - // FIXME normally you must use a sfence but it would impact perf to do here - // the function is in a loop and it would have a better place after the loop... } template @@ -173,8 +176,6 @@ __forceinline void SwizzleBlock16_sse2_I(u8 *dst, u8 *src, int pitch) dst += 64; src += 2*pitch; } - // FIXME normally you must use a sfence but it would impact perf to do here - // the function is in a loop and it would have a better place after the loop... } // Template the code to improve reuse of code @@ -256,9 +257,6 @@ __forceinline void SwizzleBlock8_sse2_I(u8 *dst, u8 *src, int pitch) dst += 64; src += 4*pitch; SwizzleColumn8_sse2_I(dst, src, pitch); - - // FIXME normally you must use a sfence but it would impact perf to do here - // the function is in a loop and it would have a better place after the loop... } // Template the code to improve reuse of code @@ -372,14 +370,105 @@ __forceinline void SwizzleBlock4_sse2_I(u8 *dst, u8 *src, int pitch) dst += 64; src += 4*pitch; SwizzleColumn4_sse2_I(dst, src, pitch); - - // FIXME normally you must use a sfence but it would impact perf to do here - // the function is in a loop and it would have a better place after the loop... } -#endif + +template +__forceinline void SwizzleBlock8H_4H(u8 *dst, u8 *src, int pitch, u32 WriteMask) +{ + __m128i zero_128 = _mm_setzero_si128(); + __m128i src_0; + __m128i src_1; + __m128i src_2; + __m128i src_3; + __m128i src_0_init_H; + __m128i src_0_init_L; + __m128i src_2_init_H; + __m128i src_2_init_L; + __m128i src_0_init; + __m128i src_2_init; + + __m128i upper_mask = _mm_cvtsi32_si128(0xF0F0F0F0); + // Build the write_mask (tranform a u32 to a 4 packets u32) + __m128i write_mask; + if (FOUR_BIT) { + if (UPPER) write_mask = _mm_cvtsi32_si128(0xF0000000); + else write_mask = _mm_cvtsi32_si128(0x0F000000); + } else { + write_mask = _mm_cvtsi32_si128(0xFF000000); + } + write_mask = _mm_shuffle_epi32(write_mask, 0); + + for (int i=3 ; i >= 0 ; --i) { + if (FOUR_BIT) { + src_0_init = _mm_cvtsi32_si128(*(u32*)src); + src_2_init = _mm_cvtsi32_si128(*(u32*)(src + pitch)); + } else { + src_0_init = _mm_loadl_epi64((__m128i*)src); + src_2_init = _mm_loadl_epi64((__m128i*)(src + pitch)); + } + + // Convert to 8 bits + if (FOUR_BIT) { + src_0_init_H = _mm_and_si128(upper_mask, src_0_init); + src_0_init_L = _mm_andnot_si128(upper_mask, src_0_init); + src_2_init_H = _mm_and_si128(upper_mask, src_2_init); + src_2_init_L = _mm_andnot_si128(upper_mask, src_2_init); + + if (UPPER) { + src_0_init_L = _mm_slli_epi32(src_0_init_L, 4); + src_2_init_L = _mm_slli_epi32(src_2_init_L, 4); + } else { + src_0_init_H = _mm_srli_epi32(src_0_init_H, 4); + src_2_init_H = _mm_srli_epi32(src_2_init_H, 4); + } + + // Repack the src to keep HByte order + src_0_init = _mm_unpacklo_epi8(src_0_init_L, src_0_init_H); + src_2_init = _mm_unpacklo_epi8(src_2_init_L, src_2_init_H); + } + + // transform to 16 bits (add 0 in low bits) + src_0_init = _mm_unpacklo_epi8(zero_128, src_0_init); + src_2_init = _mm_unpacklo_epi8(zero_128, src_2_init); + + // transform to 32 bits (add 0 in low bits) + src_0 = _mm_unpacklo_epi16(zero_128, src_0_init); + src_1 = _mm_unpackhi_epi16(zero_128, src_0_init); + src_2 = _mm_unpacklo_epi16(zero_128, src_2_init); + src_3 = _mm_unpackhi_epi16(zero_128, src_2_init); + + // Reorder the data (same as 32 bits format) + __m128i dst_0 = _mm_unpacklo_epi64(src_0, src_2); + __m128i dst_1 = _mm_unpackhi_epi64(src_0, src_2); + __m128i dst_2 = _mm_unpacklo_epi64(src_1, src_3); + __m128i dst_3 = _mm_unpackhi_epi64(src_1, src_3); + + // Load previous value and apply the ~write_mask + __m128i old_dst_0 = _mm_andnot_si128(write_mask, _mm_load_si128((__m128i*)dst)); + dst_0 = _mm_or_si128(dst_0, old_dst_0); + + __m128i old_dst_1 = _mm_andnot_si128(write_mask, _mm_load_si128(((__m128i*)dst)+1)); + dst_1 = _mm_or_si128(dst_1, old_dst_1); + + __m128i old_dst_2 = _mm_andnot_si128(write_mask, _mm_load_si128(((__m128i*)dst)+2)); + dst_2 = _mm_or_si128(dst_2, old_dst_2); + + __m128i old_dst_3 = _mm_andnot_si128(write_mask, _mm_load_si128(((__m128i*)dst)+3)); + dst_3 = _mm_or_si128(dst_3, old_dst_3); + + // store + _mm_stream_si128((__m128i*)dst, dst_0); + _mm_stream_si128(((__m128i*)dst)+1, dst_1); + _mm_stream_si128(((__m128i*)dst)+2, dst_2); + _mm_stream_si128(((__m128i*)dst)+3, dst_3); + + // update the pointer + dst += 64; + src += 2*pitch; + } +} // special swizzle macros - which I converted to functions. -#ifdef ZEROGS_SSE2 __forceinline void SwizzleBlock32(u8 *dst, u8 *src, int pitch, u32 WriteMask) { @@ -390,6 +479,88 @@ __forceinline void SwizzleBlock32(u8 *dst, u8 *src, int pitch, u32 WriteMask) #endif } +__forceinline void SwizzleBlock24(u8 *dst, u8 *src, int pitch, u32 WriteMask) +{ + __m128i mask_H = _mm_load_si128((__m128i*)mask_24b_H); + __m128i mask_L = _mm_load_si128((__m128i*)mask_24b_L); + // Build the write_mask (tranform a u32 to a 4 packets u32) + __m128i write_mask = _mm_cvtsi32_si128(0x00FFFFFF); + write_mask = _mm_shuffle_epi32(write_mask, 0); + + for (int i=3 ; i >= 0 ; --i) { + // Note src can be out of bound of GS memory (but there is some spare allocation + // to avoid a tricky corner case) + __m128i src_0 = _mm_loadu_si128((__m128i*)src); + __m128i src_1 = _mm_loadu_si128((__m128i*)(src+12)); + __m128i src_2 = _mm_loadu_si128((__m128i*)(src+pitch)); + __m128i src_3 = _mm_loadu_si128((__m128i*)(src+pitch+12)); + + // transform 24 bits value to 32 bits one + // 1/ Align a little the data + src_0 = _mm_slli_si128(src_0, 2); + src_0 = _mm_shufflelo_epi16(src_0, 0x39); + + src_1 = _mm_slli_si128(src_1, 2); + src_1 = _mm_shufflelo_epi16(src_1, 0x39); + + src_2 = _mm_slli_si128(src_0, 2); + src_2 = _mm_shufflelo_epi16(src_0, 0x39); + + src_3 = _mm_slli_si128(src_3, 2); + src_3 = _mm_shufflelo_epi16(src_3, 0x39); + + // 2/ Filter the 24 bits pixels & do the conversion + __m128i src_0_H = _mm_and_si128(src_0, mask_H); + __m128i src_0_L = _mm_and_si128(src_0, mask_L); + src_0_H = _mm_slli_si128(src_0_H, 1); + src_0 = _mm_or_si128(src_0_H, src_0_L); + + __m128i src_1_H = _mm_and_si128(src_1, mask_H); + __m128i src_1_L = _mm_and_si128(src_1, mask_L); + src_1_H = _mm_slli_si128(src_1_H, 1); + src_1 = _mm_or_si128(src_1_H, src_1_L); + + __m128i src_2_H = _mm_and_si128(src_2, mask_H); + __m128i src_2_L = _mm_and_si128(src_2, mask_L); + src_2_H = _mm_slli_si128(src_2_H, 1); + src_2 = _mm_or_si128(src_2_H, src_2_L); + + __m128i src_3_H = _mm_and_si128(src_3, mask_H); + __m128i src_3_L = _mm_and_si128(src_3, mask_L); + src_3_H = _mm_slli_si128(src_3_H, 1); + src_3 = _mm_or_si128(src_3_H, src_3_L); + + // Reorder the data (same as 32 bits format) + __m128i dst_0 = _mm_unpacklo_epi64(src_0, src_2); + __m128i dst_1 = _mm_unpackhi_epi64(src_0, src_2); + __m128i dst_2 = _mm_unpacklo_epi64(src_1, src_3); + __m128i dst_3 = _mm_unpackhi_epi64(src_1, src_3); + + // Load previous value and apply the ~write_mask + __m128i old_dst_0 = _mm_andnot_si128(write_mask, _mm_load_si128((__m128i*)dst)); + dst_0 = _mm_or_si128(dst_0, old_dst_0); + + __m128i old_dst_1 = _mm_andnot_si128(write_mask, _mm_load_si128(((__m128i*)dst)+1)); + dst_1 = _mm_or_si128(dst_1, old_dst_1); + + __m128i old_dst_2 = _mm_andnot_si128(write_mask, _mm_load_si128(((__m128i*)dst)+2)); + dst_2 = _mm_or_si128(dst_2, old_dst_2); + + __m128i old_dst_3 = _mm_andnot_si128(write_mask, _mm_load_si128(((__m128i*)dst)+3)); + dst_3 = _mm_or_si128(dst_3, old_dst_3); + + // store + _mm_stream_si128((__m128i*)dst, dst_0); + _mm_stream_si128(((__m128i*)dst)+1, dst_1); + _mm_stream_si128(((__m128i*)dst)+2, dst_2); + _mm_stream_si128(((__m128i*)dst)+3, dst_3); + + // update the pointer + dst += 64; + src += 2*pitch; + } +} + __forceinline void SwizzleBlock16(u8 *dst, u8 *src, int pitch, u32 WriteMask) { #ifdef INTRINSIC_PORT_16 @@ -453,6 +624,21 @@ __forceinline void SwizzleBlock4u(u8 *dst, u8 *src, int pitch, u32 WriteMask) #endif } +__forceinline void SwizzleBlock8H(u8 *dst, u8 *src, int pitch, u32 WriteMask) +{ + SwizzleBlock8H_4H(dst, src, pitch, WriteMask); +} + +__forceinline void SwizzleBlock4HH(u8 *dst, u8 *src, int pitch, u32 WriteMask) +{ + SwizzleBlock8H_4H(dst, src, pitch, WriteMask); +} + +__forceinline void SwizzleBlock4HL(u8 *dst, u8 *src, int pitch, u32 WriteMask) +{ + SwizzleBlock8H_4H(dst, src, pitch, WriteMask); +} + #else __forceinline void SwizzleBlock32(u8 *dst, u8 *src, int pitch, u32 WriteMask) @@ -566,13 +752,14 @@ __forceinline void __fastcall SwizzleBlock4_c(u8* dst, u8* src, int srcpitch, u3 } } -#endif __forceinline void SwizzleBlock24(u8 *dst, u8 *src, int pitch, u32 WriteMask) { u8* pnewsrc = src; u32* pblock = tempblock; - for (int by = 0; by < 7; ++by, pblock += 8, pnewsrc += pitch - 24) + // Note src can be out of bound of GS memory (but there is some spare allocation + // to avoid a tricky corner case) + for (int by = 0; by < 8; ++by, pblock += 8, pnewsrc += pitch - 24) { for (int bx = 0; bx < 8; ++bx, pnewsrc += 3) { @@ -580,19 +767,6 @@ __forceinline void SwizzleBlock24(u8 *dst, u8 *src, int pitch, u32 WriteMask) } } - for (int bx = 0; bx < 7; ++bx, pnewsrc += 3) - { - /* might be 1 byte out of bounds of GS memory */ - pblock[bx] = *(u32*)pnewsrc; - } - - /* do 3 bytes for the last copy */ - *((u8*)pblock + 28) = pnewsrc[0]; - - *((u8*)pblock + 29) = pnewsrc[1]; - - *((u8*)pblock + 30) = pnewsrc[2]; - SwizzleBlock32((u8*)dst, (u8*)tempblock, 32, 0x00ffffff); } @@ -659,4 +833,4 @@ __forceinline void SwizzleBlock4HL(u8 *dst, u8 *src, int pitch, u32 WriteMask) SwizzleBlock32((u8*)dst, (u8*)tempblock, 32, 0x0f000000); } - +#endif