mirror of https://github.com/PCSX2/pcsx2.git
GregMiscellaneous: zzogl-pg:
* For 24 bits texture: there is some spare room in gsmemory so do not bother to avoid overflow in the copy * Port some swizzle function to sse2. It would avoid some copy in a temporary buffer git-svn-id: http://pcsx2.googlecode.com/svn/branches/GregMiscellaneous@3984 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
9d2f81d142
commit
28b01737a4
|
@ -24,6 +24,9 @@
|
|||
|
||||
#include "Mem_Transmit.h"
|
||||
#include "Mem_Swizzle.h"
|
||||
#ifdef ZEROGS_SSE2
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
BLOCK m_Blocks[0x40]; // do so blocks are indexable
|
||||
|
||||
|
@ -129,6 +132,12 @@ static __forceinline const T* TransferAligningToBlocks(TransferData data, Transf
|
|||
u8 *temp = pstart + fun.gp(tempj, tempY, gs.dstbuf.bw) * data.blockbits / 8;
|
||||
swizzle(temp, (u8*)pbuf, TransPitch(pitch, data.transfersize), 0xffffffff);
|
||||
}
|
||||
#ifdef ZEROGS_SSE2
|
||||
// Note: swizzle function uses some non temporal move (mm_stream) instruction.
|
||||
// store fence insures that previous store are finish before execute new one.
|
||||
_mm_sfence();
|
||||
|
||||
#endif
|
||||
|
||||
/* transfer the rest */
|
||||
if (alignedPt.x < gs.imageEndX)
|
||||
|
|
|
@ -24,12 +24,17 @@
|
|||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
||||
// WARNING a sfence instruction must be call after SwizzleBlock sse2 function
|
||||
|
||||
// Current port of the ASM function to intrinsic
|
||||
#define INTRINSIC_PORT_32
|
||||
#define INTRINSIC_PORT_16
|
||||
#define INTRINSIC_PORT_8
|
||||
#define INTRINSIC_PORT_4
|
||||
#ifdef ZEROGS_SSE2
|
||||
static const __aligned16 u32 mask_24b_H[4] = {0x0000FFFF, 0xFF000000, 0x0000FFFF, 0xFF000000};
|
||||
static const __aligned16 u32 mask_24b_L[4] = {0x00000000, 0x00FFFFFF, 0x00000000, 0x00FFFFFF};
|
||||
|
||||
template<bool aligned>
|
||||
__forceinline void SwizzleBlock32_sse2_I(u8 *dst, u8 *src, int pitch, u32 WriteMask)
|
||||
{
|
||||
|
@ -125,8 +130,6 @@ __forceinline void SwizzleBlock32_sse2_I(u8 *dst, u8 *src, int pitch, u32 WriteM
|
|||
src += 2*pitch;
|
||||
}
|
||||
}
|
||||
// FIXME normally you must use a sfence but it would impact perf to do here
|
||||
// the function is in a loop and it would have a better place after the loop...
|
||||
}
|
||||
|
||||
template<bool aligned>
|
||||
|
@ -173,8 +176,6 @@ __forceinline void SwizzleBlock16_sse2_I(u8 *dst, u8 *src, int pitch)
|
|||
dst += 64;
|
||||
src += 2*pitch;
|
||||
}
|
||||
// FIXME normally you must use a sfence but it would impact perf to do here
|
||||
// the function is in a loop and it would have a better place after the loop...
|
||||
}
|
||||
|
||||
// Template the code to improve reuse of code
|
||||
|
@ -256,9 +257,6 @@ __forceinline void SwizzleBlock8_sse2_I(u8 *dst, u8 *src, int pitch)
|
|||
dst += 64;
|
||||
src += 4*pitch;
|
||||
SwizzleColumn8_sse2_I<aligned, 3>(dst, src, pitch);
|
||||
|
||||
// FIXME normally you must use a sfence but it would impact perf to do here
|
||||
// the function is in a loop and it would have a better place after the loop...
|
||||
}
|
||||
|
||||
// Template the code to improve reuse of code
|
||||
|
@ -372,14 +370,105 @@ __forceinline void SwizzleBlock4_sse2_I(u8 *dst, u8 *src, int pitch)
|
|||
dst += 64;
|
||||
src += 4*pitch;
|
||||
SwizzleColumn4_sse2_I<aligned, 3>(dst, src, pitch);
|
||||
|
||||
// FIXME normally you must use a sfence but it would impact perf to do here
|
||||
// the function is in a loop and it would have a better place after the loop...
|
||||
}
|
||||
#endif
|
||||
|
||||
template<bool FOUR_BIT, bool UPPER>
|
||||
__forceinline void SwizzleBlock8H_4H(u8 *dst, u8 *src, int pitch, u32 WriteMask)
|
||||
{
|
||||
__m128i zero_128 = _mm_setzero_si128();
|
||||
__m128i src_0;
|
||||
__m128i src_1;
|
||||
__m128i src_2;
|
||||
__m128i src_3;
|
||||
__m128i src_0_init_H;
|
||||
__m128i src_0_init_L;
|
||||
__m128i src_2_init_H;
|
||||
__m128i src_2_init_L;
|
||||
__m128i src_0_init;
|
||||
__m128i src_2_init;
|
||||
|
||||
__m128i upper_mask = _mm_cvtsi32_si128(0xF0F0F0F0);
|
||||
// Build the write_mask (tranform a u32 to a 4 packets u32)
|
||||
__m128i write_mask;
|
||||
if (FOUR_BIT) {
|
||||
if (UPPER) write_mask = _mm_cvtsi32_si128(0xF0000000);
|
||||
else write_mask = _mm_cvtsi32_si128(0x0F000000);
|
||||
} else {
|
||||
write_mask = _mm_cvtsi32_si128(0xFF000000);
|
||||
}
|
||||
write_mask = _mm_shuffle_epi32(write_mask, 0);
|
||||
|
||||
for (int i=3 ; i >= 0 ; --i) {
|
||||
if (FOUR_BIT) {
|
||||
src_0_init = _mm_cvtsi32_si128(*(u32*)src);
|
||||
src_2_init = _mm_cvtsi32_si128(*(u32*)(src + pitch));
|
||||
} else {
|
||||
src_0_init = _mm_loadl_epi64((__m128i*)src);
|
||||
src_2_init = _mm_loadl_epi64((__m128i*)(src + pitch));
|
||||
}
|
||||
|
||||
// Convert to 8 bits
|
||||
if (FOUR_BIT) {
|
||||
src_0_init_H = _mm_and_si128(upper_mask, src_0_init);
|
||||
src_0_init_L = _mm_andnot_si128(upper_mask, src_0_init);
|
||||
src_2_init_H = _mm_and_si128(upper_mask, src_2_init);
|
||||
src_2_init_L = _mm_andnot_si128(upper_mask, src_2_init);
|
||||
|
||||
if (UPPER) {
|
||||
src_0_init_L = _mm_slli_epi32(src_0_init_L, 4);
|
||||
src_2_init_L = _mm_slli_epi32(src_2_init_L, 4);
|
||||
} else {
|
||||
src_0_init_H = _mm_srli_epi32(src_0_init_H, 4);
|
||||
src_2_init_H = _mm_srli_epi32(src_2_init_H, 4);
|
||||
}
|
||||
|
||||
// Repack the src to keep HByte order
|
||||
src_0_init = _mm_unpacklo_epi8(src_0_init_L, src_0_init_H);
|
||||
src_2_init = _mm_unpacklo_epi8(src_2_init_L, src_2_init_H);
|
||||
}
|
||||
|
||||
// transform to 16 bits (add 0 in low bits)
|
||||
src_0_init = _mm_unpacklo_epi8(zero_128, src_0_init);
|
||||
src_2_init = _mm_unpacklo_epi8(zero_128, src_2_init);
|
||||
|
||||
// transform to 32 bits (add 0 in low bits)
|
||||
src_0 = _mm_unpacklo_epi16(zero_128, src_0_init);
|
||||
src_1 = _mm_unpackhi_epi16(zero_128, src_0_init);
|
||||
src_2 = _mm_unpacklo_epi16(zero_128, src_2_init);
|
||||
src_3 = _mm_unpackhi_epi16(zero_128, src_2_init);
|
||||
|
||||
// Reorder the data (same as 32 bits format)
|
||||
__m128i dst_0 = _mm_unpacklo_epi64(src_0, src_2);
|
||||
__m128i dst_1 = _mm_unpackhi_epi64(src_0, src_2);
|
||||
__m128i dst_2 = _mm_unpacklo_epi64(src_1, src_3);
|
||||
__m128i dst_3 = _mm_unpackhi_epi64(src_1, src_3);
|
||||
|
||||
// Load previous value and apply the ~write_mask
|
||||
__m128i old_dst_0 = _mm_andnot_si128(write_mask, _mm_load_si128((__m128i*)dst));
|
||||
dst_0 = _mm_or_si128(dst_0, old_dst_0);
|
||||
|
||||
__m128i old_dst_1 = _mm_andnot_si128(write_mask, _mm_load_si128(((__m128i*)dst)+1));
|
||||
dst_1 = _mm_or_si128(dst_1, old_dst_1);
|
||||
|
||||
__m128i old_dst_2 = _mm_andnot_si128(write_mask, _mm_load_si128(((__m128i*)dst)+2));
|
||||
dst_2 = _mm_or_si128(dst_2, old_dst_2);
|
||||
|
||||
__m128i old_dst_3 = _mm_andnot_si128(write_mask, _mm_load_si128(((__m128i*)dst)+3));
|
||||
dst_3 = _mm_or_si128(dst_3, old_dst_3);
|
||||
|
||||
// store
|
||||
_mm_stream_si128((__m128i*)dst, dst_0);
|
||||
_mm_stream_si128(((__m128i*)dst)+1, dst_1);
|
||||
_mm_stream_si128(((__m128i*)dst)+2, dst_2);
|
||||
_mm_stream_si128(((__m128i*)dst)+3, dst_3);
|
||||
|
||||
// update the pointer
|
||||
dst += 64;
|
||||
src += 2*pitch;
|
||||
}
|
||||
}
|
||||
|
||||
// special swizzle macros - which I converted to functions.
|
||||
#ifdef ZEROGS_SSE2
|
||||
|
||||
__forceinline void SwizzleBlock32(u8 *dst, u8 *src, int pitch, u32 WriteMask)
|
||||
{
|
||||
|
@ -390,6 +479,88 @@ __forceinline void SwizzleBlock32(u8 *dst, u8 *src, int pitch, u32 WriteMask)
|
|||
#endif
|
||||
}
|
||||
|
||||
__forceinline void SwizzleBlock24(u8 *dst, u8 *src, int pitch, u32 WriteMask)
|
||||
{
|
||||
__m128i mask_H = _mm_load_si128((__m128i*)mask_24b_H);
|
||||
__m128i mask_L = _mm_load_si128((__m128i*)mask_24b_L);
|
||||
// Build the write_mask (tranform a u32 to a 4 packets u32)
|
||||
__m128i write_mask = _mm_cvtsi32_si128(0x00FFFFFF);
|
||||
write_mask = _mm_shuffle_epi32(write_mask, 0);
|
||||
|
||||
for (int i=3 ; i >= 0 ; --i) {
|
||||
// Note src can be out of bound of GS memory (but there is some spare allocation
|
||||
// to avoid a tricky corner case)
|
||||
__m128i src_0 = _mm_loadu_si128((__m128i*)src);
|
||||
__m128i src_1 = _mm_loadu_si128((__m128i*)(src+12));
|
||||
__m128i src_2 = _mm_loadu_si128((__m128i*)(src+pitch));
|
||||
__m128i src_3 = _mm_loadu_si128((__m128i*)(src+pitch+12));
|
||||
|
||||
// transform 24 bits value to 32 bits one
|
||||
// 1/ Align a little the data
|
||||
src_0 = _mm_slli_si128(src_0, 2);
|
||||
src_0 = _mm_shufflelo_epi16(src_0, 0x39);
|
||||
|
||||
src_1 = _mm_slli_si128(src_1, 2);
|
||||
src_1 = _mm_shufflelo_epi16(src_1, 0x39);
|
||||
|
||||
src_2 = _mm_slli_si128(src_0, 2);
|
||||
src_2 = _mm_shufflelo_epi16(src_0, 0x39);
|
||||
|
||||
src_3 = _mm_slli_si128(src_3, 2);
|
||||
src_3 = _mm_shufflelo_epi16(src_3, 0x39);
|
||||
|
||||
// 2/ Filter the 24 bits pixels & do the conversion
|
||||
__m128i src_0_H = _mm_and_si128(src_0, mask_H);
|
||||
__m128i src_0_L = _mm_and_si128(src_0, mask_L);
|
||||
src_0_H = _mm_slli_si128(src_0_H, 1);
|
||||
src_0 = _mm_or_si128(src_0_H, src_0_L);
|
||||
|
||||
__m128i src_1_H = _mm_and_si128(src_1, mask_H);
|
||||
__m128i src_1_L = _mm_and_si128(src_1, mask_L);
|
||||
src_1_H = _mm_slli_si128(src_1_H, 1);
|
||||
src_1 = _mm_or_si128(src_1_H, src_1_L);
|
||||
|
||||
__m128i src_2_H = _mm_and_si128(src_2, mask_H);
|
||||
__m128i src_2_L = _mm_and_si128(src_2, mask_L);
|
||||
src_2_H = _mm_slli_si128(src_2_H, 1);
|
||||
src_2 = _mm_or_si128(src_2_H, src_2_L);
|
||||
|
||||
__m128i src_3_H = _mm_and_si128(src_3, mask_H);
|
||||
__m128i src_3_L = _mm_and_si128(src_3, mask_L);
|
||||
src_3_H = _mm_slli_si128(src_3_H, 1);
|
||||
src_3 = _mm_or_si128(src_3_H, src_3_L);
|
||||
|
||||
// Reorder the data (same as 32 bits format)
|
||||
__m128i dst_0 = _mm_unpacklo_epi64(src_0, src_2);
|
||||
__m128i dst_1 = _mm_unpackhi_epi64(src_0, src_2);
|
||||
__m128i dst_2 = _mm_unpacklo_epi64(src_1, src_3);
|
||||
__m128i dst_3 = _mm_unpackhi_epi64(src_1, src_3);
|
||||
|
||||
// Load previous value and apply the ~write_mask
|
||||
__m128i old_dst_0 = _mm_andnot_si128(write_mask, _mm_load_si128((__m128i*)dst));
|
||||
dst_0 = _mm_or_si128(dst_0, old_dst_0);
|
||||
|
||||
__m128i old_dst_1 = _mm_andnot_si128(write_mask, _mm_load_si128(((__m128i*)dst)+1));
|
||||
dst_1 = _mm_or_si128(dst_1, old_dst_1);
|
||||
|
||||
__m128i old_dst_2 = _mm_andnot_si128(write_mask, _mm_load_si128(((__m128i*)dst)+2));
|
||||
dst_2 = _mm_or_si128(dst_2, old_dst_2);
|
||||
|
||||
__m128i old_dst_3 = _mm_andnot_si128(write_mask, _mm_load_si128(((__m128i*)dst)+3));
|
||||
dst_3 = _mm_or_si128(dst_3, old_dst_3);
|
||||
|
||||
// store
|
||||
_mm_stream_si128((__m128i*)dst, dst_0);
|
||||
_mm_stream_si128(((__m128i*)dst)+1, dst_1);
|
||||
_mm_stream_si128(((__m128i*)dst)+2, dst_2);
|
||||
_mm_stream_si128(((__m128i*)dst)+3, dst_3);
|
||||
|
||||
// update the pointer
|
||||
dst += 64;
|
||||
src += 2*pitch;
|
||||
}
|
||||
}
|
||||
|
||||
__forceinline void SwizzleBlock16(u8 *dst, u8 *src, int pitch, u32 WriteMask)
|
||||
{
|
||||
#ifdef INTRINSIC_PORT_16
|
||||
|
@ -453,6 +624,21 @@ __forceinline void SwizzleBlock4u(u8 *dst, u8 *src, int pitch, u32 WriteMask)
|
|||
#endif
|
||||
}
|
||||
|
||||
__forceinline void SwizzleBlock8H(u8 *dst, u8 *src, int pitch, u32 WriteMask)
|
||||
{
|
||||
SwizzleBlock8H_4H<false, false>(dst, src, pitch, WriteMask);
|
||||
}
|
||||
|
||||
__forceinline void SwizzleBlock4HH(u8 *dst, u8 *src, int pitch, u32 WriteMask)
|
||||
{
|
||||
SwizzleBlock8H_4H<true, true>(dst, src, pitch, WriteMask);
|
||||
}
|
||||
|
||||
__forceinline void SwizzleBlock4HL(u8 *dst, u8 *src, int pitch, u32 WriteMask)
|
||||
{
|
||||
SwizzleBlock8H_4H<true, false>(dst, src, pitch, WriteMask);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
__forceinline void SwizzleBlock32(u8 *dst, u8 *src, int pitch, u32 WriteMask)
|
||||
|
@ -566,13 +752,14 @@ __forceinline void __fastcall SwizzleBlock4_c(u8* dst, u8* src, int srcpitch, u3
|
|||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
__forceinline void SwizzleBlock24(u8 *dst, u8 *src, int pitch, u32 WriteMask)
|
||||
{
|
||||
u8* pnewsrc = src;
|
||||
u32* pblock = tempblock;
|
||||
|
||||
for (int by = 0; by < 7; ++by, pblock += 8, pnewsrc += pitch - 24)
|
||||
// Note src can be out of bound of GS memory (but there is some spare allocation
|
||||
// to avoid a tricky corner case)
|
||||
for (int by = 0; by < 8; ++by, pblock += 8, pnewsrc += pitch - 24)
|
||||
{
|
||||
for (int bx = 0; bx < 8; ++bx, pnewsrc += 3)
|
||||
{
|
||||
|
@ -580,19 +767,6 @@ __forceinline void SwizzleBlock24(u8 *dst, u8 *src, int pitch, u32 WriteMask)
|
|||
}
|
||||
}
|
||||
|
||||
for (int bx = 0; bx < 7; ++bx, pnewsrc += 3)
|
||||
{
|
||||
/* might be 1 byte out of bounds of GS memory */
|
||||
pblock[bx] = *(u32*)pnewsrc;
|
||||
}
|
||||
|
||||
/* do 3 bytes for the last copy */
|
||||
*((u8*)pblock + 28) = pnewsrc[0];
|
||||
|
||||
*((u8*)pblock + 29) = pnewsrc[1];
|
||||
|
||||
*((u8*)pblock + 30) = pnewsrc[2];
|
||||
|
||||
SwizzleBlock32((u8*)dst, (u8*)tempblock, 32, 0x00ffffff);
|
||||
}
|
||||
|
||||
|
@ -659,4 +833,4 @@ __forceinline void SwizzleBlock4HL(u8 *dst, u8 *src, int pitch, u32 WriteMask)
|
|||
|
||||
SwizzleBlock32((u8*)dst, (u8*)tempblock, 32, 0x0f000000);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue