mirror of https://github.com/PCSX2/pcsx2.git
GregMiscellaneous: zzogl-pg:
* ASM port round 3 + add some comment to understand the algorithm. git-svn-id: http://pcsx2.googlecode.com/svn/branches/GregMiscellaneous@3829 96395faa-99c1-11dd-bbfe-3dabce05a288
This commit is contained in:
parent
3a1ef55cb1
commit
9895bb6b45
|
@ -651,6 +651,115 @@ static const __aligned16 int s_clut16mask[8] = { 0xffff0000, 0xffff0000, 0xffff0
|
||||||
|
|
||||||
extern "C" void __fastcall WriteCLUT_T16_I4_CSM1_sse2(u32* vm, u32* clut)
|
extern "C" void __fastcall WriteCLUT_T16_I4_CSM1_sse2(u32* vm, u32* clut)
|
||||||
{
|
{
|
||||||
|
#define YET_ANOTHER_INTRINSIC
|
||||||
|
#ifdef YET_ANOTHER_INTRINSIC
|
||||||
|
__m128i vm0 = _mm_load_si128((__m128i*)vm);
|
||||||
|
__m128i vm1 = _mm_load_si128((__m128i*)vm+1);
|
||||||
|
__m128i vm2 = _mm_load_si128((__m128i*)vm+2);
|
||||||
|
__m128i vm3 = _mm_load_si128((__m128i*)vm+3);
|
||||||
|
|
||||||
|
// rearrange 16bits words
|
||||||
|
vm0 = _mm_shufflehi_epi16(vm0, 0x88);
|
||||||
|
vm0 = _mm_shufflelo_epi16(vm0, 0x88); // 6 4 6 4 2 0 2 0
|
||||||
|
vm1 = _mm_shufflehi_epi16(vm1, 0x88);
|
||||||
|
vm1 = _mm_shufflelo_epi16(vm1, 0x88); // 14 12 14 12 10 8 10 8
|
||||||
|
|
||||||
|
vm0 = (__m128i)_mm_shuffle_ps((__m128)vm0, (__m128)vm1, 0x88); // 14 12 10 8 6 4 2 0
|
||||||
|
vm0 = _mm_shuffle_epi32(vm0, 0xD8); // 14 12 6 4 10 8 2 0
|
||||||
|
|
||||||
|
// *** Same jobs for vm2 and vm3
|
||||||
|
vm2 = _mm_shufflehi_epi16(vm2, 0x88);
|
||||||
|
vm2 = _mm_shufflelo_epi16(vm2, 0x88);
|
||||||
|
vm3 = _mm_shufflehi_epi16(vm3, 0x88);
|
||||||
|
vm3 = _mm_shufflelo_epi16(vm3, 0x88);
|
||||||
|
|
||||||
|
vm2 = (__m128i)_mm_shuffle_ps((__m128)vm2, (__m128)vm3, 0x88);
|
||||||
|
vm2 = _mm_shuffle_epi32(vm2, 0xD8);
|
||||||
|
|
||||||
|
// Create a zero regiters
|
||||||
|
__m128i zero_128;
|
||||||
|
zero_128 = _mm_xor_si128(zero_128, zero_128);
|
||||||
|
|
||||||
|
if ((u32)vm & 0x0F) {
|
||||||
|
// Unaligned write.
|
||||||
|
|
||||||
|
u16* clut_word_ptr = (u16*)clut;
|
||||||
|
__m128i clut_mask = _mm_load_si128((__m128i*)s_clut16mask2);
|
||||||
|
|
||||||
|
// Load previous data and clear high 16 bits of double words
|
||||||
|
__m128i clut_0 = _mm_load_si128((__m128i*)(clut_word_ptr-1)); // 6 5 4 3 2 1 0 x
|
||||||
|
__m128i clut_2 = _mm_load_si128((__m128i*)(clut_word_ptr-1)+2); // 22 21 20 19 18 17 16 15
|
||||||
|
clut_0 = _mm_and_si128(clut_0, clut_mask); // - 5 - 3 - 1 - x
|
||||||
|
clut_2 = _mm_and_si128(clut_2, clut_mask); // - 21 - 19 - 17 - 15
|
||||||
|
|
||||||
|
// Convert 16bits to 32 bits vm0 (zero entended)
|
||||||
|
__m128i vm0_low = _mm_unpacklo_epi16(vm0, zero_128); // - 10 - 8 - 2 - 0
|
||||||
|
__m128i vm0_high = _mm_unpackhi_epi16(vm0, zero_128); // - 14 - 12 - 6 - 4
|
||||||
|
|
||||||
|
// shift the value to aligned it with clut
|
||||||
|
vm0_low = _mm_slli_epi32(vm0_low, 16); // 10 - 8 - 2 - 0 -
|
||||||
|
vm0_high = _mm_slli_epi32(vm0_high, 16); // 14 - 12 - 6 - 4 -
|
||||||
|
|
||||||
|
// Interlace old and new data
|
||||||
|
clut_0 = _mm_or_si128(clut_0, vm0_low); // 10 5 8 3 2 1 0 x
|
||||||
|
clut_2 = _mm_or_si128(clut_2, vm0_high); // 14 21 12 19 6 17 4 15
|
||||||
|
|
||||||
|
// Save the result
|
||||||
|
_mm_store_si128((__m128i*)(clut_word_ptr-1), clut_0);
|
||||||
|
_mm_store_si128((__m128i*)(clut_word_ptr-1)+2, clut_2);
|
||||||
|
|
||||||
|
// *** Same jobs for clut_1 and clut_3
|
||||||
|
__m128i clut_1 = _mm_load_si128((__m128i*)(clut_word_ptr-1)+1);
|
||||||
|
__m128i clut_3 = _mm_load_si128((__m128i*)(clut_word_ptr-1)+3);
|
||||||
|
clut_1 = _mm_and_si128(clut_1, clut_mask);
|
||||||
|
clut_3 = _mm_and_si128(clut_3, clut_mask);
|
||||||
|
|
||||||
|
__m128i vm2_low = _mm_unpacklo_epi16(vm2, zero_128);
|
||||||
|
__m128i vm2_high = _mm_unpackhi_epi16(vm2, zero_128);
|
||||||
|
vm2_low = _mm_slli_epi32(vm2_low, 16);
|
||||||
|
vm2_high = _mm_slli_epi32(vm2_high, 16);
|
||||||
|
|
||||||
|
clut_1 = _mm_or_si128(clut_1, vm2_low);
|
||||||
|
clut_3 = _mm_or_si128(clut_3, vm2_high);
|
||||||
|
|
||||||
|
_mm_store_si128((__m128i*)(clut_word_ptr-1)+1, clut_1);
|
||||||
|
_mm_store_si128((__m128i*)(clut_word_ptr-1)+3, clut_3);
|
||||||
|
} else {
|
||||||
|
// Standard write
|
||||||
|
|
||||||
|
__m128i clut_mask = _mm_load_si128((__m128i*)s_clut16mask);
|
||||||
|
|
||||||
|
// Load previous data and clear low 16 bits of double words
|
||||||
|
__m128i clut_0 = _mm_and_si128(_mm_load_si128((__m128i*)clut), clut_mask); // 7 - 5 - 3 - 1 -
|
||||||
|
__m128i clut_2 = _mm_and_si128(_mm_load_si128((__m128i*)clut+2), clut_mask); // 23 - 21 - 19 - 17 -
|
||||||
|
|
||||||
|
// Convert 16bits to 32 bits vm0 (zero entended)
|
||||||
|
__m128i vm0_low = _mm_unpacklo_epi16(vm0, zero_128); // - 10 - 8 - 2 - 0
|
||||||
|
__m128i vm0_high = _mm_unpackhi_epi16(vm0, zero_128); // - 14 - 12 - 6 - 4
|
||||||
|
|
||||||
|
// Interlace old and new data
|
||||||
|
clut_0 = _mm_or_si128(clut_0, vm0_low); // 7 10 5 8 3 2 1 0
|
||||||
|
clut_2 = _mm_or_si128(clut_2, vm0_high); // 23 14 21 12 19 6 17 4
|
||||||
|
|
||||||
|
// Save the result
|
||||||
|
_mm_store_si128((__m128i*)clut, clut_0);
|
||||||
|
_mm_store_si128((__m128i*)clut+2, clut_2);
|
||||||
|
|
||||||
|
// *** Same jobs for clut_1 and clut_3
|
||||||
|
__m128i clut_1 = _mm_and_si128(_mm_load_si128((__m128i*)clut+1), clut_mask);
|
||||||
|
__m128i clut_3 = _mm_and_si128(_mm_load_si128((__m128i*)clut+3), clut_mask);
|
||||||
|
|
||||||
|
__m128i vm2_low = _mm_unpacklo_epi16(vm2, zero_128);
|
||||||
|
__m128i vm2_high = _mm_unpackhi_epi16(vm2, zero_128);
|
||||||
|
|
||||||
|
clut_1 = _mm_or_si128(clut_1, vm2_low);
|
||||||
|
clut_3 = _mm_or_si128(clut_3, vm2_high);
|
||||||
|
|
||||||
|
_mm_store_si128((__m128i*)clut+1, clut_1);
|
||||||
|
_mm_store_si128((__m128i*)clut+3, clut_3);
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
__asm
|
__asm
|
||||||
{
|
{
|
||||||
|
@ -881,6 +990,7 @@ End:
|
||||||
: "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory"
|
: "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory"
|
||||||
);
|
);
|
||||||
#endif // _MSC_VER
|
#endif // _MSC_VER
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // ZEROGS_SSE2
|
#endif // ZEROGS_SSE2
|
||||||
|
|
Loading…
Reference in New Issue