GregMiscellaneous: zzogl-pg:

* ASM port round 3 + add some comment to understand the algorithm. git-svn-id: http://pcsx2.googlecode.com/svn/branches/GregMiscellaneous@3829 96395faa-99c1-11dd-bbfe-3dabce05a288
2010-09-24 15:38:47 +00:00 · 2010-09-24 15:38:47 +00:00 · 9895bb6b45
parent 3a1ef55cb1
commit 9895bb6b45
1 changed files with 110 additions and 0 deletions
--- a/plugins/zzogl-pg/opengl/x86.cpp
+++ b/plugins/zzogl-pg/opengl/x86.cpp
@ -651,6 +651,115 @@ static const __aligned16 int s_clut16mask[8] = { 0xffff0000, 0xffff0000, 0xffff0

 extern "C" void __fastcall WriteCLUT_T16_I4_CSM1_sse2(u32* vm, u32* clut)
 {
+#define YET_ANOTHER_INTRINSIC
+#ifdef YET_ANOTHER_INTRINSIC
+    __m128i vm0 = _mm_load_si128((__m128i*)vm);
+    __m128i vm1 = _mm_load_si128((__m128i*)vm+1);
+    __m128i vm2 = _mm_load_si128((__m128i*)vm+2);
+    __m128i vm3 = _mm_load_si128((__m128i*)vm+3);
+
+    // rearrange 16bits words
+    vm0 = _mm_shufflehi_epi16(vm0, 0x88);
+    vm0 = _mm_shufflelo_epi16(vm0, 0x88); // 6 4 6 4  2 0 2 0
+    vm1 = _mm_shufflehi_epi16(vm1, 0x88);
+    vm1 = _mm_shufflelo_epi16(vm1, 0x88); // 14 12 14 12  10 8 10 8
+
+    vm0 = (__m128i)_mm_shuffle_ps((__m128)vm0, (__m128)vm1, 0x88); // 14 12 10 8  6 4 2 0
+    vm0 = _mm_shuffle_epi32(vm0, 0xD8); // 14 12 6 4  10 8 2 0
+
+    // *** Same jobs for vm2 and vm3
+    vm2 = _mm_shufflehi_epi16(vm2, 0x88);
+    vm2 = _mm_shufflelo_epi16(vm2, 0x88);
+    vm3 = _mm_shufflehi_epi16(vm3, 0x88);
+    vm3 = _mm_shufflelo_epi16(vm3, 0x88);
+
+    vm2 = (__m128i)_mm_shuffle_ps((__m128)vm2, (__m128)vm3, 0x88);
+    vm2 = _mm_shuffle_epi32(vm2, 0xD8);
+
+    // Create a zero regiters
+    __m128i zero_128;
+    zero_128 = _mm_xor_si128(zero_128, zero_128);
+
+    if ((u32)vm & 0x0F) {
+        // Unaligned write.
+
+        u16* clut_word_ptr = (u16*)clut;
+        __m128i clut_mask = _mm_load_si128((__m128i*)s_clut16mask2);
+
+        // Load previous data and clear high 16 bits of double words
+        __m128i clut_0 = _mm_load_si128((__m128i*)(clut_word_ptr-1)); // 6 5 4 3  2 1 0 x
+        __m128i clut_2 = _mm_load_si128((__m128i*)(clut_word_ptr-1)+2); // 22 21 20 19  18 17 16 15
+        clut_0 = _mm_and_si128(clut_0, clut_mask); // - 5 - 3  - 1 - x
+        clut_2 = _mm_and_si128(clut_2, clut_mask); // - 21 - 19  - 17 - 15
+
+        // Convert 16bits to 32 bits vm0 (zero entended)
+        __m128i vm0_low = _mm_unpacklo_epi16(vm0, zero_128); // - 10 - 8  - 2 - 0
+        __m128i vm0_high = _mm_unpackhi_epi16(vm0, zero_128); // - 14 - 12  - 6  - 4
+
+        // shift the value to aligned it with clut
+        vm0_low = _mm_slli_epi32(vm0_low, 16); // 10 - 8 -  2 - 0 -
+        vm0_high = _mm_slli_epi32(vm0_high, 16); // 14 - 12 -  6 - 4 -
+
+        // Interlace old and new data
+        clut_0 = _mm_or_si128(clut_0, vm0_low); // 10 5 8 3  2 1 0 x
+        clut_2 = _mm_or_si128(clut_2, vm0_high); // 14 21 12 19  6 17 4 15
+
+        // Save the result
+        _mm_store_si128((__m128i*)(clut_word_ptr-1), clut_0);
+        _mm_store_si128((__m128i*)(clut_word_ptr-1)+2, clut_2);
+
+        // *** Same jobs for clut_1 and clut_3
+        __m128i clut_1 = _mm_load_si128((__m128i*)(clut_word_ptr-1)+1);
+        __m128i clut_3 = _mm_load_si128((__m128i*)(clut_word_ptr-1)+3);
+        clut_1 = _mm_and_si128(clut_1, clut_mask);
+        clut_3 = _mm_and_si128(clut_3, clut_mask);
+
+        __m128i vm2_low = _mm_unpacklo_epi16(vm2, zero_128);
+        __m128i vm2_high = _mm_unpackhi_epi16(vm2, zero_128);
+        vm2_low = _mm_slli_epi32(vm2_low, 16);
+        vm2_high = _mm_slli_epi32(vm2_high, 16);
+
+        clut_1 = _mm_or_si128(clut_1, vm2_low);
+        clut_3 = _mm_or_si128(clut_3, vm2_high);
+
+        _mm_store_si128((__m128i*)(clut_word_ptr-1)+1, clut_1);
+        _mm_store_si128((__m128i*)(clut_word_ptr-1)+3, clut_3);
+    } else {
+        // Standard write
+
+        __m128i clut_mask = _mm_load_si128((__m128i*)s_clut16mask);
+
+        // Load previous data and clear low 16 bits of double words
+        __m128i clut_0 = _mm_and_si128(_mm_load_si128((__m128i*)clut), clut_mask); // 7 - 5 -  3 - 1 -
+        __m128i clut_2 = _mm_and_si128(_mm_load_si128((__m128i*)clut+2), clut_mask); // 23 - 21 -  19 - 17 -
+
+        //  Convert 16bits to 32 bits vm0 (zero entended)
+        __m128i vm0_low = _mm_unpacklo_epi16(vm0, zero_128); // - 10 - 8  - 2 - 0
+        __m128i vm0_high = _mm_unpackhi_epi16(vm0, zero_128); // - 14 - 12  - 6  - 4
+
+        // Interlace old and new data
+        clut_0 = _mm_or_si128(clut_0, vm0_low); // 7 10 5 8  3 2 1 0
+        clut_2 = _mm_or_si128(clut_2, vm0_high); // 23 14 21 12  19 6 17 4
+
+        // Save the result
+        _mm_store_si128((__m128i*)clut, clut_0);
+        _mm_store_si128((__m128i*)clut+2, clut_2);
+
+        // *** Same jobs for clut_1 and clut_3
+        __m128i clut_1 = _mm_and_si128(_mm_load_si128((__m128i*)clut+1), clut_mask);
+        __m128i clut_3 = _mm_and_si128(_mm_load_si128((__m128i*)clut+3), clut_mask);
+
+        __m128i vm2_low = _mm_unpacklo_epi16(vm2, zero_128);
+        __m128i vm2_high = _mm_unpackhi_epi16(vm2, zero_128);
+
+        clut_1 = _mm_or_si128(clut_1, vm2_low);
+        clut_3 = _mm_or_si128(clut_3, vm2_high);
+
+        _mm_store_si128((__m128i*)clut+1, clut_1);
+        _mm_store_si128((__m128i*)clut+3, clut_3);
+    }
+
+#else
 #if defined(_MSC_VER)
 	__asm
 	{
@ -881,6 +990,7 @@ End:
    : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory"
       );
 #endif // _MSC_VER
+#endif
 }

 #endif // ZEROGS_SSE2