pcsx2: Add IPU dither block SSE2 implementation

2019-10-07 17:50:06 +01:00 · 2019-10-07 17:50:06 +01:00 · c533a65764
parent f9b51e2656
commit c533a65764
1 changed files with 65 additions and 0 deletions
--- a/pcsx2/IPU/IPUdither.cpp
+++ b/pcsx2/IPU/IPUdither.cpp
@ -21,7 +21,15 @@
 #include "yuv2rgb.h"
 #include "mpeg2lib/Mpeg.h"

+void ipu_dither_reference(const macroblock_rgb32 &rgb32, macroblock_rgb16 &rgb16, int dte);
+void ipu_dither_sse2(const macroblock_rgb32 &rgb32, macroblock_rgb16 &rgb16, int dte);
+
 __ri void ipu_dither(const macroblock_rgb32 &rgb32, macroblock_rgb16 &rgb16, int dte)
+{
+    ipu_dither_sse2(rgb32, rgb16, dte);
+}
+
+__ri void ipu_dither_reference(const macroblock_rgb32 &rgb32, macroblock_rgb16 &rgb16, int dte)
 {
    if (dte) {
        // I'm guessing values are rounded down when clamping.
@ -55,3 +63,60 @@ __ri void ipu_dither(const macroblock_rgb32 &rgb32, macroblock_rgb16 &rgb16, int
        }
    }
 }
+
+__ri void ipu_dither_sse2(const macroblock_rgb32 &rgb32, macroblock_rgb16 &rgb16, int dte)
+{
+    const __m128i alpha_test = _mm_set1_epi16(0x40);
+    const __m128i dither_add_matrix[] = {
+        _mm_setr_epi32(0x00000000, 0x00000000, 0x00000000, 0x00010101),
+        _mm_setr_epi32(0x00020202, 0x00000000, 0x00030303, 0x00000000),
+        _mm_setr_epi32(0x00000000, 0x00010101, 0x00000000, 0x00000000),
+        _mm_setr_epi32(0x00030303, 0x00000000, 0x00020202, 0x00000000),
+    };
+    const __m128i dither_sub_matrix[] = {
+        _mm_setr_epi32(0x00040404, 0x00000000, 0x00030303, 0x00000000),
+        _mm_setr_epi32(0x00000000, 0x00020202, 0x00000000, 0x00010101),
+        _mm_setr_epi32(0x00030303, 0x00000000, 0x00040404, 0x00000000),
+        _mm_setr_epi32(0x00000000, 0x00010101, 0x00000000, 0x00020202),
+    };
+    for (int i = 0; i < 16; ++i) {
+        const __m128i dither_add = dither_add_matrix[i & 3];
+        const __m128i dither_sub = dither_sub_matrix[i & 3];
+        for (int n = 0; n < 2; ++n) {
+            __m128i rgba_8_0123 = _mm_load_si128(reinterpret_cast<const __m128i *>(&rgb32.c[i][n * 8]));
+            __m128i rgba_8_4567 = _mm_load_si128(reinterpret_cast<const __m128i *>(&rgb32.c[i][n * 8 + 4]));
+
+            // Dither and clamp
+            if (dte) {
+                rgba_8_0123 = _mm_adds_epu8(rgba_8_0123, dither_add);
+                rgba_8_0123 = _mm_subs_epu8(rgba_8_0123, dither_sub);
+                rgba_8_4567 = _mm_adds_epu8(rgba_8_4567, dither_add);
+                rgba_8_4567 = _mm_subs_epu8(rgba_8_4567, dither_sub);
+            }
+
+            // Split into channel components and extend to 16 bits
+            const __m128i rgba_16_0415 = _mm_unpacklo_epi8(rgba_8_0123, rgba_8_4567);
+            const __m128i rgba_16_2637 = _mm_unpackhi_epi8(rgba_8_0123, rgba_8_4567);
+            const __m128i rgba_32_0246 = _mm_unpacklo_epi8(rgba_16_0415, rgba_16_2637);
+            const __m128i rgba_32_1357 = _mm_unpackhi_epi8(rgba_16_0415, rgba_16_2637);
+            const __m128i rg_64_01234567 = _mm_unpacklo_epi8(rgba_32_0246, rgba_32_1357);
+            const __m128i ba_64_01234567 = _mm_unpackhi_epi8(rgba_32_0246, rgba_32_1357);
+
+            const __m128i zero = _mm_setzero_si128();
+            __m128i r = _mm_unpacklo_epi8(rg_64_01234567, zero);
+            __m128i g = _mm_unpackhi_epi8(rg_64_01234567, zero);
+            __m128i b = _mm_unpacklo_epi8(ba_64_01234567, zero);
+            __m128i a = _mm_unpackhi_epi8(ba_64_01234567, zero);
+
+            // Create RGBA
+            r = _mm_srli_epi16(r, 3);
+            g = _mm_slli_epi16(_mm_srli_epi16(g, 3), 5);
+            b = _mm_slli_epi16(_mm_srli_epi16(b, 3), 10);
+            a = _mm_slli_epi16(_mm_cmpeq_epi16(a, alpha_test), 15);
+
+            const __m128i rgba16 = _mm_or_si128(_mm_or_si128(r, g), _mm_or_si128(b, a));
+
+            _mm_store_si128(reinterpret_cast<__m128i *>(&rgb16.c[i][n * 8]), rgba16);
+        }
+    }
+}