IPU: ARM64 compatibility

2024-03-21 17:05:22 +10:00 · 2024-03-21 17:05:22 +10:00 · 7d098674f2
parent 4e0e8cef54
commit 7d098674f2
5 changed files with 159 additions and 1 deletions
--- a/pcsx2/IPU/IPU_MultiISA.cpp
+++ b/pcsx2/IPU/IPU_MultiISA.cpp
@ -1374,6 +1374,7 @@ __fi static bool mpeg2_slice()
 				//Cr bias	- 8 * 8
 				//Cb bias	- 8 * 8

+#if defined(_M_X86)
 				__m128i zeroreg = _mm_setzero_si128();

 				for (uint i = 0; i < (256+64+64) / 32; ++i)
@ -1388,6 +1389,24 @@ __fi static bool mpeg2_slice()
 					s += 32;
 					d += 32;
 				}
+#elif defined(_M_ARM64)
+				uint8x16_t zeroreg = vmovq_n_u8(0);
+
+				for (uint i = 0; i < (256 + 64 + 64) / 32; ++i)
+				{
+					//*d++ = *s++;
+					uint8x16_t woot1 = vld1q_u8((uint8_t*)s);
+					uint8x16_t woot2 = vld1q_u8((uint8_t*)s + 16);
+					vst1q_u8((uint8_t*)d, vzip1q_u8(woot1, zeroreg));
+					vst1q_u8((uint8_t*)d + 16, vzip2q_u8(woot1, zeroreg));
+					vst1q_u8((uint8_t*)d + 32, vzip1q_u8(woot2, zeroreg));
+					vst1q_u8((uint8_t*)d + 48, vzip2q_u8(woot2, zeroreg));
+					s += 32;
+					d += 32;
+				}
+#else
+#error Unsupported arch
+#endif
 			}
 		}
 		else
--- a/pcsx2/IPU/IPUdither.cpp
+++ b/pcsx2/IPU/IPUdither.cpp
@ -11,11 +11,18 @@
 MULTI_ISA_UNSHARED_START

 void ipu_dither_reference(const macroblock_rgb32 &rgb32, macroblock_rgb16 &rgb16, int dte);
+
+#if defined(_M_X86)
 void ipu_dither_sse2(const macroblock_rgb32 &rgb32, macroblock_rgb16 &rgb16, int dte);
+#endif

 __ri void ipu_dither(const macroblock_rgb32 &rgb32, macroblock_rgb16 &rgb16, int dte)
 {
+#if defined(_M_X86)
    ipu_dither_sse2(rgb32, rgb16, dte);
+#else
+    ipu_dither_reference(rgb32, rgb16, dte);
+#endif
 }

 __ri void ipu_dither_reference(const macroblock_rgb32 &rgb32, macroblock_rgb16 &rgb16, int dte)
@ -53,6 +60,8 @@ __ri void ipu_dither_reference(const macroblock_rgb32 &rgb32, macroblock_rgb16 &
    }
 }

+#if defined(_M_X86)
+
 __ri void ipu_dither_sse2(const macroblock_rgb32 &rgb32, macroblock_rgb16 &rgb16, int dte)
 {
    const __m128i alpha_test = _mm_set1_epi16(0x40);
@ -110,4 +119,6 @@ __ri void ipu_dither_sse2(const macroblock_rgb32 &rgb32, macroblock_rgb16 &rgb16
    }
 }

+#endif
+
 MULTI_ISA_UNSHARED_END
--- a/pcsx2/IPU/mpeg2_vlc.h
+++ b/pcsx2/IPU/mpeg2_vlc.h
@ -168,7 +168,7 @@ static constexpr VLC_ALIGNED16 MVtab MV_10[] = {


 static constexpr DMVtab DMV_2[] = {
-	{0, 1}, {0, 1}, {1, 2}, {-1, 2}};
+	{0, 1}, {0, 1}, {1, 2}, {(s8)-1, 2}};


 static constexpr VLC_ALIGNED16 CBPtab CBP_7[] = {
--- a/pcsx2/IPU/yuv2rgb.cpp
+++ b/pcsx2/IPU/yuv2rgb.cpp
@ -42,6 +42,8 @@ void yuv2rgb_reference(void)
 		}
 }

+#if defined(_M_X86)
+
 // Suikoden Tactics FMV speed results: Reference - ~72fps, SSE2 - ~120fps
 // An AVX2 version is only slightly faster than an SSE2 version (+2-3fps)
 // (or I'm a poor optimiser), though it might be worth attempting again
@ -134,4 +136,121 @@ __ri void yuv2rgb_sse2()
 	}
 }

+#elif defined(_M_ARM64)
+
+#if defined(_MSC_VER) && !defined(__clang__)
+#include <arm64_neon.h>
+#else
+#include <arm_neon.h>
+#endif
+
+#define MULHI16(a, b) vshrq_n_s16(vqdmulhq_s16((a), (b)), 1)
+
+__ri void yuv2rgb_neon()
+{
+	const int8x16_t c_bias = vdupq_n_s8(s8(IPU_C_BIAS));
+	const uint8x16_t y_bias = vdupq_n_u8(IPU_Y_BIAS);
+	const int16x8_t y_mask = vdupq_n_s16(s16(0xFF00));
+	// Specifying round off instead of round down as everywhere else
+	// implies that this is right
+	const int16x8_t round_1bit = vdupq_n_s16(0x0001);
+	;
+
+	const int16x8_t y_coefficient = vdupq_n_s16(s16(IPU_Y_COEFF << 2));
+	const int16x8_t gcr_coefficient = vdupq_n_s16(s16(u16(IPU_GCR_COEFF) << 2));
+	const int16x8_t gcb_coefficient = vdupq_n_s16(s16(u16(IPU_GCB_COEFF) << 2));
+	const int16x8_t rcr_coefficient = vdupq_n_s16(s16(IPU_RCR_COEFF << 2));
+	const int16x8_t bcb_coefficient = vdupq_n_s16(s16(IPU_BCB_COEFF << 2));
+
+	// Alpha set to 0x80 here. The threshold stuff is done later.
+	const uint8x16_t alpha = vreinterpretq_u8_s8(c_bias);
+
+	for (int n = 0; n < 8; ++n)
+	{
+		// could skip the loadl_epi64 but most SSE instructions require 128-bit
+		// alignment so two versions would be needed.
+		int8x16_t cb = vcombine_s8(vld1_s8(reinterpret_cast<s8*>(&decoder.mb8.Cb[n][0])), vdup_n_s8(0));
+		int8x16_t cr = vcombine_s8(vld1_s8(reinterpret_cast<s8*>(&decoder.mb8.Cr[n][0])), vdup_n_s8(0));
+
+		// (Cb - 128) << 8, (Cr - 128) << 8
+		cb = veorq_s8(cb, c_bias);
+		cr = veorq_s8(cr, c_bias);
+		cb = vzip1q_s8(vdupq_n_s8(0), cb);
+		cr = vzip1q_s8(vdupq_n_s8(0), cr);
+
+		int16x8_t rc = MULHI16(vreinterpretq_s16_s8(cr), rcr_coefficient);
+		int16x8_t gc = vqaddq_s16(MULHI16(vreinterpretq_s16_s8(cr), gcr_coefficient), MULHI16(vreinterpretq_s16_s8(cb), gcb_coefficient));
+		int16x8_t bc = MULHI16(vreinterpretq_s16_s8(cb), bcb_coefficient);
+
+		for (int m = 0; m < 2; ++m)
+		{
+			uint8x16_t y = vld1q_u8(&decoder.mb8.Y[n * 2 + m][0]);
+			y = vqsubq_u8(y, y_bias);
+			// Y << 8 for pixels 0, 2, 4, 6, 8, 10, 12, 14
+			int16x8_t y_even = vshlq_n_s16(vreinterpretq_s16_u8(y), 8);
+			// Y << 8 for pixels 1, 3, 5, 7 ,9, 11, 13, 15
+			int16x8_t y_odd = vandq_s16(vreinterpretq_s16_u8(y), y_mask);
+
+			// y_even = _mm_mulhi_epu16(y_even, y_coefficient);
+			// y_odd = _mm_mulhi_epu16(y_odd, y_coefficient);
+
+			uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_s16(y_even));
+			uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_s16(y_coefficient));
+			uint32x4_t ab3210 = vmull_u16(a3210, b3210);
+			uint32x4_t ab7654 = vmull_high_u16(vreinterpretq_u16_s16(y_even), vreinterpretq_u16_s16(y_coefficient));
+			y_even = vreinterpretq_s16_u16(vuzp2q_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654)));
+
+			a3210 = vget_low_u16(vreinterpretq_u16_s16(y_odd));
+			b3210 = vget_low_u16(vreinterpretq_u16_s16(y_coefficient));
+			ab3210 = vmull_u16(a3210, b3210);
+			ab7654 = vmull_high_u16(vreinterpretq_u16_s16(y_odd), vreinterpretq_u16_s16(y_coefficient));
+			y_odd = vreinterpretq_s16_u16(vuzp2q_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654)));
+
+			int16x8_t r_even = vqaddq_s16(rc, y_even);
+			int16x8_t r_odd = vqaddq_s16(rc, y_odd);
+			int16x8_t g_even = vqaddq_s16(gc, y_even);
+			int16x8_t g_odd = vqaddq_s16(gc, y_odd);
+			int16x8_t b_even = vqaddq_s16(bc, y_even);
+			int16x8_t b_odd = vqaddq_s16(bc, y_odd);
+
+			// round
+			r_even = vshrq_n_s16(vaddq_s16(r_even, round_1bit), 1);
+			r_odd = vshrq_n_s16(vaddq_s16(r_odd, round_1bit), 1);
+			g_even = vshrq_n_s16(vaddq_s16(g_even, round_1bit), 1);
+			g_odd = vshrq_n_s16(vaddq_s16(g_odd, round_1bit), 1);
+			b_even = vshrq_n_s16(vaddq_s16(b_even, round_1bit), 1);
+			b_odd = vshrq_n_s16(vaddq_s16(b_odd, round_1bit), 1);
+
+			// combine even and odd bytes in original order
+			uint8x16_t r = vcombine_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
+			uint8x16_t g = vcombine_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
+			uint8x16_t b = vcombine_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
+
+			r = vzip1q_u8(r, vreinterpretq_u8_u64(vdupq_laneq_u64(vreinterpretq_u64_u8(r), 1)));
+			g = vzip1q_u8(g, vreinterpretq_u8_u64(vdupq_laneq_u64(vreinterpretq_u64_u8(g), 1)));
+			b = vzip1q_u8(b, vreinterpretq_u8_u64(vdupq_laneq_u64(vreinterpretq_u64_u8(b), 1)));
+
+			// Create RGBA (we could generate A here, but we don't) quads
+			uint8x16_t rg_l = vzip1q_u8(r, g);
+			uint8x16_t ba_l = vzip1q_u8(b, alpha);
+			uint16x8_t rgba_ll = vzip1q_u16(vreinterpretq_u16_u8(rg_l), vreinterpretq_u16_u8(ba_l));
+			uint16x8_t rgba_lh = vzip2q_u16(vreinterpretq_u16_u8(rg_l), vreinterpretq_u16_u8(ba_l));
+
+			uint8x16_t rg_h = vzip2q_u8(r, g);
+			uint8x16_t ba_h = vzip2q_u8(b, alpha);
+			uint16x8_t rgba_hl = vzip1q_u16(vreinterpretq_u16_u8(rg_h), vreinterpretq_u16_u8(ba_h));
+			uint16x8_t rgba_hh = vzip2q_u16(vreinterpretq_u16_u8(rg_h), vreinterpretq_u16_u8(ba_h));
+
+			vst1q_u8(reinterpret_cast<u8*>(&decoder.rgb32.c[n * 2 + m][0]), vreinterpretq_u8_u16(rgba_ll));
+			vst1q_u8(reinterpret_cast<u8*>(&decoder.rgb32.c[n * 2 + m][4]), vreinterpretq_u8_u16(rgba_lh));
+			vst1q_u8(reinterpret_cast<u8*>(&decoder.rgb32.c[n * 2 + m][8]), vreinterpretq_u8_u16(rgba_hl));
+			vst1q_u8(reinterpret_cast<u8*>(&decoder.rgb32.c[n * 2 + m][12]), vreinterpretq_u8_u16(rgba_hh));
+		}
+	}
+}
+
+#undef MULHI16
+
+#endif
+
 MULTI_ISA_UNSHARED_END
--- a/pcsx2/IPU/yuv2rgb.h
+++ b/pcsx2/IPU/yuv2rgb.h
@ -7,5 +7,14 @@

 MULTI_ISA_DEF(extern void yuv2rgb_reference();)

+#if defined(_M_X86)
+
 #define yuv2rgb yuv2rgb_sse2
 MULTI_ISA_DEF(extern void yuv2rgb_sse2();)
+
+#elif defined(_M_ARM64)
+
+#define yuv2rgb yuv2rgb_neon
+MULTI_ISA_DEF(extern void yuv2rgb_neon();)
+
+#endif