ipu: Convert yuv2rgb sse2 inline assembly to intrinsics

It's more portable. Use _mm_shuffle_epi32 instead of _mm_movehl_ps - I think it avoids data bypass delays going from integer to float domains on older processors, and Agner's tables indicate that the instruction has the same latency and occasionally has higher throughput (depending on cpu). And switch the _mm_xor_si128 and _mm_unpacklo_epi8 around so the same constant can be used for both C bias and alpha.
2016-05-12 18:27:05 +01:00 · 2016-05-12 18:27:05 +01:00 · eaa4abea45
parent 571432a7aa
commit eaa4abea45
2 changed files with 75 additions and 341 deletions
--- a/pcsx2/IPU/yuv2rgb.cpp
+++ b/pcsx2/IPU/yuv2rgb.cpp
@ -1,5 +1,5 @@
 /*  PCSX2 - PS2 Emulator for PCs
- *  Copyright (C) 2002-2010  PCSX2 Dev Team
+ *  Copyright (C) 2002-2016  PCSX2 Dev Team
 *
 *  PCSX2 is free software: you can redistribute it and/or modify it under the terms
 *  of the GNU Lesser General Public License as published by the Free Software Found-
@ -58,357 +58,94 @@ void yuv2rgb_reference(void)
 		}
 }

-#if defined(_M_X86_32)
-// TODO OSX optimize me
-#if defined(__clang__) && !defined(__linux__)
-void yuv2rgb_sse2() {
-	yuv2rgb_reference();
-}
-#else
-// Everything below is bit accurate to the IPU specification (except maybe rounding).
-// Know the specification before you touch it.
-#define SSE_BYTES(x) {x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x}
-#define SSE_WORDS(x) {x, x, x, x, x, x, x, x}
-#define SSE_COEFFICIENTS(x) SSE_WORDS((s16)((x)<<2))
-
-struct SSE2_Tables
+// Suikoden Tactics FMV speed results: Reference - ~72fps, SSE2 - ~120fps
+// An AVX2 version is only slightly faster than an SSE2 version (+2-3fps)
+// (or I'm a poor optimiser), though it might be worth attempting again
+// once we've ported to 64 bits (the extra registers should help).
+__ri void yuv2rgb_sse2()
 {
-	u16	C_bias[8];			// offset -64
-	u8	Y_bias[16];			// offset -48
-	u16 Y_mask[8];			// offset -32
-	u16 round_1bit[8];		// offset -16
-
-	s16 Y_coefficients[8];	// offset 0
-	s16 GCr_coefficients[8];// offset 16
-	s16 GCb_coefficients[8];// offset 32
-	s16 RCr_coefficients[8];// offset 48
-	s16 BCb_coefficients[8];// offset 64
-};
-
-enum
-{
-	C_BIAS     = -0x40,
-	Y_BIAS     = -0x30,
-	Y_MASK     = -0x20,
-	ROUND_1BIT = -0x10,
-
-	Y_COEFF     = 0x00,
-	GCr_COEFF   = 0x10,
-	GCb_COEFF   = 0x20,
-	RCr_COEFF   = 0x30,
-	BCb_COEFF   = 0x40
-};
-
-static const __aligned16 SSE2_Tables sse2_tables =
-{
-	SSE_WORDS(0x8000),		// c_bias
-	SSE_BYTES(IPU_Y_BIAS),	// y_bias
-	SSE_WORDS(0xff00),		// y_mask
-
+	const __m128i c_bias = _mm_set1_epi8(s8(IPU_C_BIAS));
+	const __m128i y_bias = _mm_set1_epi8(IPU_Y_BIAS);
+	const __m128i y_mask = _mm_set1_epi16(s16(0xFF00));
 	// Specifying round off instead of round down as everywhere else
 	// implies that this is right
-	SSE_WORDS(1),		// round_1bit
+	const __m128i round_1bit = _mm_set1_epi16(0x0001);;

-	SSE_COEFFICIENTS(IPU_Y_COEFF),
-	SSE_COEFFICIENTS(IPU_GCR_COEFF),
-	SSE_COEFFICIENTS(IPU_GCB_COEFF),
-	SSE_COEFFICIENTS(IPU_RCR_COEFF),
-	SSE_COEFFICIENTS(IPU_BCB_COEFF),
-};
+	const __m128i y_coefficient = _mm_set1_epi16(s16(IPU_Y_COEFF << 2));
+	const __m128i gcr_coefficient = _mm_set1_epi16(s16(u16(IPU_GCR_COEFF) << 2));
+	const __m128i gcb_coefficient = _mm_set1_epi16(s16(u16(IPU_GCB_COEFF) << 2));
+	const __m128i rcr_coefficient = _mm_set1_epi16(s16(IPU_RCR_COEFF << 2));
+	const __m128i bcb_coefficient = _mm_set1_epi16(s16(IPU_BCB_COEFF << 2));

-static __aligned16 u16 yuv2rgb_temp[3][8];
+	// Alpha set to 0x80 here. The threshold stuff is done later.
+	const __m128i& alpha = c_bias;

-// This could potentially be improved for SSE4
-__ri void yuv2rgb_sse2(void)
-{
-#if defined(_MSC_VER) || defined(__INTEL_COMPILER)
-	__asm {
-		mov eax, 1
-		xor esi, esi
-		xor edi, edi
+	for (int n = 0; n < 8; ++n) {
+		// could skip the loadl_epi64 but most SSE instructions require 128-bit
+		// alignment so two versions would be needed.
+		__m128i cb = _mm_loadl_epi64(reinterpret_cast<__m128i*>(&decoder.mb8.Cb[n][0]));
+		__m128i cr = _mm_loadl_epi64(reinterpret_cast<__m128i*>(&decoder.mb8.Cr[n][0]));

-		// Use ecx and edx as base pointers, to allow for Mod/RM form on memOps.
-		// This saves 2-3 bytes per instruction where these are used. :)
-		mov ecx, offset yuv2rgb_temp
-		mov edx, offset sse2_tables+64;
+		// (Cb - 128) << 8, (Cr - 128) << 8
+		cb = _mm_xor_si128(cb, c_bias);
+		cr = _mm_xor_si128(cr, c_bias);
+		cb = _mm_unpacklo_epi8(_mm_setzero_si128(), cb);
+		cr = _mm_unpacklo_epi8(_mm_setzero_si128(), cr);

-		align 16
-tworows:
-		movq xmm3, qword ptr [decoder.mb8+256+esi]
-		movq xmm1, qword ptr [decoder.mb8+320+esi]
-		pxor xmm2, xmm2
-		pxor xmm0, xmm0
-		// could skip the movq but punpck requires 128-bit alignment
-		// for some reason, so two versions would be needed,
-		// bloating the function (further)
-		punpcklbw xmm2, xmm3
-		punpcklbw xmm0, xmm1
-		// unfortunately I don't think this will matter despite being
-		// technically potentially a little faster, but this is
-		// equivalent to an add or sub
-		pxor xmm2, xmmword ptr [edx+C_BIAS] // xmm2 <-- 8 x (Cb - 128) << 8
-		pxor xmm0, xmmword ptr [edx+C_BIAS] // xmm0 <-- 8 x (Cr - 128) << 8
+		__m128i rc = _mm_mulhi_epi16(cr, rcr_coefficient);
+		__m128i gc = _mm_adds_epi16(_mm_mulhi_epi16(cr, gcr_coefficient), _mm_mulhi_epi16(cb, gcb_coefficient));
+		__m128i bc = _mm_mulhi_epi16(cb, bcb_coefficient);

-		movaps xmm1, xmm0
-		movaps xmm3, xmm2
-		pmulhw xmm1, xmmword ptr [edx+GCr_COEFF]
-		pmulhw xmm3, xmmword ptr [edx+GCb_COEFF]
-		pmulhw xmm0, xmmword ptr [edx+RCr_COEFF]
-		pmulhw xmm2, xmmword ptr [edx+BCb_COEFF]
-		paddsw xmm1, xmm3
-		// store for the next line; looking at the code above
-		// compared to the code below, I have to wonder whether
-		// this was worth the hassle
-		movaps xmmword ptr [ecx], xmm0
-		movaps xmmword ptr [ecx+16], xmm1
-		movaps xmmword ptr [ecx+32], xmm2
-		jmp ihatemsvc
+		for (int m = 0; m < 2; ++m) {
+			__m128i y = _mm_load_si128(reinterpret_cast<__m128i*>(&decoder.mb8.Y[n * 2 + m][0]));
+			y = _mm_subs_epu8(y, y_bias);
+			// Y << 8 for pixels 0, 2, 4, 6, 8, 10, 12, 14
+			__m128i y_even = _mm_slli_epi16(y, 8);
+			// Y << 8 for pixels 1, 3, 5, 7 ,9, 11, 13, 15
+			__m128i y_odd = _mm_and_si128(y, y_mask);

-		align 16
-onerow:
-		movaps xmm0, xmmword ptr [ecx]
-		movaps xmm1, xmmword ptr [ecx+16]
-		movaps xmm2, xmmword ptr [ecx+32]
+			y_even = _mm_mulhi_epu16(y_even, y_coefficient);
+			y_odd  = _mm_mulhi_epu16(y_odd,  y_coefficient);

-// If masm directives worked properly in inline asm, I'd be using them,
-// but I'm not inclined to write ~70 line #defines to simulate them.
-// Maybe the function's faster like this anyway because it's smaller?
-// I'd have to write a 70 line #define to benchmark it.
+			__m128i r_even = _mm_adds_epi16(rc, y_even);
+			__m128i r_odd  = _mm_adds_epi16(rc, y_odd);
+			__m128i g_even = _mm_adds_epi16(gc, y_even);
+			__m128i g_odd  = _mm_adds_epi16(gc, y_odd);
+			__m128i b_even = _mm_adds_epi16(bc, y_even);
+			__m128i b_odd  = _mm_adds_epi16(bc, y_odd);

-ihatemsvc:
-		movaps xmm3, xmm0
-		movaps xmm4, xmm1
-		movaps xmm5, xmm2
+			// round
+			r_even = _mm_srai_epi16(_mm_add_epi16(r_even, round_1bit), 1);
+			r_odd  = _mm_srai_epi16(_mm_add_epi16(r_odd,  round_1bit), 1);
+			g_even = _mm_srai_epi16(_mm_add_epi16(g_even, round_1bit), 1);
+			g_odd  = _mm_srai_epi16(_mm_add_epi16(g_odd,  round_1bit), 1);
+			b_even = _mm_srai_epi16(_mm_add_epi16(b_even, round_1bit), 1);
+			b_odd  = _mm_srai_epi16(_mm_add_epi16(b_odd,  round_1bit), 1);

-		movaps xmm6, xmmword ptr [decoder.mb8+edi]
-		psubusb xmm6, xmmword ptr [edx+Y_BIAS]
-		movaps xmm7, xmm6
-		psllw xmm6, 8                    // xmm6 <- Y << 8 for pixels 0,2,4,6,8,10,12,14
-		pand xmm7, xmmword ptr [edx+Y_MASK]  // xmm7 <- Y << 8 for pixels 1,3,5,7,9,11,13,15
+			// combine even and odd bytes in original order
+			__m128i r = _mm_packus_epi16(r_even, r_odd);
+			__m128i g = _mm_packus_epi16(g_even, g_odd);
+			__m128i b = _mm_packus_epi16(b_even, b_odd);

-		pmulhuw xmm6, xmmword ptr [edx+Y_COEFF]
-		pmulhuw xmm7, xmmword ptr [edx+Y_COEFF]
+			r = _mm_unpacklo_epi8(r, _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 2, 3, 2)));
+			g = _mm_unpacklo_epi8(g, _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 2, 3, 2)));
+			b = _mm_unpacklo_epi8(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 2, 3, 2)));

-		paddsw xmm0, xmm6
-		paddsw xmm3, xmm7
-		paddsw xmm1, xmm6
-		paddsw xmm4, xmm7
-		paddsw xmm2, xmm6
-		paddsw xmm5, xmm7
+			// Create RGBA (we could generate A here, but we don't) quads
+			__m128i rg_l = _mm_unpacklo_epi8(r, g);
+			__m128i ba_l = _mm_unpacklo_epi8(b, alpha);
+			__m128i rgba_ll = _mm_unpacklo_epi16(rg_l, ba_l);
+			__m128i rgba_lh = _mm_unpackhi_epi16(rg_l, ba_l);

-		// 0x80; a constant is probably so much better
-		pcmpeqb xmm7, xmm7
-		psllw xmm7, 15
-		psrlw xmm7, 8
-		packuswb xmm7, xmm7
+			__m128i rg_h = _mm_unpackhi_epi8(r, g);
+			__m128i ba_h = _mm_unpackhi_epi8(b, alpha);
+			__m128i rgba_hl = _mm_unpacklo_epi16(rg_h, ba_h);
+			__m128i rgba_hh = _mm_unpackhi_epi16(rg_h, ba_h);

-		// round
-		movaps xmm6, xmmword ptr [edx+ROUND_1BIT]
-		paddw xmm0, xmm6
-		paddw xmm1, xmm6
-		paddw xmm2, xmm6
-		paddw xmm3, xmm6
-		paddw xmm4, xmm6
-		paddw xmm5, xmm6
-		psraw xmm0, 1
-		psraw xmm1, 1
-		psraw xmm2, 1
-		psraw xmm3, 1
-		psraw xmm4, 1
-		psraw xmm5, 1
-
-		// combine even and odd bytes
-		packuswb xmm0, xmm3
-		packuswb xmm1, xmm4
-		packuswb xmm2, xmm5
-		movhlps xmm3, xmm0
-		movhlps xmm4, xmm1
-		movhlps xmm5, xmm2
-		punpcklbw xmm0, xmm3 // Red bytes, back in order
-		punpcklbw xmm1, xmm4 // Green ""
-		punpcklbw xmm2, xmm5 // Blue ""
-		movaps xmm3, xmm0
-		movaps xmm4, xmm1
-		movaps xmm5, xmm2
-
-		// Create RGBA (we could generate A here, but we don't) quads
-		punpcklbw xmm0, xmm1
-		punpcklbw xmm2, xmm7
-		movaps xmm1, xmm0
-		punpcklwd xmm0, xmm2
-		punpckhwd xmm1, xmm2
-
-		punpckhbw xmm3, xmm4
-		punpckhbw xmm5, xmm7
-		movaps xmm4, xmm3
-		punpcklwd xmm3, xmm5
-		punpckhwd xmm4, xmm5
-
-		// at last
-		movaps xmmword ptr [decoder.rgb32+edi*4+0], xmm0
-		movaps xmmword ptr [decoder.rgb32+edi*4+16], xmm1
-		movaps xmmword ptr [decoder.rgb32+edi*4+32], xmm3
-		movaps xmmword ptr [decoder.rgb32+edi*4+48], xmm4
-
-		add edi, 16
-
-		neg eax
-		jl onerow // run twice
-
-		add esi, 8
-		cmp esi, 64
-		jne tworows
+			_mm_store_si128(reinterpret_cast<__m128i*>(&decoder.rgb32.c[n * 2 + m][0]), rgba_ll);
+			_mm_store_si128(reinterpret_cast<__m128i*>(&decoder.rgb32.c[n * 2 + m][4]), rgba_lh);
+			_mm_store_si128(reinterpret_cast<__m128i*>(&decoder.rgb32.c[n * 2 + m][8]), rgba_hl);
+			_mm_store_si128(reinterpret_cast<__m128i*>(&decoder.rgb32.c[n * 2 + m][12]), rgba_hh);
+		}
 	}
-
-#elif defined(__GNUC__)
-
-	// offset to the middle of the sse2 table, so that we can use 1-byte address displacement
-	// to access all fields:
-	static const u8* sse2_tableoffset = ((u8*)&sse2_tables) + 64;
-	static const u8* mb8 = (u8*)&decoder.mb8;
-	static u8* rgb32 = (u8*)&decoder.rgb32;
-
-	__asm__ __volatile__ (
-		".intel_syntax noprefix\n"
-		"xor esi, esi\n"
-		"xor edi, edi\n"
-
-		".align 16\n"
-"tworows_%=:\n"
-		"movq xmm3, qword ptr [%[mb8]+256+esi]\n"
-		"movq xmm1, qword ptr [%[mb8]+320+esi]\n"
-		"pxor xmm2, xmm2\n"
-		"pxor xmm0, xmm0\n"
-		// could skip the movq but punpck requires 128-bit alignment
-		// for some reason, so two versions would be needed,
-		// bloating the function (further)
-		"punpcklbw xmm2, xmm3\n"
-		"punpcklbw xmm0, xmm1\n"
-		// unfortunately I don't think this will matter despite being
-		// technically potentially a little faster, but this is
-		// equivalent to an add or sub
-		"pxor xmm2, xmmword ptr [%[sse2_tables]+%c[C_BIAS]]\n" // xmm2 <-- 8 x (Cb - 128) << 8
-		"pxor xmm0, xmmword ptr [%[sse2_tables]+%c[C_BIAS]]\n" // xmm0 <-- 8 x (Cr - 128) << 8
-
-		"movaps xmm1, xmm0\n"
-		"movaps xmm3, xmm2\n"
-		"pmulhw xmm1, xmmword ptr [%[sse2_tables]+%c[GCr_COEFF]]\n"
-		"pmulhw xmm3, xmmword ptr [%[sse2_tables]+%c[GCb_COEFF]]\n"
-		"pmulhw xmm0, xmmword ptr [%[sse2_tables]+%c[RCr_COEFF]]\n"
-		"pmulhw xmm2, xmmword ptr [%[sse2_tables]+%c[BCb_COEFF]]\n"
-		"paddsw xmm1, xmm3\n"
-		// store for the next line; looking at the code above
-		// compared to the code below, I have to wonder whether
-		// this was worth the hassle
-		"movaps xmmword ptr [%[yuv2rgb_temp]], xmm0\n"
-		"movaps xmmword ptr [%[yuv2rgb_temp]+16], xmm1\n"
-		"movaps xmmword ptr [%[yuv2rgb_temp]+32], xmm2\n"
-		"jmp ihategcctoo_%=\n"
-
-		".align 16\n"
-"onerow_%=:\n"
-		"movaps xmm0, xmmword ptr [%[yuv2rgb_temp]]\n"
-		"movaps xmm1, xmmword ptr [%[yuv2rgb_temp]+16]\n"
-		"movaps xmm2, xmmword ptr [%[yuv2rgb_temp]+32]\n"
-
-"ihategcctoo_%=:\n"
-		"movaps xmm3, xmm0\n"
-		"movaps xmm4, xmm1\n"
-		"movaps xmm5, xmm2\n"
-
-		"movaps xmm6, xmmword ptr [%[mb8]+edi]\n"
-		"psubusb xmm6, xmmword ptr [%[sse2_tables]+%c[Y_BIAS]]\n"
-		"movaps xmm7, xmm6\n"
-		"psllw xmm6, 8\n"                   // xmm6 <- Y << 8 for pixels 0,2,4,6,8,10,12,14
-		"pand xmm7, xmmword ptr [%[sse2_tables]+%c[Y_MASK]]\n" // xmm7 <- Y << 8 for pixels 1,3,5,7,9,11,13,15
-
-		"pmulhuw xmm6, xmmword ptr [%[sse2_tables]+%c[Y_COEFF]]\n"
-		"pmulhuw xmm7, xmmword ptr [%[sse2_tables]+%c[Y_COEFF]]\n"
-
-		"paddsw xmm0, xmm6\n"
-		"paddsw xmm3, xmm7\n"
-		"paddsw xmm1, xmm6\n"
-		"paddsw xmm4, xmm7\n"
-		"paddsw xmm2, xmm6\n"
-		"paddsw xmm5, xmm7\n"
-
-		// 0x80; a constant is probably so much better
-		"pcmpeqb xmm7, xmm7\n"
-		"psllw xmm7, 15\n"
-		"psrlw xmm7, 8\n"
-		"packuswb xmm7, xmm7\n"
-
-		// round
-		"movaps xmm6, xmmword ptr [%[sse2_tables]+%c[ROUND_1BIT]]\n"
-		"paddw xmm0, xmm6\n"
-		"paddw xmm1, xmm6\n"
-		"paddw xmm2, xmm6\n"
-		"paddw xmm3, xmm6\n"
-		"paddw xmm4, xmm6\n"
-		"paddw xmm5, xmm6\n"
-		"psraw xmm0, 1\n"
-		"psraw xmm1, 1\n"
-		"psraw xmm2, 1\n"
-		"psraw xmm3, 1\n"
-		"psraw xmm4, 1\n"
-		"psraw xmm5, 1\n"
-
-		// combine even and odd bytes
-		"packuswb xmm0, xmm3\n"
-		"packuswb xmm1, xmm4\n"
-		"packuswb xmm2, xmm5\n"
-		"movhlps xmm3, xmm0\n"
-		"movhlps xmm4, xmm1\n"
-		"movhlps xmm5, xmm2\n"
-		"punpcklbw xmm0, xmm3\n" // Red bytes, back in order
-		"punpcklbw xmm1, xmm4\n" // Green ""
-		"punpcklbw xmm2, xmm5\n" // Blue ""
-		"movaps xmm3, xmm0\n"
-		"movaps xmm4, xmm1\n"
-		"movaps xmm5, xmm2\n"
-
-		// Create RGBA (we could generate A here, but we don't) quads
-		"punpcklbw xmm0, xmm1\n"
-		"punpcklbw xmm2, xmm7\n"
-		"movaps xmm1, xmm0\n"
-		"punpcklwd xmm0, xmm2\n"
-		"punpckhwd xmm1, xmm2\n"
-
-		"punpckhbw xmm3, xmm4\n"
-		"punpckhbw xmm5, xmm7\n"
-		"movaps xmm4, xmm3\n"
-		"punpcklwd xmm3, xmm5\n"
-		"punpckhwd xmm4, xmm5\n"
-
-		// at last
-		"movaps xmmword ptr [%[rgb32]+edi*4+0], xmm0\n"
-		"movaps xmmword ptr [%[rgb32]+edi*4+16], xmm1\n"
-		"movaps xmmword ptr [%[rgb32]+edi*4+32], xmm3\n"
-		"movaps xmmword ptr [%[rgb32]+edi*4+48], xmm4\n"
-
-		"add edi, 16\n"
-
-        // run twice the onerow <=> edi = 16 or 48 or 80 etc... <=> check bit 5
-		"test edi, 16\n"
-		"jnz onerow_%=\n"
-
-		"add esi, 8\n"
-		"cmp esi, 64\n"
-		"jne tworows_%=\n"
-		".att_syntax\n"
-		:
-		:[C_BIAS]"i"(C_BIAS), [Y_BIAS]"i"(Y_BIAS), [Y_MASK]"i"(Y_MASK),
-			[ROUND_1BIT]"i"(ROUND_1BIT), [Y_COEFF]"i"(Y_COEFF), [GCr_COEFF]"i"(GCr_COEFF),
-			[GCb_COEFF]"i"(GCb_COEFF), [RCr_COEFF]"i"(RCr_COEFF), [BCb_COEFF]"i"(BCb_COEFF),
-            // Use ecx and edx as base pointers, to allow for Mod/RM form on memOps.
-            // This saves 2-3 bytes per instruction where these are used. :)
-			[yuv2rgb_temp]"c"(yuv2rgb_temp), [sse2_tables]"d"(sse2_tableoffset),
-			[mb8]"r"(mb8), [rgb32]"r"(rgb32)
-		: "esi", "edi", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory"
-	);
-#else
-#	error Unsupported compiler
-#endif
 }
-#endif
-#endif
--- a/pcsx2/IPU/yuv2rgb.h
+++ b/pcsx2/IPU/yuv2rgb.h
@ -1,5 +1,5 @@
 /*  PCSX2 - PS2 Emulator for PCs
- *  Copyright (C) 2002-2010  PCSX2 Dev Team
+ *  Copyright (C) 2002-2016  PCSX2 Dev Team
 *
 *  PCSX2 is free software: you can redistribute it and/or modify it under the terms
 *  of the GNU Lesser General Public License as published by the Free Software Found-
@ -16,9 +16,6 @@
 #pragma once

 extern void yuv2rgb_reference();
-#ifdef _M_X86_32
+
 #define yuv2rgb yuv2rgb_sse2
 extern void yuv2rgb_sse2();
-#else
-#define yuv2rgb yuv2rgb_reference
-#endif