ipu: Convert yuv2rgb sse2 inline assembly to intrinsics

It's more portable.

Use _mm_shuffle_epi32 instead of _mm_movehl_ps - I think it avoids
data bypass delays going from integer to float domains on older
processors, and Agner's tables indicate that the instruction has the
same latency and occasionally has higher throughput (depending on cpu).

And switch the _mm_xor_si128 and _mm_unpacklo_epi8 around so the same
constant can be used for both C bias and alpha.
This commit is contained in:
Jonathan Li 2016-05-12 18:27:05 +01:00
parent 571432a7aa
commit eaa4abea45
2 changed files with 75 additions and 341 deletions

View File

@ -1,5 +1,5 @@
/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2010 PCSX2 Dev Team
* Copyright (C) 2002-2016 PCSX2 Dev Team
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
@ -58,357 +58,94 @@ void yuv2rgb_reference(void)
}
}
#if defined(_M_X86_32)
// TODO OSX optimize me
#if defined(__clang__) && !defined(__linux__)
void yuv2rgb_sse2() {
yuv2rgb_reference();
}
#else
// Everything below is bit accurate to the IPU specification (except maybe rounding).
// Know the specification before you touch it.
#define SSE_BYTES(x) {x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x}
#define SSE_WORDS(x) {x, x, x, x, x, x, x, x}
#define SSE_COEFFICIENTS(x) SSE_WORDS((s16)((x)<<2))
struct SSE2_Tables
// Suikoden Tactics FMV speed results: Reference - ~72fps, SSE2 - ~120fps
// An AVX2 version is only slightly faster than an SSE2 version (+2-3fps)
// (or I'm a poor optimiser), though it might be worth attempting again
// once we've ported to 64 bits (the extra registers should help).
__ri void yuv2rgb_sse2()
{
u16 C_bias[8]; // offset -64
u8 Y_bias[16]; // offset -48
u16 Y_mask[8]; // offset -32
u16 round_1bit[8]; // offset -16
s16 Y_coefficients[8]; // offset 0
s16 GCr_coefficients[8];// offset 16
s16 GCb_coefficients[8];// offset 32
s16 RCr_coefficients[8];// offset 48
s16 BCb_coefficients[8];// offset 64
};
enum
{
C_BIAS = -0x40,
Y_BIAS = -0x30,
Y_MASK = -0x20,
ROUND_1BIT = -0x10,
Y_COEFF = 0x00,
GCr_COEFF = 0x10,
GCb_COEFF = 0x20,
RCr_COEFF = 0x30,
BCb_COEFF = 0x40
};
static const __aligned16 SSE2_Tables sse2_tables =
{
SSE_WORDS(0x8000), // c_bias
SSE_BYTES(IPU_Y_BIAS), // y_bias
SSE_WORDS(0xff00), // y_mask
const __m128i c_bias = _mm_set1_epi8(s8(IPU_C_BIAS));
const __m128i y_bias = _mm_set1_epi8(IPU_Y_BIAS);
const __m128i y_mask = _mm_set1_epi16(s16(0xFF00));
// Specifying round off instead of round down as everywhere else
// implies that this is right
SSE_WORDS(1), // round_1bit
const __m128i round_1bit = _mm_set1_epi16(0x0001);;
SSE_COEFFICIENTS(IPU_Y_COEFF),
SSE_COEFFICIENTS(IPU_GCR_COEFF),
SSE_COEFFICIENTS(IPU_GCB_COEFF),
SSE_COEFFICIENTS(IPU_RCR_COEFF),
SSE_COEFFICIENTS(IPU_BCB_COEFF),
};
const __m128i y_coefficient = _mm_set1_epi16(s16(IPU_Y_COEFF << 2));
const __m128i gcr_coefficient = _mm_set1_epi16(s16(u16(IPU_GCR_COEFF) << 2));
const __m128i gcb_coefficient = _mm_set1_epi16(s16(u16(IPU_GCB_COEFF) << 2));
const __m128i rcr_coefficient = _mm_set1_epi16(s16(IPU_RCR_COEFF << 2));
const __m128i bcb_coefficient = _mm_set1_epi16(s16(IPU_BCB_COEFF << 2));
static __aligned16 u16 yuv2rgb_temp[3][8];
// Alpha set to 0x80 here. The threshold stuff is done later.
const __m128i& alpha = c_bias;
// This could potentially be improved for SSE4
__ri void yuv2rgb_sse2(void)
{
#if defined(_MSC_VER) || defined(__INTEL_COMPILER)
__asm {
mov eax, 1
xor esi, esi
xor edi, edi
for (int n = 0; n < 8; ++n) {
// could skip the loadl_epi64 but most SSE instructions require 128-bit
// alignment so two versions would be needed.
__m128i cb = _mm_loadl_epi64(reinterpret_cast<__m128i*>(&decoder.mb8.Cb[n][0]));
__m128i cr = _mm_loadl_epi64(reinterpret_cast<__m128i*>(&decoder.mb8.Cr[n][0]));
// Use ecx and edx as base pointers, to allow for Mod/RM form on memOps.
// This saves 2-3 bytes per instruction where these are used. :)
mov ecx, offset yuv2rgb_temp
mov edx, offset sse2_tables+64;
// (Cb - 128) << 8, (Cr - 128) << 8
cb = _mm_xor_si128(cb, c_bias);
cr = _mm_xor_si128(cr, c_bias);
cb = _mm_unpacklo_epi8(_mm_setzero_si128(), cb);
cr = _mm_unpacklo_epi8(_mm_setzero_si128(), cr);
align 16
tworows:
movq xmm3, qword ptr [decoder.mb8+256+esi]
movq xmm1, qword ptr [decoder.mb8+320+esi]
pxor xmm2, xmm2
pxor xmm0, xmm0
// could skip the movq but punpck requires 128-bit alignment
// for some reason, so two versions would be needed,
// bloating the function (further)
punpcklbw xmm2, xmm3
punpcklbw xmm0, xmm1
// unfortunately I don't think this will matter despite being
// technically potentially a little faster, but this is
// equivalent to an add or sub
pxor xmm2, xmmword ptr [edx+C_BIAS] // xmm2 <-- 8 x (Cb - 128) << 8
pxor xmm0, xmmword ptr [edx+C_BIAS] // xmm0 <-- 8 x (Cr - 128) << 8
__m128i rc = _mm_mulhi_epi16(cr, rcr_coefficient);
__m128i gc = _mm_adds_epi16(_mm_mulhi_epi16(cr, gcr_coefficient), _mm_mulhi_epi16(cb, gcb_coefficient));
__m128i bc = _mm_mulhi_epi16(cb, bcb_coefficient);
movaps xmm1, xmm0
movaps xmm3, xmm2
pmulhw xmm1, xmmword ptr [edx+GCr_COEFF]
pmulhw xmm3, xmmword ptr [edx+GCb_COEFF]
pmulhw xmm0, xmmword ptr [edx+RCr_COEFF]
pmulhw xmm2, xmmword ptr [edx+BCb_COEFF]
paddsw xmm1, xmm3
// store for the next line; looking at the code above
// compared to the code below, I have to wonder whether
// this was worth the hassle
movaps xmmword ptr [ecx], xmm0
movaps xmmword ptr [ecx+16], xmm1
movaps xmmword ptr [ecx+32], xmm2
jmp ihatemsvc
for (int m = 0; m < 2; ++m) {
__m128i y = _mm_load_si128(reinterpret_cast<__m128i*>(&decoder.mb8.Y[n * 2 + m][0]));
y = _mm_subs_epu8(y, y_bias);
// Y << 8 for pixels 0, 2, 4, 6, 8, 10, 12, 14
__m128i y_even = _mm_slli_epi16(y, 8);
// Y << 8 for pixels 1, 3, 5, 7 ,9, 11, 13, 15
__m128i y_odd = _mm_and_si128(y, y_mask);
align 16
onerow:
movaps xmm0, xmmword ptr [ecx]
movaps xmm1, xmmword ptr [ecx+16]
movaps xmm2, xmmword ptr [ecx+32]
y_even = _mm_mulhi_epu16(y_even, y_coefficient);
y_odd = _mm_mulhi_epu16(y_odd, y_coefficient);
// If masm directives worked properly in inline asm, I'd be using them,
// but I'm not inclined to write ~70 line #defines to simulate them.
// Maybe the function's faster like this anyway because it's smaller?
// I'd have to write a 70 line #define to benchmark it.
ihatemsvc:
movaps xmm3, xmm0
movaps xmm4, xmm1
movaps xmm5, xmm2
movaps xmm6, xmmword ptr [decoder.mb8+edi]
psubusb xmm6, xmmword ptr [edx+Y_BIAS]
movaps xmm7, xmm6
psllw xmm6, 8 // xmm6 <- Y << 8 for pixels 0,2,4,6,8,10,12,14
pand xmm7, xmmword ptr [edx+Y_MASK] // xmm7 <- Y << 8 for pixels 1,3,5,7,9,11,13,15
pmulhuw xmm6, xmmword ptr [edx+Y_COEFF]
pmulhuw xmm7, xmmword ptr [edx+Y_COEFF]
paddsw xmm0, xmm6
paddsw xmm3, xmm7
paddsw xmm1, xmm6
paddsw xmm4, xmm7
paddsw xmm2, xmm6
paddsw xmm5, xmm7
// 0x80; a constant is probably so much better
pcmpeqb xmm7, xmm7
psllw xmm7, 15
psrlw xmm7, 8
packuswb xmm7, xmm7
__m128i r_even = _mm_adds_epi16(rc, y_even);
__m128i r_odd = _mm_adds_epi16(rc, y_odd);
__m128i g_even = _mm_adds_epi16(gc, y_even);
__m128i g_odd = _mm_adds_epi16(gc, y_odd);
__m128i b_even = _mm_adds_epi16(bc, y_even);
__m128i b_odd = _mm_adds_epi16(bc, y_odd);
// round
movaps xmm6, xmmword ptr [edx+ROUND_1BIT]
paddw xmm0, xmm6
paddw xmm1, xmm6
paddw xmm2, xmm6
paddw xmm3, xmm6
paddw xmm4, xmm6
paddw xmm5, xmm6
psraw xmm0, 1
psraw xmm1, 1
psraw xmm2, 1
psraw xmm3, 1
psraw xmm4, 1
psraw xmm5, 1
r_even = _mm_srai_epi16(_mm_add_epi16(r_even, round_1bit), 1);
r_odd = _mm_srai_epi16(_mm_add_epi16(r_odd, round_1bit), 1);
g_even = _mm_srai_epi16(_mm_add_epi16(g_even, round_1bit), 1);
g_odd = _mm_srai_epi16(_mm_add_epi16(g_odd, round_1bit), 1);
b_even = _mm_srai_epi16(_mm_add_epi16(b_even, round_1bit), 1);
b_odd = _mm_srai_epi16(_mm_add_epi16(b_odd, round_1bit), 1);
// combine even and odd bytes
packuswb xmm0, xmm3
packuswb xmm1, xmm4
packuswb xmm2, xmm5
movhlps xmm3, xmm0
movhlps xmm4, xmm1
movhlps xmm5, xmm2
punpcklbw xmm0, xmm3 // Red bytes, back in order
punpcklbw xmm1, xmm4 // Green ""
punpcklbw xmm2, xmm5 // Blue ""
movaps xmm3, xmm0
movaps xmm4, xmm1
movaps xmm5, xmm2
// combine even and odd bytes in original order
__m128i r = _mm_packus_epi16(r_even, r_odd);
__m128i g = _mm_packus_epi16(g_even, g_odd);
__m128i b = _mm_packus_epi16(b_even, b_odd);
r = _mm_unpacklo_epi8(r, _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 2, 3, 2)));
g = _mm_unpacklo_epi8(g, _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 2, 3, 2)));
b = _mm_unpacklo_epi8(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 2, 3, 2)));
// Create RGBA (we could generate A here, but we don't) quads
punpcklbw xmm0, xmm1
punpcklbw xmm2, xmm7
movaps xmm1, xmm0
punpcklwd xmm0, xmm2
punpckhwd xmm1, xmm2
__m128i rg_l = _mm_unpacklo_epi8(r, g);
__m128i ba_l = _mm_unpacklo_epi8(b, alpha);
__m128i rgba_ll = _mm_unpacklo_epi16(rg_l, ba_l);
__m128i rgba_lh = _mm_unpackhi_epi16(rg_l, ba_l);
punpckhbw xmm3, xmm4
punpckhbw xmm5, xmm7
movaps xmm4, xmm3
punpcklwd xmm3, xmm5
punpckhwd xmm4, xmm5
__m128i rg_h = _mm_unpackhi_epi8(r, g);
__m128i ba_h = _mm_unpackhi_epi8(b, alpha);
__m128i rgba_hl = _mm_unpacklo_epi16(rg_h, ba_h);
__m128i rgba_hh = _mm_unpackhi_epi16(rg_h, ba_h);
// at last
movaps xmmword ptr [decoder.rgb32+edi*4+0], xmm0
movaps xmmword ptr [decoder.rgb32+edi*4+16], xmm1
movaps xmmword ptr [decoder.rgb32+edi*4+32], xmm3
movaps xmmword ptr [decoder.rgb32+edi*4+48], xmm4
add edi, 16
neg eax
jl onerow // run twice
add esi, 8
cmp esi, 64
jne tworows
_mm_store_si128(reinterpret_cast<__m128i*>(&decoder.rgb32.c[n * 2 + m][0]), rgba_ll);
_mm_store_si128(reinterpret_cast<__m128i*>(&decoder.rgb32.c[n * 2 + m][4]), rgba_lh);
_mm_store_si128(reinterpret_cast<__m128i*>(&decoder.rgb32.c[n * 2 + m][8]), rgba_hl);
_mm_store_si128(reinterpret_cast<__m128i*>(&decoder.rgb32.c[n * 2 + m][12]), rgba_hh);
}
}
#elif defined(__GNUC__)
// offset to the middle of the sse2 table, so that we can use 1-byte address displacement
// to access all fields:
static const u8* sse2_tableoffset = ((u8*)&sse2_tables) + 64;
static const u8* mb8 = (u8*)&decoder.mb8;
static u8* rgb32 = (u8*)&decoder.rgb32;
__asm__ __volatile__ (
".intel_syntax noprefix\n"
"xor esi, esi\n"
"xor edi, edi\n"
".align 16\n"
"tworows_%=:\n"
"movq xmm3, qword ptr [%[mb8]+256+esi]\n"
"movq xmm1, qword ptr [%[mb8]+320+esi]\n"
"pxor xmm2, xmm2\n"
"pxor xmm0, xmm0\n"
// could skip the movq but punpck requires 128-bit alignment
// for some reason, so two versions would be needed,
// bloating the function (further)
"punpcklbw xmm2, xmm3\n"
"punpcklbw xmm0, xmm1\n"
// unfortunately I don't think this will matter despite being
// technically potentially a little faster, but this is
// equivalent to an add or sub
"pxor xmm2, xmmword ptr [%[sse2_tables]+%c[C_BIAS]]\n" // xmm2 <-- 8 x (Cb - 128) << 8
"pxor xmm0, xmmword ptr [%[sse2_tables]+%c[C_BIAS]]\n" // xmm0 <-- 8 x (Cr - 128) << 8
"movaps xmm1, xmm0\n"
"movaps xmm3, xmm2\n"
"pmulhw xmm1, xmmword ptr [%[sse2_tables]+%c[GCr_COEFF]]\n"
"pmulhw xmm3, xmmword ptr [%[sse2_tables]+%c[GCb_COEFF]]\n"
"pmulhw xmm0, xmmword ptr [%[sse2_tables]+%c[RCr_COEFF]]\n"
"pmulhw xmm2, xmmword ptr [%[sse2_tables]+%c[BCb_COEFF]]\n"
"paddsw xmm1, xmm3\n"
// store for the next line; looking at the code above
// compared to the code below, I have to wonder whether
// this was worth the hassle
"movaps xmmword ptr [%[yuv2rgb_temp]], xmm0\n"
"movaps xmmword ptr [%[yuv2rgb_temp]+16], xmm1\n"
"movaps xmmword ptr [%[yuv2rgb_temp]+32], xmm2\n"
"jmp ihategcctoo_%=\n"
".align 16\n"
"onerow_%=:\n"
"movaps xmm0, xmmword ptr [%[yuv2rgb_temp]]\n"
"movaps xmm1, xmmword ptr [%[yuv2rgb_temp]+16]\n"
"movaps xmm2, xmmword ptr [%[yuv2rgb_temp]+32]\n"
"ihategcctoo_%=:\n"
"movaps xmm3, xmm0\n"
"movaps xmm4, xmm1\n"
"movaps xmm5, xmm2\n"
"movaps xmm6, xmmword ptr [%[mb8]+edi]\n"
"psubusb xmm6, xmmword ptr [%[sse2_tables]+%c[Y_BIAS]]\n"
"movaps xmm7, xmm6\n"
"psllw xmm6, 8\n" // xmm6 <- Y << 8 for pixels 0,2,4,6,8,10,12,14
"pand xmm7, xmmword ptr [%[sse2_tables]+%c[Y_MASK]]\n" // xmm7 <- Y << 8 for pixels 1,3,5,7,9,11,13,15
"pmulhuw xmm6, xmmword ptr [%[sse2_tables]+%c[Y_COEFF]]\n"
"pmulhuw xmm7, xmmword ptr [%[sse2_tables]+%c[Y_COEFF]]\n"
"paddsw xmm0, xmm6\n"
"paddsw xmm3, xmm7\n"
"paddsw xmm1, xmm6\n"
"paddsw xmm4, xmm7\n"
"paddsw xmm2, xmm6\n"
"paddsw xmm5, xmm7\n"
// 0x80; a constant is probably so much better
"pcmpeqb xmm7, xmm7\n"
"psllw xmm7, 15\n"
"psrlw xmm7, 8\n"
"packuswb xmm7, xmm7\n"
// round
"movaps xmm6, xmmword ptr [%[sse2_tables]+%c[ROUND_1BIT]]\n"
"paddw xmm0, xmm6\n"
"paddw xmm1, xmm6\n"
"paddw xmm2, xmm6\n"
"paddw xmm3, xmm6\n"
"paddw xmm4, xmm6\n"
"paddw xmm5, xmm6\n"
"psraw xmm0, 1\n"
"psraw xmm1, 1\n"
"psraw xmm2, 1\n"
"psraw xmm3, 1\n"
"psraw xmm4, 1\n"
"psraw xmm5, 1\n"
// combine even and odd bytes
"packuswb xmm0, xmm3\n"
"packuswb xmm1, xmm4\n"
"packuswb xmm2, xmm5\n"
"movhlps xmm3, xmm0\n"
"movhlps xmm4, xmm1\n"
"movhlps xmm5, xmm2\n"
"punpcklbw xmm0, xmm3\n" // Red bytes, back in order
"punpcklbw xmm1, xmm4\n" // Green ""
"punpcklbw xmm2, xmm5\n" // Blue ""
"movaps xmm3, xmm0\n"
"movaps xmm4, xmm1\n"
"movaps xmm5, xmm2\n"
// Create RGBA (we could generate A here, but we don't) quads
"punpcklbw xmm0, xmm1\n"
"punpcklbw xmm2, xmm7\n"
"movaps xmm1, xmm0\n"
"punpcklwd xmm0, xmm2\n"
"punpckhwd xmm1, xmm2\n"
"punpckhbw xmm3, xmm4\n"
"punpckhbw xmm5, xmm7\n"
"movaps xmm4, xmm3\n"
"punpcklwd xmm3, xmm5\n"
"punpckhwd xmm4, xmm5\n"
// at last
"movaps xmmword ptr [%[rgb32]+edi*4+0], xmm0\n"
"movaps xmmword ptr [%[rgb32]+edi*4+16], xmm1\n"
"movaps xmmword ptr [%[rgb32]+edi*4+32], xmm3\n"
"movaps xmmword ptr [%[rgb32]+edi*4+48], xmm4\n"
"add edi, 16\n"
// run twice the onerow <=> edi = 16 or 48 or 80 etc... <=> check bit 5
"test edi, 16\n"
"jnz onerow_%=\n"
"add esi, 8\n"
"cmp esi, 64\n"
"jne tworows_%=\n"
".att_syntax\n"
:
:[C_BIAS]"i"(C_BIAS), [Y_BIAS]"i"(Y_BIAS), [Y_MASK]"i"(Y_MASK),
[ROUND_1BIT]"i"(ROUND_1BIT), [Y_COEFF]"i"(Y_COEFF), [GCr_COEFF]"i"(GCr_COEFF),
[GCb_COEFF]"i"(GCb_COEFF), [RCr_COEFF]"i"(RCr_COEFF), [BCb_COEFF]"i"(BCb_COEFF),
// Use ecx and edx as base pointers, to allow for Mod/RM form on memOps.
// This saves 2-3 bytes per instruction where these are used. :)
[yuv2rgb_temp]"c"(yuv2rgb_temp), [sse2_tables]"d"(sse2_tableoffset),
[mb8]"r"(mb8), [rgb32]"r"(rgb32)
: "esi", "edi", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory"
);
#else
# error Unsupported compiler
#endif
}
#endif
#endif

View File

@ -1,5 +1,5 @@
/* PCSX2 - PS2 Emulator for PCs
* Copyright (C) 2002-2010 PCSX2 Dev Team
* Copyright (C) 2002-2016 PCSX2 Dev Team
*
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
* of the GNU Lesser General Public License as published by the Free Software Found-
@ -16,9 +16,6 @@
#pragma once
extern void yuv2rgb_reference();
#ifdef _M_X86_32
#define yuv2rgb yuv2rgb_sse2
extern void yuv2rgb_sse2();
#else
#define yuv2rgb yuv2rgb_reference
#endif