mirror of https://github.com/PCSX2/pcsx2.git
ipu: Convert yuv2rgb sse2 inline assembly to intrinsics
It's more portable. Use _mm_shuffle_epi32 instead of _mm_movehl_ps - I think it avoids data bypass delays going from integer to float domains on older processors, and Agner's tables indicate that the instruction has the same latency and occasionally has higher throughput (depending on cpu). And switch the _mm_xor_si128 and _mm_unpacklo_epi8 around so the same constant can be used for both C bias and alpha.
This commit is contained in:
parent
571432a7aa
commit
eaa4abea45
|
@ -1,5 +1,5 @@
|
||||||
/* PCSX2 - PS2 Emulator for PCs
|
/* PCSX2 - PS2 Emulator for PCs
|
||||||
* Copyright (C) 2002-2010 PCSX2 Dev Team
|
* Copyright (C) 2002-2016 PCSX2 Dev Team
|
||||||
*
|
*
|
||||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||||
|
@ -58,357 +58,94 @@ void yuv2rgb_reference(void)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(_M_X86_32)
|
// Suikoden Tactics FMV speed results: Reference - ~72fps, SSE2 - ~120fps
|
||||||
// TODO OSX optimize me
|
// An AVX2 version is only slightly faster than an SSE2 version (+2-3fps)
|
||||||
#if defined(__clang__) && !defined(__linux__)
|
// (or I'm a poor optimiser), though it might be worth attempting again
|
||||||
void yuv2rgb_sse2() {
|
// once we've ported to 64 bits (the extra registers should help).
|
||||||
yuv2rgb_reference();
|
__ri void yuv2rgb_sse2()
|
||||||
}
|
|
||||||
#else
|
|
||||||
// Everything below is bit accurate to the IPU specification (except maybe rounding).
|
|
||||||
// Know the specification before you touch it.
|
|
||||||
#define SSE_BYTES(x) {x, x, x, x, x, x, x, x, x, x, x, x, x, x, x, x}
|
|
||||||
#define SSE_WORDS(x) {x, x, x, x, x, x, x, x}
|
|
||||||
#define SSE_COEFFICIENTS(x) SSE_WORDS((s16)((x)<<2))
|
|
||||||
|
|
||||||
struct SSE2_Tables
|
|
||||||
{
|
{
|
||||||
u16 C_bias[8]; // offset -64
|
const __m128i c_bias = _mm_set1_epi8(s8(IPU_C_BIAS));
|
||||||
u8 Y_bias[16]; // offset -48
|
const __m128i y_bias = _mm_set1_epi8(IPU_Y_BIAS);
|
||||||
u16 Y_mask[8]; // offset -32
|
const __m128i y_mask = _mm_set1_epi16(s16(0xFF00));
|
||||||
u16 round_1bit[8]; // offset -16
|
|
||||||
|
|
||||||
s16 Y_coefficients[8]; // offset 0
|
|
||||||
s16 GCr_coefficients[8];// offset 16
|
|
||||||
s16 GCb_coefficients[8];// offset 32
|
|
||||||
s16 RCr_coefficients[8];// offset 48
|
|
||||||
s16 BCb_coefficients[8];// offset 64
|
|
||||||
};
|
|
||||||
|
|
||||||
enum
|
|
||||||
{
|
|
||||||
C_BIAS = -0x40,
|
|
||||||
Y_BIAS = -0x30,
|
|
||||||
Y_MASK = -0x20,
|
|
||||||
ROUND_1BIT = -0x10,
|
|
||||||
|
|
||||||
Y_COEFF = 0x00,
|
|
||||||
GCr_COEFF = 0x10,
|
|
||||||
GCb_COEFF = 0x20,
|
|
||||||
RCr_COEFF = 0x30,
|
|
||||||
BCb_COEFF = 0x40
|
|
||||||
};
|
|
||||||
|
|
||||||
static const __aligned16 SSE2_Tables sse2_tables =
|
|
||||||
{
|
|
||||||
SSE_WORDS(0x8000), // c_bias
|
|
||||||
SSE_BYTES(IPU_Y_BIAS), // y_bias
|
|
||||||
SSE_WORDS(0xff00), // y_mask
|
|
||||||
|
|
||||||
// Specifying round off instead of round down as everywhere else
|
// Specifying round off instead of round down as everywhere else
|
||||||
// implies that this is right
|
// implies that this is right
|
||||||
SSE_WORDS(1), // round_1bit
|
const __m128i round_1bit = _mm_set1_epi16(0x0001);;
|
||||||
|
|
||||||
SSE_COEFFICIENTS(IPU_Y_COEFF),
|
const __m128i y_coefficient = _mm_set1_epi16(s16(IPU_Y_COEFF << 2));
|
||||||
SSE_COEFFICIENTS(IPU_GCR_COEFF),
|
const __m128i gcr_coefficient = _mm_set1_epi16(s16(u16(IPU_GCR_COEFF) << 2));
|
||||||
SSE_COEFFICIENTS(IPU_GCB_COEFF),
|
const __m128i gcb_coefficient = _mm_set1_epi16(s16(u16(IPU_GCB_COEFF) << 2));
|
||||||
SSE_COEFFICIENTS(IPU_RCR_COEFF),
|
const __m128i rcr_coefficient = _mm_set1_epi16(s16(IPU_RCR_COEFF << 2));
|
||||||
SSE_COEFFICIENTS(IPU_BCB_COEFF),
|
const __m128i bcb_coefficient = _mm_set1_epi16(s16(IPU_BCB_COEFF << 2));
|
||||||
};
|
|
||||||
|
|
||||||
static __aligned16 u16 yuv2rgb_temp[3][8];
|
// Alpha set to 0x80 here. The threshold stuff is done later.
|
||||||
|
const __m128i& alpha = c_bias;
|
||||||
|
|
||||||
// This could potentially be improved for SSE4
|
for (int n = 0; n < 8; ++n) {
|
||||||
__ri void yuv2rgb_sse2(void)
|
// could skip the loadl_epi64 but most SSE instructions require 128-bit
|
||||||
{
|
// alignment so two versions would be needed.
|
||||||
#if defined(_MSC_VER) || defined(__INTEL_COMPILER)
|
__m128i cb = _mm_loadl_epi64(reinterpret_cast<__m128i*>(&decoder.mb8.Cb[n][0]));
|
||||||
__asm {
|
__m128i cr = _mm_loadl_epi64(reinterpret_cast<__m128i*>(&decoder.mb8.Cr[n][0]));
|
||||||
mov eax, 1
|
|
||||||
xor esi, esi
|
|
||||||
xor edi, edi
|
|
||||||
|
|
||||||
// Use ecx and edx as base pointers, to allow for Mod/RM form on memOps.
|
// (Cb - 128) << 8, (Cr - 128) << 8
|
||||||
// This saves 2-3 bytes per instruction where these are used. :)
|
cb = _mm_xor_si128(cb, c_bias);
|
||||||
mov ecx, offset yuv2rgb_temp
|
cr = _mm_xor_si128(cr, c_bias);
|
||||||
mov edx, offset sse2_tables+64;
|
cb = _mm_unpacklo_epi8(_mm_setzero_si128(), cb);
|
||||||
|
cr = _mm_unpacklo_epi8(_mm_setzero_si128(), cr);
|
||||||
|
|
||||||
align 16
|
__m128i rc = _mm_mulhi_epi16(cr, rcr_coefficient);
|
||||||
tworows:
|
__m128i gc = _mm_adds_epi16(_mm_mulhi_epi16(cr, gcr_coefficient), _mm_mulhi_epi16(cb, gcb_coefficient));
|
||||||
movq xmm3, qword ptr [decoder.mb8+256+esi]
|
__m128i bc = _mm_mulhi_epi16(cb, bcb_coefficient);
|
||||||
movq xmm1, qword ptr [decoder.mb8+320+esi]
|
|
||||||
pxor xmm2, xmm2
|
|
||||||
pxor xmm0, xmm0
|
|
||||||
// could skip the movq but punpck requires 128-bit alignment
|
|
||||||
// for some reason, so two versions would be needed,
|
|
||||||
// bloating the function (further)
|
|
||||||
punpcklbw xmm2, xmm3
|
|
||||||
punpcklbw xmm0, xmm1
|
|
||||||
// unfortunately I don't think this will matter despite being
|
|
||||||
// technically potentially a little faster, but this is
|
|
||||||
// equivalent to an add or sub
|
|
||||||
pxor xmm2, xmmword ptr [edx+C_BIAS] // xmm2 <-- 8 x (Cb - 128) << 8
|
|
||||||
pxor xmm0, xmmword ptr [edx+C_BIAS] // xmm0 <-- 8 x (Cr - 128) << 8
|
|
||||||
|
|
||||||
movaps xmm1, xmm0
|
for (int m = 0; m < 2; ++m) {
|
||||||
movaps xmm3, xmm2
|
__m128i y = _mm_load_si128(reinterpret_cast<__m128i*>(&decoder.mb8.Y[n * 2 + m][0]));
|
||||||
pmulhw xmm1, xmmword ptr [edx+GCr_COEFF]
|
y = _mm_subs_epu8(y, y_bias);
|
||||||
pmulhw xmm3, xmmword ptr [edx+GCb_COEFF]
|
// Y << 8 for pixels 0, 2, 4, 6, 8, 10, 12, 14
|
||||||
pmulhw xmm0, xmmword ptr [edx+RCr_COEFF]
|
__m128i y_even = _mm_slli_epi16(y, 8);
|
||||||
pmulhw xmm2, xmmword ptr [edx+BCb_COEFF]
|
// Y << 8 for pixels 1, 3, 5, 7 ,9, 11, 13, 15
|
||||||
paddsw xmm1, xmm3
|
__m128i y_odd = _mm_and_si128(y, y_mask);
|
||||||
// store for the next line; looking at the code above
|
|
||||||
// compared to the code below, I have to wonder whether
|
|
||||||
// this was worth the hassle
|
|
||||||
movaps xmmword ptr [ecx], xmm0
|
|
||||||
movaps xmmword ptr [ecx+16], xmm1
|
|
||||||
movaps xmmword ptr [ecx+32], xmm2
|
|
||||||
jmp ihatemsvc
|
|
||||||
|
|
||||||
align 16
|
y_even = _mm_mulhi_epu16(y_even, y_coefficient);
|
||||||
onerow:
|
y_odd = _mm_mulhi_epu16(y_odd, y_coefficient);
|
||||||
movaps xmm0, xmmword ptr [ecx]
|
|
||||||
movaps xmm1, xmmword ptr [ecx+16]
|
|
||||||
movaps xmm2, xmmword ptr [ecx+32]
|
|
||||||
|
|
||||||
// If masm directives worked properly in inline asm, I'd be using them,
|
__m128i r_even = _mm_adds_epi16(rc, y_even);
|
||||||
// but I'm not inclined to write ~70 line #defines to simulate them.
|
__m128i r_odd = _mm_adds_epi16(rc, y_odd);
|
||||||
// Maybe the function's faster like this anyway because it's smaller?
|
__m128i g_even = _mm_adds_epi16(gc, y_even);
|
||||||
// I'd have to write a 70 line #define to benchmark it.
|
__m128i g_odd = _mm_adds_epi16(gc, y_odd);
|
||||||
|
__m128i b_even = _mm_adds_epi16(bc, y_even);
|
||||||
|
__m128i b_odd = _mm_adds_epi16(bc, y_odd);
|
||||||
|
|
||||||
ihatemsvc:
|
// round
|
||||||
movaps xmm3, xmm0
|
r_even = _mm_srai_epi16(_mm_add_epi16(r_even, round_1bit), 1);
|
||||||
movaps xmm4, xmm1
|
r_odd = _mm_srai_epi16(_mm_add_epi16(r_odd, round_1bit), 1);
|
||||||
movaps xmm5, xmm2
|
g_even = _mm_srai_epi16(_mm_add_epi16(g_even, round_1bit), 1);
|
||||||
|
g_odd = _mm_srai_epi16(_mm_add_epi16(g_odd, round_1bit), 1);
|
||||||
|
b_even = _mm_srai_epi16(_mm_add_epi16(b_even, round_1bit), 1);
|
||||||
|
b_odd = _mm_srai_epi16(_mm_add_epi16(b_odd, round_1bit), 1);
|
||||||
|
|
||||||
movaps xmm6, xmmword ptr [decoder.mb8+edi]
|
// combine even and odd bytes in original order
|
||||||
psubusb xmm6, xmmword ptr [edx+Y_BIAS]
|
__m128i r = _mm_packus_epi16(r_even, r_odd);
|
||||||
movaps xmm7, xmm6
|
__m128i g = _mm_packus_epi16(g_even, g_odd);
|
||||||
psllw xmm6, 8 // xmm6 <- Y << 8 for pixels 0,2,4,6,8,10,12,14
|
__m128i b = _mm_packus_epi16(b_even, b_odd);
|
||||||
pand xmm7, xmmword ptr [edx+Y_MASK] // xmm7 <- Y << 8 for pixels 1,3,5,7,9,11,13,15
|
|
||||||
|
|
||||||
pmulhuw xmm6, xmmword ptr [edx+Y_COEFF]
|
r = _mm_unpacklo_epi8(r, _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 2, 3, 2)));
|
||||||
pmulhuw xmm7, xmmword ptr [edx+Y_COEFF]
|
g = _mm_unpacklo_epi8(g, _mm_shuffle_epi32(g, _MM_SHUFFLE(3, 2, 3, 2)));
|
||||||
|
b = _mm_unpacklo_epi8(b, _mm_shuffle_epi32(b, _MM_SHUFFLE(3, 2, 3, 2)));
|
||||||
|
|
||||||
paddsw xmm0, xmm6
|
// Create RGBA (we could generate A here, but we don't) quads
|
||||||
paddsw xmm3, xmm7
|
__m128i rg_l = _mm_unpacklo_epi8(r, g);
|
||||||
paddsw xmm1, xmm6
|
__m128i ba_l = _mm_unpacklo_epi8(b, alpha);
|
||||||
paddsw xmm4, xmm7
|
__m128i rgba_ll = _mm_unpacklo_epi16(rg_l, ba_l);
|
||||||
paddsw xmm2, xmm6
|
__m128i rgba_lh = _mm_unpackhi_epi16(rg_l, ba_l);
|
||||||
paddsw xmm5, xmm7
|
|
||||||
|
|
||||||
// 0x80; a constant is probably so much better
|
__m128i rg_h = _mm_unpackhi_epi8(r, g);
|
||||||
pcmpeqb xmm7, xmm7
|
__m128i ba_h = _mm_unpackhi_epi8(b, alpha);
|
||||||
psllw xmm7, 15
|
__m128i rgba_hl = _mm_unpacklo_epi16(rg_h, ba_h);
|
||||||
psrlw xmm7, 8
|
__m128i rgba_hh = _mm_unpackhi_epi16(rg_h, ba_h);
|
||||||
packuswb xmm7, xmm7
|
|
||||||
|
|
||||||
// round
|
_mm_store_si128(reinterpret_cast<__m128i*>(&decoder.rgb32.c[n * 2 + m][0]), rgba_ll);
|
||||||
movaps xmm6, xmmword ptr [edx+ROUND_1BIT]
|
_mm_store_si128(reinterpret_cast<__m128i*>(&decoder.rgb32.c[n * 2 + m][4]), rgba_lh);
|
||||||
paddw xmm0, xmm6
|
_mm_store_si128(reinterpret_cast<__m128i*>(&decoder.rgb32.c[n * 2 + m][8]), rgba_hl);
|
||||||
paddw xmm1, xmm6
|
_mm_store_si128(reinterpret_cast<__m128i*>(&decoder.rgb32.c[n * 2 + m][12]), rgba_hh);
|
||||||
paddw xmm2, xmm6
|
}
|
||||||
paddw xmm3, xmm6
|
|
||||||
paddw xmm4, xmm6
|
|
||||||
paddw xmm5, xmm6
|
|
||||||
psraw xmm0, 1
|
|
||||||
psraw xmm1, 1
|
|
||||||
psraw xmm2, 1
|
|
||||||
psraw xmm3, 1
|
|
||||||
psraw xmm4, 1
|
|
||||||
psraw xmm5, 1
|
|
||||||
|
|
||||||
// combine even and odd bytes
|
|
||||||
packuswb xmm0, xmm3
|
|
||||||
packuswb xmm1, xmm4
|
|
||||||
packuswb xmm2, xmm5
|
|
||||||
movhlps xmm3, xmm0
|
|
||||||
movhlps xmm4, xmm1
|
|
||||||
movhlps xmm5, xmm2
|
|
||||||
punpcklbw xmm0, xmm3 // Red bytes, back in order
|
|
||||||
punpcklbw xmm1, xmm4 // Green ""
|
|
||||||
punpcklbw xmm2, xmm5 // Blue ""
|
|
||||||
movaps xmm3, xmm0
|
|
||||||
movaps xmm4, xmm1
|
|
||||||
movaps xmm5, xmm2
|
|
||||||
|
|
||||||
// Create RGBA (we could generate A here, but we don't) quads
|
|
||||||
punpcklbw xmm0, xmm1
|
|
||||||
punpcklbw xmm2, xmm7
|
|
||||||
movaps xmm1, xmm0
|
|
||||||
punpcklwd xmm0, xmm2
|
|
||||||
punpckhwd xmm1, xmm2
|
|
||||||
|
|
||||||
punpckhbw xmm3, xmm4
|
|
||||||
punpckhbw xmm5, xmm7
|
|
||||||
movaps xmm4, xmm3
|
|
||||||
punpcklwd xmm3, xmm5
|
|
||||||
punpckhwd xmm4, xmm5
|
|
||||||
|
|
||||||
// at last
|
|
||||||
movaps xmmword ptr [decoder.rgb32+edi*4+0], xmm0
|
|
||||||
movaps xmmword ptr [decoder.rgb32+edi*4+16], xmm1
|
|
||||||
movaps xmmword ptr [decoder.rgb32+edi*4+32], xmm3
|
|
||||||
movaps xmmword ptr [decoder.rgb32+edi*4+48], xmm4
|
|
||||||
|
|
||||||
add edi, 16
|
|
||||||
|
|
||||||
neg eax
|
|
||||||
jl onerow // run twice
|
|
||||||
|
|
||||||
add esi, 8
|
|
||||||
cmp esi, 64
|
|
||||||
jne tworows
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#elif defined(__GNUC__)
|
|
||||||
|
|
||||||
// offset to the middle of the sse2 table, so that we can use 1-byte address displacement
|
|
||||||
// to access all fields:
|
|
||||||
static const u8* sse2_tableoffset = ((u8*)&sse2_tables) + 64;
|
|
||||||
static const u8* mb8 = (u8*)&decoder.mb8;
|
|
||||||
static u8* rgb32 = (u8*)&decoder.rgb32;
|
|
||||||
|
|
||||||
__asm__ __volatile__ (
|
|
||||||
".intel_syntax noprefix\n"
|
|
||||||
"xor esi, esi\n"
|
|
||||||
"xor edi, edi\n"
|
|
||||||
|
|
||||||
".align 16\n"
|
|
||||||
"tworows_%=:\n"
|
|
||||||
"movq xmm3, qword ptr [%[mb8]+256+esi]\n"
|
|
||||||
"movq xmm1, qword ptr [%[mb8]+320+esi]\n"
|
|
||||||
"pxor xmm2, xmm2\n"
|
|
||||||
"pxor xmm0, xmm0\n"
|
|
||||||
// could skip the movq but punpck requires 128-bit alignment
|
|
||||||
// for some reason, so two versions would be needed,
|
|
||||||
// bloating the function (further)
|
|
||||||
"punpcklbw xmm2, xmm3\n"
|
|
||||||
"punpcklbw xmm0, xmm1\n"
|
|
||||||
// unfortunately I don't think this will matter despite being
|
|
||||||
// technically potentially a little faster, but this is
|
|
||||||
// equivalent to an add or sub
|
|
||||||
"pxor xmm2, xmmword ptr [%[sse2_tables]+%c[C_BIAS]]\n" // xmm2 <-- 8 x (Cb - 128) << 8
|
|
||||||
"pxor xmm0, xmmword ptr [%[sse2_tables]+%c[C_BIAS]]\n" // xmm0 <-- 8 x (Cr - 128) << 8
|
|
||||||
|
|
||||||
"movaps xmm1, xmm0\n"
|
|
||||||
"movaps xmm3, xmm2\n"
|
|
||||||
"pmulhw xmm1, xmmword ptr [%[sse2_tables]+%c[GCr_COEFF]]\n"
|
|
||||||
"pmulhw xmm3, xmmword ptr [%[sse2_tables]+%c[GCb_COEFF]]\n"
|
|
||||||
"pmulhw xmm0, xmmword ptr [%[sse2_tables]+%c[RCr_COEFF]]\n"
|
|
||||||
"pmulhw xmm2, xmmword ptr [%[sse2_tables]+%c[BCb_COEFF]]\n"
|
|
||||||
"paddsw xmm1, xmm3\n"
|
|
||||||
// store for the next line; looking at the code above
|
|
||||||
// compared to the code below, I have to wonder whether
|
|
||||||
// this was worth the hassle
|
|
||||||
"movaps xmmword ptr [%[yuv2rgb_temp]], xmm0\n"
|
|
||||||
"movaps xmmword ptr [%[yuv2rgb_temp]+16], xmm1\n"
|
|
||||||
"movaps xmmword ptr [%[yuv2rgb_temp]+32], xmm2\n"
|
|
||||||
"jmp ihategcctoo_%=\n"
|
|
||||||
|
|
||||||
".align 16\n"
|
|
||||||
"onerow_%=:\n"
|
|
||||||
"movaps xmm0, xmmword ptr [%[yuv2rgb_temp]]\n"
|
|
||||||
"movaps xmm1, xmmword ptr [%[yuv2rgb_temp]+16]\n"
|
|
||||||
"movaps xmm2, xmmword ptr [%[yuv2rgb_temp]+32]\n"
|
|
||||||
|
|
||||||
"ihategcctoo_%=:\n"
|
|
||||||
"movaps xmm3, xmm0\n"
|
|
||||||
"movaps xmm4, xmm1\n"
|
|
||||||
"movaps xmm5, xmm2\n"
|
|
||||||
|
|
||||||
"movaps xmm6, xmmword ptr [%[mb8]+edi]\n"
|
|
||||||
"psubusb xmm6, xmmword ptr [%[sse2_tables]+%c[Y_BIAS]]\n"
|
|
||||||
"movaps xmm7, xmm6\n"
|
|
||||||
"psllw xmm6, 8\n" // xmm6 <- Y << 8 for pixels 0,2,4,6,8,10,12,14
|
|
||||||
"pand xmm7, xmmword ptr [%[sse2_tables]+%c[Y_MASK]]\n" // xmm7 <- Y << 8 for pixels 1,3,5,7,9,11,13,15
|
|
||||||
|
|
||||||
"pmulhuw xmm6, xmmword ptr [%[sse2_tables]+%c[Y_COEFF]]\n"
|
|
||||||
"pmulhuw xmm7, xmmword ptr [%[sse2_tables]+%c[Y_COEFF]]\n"
|
|
||||||
|
|
||||||
"paddsw xmm0, xmm6\n"
|
|
||||||
"paddsw xmm3, xmm7\n"
|
|
||||||
"paddsw xmm1, xmm6\n"
|
|
||||||
"paddsw xmm4, xmm7\n"
|
|
||||||
"paddsw xmm2, xmm6\n"
|
|
||||||
"paddsw xmm5, xmm7\n"
|
|
||||||
|
|
||||||
// 0x80; a constant is probably so much better
|
|
||||||
"pcmpeqb xmm7, xmm7\n"
|
|
||||||
"psllw xmm7, 15\n"
|
|
||||||
"psrlw xmm7, 8\n"
|
|
||||||
"packuswb xmm7, xmm7\n"
|
|
||||||
|
|
||||||
// round
|
|
||||||
"movaps xmm6, xmmword ptr [%[sse2_tables]+%c[ROUND_1BIT]]\n"
|
|
||||||
"paddw xmm0, xmm6\n"
|
|
||||||
"paddw xmm1, xmm6\n"
|
|
||||||
"paddw xmm2, xmm6\n"
|
|
||||||
"paddw xmm3, xmm6\n"
|
|
||||||
"paddw xmm4, xmm6\n"
|
|
||||||
"paddw xmm5, xmm6\n"
|
|
||||||
"psraw xmm0, 1\n"
|
|
||||||
"psraw xmm1, 1\n"
|
|
||||||
"psraw xmm2, 1\n"
|
|
||||||
"psraw xmm3, 1\n"
|
|
||||||
"psraw xmm4, 1\n"
|
|
||||||
"psraw xmm5, 1\n"
|
|
||||||
|
|
||||||
// combine even and odd bytes
|
|
||||||
"packuswb xmm0, xmm3\n"
|
|
||||||
"packuswb xmm1, xmm4\n"
|
|
||||||
"packuswb xmm2, xmm5\n"
|
|
||||||
"movhlps xmm3, xmm0\n"
|
|
||||||
"movhlps xmm4, xmm1\n"
|
|
||||||
"movhlps xmm5, xmm2\n"
|
|
||||||
"punpcklbw xmm0, xmm3\n" // Red bytes, back in order
|
|
||||||
"punpcklbw xmm1, xmm4\n" // Green ""
|
|
||||||
"punpcklbw xmm2, xmm5\n" // Blue ""
|
|
||||||
"movaps xmm3, xmm0\n"
|
|
||||||
"movaps xmm4, xmm1\n"
|
|
||||||
"movaps xmm5, xmm2\n"
|
|
||||||
|
|
||||||
// Create RGBA (we could generate A here, but we don't) quads
|
|
||||||
"punpcklbw xmm0, xmm1\n"
|
|
||||||
"punpcklbw xmm2, xmm7\n"
|
|
||||||
"movaps xmm1, xmm0\n"
|
|
||||||
"punpcklwd xmm0, xmm2\n"
|
|
||||||
"punpckhwd xmm1, xmm2\n"
|
|
||||||
|
|
||||||
"punpckhbw xmm3, xmm4\n"
|
|
||||||
"punpckhbw xmm5, xmm7\n"
|
|
||||||
"movaps xmm4, xmm3\n"
|
|
||||||
"punpcklwd xmm3, xmm5\n"
|
|
||||||
"punpckhwd xmm4, xmm5\n"
|
|
||||||
|
|
||||||
// at last
|
|
||||||
"movaps xmmword ptr [%[rgb32]+edi*4+0], xmm0\n"
|
|
||||||
"movaps xmmword ptr [%[rgb32]+edi*4+16], xmm1\n"
|
|
||||||
"movaps xmmword ptr [%[rgb32]+edi*4+32], xmm3\n"
|
|
||||||
"movaps xmmword ptr [%[rgb32]+edi*4+48], xmm4\n"
|
|
||||||
|
|
||||||
"add edi, 16\n"
|
|
||||||
|
|
||||||
// run twice the onerow <=> edi = 16 or 48 or 80 etc... <=> check bit 5
|
|
||||||
"test edi, 16\n"
|
|
||||||
"jnz onerow_%=\n"
|
|
||||||
|
|
||||||
"add esi, 8\n"
|
|
||||||
"cmp esi, 64\n"
|
|
||||||
"jne tworows_%=\n"
|
|
||||||
".att_syntax\n"
|
|
||||||
:
|
|
||||||
:[C_BIAS]"i"(C_BIAS), [Y_BIAS]"i"(Y_BIAS), [Y_MASK]"i"(Y_MASK),
|
|
||||||
[ROUND_1BIT]"i"(ROUND_1BIT), [Y_COEFF]"i"(Y_COEFF), [GCr_COEFF]"i"(GCr_COEFF),
|
|
||||||
[GCb_COEFF]"i"(GCb_COEFF), [RCr_COEFF]"i"(RCr_COEFF), [BCb_COEFF]"i"(BCb_COEFF),
|
|
||||||
// Use ecx and edx as base pointers, to allow for Mod/RM form on memOps.
|
|
||||||
// This saves 2-3 bytes per instruction where these are used. :)
|
|
||||||
[yuv2rgb_temp]"c"(yuv2rgb_temp), [sse2_tables]"d"(sse2_tableoffset),
|
|
||||||
[mb8]"r"(mb8), [rgb32]"r"(rgb32)
|
|
||||||
: "esi", "edi", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "memory"
|
|
||||||
);
|
|
||||||
#else
|
|
||||||
# error Unsupported compiler
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
#endif
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/* PCSX2 - PS2 Emulator for PCs
|
/* PCSX2 - PS2 Emulator for PCs
|
||||||
* Copyright (C) 2002-2010 PCSX2 Dev Team
|
* Copyright (C) 2002-2016 PCSX2 Dev Team
|
||||||
*
|
*
|
||||||
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
* PCSX2 is free software: you can redistribute it and/or modify it under the terms
|
||||||
* of the GNU Lesser General Public License as published by the Free Software Found-
|
* of the GNU Lesser General Public License as published by the Free Software Found-
|
||||||
|
@ -16,9 +16,6 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
extern void yuv2rgb_reference();
|
extern void yuv2rgb_reference();
|
||||||
#ifdef _M_X86_32
|
|
||||||
#define yuv2rgb yuv2rgb_sse2
|
#define yuv2rgb yuv2rgb_sse2
|
||||||
extern void yuv2rgb_sse2();
|
extern void yuv2rgb_sse2();
|
||||||
#else
|
|
||||||
#define yuv2rgb yuv2rgb_reference
|
|
||||||
#endif
|
|
||||||
|
|
Loading…
Reference in New Issue