mirror of https://github.com/PCSX2/pcsx2.git
IPU: ARM64 compatibility
This commit is contained in:
parent
4e0e8cef54
commit
7d098674f2
|
@ -1374,6 +1374,7 @@ __fi static bool mpeg2_slice()
|
|||
//Cr bias - 8 * 8
|
||||
//Cb bias - 8 * 8
|
||||
|
||||
#if defined(_M_X86)
|
||||
__m128i zeroreg = _mm_setzero_si128();
|
||||
|
||||
for (uint i = 0; i < (256+64+64) / 32; ++i)
|
||||
|
@ -1388,6 +1389,24 @@ __fi static bool mpeg2_slice()
|
|||
s += 32;
|
||||
d += 32;
|
||||
}
|
||||
#elif defined(_M_ARM64)
|
||||
uint8x16_t zeroreg = vmovq_n_u8(0);
|
||||
|
||||
for (uint i = 0; i < (256 + 64 + 64) / 32; ++i)
|
||||
{
|
||||
//*d++ = *s++;
|
||||
uint8x16_t woot1 = vld1q_u8((uint8_t*)s);
|
||||
uint8x16_t woot2 = vld1q_u8((uint8_t*)s + 16);
|
||||
vst1q_u8((uint8_t*)d, vzip1q_u8(woot1, zeroreg));
|
||||
vst1q_u8((uint8_t*)d + 16, vzip2q_u8(woot1, zeroreg));
|
||||
vst1q_u8((uint8_t*)d + 32, vzip1q_u8(woot2, zeroreg));
|
||||
vst1q_u8((uint8_t*)d + 48, vzip2q_u8(woot2, zeroreg));
|
||||
s += 32;
|
||||
d += 32;
|
||||
}
|
||||
#else
|
||||
#error Unsupported arch
|
||||
#endif
|
||||
}
|
||||
}
|
||||
else
|
||||
|
|
|
@ -11,11 +11,18 @@
|
|||
MULTI_ISA_UNSHARED_START
|
||||
|
||||
void ipu_dither_reference(const macroblock_rgb32 &rgb32, macroblock_rgb16 &rgb16, int dte);
|
||||
|
||||
#if defined(_M_X86)
|
||||
void ipu_dither_sse2(const macroblock_rgb32 &rgb32, macroblock_rgb16 &rgb16, int dte);
|
||||
#endif
|
||||
|
||||
__ri void ipu_dither(const macroblock_rgb32 &rgb32, macroblock_rgb16 &rgb16, int dte)
|
||||
{
|
||||
#if defined(_M_X86)
|
||||
ipu_dither_sse2(rgb32, rgb16, dte);
|
||||
#else
|
||||
ipu_dither_reference(rgb32, rgb16, dte);
|
||||
#endif
|
||||
}
|
||||
|
||||
__ri void ipu_dither_reference(const macroblock_rgb32 &rgb32, macroblock_rgb16 &rgb16, int dte)
|
||||
|
@ -53,6 +60,8 @@ __ri void ipu_dither_reference(const macroblock_rgb32 &rgb32, macroblock_rgb16 &
|
|||
}
|
||||
}
|
||||
|
||||
#if defined(_M_X86)
|
||||
|
||||
__ri void ipu_dither_sse2(const macroblock_rgb32 &rgb32, macroblock_rgb16 &rgb16, int dte)
|
||||
{
|
||||
const __m128i alpha_test = _mm_set1_epi16(0x40);
|
||||
|
@ -110,4 +119,6 @@ __ri void ipu_dither_sse2(const macroblock_rgb32 &rgb32, macroblock_rgb16 &rgb16
|
|||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
MULTI_ISA_UNSHARED_END
|
||||
|
|
|
@ -168,7 +168,7 @@ static constexpr VLC_ALIGNED16 MVtab MV_10[] = {
|
|||
|
||||
|
||||
static constexpr DMVtab DMV_2[] = {
|
||||
{0, 1}, {0, 1}, {1, 2}, {-1, 2}};
|
||||
{0, 1}, {0, 1}, {1, 2}, {(s8)-1, 2}};
|
||||
|
||||
|
||||
static constexpr VLC_ALIGNED16 CBPtab CBP_7[] = {
|
||||
|
|
|
@ -42,6 +42,8 @@ void yuv2rgb_reference(void)
|
|||
}
|
||||
}
|
||||
|
||||
#if defined(_M_X86)
|
||||
|
||||
// Suikoden Tactics FMV speed results: Reference - ~72fps, SSE2 - ~120fps
|
||||
// An AVX2 version is only slightly faster than an SSE2 version (+2-3fps)
|
||||
// (or I'm a poor optimiser), though it might be worth attempting again
|
||||
|
@ -134,4 +136,121 @@ __ri void yuv2rgb_sse2()
|
|||
}
|
||||
}
|
||||
|
||||
#elif defined(_M_ARM64)
|
||||
|
||||
#if defined(_MSC_VER) && !defined(__clang__)
|
||||
#include <arm64_neon.h>
|
||||
#else
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
#define MULHI16(a, b) vshrq_n_s16(vqdmulhq_s16((a), (b)), 1)
|
||||
|
||||
__ri void yuv2rgb_neon()
|
||||
{
|
||||
const int8x16_t c_bias = vdupq_n_s8(s8(IPU_C_BIAS));
|
||||
const uint8x16_t y_bias = vdupq_n_u8(IPU_Y_BIAS);
|
||||
const int16x8_t y_mask = vdupq_n_s16(s16(0xFF00));
|
||||
// Specifying round off instead of round down as everywhere else
|
||||
// implies that this is right
|
||||
const int16x8_t round_1bit = vdupq_n_s16(0x0001);
|
||||
;
|
||||
|
||||
const int16x8_t y_coefficient = vdupq_n_s16(s16(IPU_Y_COEFF << 2));
|
||||
const int16x8_t gcr_coefficient = vdupq_n_s16(s16(u16(IPU_GCR_COEFF) << 2));
|
||||
const int16x8_t gcb_coefficient = vdupq_n_s16(s16(u16(IPU_GCB_COEFF) << 2));
|
||||
const int16x8_t rcr_coefficient = vdupq_n_s16(s16(IPU_RCR_COEFF << 2));
|
||||
const int16x8_t bcb_coefficient = vdupq_n_s16(s16(IPU_BCB_COEFF << 2));
|
||||
|
||||
// Alpha set to 0x80 here. The threshold stuff is done later.
|
||||
const uint8x16_t alpha = vreinterpretq_u8_s8(c_bias);
|
||||
|
||||
for (int n = 0; n < 8; ++n)
|
||||
{
|
||||
// could skip the loadl_epi64 but most SSE instructions require 128-bit
|
||||
// alignment so two versions would be needed.
|
||||
int8x16_t cb = vcombine_s8(vld1_s8(reinterpret_cast<s8*>(&decoder.mb8.Cb[n][0])), vdup_n_s8(0));
|
||||
int8x16_t cr = vcombine_s8(vld1_s8(reinterpret_cast<s8*>(&decoder.mb8.Cr[n][0])), vdup_n_s8(0));
|
||||
|
||||
// (Cb - 128) << 8, (Cr - 128) << 8
|
||||
cb = veorq_s8(cb, c_bias);
|
||||
cr = veorq_s8(cr, c_bias);
|
||||
cb = vzip1q_s8(vdupq_n_s8(0), cb);
|
||||
cr = vzip1q_s8(vdupq_n_s8(0), cr);
|
||||
|
||||
int16x8_t rc = MULHI16(vreinterpretq_s16_s8(cr), rcr_coefficient);
|
||||
int16x8_t gc = vqaddq_s16(MULHI16(vreinterpretq_s16_s8(cr), gcr_coefficient), MULHI16(vreinterpretq_s16_s8(cb), gcb_coefficient));
|
||||
int16x8_t bc = MULHI16(vreinterpretq_s16_s8(cb), bcb_coefficient);
|
||||
|
||||
for (int m = 0; m < 2; ++m)
|
||||
{
|
||||
uint8x16_t y = vld1q_u8(&decoder.mb8.Y[n * 2 + m][0]);
|
||||
y = vqsubq_u8(y, y_bias);
|
||||
// Y << 8 for pixels 0, 2, 4, 6, 8, 10, 12, 14
|
||||
int16x8_t y_even = vshlq_n_s16(vreinterpretq_s16_u8(y), 8);
|
||||
// Y << 8 for pixels 1, 3, 5, 7 ,9, 11, 13, 15
|
||||
int16x8_t y_odd = vandq_s16(vreinterpretq_s16_u8(y), y_mask);
|
||||
|
||||
// y_even = _mm_mulhi_epu16(y_even, y_coefficient);
|
||||
// y_odd = _mm_mulhi_epu16(y_odd, y_coefficient);
|
||||
|
||||
uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_s16(y_even));
|
||||
uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_s16(y_coefficient));
|
||||
uint32x4_t ab3210 = vmull_u16(a3210, b3210);
|
||||
uint32x4_t ab7654 = vmull_high_u16(vreinterpretq_u16_s16(y_even), vreinterpretq_u16_s16(y_coefficient));
|
||||
y_even = vreinterpretq_s16_u16(vuzp2q_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654)));
|
||||
|
||||
a3210 = vget_low_u16(vreinterpretq_u16_s16(y_odd));
|
||||
b3210 = vget_low_u16(vreinterpretq_u16_s16(y_coefficient));
|
||||
ab3210 = vmull_u16(a3210, b3210);
|
||||
ab7654 = vmull_high_u16(vreinterpretq_u16_s16(y_odd), vreinterpretq_u16_s16(y_coefficient));
|
||||
y_odd = vreinterpretq_s16_u16(vuzp2q_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654)));
|
||||
|
||||
int16x8_t r_even = vqaddq_s16(rc, y_even);
|
||||
int16x8_t r_odd = vqaddq_s16(rc, y_odd);
|
||||
int16x8_t g_even = vqaddq_s16(gc, y_even);
|
||||
int16x8_t g_odd = vqaddq_s16(gc, y_odd);
|
||||
int16x8_t b_even = vqaddq_s16(bc, y_even);
|
||||
int16x8_t b_odd = vqaddq_s16(bc, y_odd);
|
||||
|
||||
// round
|
||||
r_even = vshrq_n_s16(vaddq_s16(r_even, round_1bit), 1);
|
||||
r_odd = vshrq_n_s16(vaddq_s16(r_odd, round_1bit), 1);
|
||||
g_even = vshrq_n_s16(vaddq_s16(g_even, round_1bit), 1);
|
||||
g_odd = vshrq_n_s16(vaddq_s16(g_odd, round_1bit), 1);
|
||||
b_even = vshrq_n_s16(vaddq_s16(b_even, round_1bit), 1);
|
||||
b_odd = vshrq_n_s16(vaddq_s16(b_odd, round_1bit), 1);
|
||||
|
||||
// combine even and odd bytes in original order
|
||||
uint8x16_t r = vcombine_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
|
||||
uint8x16_t g = vcombine_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
|
||||
uint8x16_t b = vcombine_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
|
||||
|
||||
r = vzip1q_u8(r, vreinterpretq_u8_u64(vdupq_laneq_u64(vreinterpretq_u64_u8(r), 1)));
|
||||
g = vzip1q_u8(g, vreinterpretq_u8_u64(vdupq_laneq_u64(vreinterpretq_u64_u8(g), 1)));
|
||||
b = vzip1q_u8(b, vreinterpretq_u8_u64(vdupq_laneq_u64(vreinterpretq_u64_u8(b), 1)));
|
||||
|
||||
// Create RGBA (we could generate A here, but we don't) quads
|
||||
uint8x16_t rg_l = vzip1q_u8(r, g);
|
||||
uint8x16_t ba_l = vzip1q_u8(b, alpha);
|
||||
uint16x8_t rgba_ll = vzip1q_u16(vreinterpretq_u16_u8(rg_l), vreinterpretq_u16_u8(ba_l));
|
||||
uint16x8_t rgba_lh = vzip2q_u16(vreinterpretq_u16_u8(rg_l), vreinterpretq_u16_u8(ba_l));
|
||||
|
||||
uint8x16_t rg_h = vzip2q_u8(r, g);
|
||||
uint8x16_t ba_h = vzip2q_u8(b, alpha);
|
||||
uint16x8_t rgba_hl = vzip1q_u16(vreinterpretq_u16_u8(rg_h), vreinterpretq_u16_u8(ba_h));
|
||||
uint16x8_t rgba_hh = vzip2q_u16(vreinterpretq_u16_u8(rg_h), vreinterpretq_u16_u8(ba_h));
|
||||
|
||||
vst1q_u8(reinterpret_cast<u8*>(&decoder.rgb32.c[n * 2 + m][0]), vreinterpretq_u8_u16(rgba_ll));
|
||||
vst1q_u8(reinterpret_cast<u8*>(&decoder.rgb32.c[n * 2 + m][4]), vreinterpretq_u8_u16(rgba_lh));
|
||||
vst1q_u8(reinterpret_cast<u8*>(&decoder.rgb32.c[n * 2 + m][8]), vreinterpretq_u8_u16(rgba_hl));
|
||||
vst1q_u8(reinterpret_cast<u8*>(&decoder.rgb32.c[n * 2 + m][12]), vreinterpretq_u8_u16(rgba_hh));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#undef MULHI16
|
||||
|
||||
#endif
|
||||
|
||||
MULTI_ISA_UNSHARED_END
|
||||
|
|
|
@ -7,5 +7,14 @@
|
|||
|
||||
MULTI_ISA_DEF(extern void yuv2rgb_reference();)
|
||||
|
||||
#if defined(_M_X86)
|
||||
|
||||
#define yuv2rgb yuv2rgb_sse2
|
||||
MULTI_ISA_DEF(extern void yuv2rgb_sse2();)
|
||||
|
||||
#elif defined(_M_ARM64)
|
||||
|
||||
#define yuv2rgb yuv2rgb_neon
|
||||
MULTI_ISA_DEF(extern void yuv2rgb_neon();)
|
||||
|
||||
#endif
|
||||
|
|
Loading…
Reference in New Issue