IPU: ARM64 compatibility

This commit is contained in:
Stenzek 2024-03-21 17:05:22 +10:00 committed by Connor McLaughlin
parent 4e0e8cef54
commit 7d098674f2
5 changed files with 159 additions and 1 deletions

View File

@ -1374,6 +1374,7 @@ __fi static bool mpeg2_slice()
//Cr bias - 8 * 8
//Cb bias - 8 * 8
#if defined(_M_X86)
__m128i zeroreg = _mm_setzero_si128();
for (uint i = 0; i < (256+64+64) / 32; ++i)
@ -1388,6 +1389,24 @@ __fi static bool mpeg2_slice()
s += 32;
d += 32;
}
#elif defined(_M_ARM64)
uint8x16_t zeroreg = vmovq_n_u8(0);
for (uint i = 0; i < (256 + 64 + 64) / 32; ++i)
{
//*d++ = *s++;
uint8x16_t woot1 = vld1q_u8((uint8_t*)s);
uint8x16_t woot2 = vld1q_u8((uint8_t*)s + 16);
vst1q_u8((uint8_t*)d, vzip1q_u8(woot1, zeroreg));
vst1q_u8((uint8_t*)d + 16, vzip2q_u8(woot1, zeroreg));
vst1q_u8((uint8_t*)d + 32, vzip1q_u8(woot2, zeroreg));
vst1q_u8((uint8_t*)d + 48, vzip2q_u8(woot2, zeroreg));
s += 32;
d += 32;
}
#else
#error Unsupported arch
#endif
}
}
else

View File

@ -11,11 +11,18 @@
MULTI_ISA_UNSHARED_START
void ipu_dither_reference(const macroblock_rgb32 &rgb32, macroblock_rgb16 &rgb16, int dte);
#if defined(_M_X86)
void ipu_dither_sse2(const macroblock_rgb32 &rgb32, macroblock_rgb16 &rgb16, int dte);
#endif
__ri void ipu_dither(const macroblock_rgb32 &rgb32, macroblock_rgb16 &rgb16, int dte)
{
#if defined(_M_X86)
ipu_dither_sse2(rgb32, rgb16, dte);
#else
ipu_dither_reference(rgb32, rgb16, dte);
#endif
}
__ri void ipu_dither_reference(const macroblock_rgb32 &rgb32, macroblock_rgb16 &rgb16, int dte)
@ -53,6 +60,8 @@ __ri void ipu_dither_reference(const macroblock_rgb32 &rgb32, macroblock_rgb16 &
}
}
#if defined(_M_X86)
__ri void ipu_dither_sse2(const macroblock_rgb32 &rgb32, macroblock_rgb16 &rgb16, int dte)
{
const __m128i alpha_test = _mm_set1_epi16(0x40);
@ -110,4 +119,6 @@ __ri void ipu_dither_sse2(const macroblock_rgb32 &rgb32, macroblock_rgb16 &rgb16
}
}
#endif
MULTI_ISA_UNSHARED_END

View File

@ -168,7 +168,7 @@ static constexpr VLC_ALIGNED16 MVtab MV_10[] = {
static constexpr DMVtab DMV_2[] = {
{0, 1}, {0, 1}, {1, 2}, {-1, 2}};
{0, 1}, {0, 1}, {1, 2}, {(s8)-1, 2}};
static constexpr VLC_ALIGNED16 CBPtab CBP_7[] = {

View File

@ -42,6 +42,8 @@ void yuv2rgb_reference(void)
}
}
#if defined(_M_X86)
// Suikoden Tactics FMV speed results: Reference - ~72fps, SSE2 - ~120fps
// An AVX2 version is only slightly faster than an SSE2 version (+2-3fps)
// (or I'm a poor optimiser), though it might be worth attempting again
@ -134,4 +136,121 @@ __ri void yuv2rgb_sse2()
}
}
#elif defined(_M_ARM64)
#if defined(_MSC_VER) && !defined(__clang__)
#include <arm64_neon.h>
#else
#include <arm_neon.h>
#endif
#define MULHI16(a, b) vshrq_n_s16(vqdmulhq_s16((a), (b)), 1)
__ri void yuv2rgb_neon()
{
const int8x16_t c_bias = vdupq_n_s8(s8(IPU_C_BIAS));
const uint8x16_t y_bias = vdupq_n_u8(IPU_Y_BIAS);
const int16x8_t y_mask = vdupq_n_s16(s16(0xFF00));
// Specifying round off instead of round down as everywhere else
// implies that this is right
const int16x8_t round_1bit = vdupq_n_s16(0x0001);
;
const int16x8_t y_coefficient = vdupq_n_s16(s16(IPU_Y_COEFF << 2));
const int16x8_t gcr_coefficient = vdupq_n_s16(s16(u16(IPU_GCR_COEFF) << 2));
const int16x8_t gcb_coefficient = vdupq_n_s16(s16(u16(IPU_GCB_COEFF) << 2));
const int16x8_t rcr_coefficient = vdupq_n_s16(s16(IPU_RCR_COEFF << 2));
const int16x8_t bcb_coefficient = vdupq_n_s16(s16(IPU_BCB_COEFF << 2));
// Alpha set to 0x80 here. The threshold stuff is done later.
const uint8x16_t alpha = vreinterpretq_u8_s8(c_bias);
for (int n = 0; n < 8; ++n)
{
// could skip the loadl_epi64 but most SSE instructions require 128-bit
// alignment so two versions would be needed.
int8x16_t cb = vcombine_s8(vld1_s8(reinterpret_cast<s8*>(&decoder.mb8.Cb[n][0])), vdup_n_s8(0));
int8x16_t cr = vcombine_s8(vld1_s8(reinterpret_cast<s8*>(&decoder.mb8.Cr[n][0])), vdup_n_s8(0));
// (Cb - 128) << 8, (Cr - 128) << 8
cb = veorq_s8(cb, c_bias);
cr = veorq_s8(cr, c_bias);
cb = vzip1q_s8(vdupq_n_s8(0), cb);
cr = vzip1q_s8(vdupq_n_s8(0), cr);
int16x8_t rc = MULHI16(vreinterpretq_s16_s8(cr), rcr_coefficient);
int16x8_t gc = vqaddq_s16(MULHI16(vreinterpretq_s16_s8(cr), gcr_coefficient), MULHI16(vreinterpretq_s16_s8(cb), gcb_coefficient));
int16x8_t bc = MULHI16(vreinterpretq_s16_s8(cb), bcb_coefficient);
for (int m = 0; m < 2; ++m)
{
uint8x16_t y = vld1q_u8(&decoder.mb8.Y[n * 2 + m][0]);
y = vqsubq_u8(y, y_bias);
// Y << 8 for pixels 0, 2, 4, 6, 8, 10, 12, 14
int16x8_t y_even = vshlq_n_s16(vreinterpretq_s16_u8(y), 8);
// Y << 8 for pixels 1, 3, 5, 7 ,9, 11, 13, 15
int16x8_t y_odd = vandq_s16(vreinterpretq_s16_u8(y), y_mask);
// y_even = _mm_mulhi_epu16(y_even, y_coefficient);
// y_odd = _mm_mulhi_epu16(y_odd, y_coefficient);
uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_s16(y_even));
uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_s16(y_coefficient));
uint32x4_t ab3210 = vmull_u16(a3210, b3210);
uint32x4_t ab7654 = vmull_high_u16(vreinterpretq_u16_s16(y_even), vreinterpretq_u16_s16(y_coefficient));
y_even = vreinterpretq_s16_u16(vuzp2q_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654)));
a3210 = vget_low_u16(vreinterpretq_u16_s16(y_odd));
b3210 = vget_low_u16(vreinterpretq_u16_s16(y_coefficient));
ab3210 = vmull_u16(a3210, b3210);
ab7654 = vmull_high_u16(vreinterpretq_u16_s16(y_odd), vreinterpretq_u16_s16(y_coefficient));
y_odd = vreinterpretq_s16_u16(vuzp2q_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654)));
int16x8_t r_even = vqaddq_s16(rc, y_even);
int16x8_t r_odd = vqaddq_s16(rc, y_odd);
int16x8_t g_even = vqaddq_s16(gc, y_even);
int16x8_t g_odd = vqaddq_s16(gc, y_odd);
int16x8_t b_even = vqaddq_s16(bc, y_even);
int16x8_t b_odd = vqaddq_s16(bc, y_odd);
// round
r_even = vshrq_n_s16(vaddq_s16(r_even, round_1bit), 1);
r_odd = vshrq_n_s16(vaddq_s16(r_odd, round_1bit), 1);
g_even = vshrq_n_s16(vaddq_s16(g_even, round_1bit), 1);
g_odd = vshrq_n_s16(vaddq_s16(g_odd, round_1bit), 1);
b_even = vshrq_n_s16(vaddq_s16(b_even, round_1bit), 1);
b_odd = vshrq_n_s16(vaddq_s16(b_odd, round_1bit), 1);
// combine even and odd bytes in original order
uint8x16_t r = vcombine_u8(vqmovun_s16(r_even), vqmovun_s16(r_odd));
uint8x16_t g = vcombine_u8(vqmovun_s16(g_even), vqmovun_s16(g_odd));
uint8x16_t b = vcombine_u8(vqmovun_s16(b_even), vqmovun_s16(b_odd));
r = vzip1q_u8(r, vreinterpretq_u8_u64(vdupq_laneq_u64(vreinterpretq_u64_u8(r), 1)));
g = vzip1q_u8(g, vreinterpretq_u8_u64(vdupq_laneq_u64(vreinterpretq_u64_u8(g), 1)));
b = vzip1q_u8(b, vreinterpretq_u8_u64(vdupq_laneq_u64(vreinterpretq_u64_u8(b), 1)));
// Create RGBA (we could generate A here, but we don't) quads
uint8x16_t rg_l = vzip1q_u8(r, g);
uint8x16_t ba_l = vzip1q_u8(b, alpha);
uint16x8_t rgba_ll = vzip1q_u16(vreinterpretq_u16_u8(rg_l), vreinterpretq_u16_u8(ba_l));
uint16x8_t rgba_lh = vzip2q_u16(vreinterpretq_u16_u8(rg_l), vreinterpretq_u16_u8(ba_l));
uint8x16_t rg_h = vzip2q_u8(r, g);
uint8x16_t ba_h = vzip2q_u8(b, alpha);
uint16x8_t rgba_hl = vzip1q_u16(vreinterpretq_u16_u8(rg_h), vreinterpretq_u16_u8(ba_h));
uint16x8_t rgba_hh = vzip2q_u16(vreinterpretq_u16_u8(rg_h), vreinterpretq_u16_u8(ba_h));
vst1q_u8(reinterpret_cast<u8*>(&decoder.rgb32.c[n * 2 + m][0]), vreinterpretq_u8_u16(rgba_ll));
vst1q_u8(reinterpret_cast<u8*>(&decoder.rgb32.c[n * 2 + m][4]), vreinterpretq_u8_u16(rgba_lh));
vst1q_u8(reinterpret_cast<u8*>(&decoder.rgb32.c[n * 2 + m][8]), vreinterpretq_u8_u16(rgba_hl));
vst1q_u8(reinterpret_cast<u8*>(&decoder.rgb32.c[n * 2 + m][12]), vreinterpretq_u8_u16(rgba_hh));
}
}
}
#undef MULHI16
#endif
MULTI_ISA_UNSHARED_END

View File

@ -7,5 +7,14 @@
MULTI_ISA_DEF(extern void yuv2rgb_reference();)
#if defined(_M_X86)
#define yuv2rgb yuv2rgb_sse2
MULTI_ISA_DEF(extern void yuv2rgb_sse2();)
#elif defined(_M_ARM64)
#define yuv2rgb yuv2rgb_neon
MULTI_ISA_DEF(extern void yuv2rgb_neon();)
#endif