IPU: Fix regression from #10617

This commit is contained in:
Stenzek 2024-01-14 14:57:10 +10:00 committed by Connor McLaughlin
parent 3d13c5d13c
commit 0bbde2ca52
2 changed files with 20 additions and 18 deletions

View File

@ -55,6 +55,11 @@ using r128 = __m128i;
return _mm_set_epi64x(0, val);
}
[[maybe_unused]] __fi static r128 r128_from_u32_dup(u32 val)
{
return _mm_set1_epi32(val);
}
[[maybe_unused]] __fi static r128 r128_from_u32x4(u32 lo0, u32 lo1, u32 hi0, u32 hi1)
{
return _mm_setr_epi32(lo0, lo1, hi0, hi1);
@ -150,6 +155,11 @@ using r128 = uint32x4_t;
return vreinterpretq_u32_u64(vcombine_u64(vcreate_u64(val), vcreate_u64(0)));
}
[[maybe_unused]] __fi static r128 r128_from_u32_dup(u32 val)
{
return vdupq_n_u32(val);
}
[[maybe_unused]] __fi static r128 r128_from_u32x4(u32 lo0, u32 lo1, u32 hi0, u32 hi1)
{
const u32 values[4] = {lo0, lo1, hi0, hi1};

View File

@ -346,19 +346,19 @@ __ri static void IDCT_Copy(s16* block, u8* dest, const int stride)
// stride = increment for dest in 16-bit units (typically either 8 [128 bits] or 16 [256 bits]).
__ri static void IDCT_Add(s16* block, s16* dest, const int stride)
__ri static void IDCT_Add(const int last, s16* block, s16* dest, const int stride)
{
// on the IPU, stride is always assured to be multiples of QWC (bottom 3 bits are 0).
if ((block[0] & 7) == 4)
if (last != 129 || (block[0] & 7) == 4)
{
IDCT_Block(block);
__m128 zero = _mm_setzero_ps();
const r128 zero = r128_zero();
for (int i = 0; i < 8; i++)
{
_mm_store_ps((float*)dest, _mm_load_ps((float*)block));
_mm_store_ps((float*)block, zero);
r128_store(dest, r128_load(block));
r128_store(block, zero);
dest += stride;
block += 8;
@ -366,14 +366,12 @@ __ri static void IDCT_Add(s16* block, s16* dest, const int stride)
}
else
{
s16 DC = ((int)block[0] + 4) >> 3;
s16 dcf[2] = {DC, DC};
const u16 DC = static_cast<u16>((static_cast<s32>(block[0]) + 4) >> 3);
const r128 dc128 = r128_from_u32_dup(static_cast<u32>(DC) | (static_cast<u32>(DC) << 16));
block[0] = block[63] = 0;
__m128 dc128 = _mm_set_ps1(*(float*)dcf);
for (int i = 0; i < 8; ++i)
_mm_store_ps((float*)(dest + (stride * i)), dc128);
r128_store((dest + (stride * i)), dc128);
}
}
@ -942,20 +940,14 @@ __ri static bool slice_intra_DCT(const int cc, u8 * const dest, const int stride
__ri static bool slice_non_intra_DCT(s16 * const dest, const int stride, const bool skip)
{
int last;
if (!skip)
{
std::memset(decoder.DCTblock, 0, sizeof(decoder.DCTblock));
}
int last = 0;
if (!get_non_intra_block(&last))
{
return false;
}
IDCT_Add(decoder.DCTblock, dest, stride);
IDCT_Add(last, decoder.DCTblock, dest, stride);
return true;
}