mirror of https://github.com/PCSX2/pcsx2.git
IPU: Fix regression from #10617
This commit is contained in:
parent
3d13c5d13c
commit
0bbde2ca52
|
@ -55,6 +55,11 @@ using r128 = __m128i;
|
|||
return _mm_set_epi64x(0, val);
|
||||
}
|
||||
|
||||
[[maybe_unused]] __fi static r128 r128_from_u32_dup(u32 val)
|
||||
{
|
||||
return _mm_set1_epi32(val);
|
||||
}
|
||||
|
||||
[[maybe_unused]] __fi static r128 r128_from_u32x4(u32 lo0, u32 lo1, u32 hi0, u32 hi1)
|
||||
{
|
||||
return _mm_setr_epi32(lo0, lo1, hi0, hi1);
|
||||
|
@ -150,6 +155,11 @@ using r128 = uint32x4_t;
|
|||
return vreinterpretq_u32_u64(vcombine_u64(vcreate_u64(val), vcreate_u64(0)));
|
||||
}
|
||||
|
||||
[[maybe_unused]] __fi static r128 r128_from_u32_dup(u32 val)
|
||||
{
|
||||
return vdupq_n_u32(val);
|
||||
}
|
||||
|
||||
[[maybe_unused]] __fi static r128 r128_from_u32x4(u32 lo0, u32 lo1, u32 hi0, u32 hi1)
|
||||
{
|
||||
const u32 values[4] = {lo0, lo1, hi0, hi1};
|
||||
|
|
|
@ -346,19 +346,19 @@ __ri static void IDCT_Copy(s16* block, u8* dest, const int stride)
|
|||
|
||||
|
||||
// stride = increment for dest in 16-bit units (typically either 8 [128 bits] or 16 [256 bits]).
|
||||
__ri static void IDCT_Add(s16* block, s16* dest, const int stride)
|
||||
__ri static void IDCT_Add(const int last, s16* block, s16* dest, const int stride)
|
||||
{
|
||||
// on the IPU, stride is always assured to be multiples of QWC (bottom 3 bits are 0).
|
||||
|
||||
if ((block[0] & 7) == 4)
|
||||
if (last != 129 || (block[0] & 7) == 4)
|
||||
{
|
||||
IDCT_Block(block);
|
||||
|
||||
__m128 zero = _mm_setzero_ps();
|
||||
const r128 zero = r128_zero();
|
||||
for (int i = 0; i < 8; i++)
|
||||
{
|
||||
_mm_store_ps((float*)dest, _mm_load_ps((float*)block));
|
||||
_mm_store_ps((float*)block, zero);
|
||||
r128_store(dest, r128_load(block));
|
||||
r128_store(block, zero);
|
||||
|
||||
dest += stride;
|
||||
block += 8;
|
||||
|
@ -366,14 +366,12 @@ __ri static void IDCT_Add(s16* block, s16* dest, const int stride)
|
|||
}
|
||||
else
|
||||
{
|
||||
s16 DC = ((int)block[0] + 4) >> 3;
|
||||
s16 dcf[2] = {DC, DC};
|
||||
const u16 DC = static_cast<u16>((static_cast<s32>(block[0]) + 4) >> 3);
|
||||
const r128 dc128 = r128_from_u32_dup(static_cast<u32>(DC) | (static_cast<u32>(DC) << 16));
|
||||
block[0] = block[63] = 0;
|
||||
|
||||
__m128 dc128 = _mm_set_ps1(*(float*)dcf);
|
||||
|
||||
for (int i = 0; i < 8; ++i)
|
||||
_mm_store_ps((float*)(dest + (stride * i)), dc128);
|
||||
r128_store((dest + (stride * i)), dc128);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -942,20 +940,14 @@ __ri static bool slice_intra_DCT(const int cc, u8 * const dest, const int stride
|
|||
|
||||
__ri static bool slice_non_intra_DCT(s16 * const dest, const int stride, const bool skip)
|
||||
{
|
||||
int last;
|
||||
|
||||
if (!skip)
|
||||
{
|
||||
std::memset(decoder.DCTblock, 0, sizeof(decoder.DCTblock));
|
||||
}
|
||||
|
||||
int last = 0;
|
||||
if (!get_non_intra_block(&last))
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
IDCT_Add(decoder.DCTblock, dest, stride);
|
||||
|
||||
IDCT_Add(last, decoder.DCTblock, dest, stride);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue