diff --git a/common/SingleRegisterTypes.h b/common/SingleRegisterTypes.h index e6c35e40ec..723fc902e0 100644 --- a/common/SingleRegisterTypes.h +++ b/common/SingleRegisterTypes.h @@ -55,6 +55,11 @@ using r128 = __m128i; return _mm_set_epi64x(0, val); } +[[maybe_unused]] __fi static r128 r128_from_u32_dup(u32 val) +{ + return _mm_set1_epi32(val); +} + [[maybe_unused]] __fi static r128 r128_from_u32x4(u32 lo0, u32 lo1, u32 hi0, u32 hi1) { return _mm_setr_epi32(lo0, lo1, hi0, hi1); @@ -150,6 +155,11 @@ using r128 = uint32x4_t; return vreinterpretq_u32_u64(vcombine_u64(vcreate_u64(val), vcreate_u64(0))); } +[[maybe_unused]] __fi static r128 r128_from_u32_dup(u32 val) +{ + return vdupq_n_u32(val); +} + [[maybe_unused]] __fi static r128 r128_from_u32x4(u32 lo0, u32 lo1, u32 hi0, u32 hi1) { const u32 values[4] = {lo0, lo1, hi0, hi1}; diff --git a/pcsx2/IPU/IPU_MultiISA.cpp b/pcsx2/IPU/IPU_MultiISA.cpp index 5ffb0a1bf2..a3e803668d 100644 --- a/pcsx2/IPU/IPU_MultiISA.cpp +++ b/pcsx2/IPU/IPU_MultiISA.cpp @@ -346,19 +346,19 @@ __ri static void IDCT_Copy(s16* block, u8* dest, const int stride) // stride = increment for dest in 16-bit units (typically either 8 [128 bits] or 16 [256 bits]). -__ri static void IDCT_Add(s16* block, s16* dest, const int stride) +__ri static void IDCT_Add(const int last, s16* block, s16* dest, const int stride) { // on the IPU, stride is always assured to be multiples of QWC (bottom 3 bits are 0). - if ((block[0] & 7) == 4) + if (last != 129 || (block[0] & 7) == 4) { IDCT_Block(block); - __m128 zero = _mm_setzero_ps(); + const r128 zero = r128_zero(); for (int i = 0; i < 8; i++) { - _mm_store_ps((float*)dest, _mm_load_ps((float*)block)); - _mm_store_ps((float*)block, zero); + r128_store(dest, r128_load(block)); + r128_store(block, zero); dest += stride; block += 8; @@ -366,14 +366,12 @@ __ri static void IDCT_Add(s16* block, s16* dest, const int stride) } else { - s16 DC = ((int)block[0] + 4) >> 3; - s16 dcf[2] = {DC, DC}; + const u16 DC = static_cast((static_cast(block[0]) + 4) >> 3); + const r128 dc128 = r128_from_u32_dup(static_cast(DC) | (static_cast(DC) << 16)); block[0] = block[63] = 0; - __m128 dc128 = _mm_set_ps1(*(float*)dcf); - for (int i = 0; i < 8; ++i) - _mm_store_ps((float*)(dest + (stride * i)), dc128); + r128_store((dest + (stride * i)), dc128); } } @@ -942,20 +940,14 @@ __ri static bool slice_intra_DCT(const int cc, u8 * const dest, const int stride __ri static bool slice_non_intra_DCT(s16 * const dest, const int stride, const bool skip) { - int last; - if (!skip) - { std::memset(decoder.DCTblock, 0, sizeof(decoder.DCTblock)); - } + int last = 0; if (!get_non_intra_block(&last)) - { return false; - } - - IDCT_Add(decoder.DCTblock, dest, stride); + IDCT_Add(last, decoder.DCTblock, dest, stride); return true; }