From f29346f0b73c55ede1a3fb4dd1bab82775ff7259 Mon Sep 17 00:00:00 2001 From: Ziemas Date: Tue, 17 Oct 2023 09:41:25 +0200 Subject: [PATCH] SPU2: Optimize reverb resampling [SAVEVERSION+] --- pcsx2/GS/GSVector4i.h | 5 ++ pcsx2/GS/GSVector8i.h | 5 ++ pcsx2/SPU2/Reverb.cpp | 168 +++++++++++++++++++++++++++++++++++------- pcsx2/SPU2/defs.h | 4 +- pcsx2/SaveState.h | 2 +- 5 files changed, 154 insertions(+), 30 deletions(-) diff --git a/pcsx2/GS/GSVector4i.h b/pcsx2/GS/GSVector4i.h index c34e62365f..4c9a120852 100644 --- a/pcsx2/GS/GSVector4i.h +++ b/pcsx2/GS/GSVector4i.h @@ -846,6 +846,11 @@ public: return GSVector4i(_mm_adds_epi16(m, v.m)); } + __forceinline GSVector4i hadds16(const GSVector4i& v) const + { + return GSVector4i(_mm_hadds_epi16(m, v.m)); + } + __forceinline GSVector4i addus8(const GSVector4i& v) const { return GSVector4i(_mm_adds_epu8(m, v.m)); diff --git a/pcsx2/GS/GSVector8i.h b/pcsx2/GS/GSVector8i.h index 8bff2eef42..2cd2032f13 100644 --- a/pcsx2/GS/GSVector8i.h +++ b/pcsx2/GS/GSVector8i.h @@ -765,6 +765,11 @@ public: return GSVector8i(_mm256_adds_epi16(m, v.m)); } + __forceinline GSVector8i hadds16(const GSVector8i& v) const + { + return GSVector8i(_mm256_hadds_epi16(m, v.m)); + } + __forceinline GSVector8i addus8(const GSVector8i& v) const { return GSVector8i(_mm256_adds_epu8(m, v.m)); diff --git a/pcsx2/SPU2/Reverb.cpp b/pcsx2/SPU2/Reverb.cpp index 5718706711..f1369e9903 100644 --- a/pcsx2/SPU2/Reverb.cpp +++ b/pcsx2/SPU2/Reverb.cpp @@ -15,6 +15,8 @@ #include "PrecompiledHeader.h" #include "Global.h" +#include "GS/GSVector.h" + #include @@ -55,7 +57,7 @@ void V_Core::AnalyzeReverbPreset() static constexpr u32 NUM_TAPS = 39; // 39 tap filter, the 0's could be optimized out -static constexpr std::array filter_coefs = { +static constexpr std::array filter_down_coefs alignas(32) = { -1, 0, 2, @@ -97,39 +99,147 @@ static constexpr std::array filter_coefs = { -1, }; +static constexpr std::array make_up_coefs() +{ + std::array ret = {}; + + for (u32 i = 0; i < NUM_TAPS; i++) + { + ret[i] = static_cast(std::clamp(filter_down_coefs[i] * 2, INT16_MIN, INT16_MAX)); + } + + return ret; +} + +static constexpr std::array filter_up_coefs alignas(32) = make_up_coefs(); + s32 __forceinline V_Core::ReverbDownsample(bool right) { int index = (RevbSampleBufPos - NUM_TAPS) & 63; - s32 out = 0; - for (int i = 0; i < NUM_TAPS; i++) - { - out += RevbDownBuf[right][index + i] * filter_coefs[i]; - } +#if _M_SSE >= 0x501 + auto c = GSVector8i::load(&filter_down_coefs[0]); + auto s = GSVector8i::load(&RevbDownBuf[right][index]); + auto acc = s.mul16hrs(c); - out >>= 15; - out = std::clamp(out, INT16_MIN, INT16_MAX); + c = GSVector8i::load(&filter_down_coefs[16]); + s = GSVector8i::load(&RevbDownBuf[right][index + 16]); + acc = acc.adds16(s.mul16hrs(c)); - return out; + c = GSVector8i::load(&filter_down_coefs[32]); + s = GSVector8i::load(&RevbDownBuf[right][index + 32]); + acc = acc.adds16(s.mul16hrs(c)); + + acc = acc.adds16(acc.ba()); + + acc = acc.hadds16(acc); + acc = acc.hadds16(acc); + acc = acc.hadds16(acc); +#else + auto c = GSVector4i::load(&filter_down_coefs[0]); + auto s = GSVector4i::load(&RevbDownBuf[right][index]); + auto acc = s.mul16hrs(c); + + c = GSVector4i::load(&filter_down_coefs[8]); + s = GSVector4i::load(&RevbDownBuf[right][index + 8]); + acc = acc.adds16(s.mul16hrs(c)); + + c = GSVector4i::load(&filter_down_coefs[16]); + s = GSVector4i::load(&RevbDownBuf[right][index + 16]); + acc = acc.adds16(s.mul16hrs(c)); + + c = GSVector4i::load(&filter_down_coefs[24]); + s = GSVector4i::load(&RevbDownBuf[right][index + 24]); + acc = acc.adds16(s.mul16hrs(c)); + + c = GSVector4i::load(&filter_down_coefs[32]); + s = GSVector4i::load(&RevbDownBuf[right][index + 32]); + acc = acc.adds16(s.mul16hrs(c)); + + acc = acc.hadds16(acc); + acc = acc.hadds16(acc); + acc = acc.hadds16(acc); +#endif + + return acc.I16[0]; } StereoOut32 __forceinline V_Core::ReverbUpsample() { int index = (RevbSampleBufPos - NUM_TAPS) & 63; - s32 ls = 0, rs = 0; - for (int i = 0; i < NUM_TAPS; i++) - { - ls += RevbUpBuf[0][index + i] * (filter_coefs[i] * 2); - rs += RevbUpBuf[1][index + i] * (filter_coefs[i] * 2); - } +#if _M_SSE >= 0x501 + auto c = GSVector8i::load(&filter_up_coefs[0]); + auto l = GSVector8i::load(&RevbUpBuf[0][index]); + auto r = GSVector8i::load(&RevbUpBuf[1][index]); - ls >>= 15; - ls = std::clamp(ls, INT16_MIN, INT16_MAX); - rs >>= 15; - rs = std::clamp(rs, INT16_MIN, INT16_MAX); + auto lacc = l.mul16hrs(c); + auto racc = r.mul16hrs(c); - return {ls, rs}; + c = GSVector8i::load(&filter_up_coefs[16]); + l = GSVector8i::load(&RevbUpBuf[0][index + 16]); + r = GSVector8i::load(&RevbUpBuf[1][index + 16]); + lacc = lacc.adds16(l.mul16hrs(c)); + racc = racc.adds16(r.mul16hrs(c)); + + c = GSVector8i::load(&filter_up_coefs[32]); + l = GSVector8i::load(&RevbUpBuf[0][index + 32]); + r = GSVector8i::load(&RevbUpBuf[1][index + 32]); + lacc = lacc.adds16(l.mul16hrs(c)); + racc = racc.adds16(r.mul16hrs(c)); + + lacc = lacc.adds16(lacc.ba()); + racc = racc.adds16(racc.ba()); + + lacc = lacc.hadds16(lacc); + lacc = lacc.hadds16(lacc); + lacc = lacc.hadds16(lacc); + + racc = racc.hadds16(racc); + racc = racc.hadds16(racc); + racc = racc.hadds16(racc); +#else + auto c = GSVector4i::load(&filter_up_coefs[0]); + auto l = GSVector4i::load(&RevbUpBuf[0][index]); + auto r = GSVector4i::load(&RevbUpBuf[1][index]); + + auto lacc = l.mul16hrs(c); + auto racc = r.mul16hrs(c); + + c = GSVector4i::load(&filter_up_coefs[8]); + l = GSVector4i::load(&RevbUpBuf[0][index + 8]); + r = GSVector4i::load(&RevbUpBuf[1][index + 8]); + lacc = lacc.adds16(l.mul16hrs(c)); + racc = racc.adds16(r.mul16hrs(c)); + + c = GSVector4i::load(&filter_up_coefs[16]); + l = GSVector4i::load(&RevbUpBuf[0][index + 16]); + r = GSVector4i::load(&RevbUpBuf[1][index + 16]); + lacc = lacc.adds16(l.mul16hrs(c)); + racc = racc.adds16(r.mul16hrs(c)); + + c = GSVector4i::load(&filter_up_coefs[24]); + l = GSVector4i::load(&RevbUpBuf[0][index + 24]); + r = GSVector4i::load(&RevbUpBuf[1][index + 24]); + lacc = lacc.adds16(l.mul16hrs(c)); + racc = racc.adds16(r.mul16hrs(c)); + + c = GSVector4i::load(&filter_up_coefs[32]); + l = GSVector4i::load(&RevbUpBuf[0][index + 32]); + r = GSVector4i::load(&RevbUpBuf[1][index + 32]); + lacc = lacc.adds16(l.mul16hrs(c)); + racc = racc.adds16(r.mul16hrs(c)); + + lacc = lacc.hadds16(lacc); + lacc = lacc.hadds16(lacc); + lacc = lacc.hadds16(lacc); + + racc = racc.hadds16(racc); + racc = racc.hadds16(racc); + racc = racc.hadds16(racc); +#endif + + return {lacc.I16[0], racc.I16[0]}; } __forceinline s32 V_Core::RevbGetIndexer(s32 offset) @@ -151,10 +261,12 @@ StereoOut32 V_Core::DoReverb(const StereoOut32& Input) return StereoOut32::Empty; } - RevbDownBuf[0][RevbSampleBufPos] = Input.Left; - RevbDownBuf[1][RevbSampleBufPos] = Input.Right; - RevbDownBuf[0][RevbSampleBufPos + 64] = Input.Left; - RevbDownBuf[1][RevbSampleBufPos + 64] = Input.Right; + auto input = clamp_mix(Input); + + RevbDownBuf[0][RevbSampleBufPos] = input.Left; + RevbDownBuf[1][RevbSampleBufPos] = input.Right; + RevbDownBuf[0][RevbSampleBufPos | 64] = input.Left; + RevbDownBuf[1][RevbSampleBufPos | 64] = input.Right; bool R = Cycles & 1; @@ -234,11 +346,13 @@ StereoOut32 V_Core::DoReverb(const StereoOut32& Input) _spu2mem[apf2_dst] = clamp_mix(apf2); } - RevbUpBuf[R][RevbSampleBufPos] = clamp_mix(out); + out = clamp_mix(out); + + RevbUpBuf[R][RevbSampleBufPos] = out; RevbUpBuf[!R][RevbSampleBufPos] = 0; - RevbUpBuf[R][RevbSampleBufPos + 64] = clamp_mix(out); - RevbUpBuf[!R][RevbSampleBufPos + 64] = 0; + RevbUpBuf[R][RevbSampleBufPos | 64] = out; + RevbUpBuf[!R][RevbSampleBufPos | 64] = 0; RevbSampleBufPos = (RevbSampleBufPos + 1) & 63; diff --git a/pcsx2/SPU2/defs.h b/pcsx2/SPU2/defs.h index 6733ad0ac0..dfd6845b47 100644 --- a/pcsx2/SPU2/defs.h +++ b/pcsx2/SPU2/defs.h @@ -422,8 +422,8 @@ struct V_Core V_Reverb Revb; // Reverb Registers - s32 RevbDownBuf[2][64 * 2]; // Downsample buffer for reverb, one for each channel - s32 RevbUpBuf[2][64 * 2]; // Upsample buffer for reverb, one for each channel + s16 RevbDownBuf[2][64 * 2]; // Downsample buffer for reverb, one for each channel + s16 RevbUpBuf[2][64 * 2]; // Upsample buffer for reverb, one for each channel u32 RevbSampleBufPos; u32 EffectsStartA; u32 EffectsEndA; diff --git a/pcsx2/SaveState.h b/pcsx2/SaveState.h index bc79047f38..6b5c804c9e 100644 --- a/pcsx2/SaveState.h +++ b/pcsx2/SaveState.h @@ -37,7 +37,7 @@ enum class FreezeAction // [SAVEVERSION+] // This informs the auto updater that the users savestates will be invalidated. -static const u32 g_SaveVersion = (0x9A47 << 16) | 0x0000; +static const u32 g_SaveVersion = (0x9A48 << 16) | 0x0000; // the freezing data between submodules and core