SPU2: Multi-isa resampling functions

This commit is contained in:
Ziemas 2023-10-17 16:01:12 +02:00 committed by refractionpcsx2
parent f29346f0b7
commit 5eacc7c327
6 changed files with 283 additions and 195 deletions

View File

@ -280,6 +280,10 @@ set(pcsx2SPU2Sources
SPU2/Wavedump_wav.cpp
)
set(pcsx2SPU2SourcesUnshared
SPU2/ReverbResample.cpp
)
# SPU2 headers
set(pcsx2SPU2Headers
SPU2/Debug.h
@ -762,7 +766,7 @@ if(DISABLE_ADVANCE_SIMD)
# Note: ld64 (macOS's linker) does not act the same way when presented with .a files, unless linked with `-force_load` (cmake WHOLE_ARCHIVE).
set(is_first_isa "1")
foreach(isa "sse4" "avx" "avx2")
add_library(GS-${isa} STATIC ${pcsx2GSSourcesUnshared} ${pcsx2IPUSourcesUnshared})
add_library(GS-${isa} STATIC ${pcsx2GSSourcesUnshared} ${pcsx2IPUSourcesUnshared} ${pcsx2SPU2SourcesUnshared})
target_link_libraries(GS-${isa} PRIVATE PCSX2_FLAGS)
target_compile_definitions(GS-${isa} PRIVATE MULTI_ISA_UNSHARED_COMPILATION=isa_${isa} MULTI_ISA_IS_FIRST=${is_first_isa} ${pcsx2_defs_${isa}})
target_compile_options(GS-${isa} PRIVATE ${compile_options_${isa}})
@ -778,6 +782,7 @@ if(DISABLE_ADVANCE_SIMD)
else()
list(APPEND pcsx2GSSources ${pcsx2GSSourcesUnshared})
list(APPEND pcsx2IPUSources ${pcsx2IPUSourcesUnshared})
list(APPEND pcsx2SPU2Sources ${pcsx2SPU2SourcesUnshared})
endif()
# DebugTools sources

View File

@ -19,7 +19,6 @@
#include <array>
void V_Core::AnalyzeReverbPreset()
{
Console.WriteLn("Reverb Parameter Update for Core %d:", Index);
@ -55,193 +54,6 @@ void V_Core::AnalyzeReverbPreset()
Console.WriteLn("----------------------------------------------------------");
}
static constexpr u32 NUM_TAPS = 39;
// 39 tap filter, the 0's could be optimized out
static constexpr std::array<s16, 48> filter_down_coefs alignas(32) = {
-1,
0,
2,
0,
-10,
0,
35,
0,
-103,
0,
266,
0,
-616,
0,
1332,
0,
-2960,
0,
10246,
16384,
10246,
0,
-2960,
0,
1332,
0,
-616,
0,
266,
0,
-103,
0,
35,
0,
-10,
0,
2,
0,
-1,
};
static constexpr std::array<s16, 48> make_up_coefs()
{
std::array<s16, 48> ret = {};
for (u32 i = 0; i < NUM_TAPS; i++)
{
ret[i] = static_cast<s16>(std::clamp<s32>(filter_down_coefs[i] * 2, INT16_MIN, INT16_MAX));
}
return ret;
}
static constexpr std::array<s16, 48> filter_up_coefs alignas(32) = make_up_coefs();
s32 __forceinline V_Core::ReverbDownsample(bool right)
{
int index = (RevbSampleBufPos - NUM_TAPS) & 63;
#if _M_SSE >= 0x501
auto c = GSVector8i::load<true>(&filter_down_coefs[0]);
auto s = GSVector8i::load<false>(&RevbDownBuf[right][index]);
auto acc = s.mul16hrs(c);
c = GSVector8i::load<true>(&filter_down_coefs[16]);
s = GSVector8i::load<false>(&RevbDownBuf[right][index + 16]);
acc = acc.adds16(s.mul16hrs(c));
c = GSVector8i::load<true>(&filter_down_coefs[32]);
s = GSVector8i::load<false>(&RevbDownBuf[right][index + 32]);
acc = acc.adds16(s.mul16hrs(c));
acc = acc.adds16(acc.ba());
acc = acc.hadds16(acc);
acc = acc.hadds16(acc);
acc = acc.hadds16(acc);
#else
auto c = GSVector4i::load<true>(&filter_down_coefs[0]);
auto s = GSVector4i::load<false>(&RevbDownBuf[right][index]);
auto acc = s.mul16hrs(c);
c = GSVector4i::load<true>(&filter_down_coefs[8]);
s = GSVector4i::load<false>(&RevbDownBuf[right][index + 8]);
acc = acc.adds16(s.mul16hrs(c));
c = GSVector4i::load<true>(&filter_down_coefs[16]);
s = GSVector4i::load<false>(&RevbDownBuf[right][index + 16]);
acc = acc.adds16(s.mul16hrs(c));
c = GSVector4i::load<true>(&filter_down_coefs[24]);
s = GSVector4i::load<false>(&RevbDownBuf[right][index + 24]);
acc = acc.adds16(s.mul16hrs(c));
c = GSVector4i::load<true>(&filter_down_coefs[32]);
s = GSVector4i::load<false>(&RevbDownBuf[right][index + 32]);
acc = acc.adds16(s.mul16hrs(c));
acc = acc.hadds16(acc);
acc = acc.hadds16(acc);
acc = acc.hadds16(acc);
#endif
return acc.I16[0];
}
StereoOut32 __forceinline V_Core::ReverbUpsample()
{
int index = (RevbSampleBufPos - NUM_TAPS) & 63;
#if _M_SSE >= 0x501
auto c = GSVector8i::load<true>(&filter_up_coefs[0]);
auto l = GSVector8i::load<false>(&RevbUpBuf[0][index]);
auto r = GSVector8i::load<false>(&RevbUpBuf[1][index]);
auto lacc = l.mul16hrs(c);
auto racc = r.mul16hrs(c);
c = GSVector8i::load<true>(&filter_up_coefs[16]);
l = GSVector8i::load<false>(&RevbUpBuf[0][index + 16]);
r = GSVector8i::load<false>(&RevbUpBuf[1][index + 16]);
lacc = lacc.adds16(l.mul16hrs(c));
racc = racc.adds16(r.mul16hrs(c));
c = GSVector8i::load<true>(&filter_up_coefs[32]);
l = GSVector8i::load<false>(&RevbUpBuf[0][index + 32]);
r = GSVector8i::load<false>(&RevbUpBuf[1][index + 32]);
lacc = lacc.adds16(l.mul16hrs(c));
racc = racc.adds16(r.mul16hrs(c));
lacc = lacc.adds16(lacc.ba());
racc = racc.adds16(racc.ba());
lacc = lacc.hadds16(lacc);
lacc = lacc.hadds16(lacc);
lacc = lacc.hadds16(lacc);
racc = racc.hadds16(racc);
racc = racc.hadds16(racc);
racc = racc.hadds16(racc);
#else
auto c = GSVector4i::load<true>(&filter_up_coefs[0]);
auto l = GSVector4i::load<false>(&RevbUpBuf[0][index]);
auto r = GSVector4i::load<false>(&RevbUpBuf[1][index]);
auto lacc = l.mul16hrs(c);
auto racc = r.mul16hrs(c);
c = GSVector4i::load<true>(&filter_up_coefs[8]);
l = GSVector4i::load<false>(&RevbUpBuf[0][index + 8]);
r = GSVector4i::load<false>(&RevbUpBuf[1][index + 8]);
lacc = lacc.adds16(l.mul16hrs(c));
racc = racc.adds16(r.mul16hrs(c));
c = GSVector4i::load<true>(&filter_up_coefs[16]);
l = GSVector4i::load<false>(&RevbUpBuf[0][index + 16]);
r = GSVector4i::load<false>(&RevbUpBuf[1][index + 16]);
lacc = lacc.adds16(l.mul16hrs(c));
racc = racc.adds16(r.mul16hrs(c));
c = GSVector4i::load<true>(&filter_up_coefs[24]);
l = GSVector4i::load<false>(&RevbUpBuf[0][index + 24]);
r = GSVector4i::load<false>(&RevbUpBuf[1][index + 24]);
lacc = lacc.adds16(l.mul16hrs(c));
racc = racc.adds16(r.mul16hrs(c));
c = GSVector4i::load<true>(&filter_up_coefs[32]);
l = GSVector4i::load<false>(&RevbUpBuf[0][index + 32]);
r = GSVector4i::load<false>(&RevbUpBuf[1][index + 32]);
lacc = lacc.adds16(l.mul16hrs(c));
racc = racc.adds16(r.mul16hrs(c));
lacc = lacc.hadds16(lacc);
lacc = lacc.hadds16(lacc);
lacc = lacc.hadds16(lacc);
racc = racc.hadds16(racc);
racc = racc.hadds16(racc);
racc = racc.hadds16(racc);
#endif
return {lacc.I16[0], racc.I16[0]};
}
__forceinline s32 V_Core::RevbGetIndexer(s32 offset)
{
u32 start = EffectsStartA & 0x3f'ffff;
@ -325,7 +137,7 @@ StereoOut32 V_Core::DoReverb(const StereoOut32& Input)
s32 in, same, diff, apf1, apf2, out;
#define MUL(x, y) ((x) * (y) >> 15)
in = MUL(R ? Revb.IN_COEF_R : Revb.IN_COEF_L, ReverbDownsample(R));
in = MUL(R ? Revb.IN_COEF_R : Revb.IN_COEF_L, ReverbDownsample(*this, R));
same = MUL(Revb.IIR_VOL, in + MUL(Revb.WALL_VOL, _spu2mem[same_src]) - _spu2mem[same_prv]) + _spu2mem[same_prv];
diff = MUL(Revb.IIR_VOL, in + MUL(Revb.WALL_VOL, _spu2mem[diff_src]) - _spu2mem[diff_prv]) + _spu2mem[diff_prv];
@ -356,5 +168,5 @@ StereoOut32 V_Core::DoReverb(const StereoOut32& Input)
RevbSampleBufPos = (RevbSampleBufPos + 1) & 63;
return ReverbUpsample();
return ReverbUpsample(*this);
}

View File

@ -0,0 +1,257 @@
#include "GS/GSVector.h"
#include "Global.h"
MULTI_ISA_UNSHARED_START
static constexpr u32 NUM_TAPS = 39;
// 39 tap filter, the 0's could be optimized out
static constexpr std::array<s16, 48> filter_down_coefs alignas(32) = {
-1,
0,
2,
0,
-10,
0,
35,
0,
-103,
0,
266,
0,
-616,
0,
1332,
0,
-2960,
0,
10246,
16384,
10246,
0,
-2960,
0,
1332,
0,
-616,
0,
266,
0,
-103,
0,
35,
0,
-10,
0,
2,
0,
-1,
};
static constexpr std::array<s16, 48> make_up_coefs()
{
std::array<s16, 48> ret = {};
for (u32 i = 0; i < NUM_TAPS; i++)
{
ret[i] = static_cast<s16>(std::clamp<s32>(filter_down_coefs[i] * 2, INT16_MIN, INT16_MAX));
}
return ret;
}
static constexpr std::array<s16, 48> filter_up_coefs alignas(32) = make_up_coefs();
s32 __forceinline ReverbDownsample_reference(V_Core& core, bool right)
{
int index = (core.RevbSampleBufPos - NUM_TAPS) & 63;
s32 out = 0;
for (int i = 0; i < NUM_TAPS; i++)
{
out += core.RevbDownBuf[right][index + i] * filter_down_coefs[i];
}
out >>= 15;
return clamp_mix(out);
}
#if _M_SSE >= 0x501
s32 __forceinline ReverbDownsample_avx(V_Core& core, bool right)
{
int index = (core.RevbSampleBufPos - NUM_TAPS) & 63;
auto c = GSVector8i::load<true>(&filter_down_coefs[0]);
auto s = GSVector8i::load<false>(&core.RevbDownBuf[right][index]);
auto acc = s.mul16hrs(c);
c = GSVector8i::load<true>(&filter_down_coefs[16]);
s = GSVector8i::load<false>(&core.RevbDownBuf[right][index + 16]);
acc = acc.adds16(s.mul16hrs(c));
c = GSVector8i::load<true>(&filter_down_coefs[32]);
s = GSVector8i::load<false>(&core.RevbDownBuf[right][index + 32]);
acc = acc.adds16(s.mul16hrs(c));
acc = acc.adds16(acc.ba());
acc = acc.hadds16(acc);
acc = acc.hadds16(acc);
acc = acc.hadds16(acc);
return acc.I16[0];
}
#endif
s32 __forceinline ReverbDownsample_sse(V_Core& core, bool right)
{
int index = (core.RevbSampleBufPos - NUM_TAPS) & 63;
auto c = GSVector4i::load<true>(&filter_down_coefs[0]);
auto s = GSVector4i::load<false>(&core.RevbDownBuf[right][index]);
auto acc = s.mul16hrs(c);
c = GSVector4i::load<true>(&filter_down_coefs[8]);
s = GSVector4i::load<false>(&core.RevbDownBuf[right][index + 8]);
acc = acc.adds16(s.mul16hrs(c));
c = GSVector4i::load<true>(&filter_down_coefs[16]);
s = GSVector4i::load<false>(&core.RevbDownBuf[right][index + 16]);
acc = acc.adds16(s.mul16hrs(c));
c = GSVector4i::load<true>(&filter_down_coefs[24]);
s = GSVector4i::load<false>(&core.RevbDownBuf[right][index + 24]);
acc = acc.adds16(s.mul16hrs(c));
c = GSVector4i::load<true>(&filter_down_coefs[32]);
s = GSVector4i::load<false>(&core.RevbDownBuf[right][index + 32]);
acc = acc.adds16(s.mul16hrs(c));
acc = acc.hadds16(acc);
acc = acc.hadds16(acc);
acc = acc.hadds16(acc);
return acc.I16[0];
}
s32 ReverbDownsample(V_Core& core, bool right)
{
#if _M_SSE >= 0x501
return ReverbDownsample_avx(core, right);
#else
return ReverbDownsample_sse(core, right);
#endif
}
StereoOut32 __forceinline ReverbUpsample_reference(V_Core& core)
{
int index = (core.RevbSampleBufPos - NUM_TAPS) & 63;
s32 l = 0, r = 0;
for (int i = 0; i < NUM_TAPS; i++)
{
l += core.RevbUpBuf[0][index + i] * filter_up_coefs[i];
r += core.RevbUpBuf[1][index + i] * filter_up_coefs[i];
}
l >>= 15;
r >>= 15;
return {clamp_mix(l), clamp_mix(r)};
}
#if _M_SSE >= 0x501
StereoOut32 __forceinline ReverbUpsample_avx(V_Core& core)
{
int index = (core.RevbSampleBufPos - NUM_TAPS) & 63;
auto c = GSVector8i::load<true>(&filter_up_coefs[0]);
auto l = GSVector8i::load<false>(&core.RevbUpBuf[0][index]);
auto r = GSVector8i::load<false>(&core.RevbUpBuf[1][index]);
auto lacc = l.mul16hrs(c);
auto racc = r.mul16hrs(c);
c = GSVector8i::load<true>(&filter_up_coefs[16]);
l = GSVector8i::load<false>(&core.RevbUpBuf[0][index + 16]);
r = GSVector8i::load<false>(&core.RevbUpBuf[1][index + 16]);
lacc = lacc.adds16(l.mul16hrs(c));
racc = racc.adds16(r.mul16hrs(c));
c = GSVector8i::load<true>(&filter_up_coefs[32]);
l = GSVector8i::load<false>(&core.RevbUpBuf[0][index + 32]);
r = GSVector8i::load<false>(&core.RevbUpBuf[1][index + 32]);
lacc = lacc.adds16(l.mul16hrs(c));
racc = racc.adds16(r.mul16hrs(c));
lacc = lacc.adds16(lacc.ba());
racc = racc.adds16(racc.ba());
lacc = lacc.hadds16(lacc);
lacc = lacc.hadds16(lacc);
lacc = lacc.hadds16(lacc);
racc = racc.hadds16(racc);
racc = racc.hadds16(racc);
racc = racc.hadds16(racc);
return {lacc.I16[0], racc.I16[0]};
}
#endif
StereoOut32 __forceinline ReverbUpsample_sse(V_Core& core)
{
int index = (core.RevbSampleBufPos - NUM_TAPS) & 63;
auto c = GSVector4i::load<true>(&filter_up_coefs[0]);
auto l = GSVector4i::load<false>(&core.RevbUpBuf[0][index]);
auto r = GSVector4i::load<false>(&core.RevbUpBuf[1][index]);
auto lacc = l.mul16hrs(c);
auto racc = r.mul16hrs(c);
c = GSVector4i::load<true>(&filter_up_coefs[8]);
l = GSVector4i::load<false>(&core.RevbUpBuf[0][index + 8]);
r = GSVector4i::load<false>(&core.RevbUpBuf[1][index + 8]);
lacc = lacc.adds16(l.mul16hrs(c));
racc = racc.adds16(r.mul16hrs(c));
c = GSVector4i::load<true>(&filter_up_coefs[16]);
l = GSVector4i::load<false>(&core.RevbUpBuf[0][index + 16]);
r = GSVector4i::load<false>(&core.RevbUpBuf[1][index + 16]);
lacc = lacc.adds16(l.mul16hrs(c));
racc = racc.adds16(r.mul16hrs(c));
c = GSVector4i::load<true>(&filter_up_coefs[24]);
l = GSVector4i::load<false>(&core.RevbUpBuf[0][index + 24]);
r = GSVector4i::load<false>(&core.RevbUpBuf[1][index + 24]);
lacc = lacc.adds16(l.mul16hrs(c));
racc = racc.adds16(r.mul16hrs(c));
c = GSVector4i::load<true>(&filter_up_coefs[32]);
l = GSVector4i::load<false>(&core.RevbUpBuf[0][index + 32]);
r = GSVector4i::load<false>(&core.RevbUpBuf[1][index + 32]);
lacc = lacc.adds16(l.mul16hrs(c));
racc = racc.adds16(r.mul16hrs(c));
lacc = lacc.hadds16(lacc);
lacc = lacc.hadds16(lacc);
lacc = lacc.hadds16(lacc);
racc = racc.hadds16(racc);
racc = racc.hadds16(racc);
racc = racc.hadds16(racc);
return {lacc.I16[0], racc.I16[0]};
}
StereoOut32 ReverbUpsample(V_Core& core)
{
#if _M_SSE >= 0x501
return ReverbUpsample_avx(core);
#else
return ReverbUpsample_sse(core);
#endif
}
MULTI_ISA_UNSHARED_END

View File

@ -19,6 +19,8 @@
#include "SPU2/SndOut.h"
#include "SPU2/Global.h"
#include "GS/MultiISA.h"
#include <array>
// --------------------------------------------------------------------------------------
@ -486,9 +488,6 @@ struct V_Core
StereoOut32 DoReverb(const StereoOut32& Input);
s32 RevbGetIndexer(s32 offset);
s32 ReverbDownsample(bool right);
StereoOut32 ReverbUpsample();
StereoOut32 ReadInput();
StereoOut32 ReadInput_HiFi();
@ -537,6 +536,14 @@ struct V_Core
void FinishDMAwrite();
};
MULTI_ISA_DEF(
StereoOut32 ReverbUpsample(V_Core& core);
s32 ReverbDownsample(V_Core& core, bool right);
)
extern StereoOut32 (*ReverbUpsample)(V_Core& core);
extern s32 (*ReverbDownsample)(V_Core& core, bool right);
extern V_Core Cores[2];
extern V_SPDIF Spdif;

View File

@ -46,6 +46,9 @@ int PlayMode;
static bool has_to_call_irq[2] = { false, false };
static bool has_to_call_irq_dma[2] = { false, false };
StereoOut32 (*ReverbUpsample)(V_Core& core);
s32 (*ReverbDownsample)(V_Core& core, bool right);
static bool psxmode = false;
@ -111,6 +114,9 @@ void V_Core::Init(int index)
if (SPU2::MsgToConsole())
SPU2::ConLog("* SPU2: Init SPU2 core %d \n", index);
ReverbDownsample = MULTI_ISA_SELECT(ReverbDownsample);
ReverbUpsample = MULTI_ISA_SELECT(ReverbUpsample);
//memset(this, 0, sizeof(V_Core));
// Explicitly initializing variables instead.
Mute = false;

View File

@ -264,6 +264,7 @@
<ClCompile Include="SPU2\Mixer.cpp" />
<ClCompile Include="SPU2\ReadInput.cpp" />
<ClCompile Include="SPU2\Reverb.cpp" />
<ClCompile Include="SPU2\ReverbResample.cpp" />
<ClCompile Include="SPU2\spu2.cpp" />
<ClCompile Include="IPU\IPUdma.cpp" />
<ClCompile Include="IPU\IPUdither.cpp" />
@ -872,4 +873,4 @@
<Import Condition="$(Configuration.Contains(Debug)) Or $(Configuration.Contains(Devel))" Project="$(SolutionDir)3rdparty\winpixeventruntime\WinPixEventRuntime.props" />
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets" />
</Project>
</Project>