SPU2: Use 16-bit samples for output

It's clamped to 16-bit in the output anyway. Volume application is moved to the audio thread, so the stretcher has higher precision if the user chooses a low volume. Also vectorizes the int->float conversion (since it happens on the EE thread, it's beneficial for performance).
2023-01-18 19:40:05 +10:00 · 2023-01-18 19:40:05 +10:00 · 76fa37019e
parent d94e861a78
commit 76fa37019e
5 changed files with 347 additions and 514 deletions
--- a/pcsx2/SPU2/DplIIdecoder.cpp
+++ b/pcsx2/SPU2/DplIIdecoder.cpp
@ -42,19 +42,19 @@ static float LMax = 0, RMax = 0;
 static float AccL = 0;
 static float AccR = 0;

-const float Scale = 4294967296.0f; // tweak this value to change the overall output volume
+constexpr float Scale = 4294967296.0f; // tweak this value to change the overall output volume

-const float GainL = 0.80f * Scale;
-const float GainR = 0.80f * Scale;
+constexpr float GainL = 0.80f * Scale;
+constexpr float GainR = 0.80f * Scale;

-const float GainC = 0.75f * Scale;
+constexpr float GainC = 0.75f * Scale;

-const float GainSL = 0.90f * Scale;
-const float GainSR = 0.90f * Scale;
+constexpr float GainSL = 0.90f * Scale;
+constexpr float GainSR = 0.90f * Scale;

-const float GainLFE = 0.90f * Scale;
+constexpr float GainLFE = 0.90f * Scale;

-const float AddCLR = 0.20f * Scale; // Stereo expansion
+constexpr float AddCLR = 0.20f * Scale; // Stereo expansion

 extern void ResetDplIIDecoder()
 {
@ -66,10 +66,10 @@ extern void ResetDplIIDecoder()
 	AccR = 0;
 }

-void ProcessDplIISample32(const StereoOut32& src, Stereo51Out32DplII* s)
+void ProcessDplIISample32(const StereoOut16& src, Stereo51Out32DplII* s)
 {
-	float IL = src.Left / (float)(1 << (SndOutVolumeShift + 16));
-	float IR = src.Right / (float)(1 << (SndOutVolumeShift + 16));
+	float IL = src.Left / (float)(1 << 16);
+	float IR = src.Right / (float)(1 << 16);

 	// Calculate center channel and LFE
 	float C = (IL + IR) * 0.5f;
@ -121,7 +121,7 @@ void ProcessDplIISample32(const StereoOut32& src, Stereo51Out32DplII* s)
 	s->RightBack = (s32)(SR * GainSR);
 }

-void ProcessDplIISample16(const StereoOut32& src, Stereo51Out16DplII* s)
+void ProcessDplIISample16(const StereoOut16& src, Stereo51Out16DplII* s)
 {
 	Stereo51Out32DplII ss;
 	ProcessDplIISample32(src, &ss);
@ -134,10 +134,10 @@ void ProcessDplIISample16(const StereoOut32& src, Stereo51Out16DplII* s)
 	s->RightBack = ss.RightBack >> 16;
 }

-void ProcessDplSample32(const StereoOut32& src, Stereo51Out32Dpl* s)
+void ProcessDplSample32(const StereoOut16& src, Stereo51Out32Dpl* s)
 {
-	float ValL = src.Left / (float)(1 << (SndOutVolumeShift + 16));
-	float ValR = src.Right / (float)(1 << (SndOutVolumeShift + 16));
+	float ValL = src.Left / (float)(1 << 16);
+	float ValR = src.Right / (float)(1 << 16);

 	float C = (ValL + ValR) * 0.5f; //+15.8
 	float S = (ValL - ValR) * 0.5f;
@ -157,7 +157,7 @@ void ProcessDplSample32(const StereoOut32& src, Stereo51Out32Dpl* s)
 	s->RightBack = (s32)(S * GainSR);
 }

-void ProcessDplSample16(const StereoOut32& src, Stereo51Out16Dpl* s)
+void ProcessDplSample16(const StereoOut16& src, Stereo51Out16Dpl* s)
 {
 	Stereo51Out32Dpl ss;
 	ProcessDplSample32(src, &ss);
--- a/pcsx2/SPU2/Mixer.cpp
+++ b/pcsx2/SPU2/Mixer.cpp
@ -28,8 +28,6 @@ static const s32 tbl_XA_Factor[16][2] =
 		{98, -55},
 		{122, -60}};

-float SPU2::FinalVolume = 1.0f;
-
 // Performs a 64-bit multiplication between two values and returns the
 // high 32 bits as a result (discarding the fractional 32 bits).
 // The combined fractional bits of both inputs must be 32 bits for this
@ -692,17 +690,10 @@ __forceinline
 	}
 	else
 	{
-		Out.Left = MulShr32(Out.Left << SndOutVolumeShift, Cores[1].MasterVol.Left.Value);
-		Out.Right = MulShr32(Out.Right << SndOutVolumeShift, Cores[1].MasterVol.Right.Value);
-
-		
-		
+		Out.Left = MulShr32(Out.Left, Cores[1].MasterVol.Left.Value);
+		Out.Right = MulShr32(Out.Right, Cores[1].MasterVol.Right.Value);
 	}

-	// Configurable output volume
-	Out.Left *= SPU2::FinalVolume;
-	Out.Right *= SPU2::FinalVolume;
-
 	// Final Clamp!
 	// Like any good audio system, the PS2 pumps the volume and incurs some distortion in its
 	// output, giving us a nice thumpy sound at times.  So we add 1 above (2x volume pump) and
@ -711,12 +702,9 @@ __forceinline
 	// Edit: I'm sorry Jake, but I know of no good audio system that arbitrary distorts and clips
 	// output by design.
 	// Good thing though that this code gets the volume exactly right, as per tests :)
-	Out = clamp_mix(Out, SndOutVolumeShift);
+	Out = clamp_mix(Out);

-	SndBuffer::Write(Out);
-
-	if (SampleRate == 96000) // Double up samples for 96khz (Port Audio Non-Exclusive)
-		SndBuffer::Write(Out);
+	SndBuffer::Write(StereoOut16(Out));

 	// Update AutoDMA output positioning
 	OutPos++;
@ -743,13 +731,3 @@ __forceinline
 		}
 	}
 }
-
-s32 SPU2::GetOutputVolume()
-{
-	return static_cast<s32>(std::round(FinalVolume * 100.0f));
-}
-
-void SPU2::SetOutputVolume(s32 volume)
-{
-	FinalVolume = static_cast<float>(std::clamp<s32>(volume, 0, Pcsx2Config::SPU2Options::MAX_VOLUME)) / 100.0f;
-}
--- a/pcsx2/SPU2/Mixer.h
+++ b/pcsx2/SPU2/Mixer.h
@ -15,14 +15,9 @@

 #pragma once

-namespace SPU2
-{
-extern float FinalVolume;
-}
-
 struct StereoOut32
 {
-	static StereoOut32 Empty;
+	static const StereoOut32 Empty;

 	s32 Left;
 	s32 Right;
@ -39,11 +34,6 @@ struct StereoOut32
 	{
 	}

-	StereoOut32(const StereoOut16& src);
-	explicit StereoOut32(const StereoOutFloat& src);
-
-	StereoOut16 DownSample() const;
-
 	StereoOut32 operator*(const int& factor) const
 	{
 		return StereoOut32(
@ -69,12 +59,6 @@ struct StereoOut32
 	{
 		return StereoOut32(Left / src, Right / src);
 	}
-
-	void ResampleFrom(const StereoOut32& src)
-	{
-		this->Left = src.Left << 2;
-		this->Right = src.Right << 2;
-	}
 };

 extern void Mix();
--- a/pcsx2/SPU2/SndOut.cpp
+++ b/pcsx2/SPU2/SndOut.cpp
@ -23,64 +23,39 @@

 #include "SoundTouch.h"

-StereoOut32 StereoOut32::Empty(0, 0);
+const StereoOut32 StereoOut32::Empty(0, 0);

-StereoOut32::StereoOut32(const StereoOut16& src)
-	: Left(src.Left)
-	, Right(src.Right)
+namespace
 {
-}
-
-StereoOut32::StereoOut32(const StereoOutFloat& src)
-	: Left(static_cast<s32>(src.Left * 2147483647.0f))
-	, Right(static_cast<s32>(src.Right * 2147483647.0f))
-{
-}
-
-StereoOut16 StereoOut32::DownSample() const
-{
-	return StereoOut16(
-		Left >> SndOutVolumeShift,
-		Right >> SndOutVolumeShift);
-}
-
-StereoOut32 StereoOut16::UpSample() const
-{
-	return StereoOut32(
-		Left << SndOutVolumeShift,
-		Right << SndOutVolumeShift);
-}
-
-namespace {
-class NullOutModule final : public SndOutModule
-{
-public:
-	bool Init() override { return true; }
-	void Close() override {}
-	void SetPaused(bool paused) override {}
-	int GetEmptySampleCount() override { return 0; }
-
-	const char* GetIdent() const override
+	class NullOutModule final : public SndOutModule
 	{
-		return "nullout";
-	}
+	public:
+		bool Init() override { return true; }
+		void Close() override {}
+		void SetPaused(bool paused) override {}
+		int GetEmptySampleCount() override { return 0; }

-	const char* GetLongName() const override
-	{
-		return "No Sound (Emulate SPU2 only)";
-	}
+		const char* GetIdent() const override
+		{
+			return "nullout";
+		}

-	const char* const* GetBackendNames() const override
-	{
-		return nullptr;
-	}
+		const char* GetLongName() const override
+		{
+			return "No Sound (Emulate SPU2 only)";
+		}

-	std::vector<SndOutDeviceInfo> GetOutputDeviceList(const char* driver) const override
-	{
-		return {};
-	}
-};
-}
+		const char* const* GetBackendNames() const override
+		{
+			return nullptr;
+		}
+
+		std::vector<SndOutDeviceInfo> GetOutputDeviceList(const char* driver) const override
+		{
+			return {};
+		}
+	};
+} // namespace

 static NullOutModule s_NullOut;
 static SndOutModule* NullOut = &s_NullOut;
@ -118,7 +93,9 @@ const char* const* GetOutputModuleBackends(const char* omodid)
 }

 SndOutDeviceInfo::SndOutDeviceInfo(std::string name_, std::string display_name_, u32 minimum_latency_)
-	: name(std::move(name_)), display_name(std::move(display_name_)), minimum_latency_frames(minimum_latency_)
+	: name(std::move(name_))
+	, display_name(std::move(display_name_))
+	, minimum_latency_frames(minimum_latency_)
 {
 }

@ -132,21 +109,77 @@ std::vector<SndOutDeviceInfo> GetOutputDeviceList(const char* omodid, const char
 	return ret;
 }

-StereoOut32* SndBuffer::m_buffer;
-s32 SndBuffer::m_size;
-alignas(4) volatile s32 SndBuffer::m_rpos;
-alignas(4) volatile s32 SndBuffer::m_wpos;
+namespace SndBuffer
+{
+	static float s_final_volume = 1.0f;

-bool SndBuffer::m_underrun_freeze;
-StereoOut32* SndBuffer::sndTempBuffer = nullptr;
-StereoOut16* SndBuffer::sndTempBuffer16 = nullptr;
-int SndBuffer::sndTempProgress = 0;
+	static bool s_underrun_freeze = 0;

-int GetAlignedBufferSize(int comp)
+	// data prediction amount, used to "commit" data that hasn't
+	// finished timestretch processing.
+	static s32 s_predict_data = 0;
+
+	// records last buffer status (fill %, range -100 to 100, with 0 being 50% full)
+	static float s_last_pct = 0;
+
+	static float s_last_emergency_adj = 0.0f;
+	static float s_cTempo = 1.0f;
+	static float s_eTempo = 1.0f;
+	static int s_ss_freeze = 0;
+
+	static std::unique_ptr<StereoOut16[]> s_staging_buffer;
+	static std::unique_ptr<float[]> s_float_buffer;
+
+	static int s_staging_progress = 0;
+
+	static std::unique_ptr<StereoOut16[]> s_output_buffer;
+	static s32 s_output_buffer_size = 0;
+
+	// TODO: Replace these with proper atomics.
+	alignas(4) static volatile s32 m_rpos = 0;
+	alignas(4) static volatile s32 m_wpos = 0;
+
+	static bool CheckUnderrunStatus(int& nSamples, int& quietSampleCount);
+
+	static void soundtouchInit();
+	static void soundtouchClearContents();
+	static void soundtouchCleanup();
+	static void timeStretchWrite();
+	static void timeStretchUnderrun();
+	static s32 timeStretchOverrun();
+
+	static void PredictDataWrite(int samples);
+	static float GetStatusPct();
+	static void UpdateTempoChangeSoundTouch();
+	static void UpdateTempoChangeSoundTouch2();
+
+	static void _WriteSamples(StereoOut16* bData, int nSamples);
+
+	static void _WriteSamples_Safe(StereoOut16* bData, int nSamples);
+	static void _ReadSamples_Safe(StereoOut16* bData, int nSamples);
+
+	static void _WriteSamples_Internal(StereoOut16* bData, int nSamples);
+	static void _DropSamples_Internal(int nSamples);
+	static void _ReadSamples_Internal(StereoOut16* bData, int nSamples);
+
+	static int _GetApproximateDataInBuffer();
+} // namespace SndBuffer
+
+static int GetAlignedBufferSize(int comp)
 {
 	return (comp + SndOutPacketSize - 1) & ~(SndOutPacketSize - 1);
 }

+s32 SPU2::GetOutputVolume()
+{
+	return static_cast<s32>(std::round(SndBuffer::s_final_volume * 100.0f));
+}
+
+void SPU2::SetOutputVolume(s32 volume)
+{
+	SndBuffer::s_final_volume = static_cast<float>(std::clamp<s32>(volume, 0, Pcsx2Config::SPU2Options::MAX_VOLUME)) / 100.0f;
+}
+
 // Returns TRUE if there is data to be output, or false if no data
 // is available to be copied.
 bool SndBuffer::CheckUnderrunStatus(int& nSamples, int& quietSampleCount)
@ -154,9 +187,9 @@ bool SndBuffer::CheckUnderrunStatus(int& nSamples, int& quietSampleCount)
 	quietSampleCount = 0;

 	int data = _GetApproximateDataInBuffer();
-	if (m_underrun_freeze)
+	if (s_underrun_freeze)
 	{
-		int toFill = m_size / ((EmuConfig.SPU2.SynchMode == Pcsx2Config::SPU2Options::SynchronizationMode::NoSync) ? 32 : 400); // TimeStretch and Async off?
+		int toFill = s_output_buffer_size / ((EmuConfig.SPU2.SynchMode == Pcsx2Config::SPU2Options::SynchronizationMode::NoSync) ? 32 : 400); // TimeStretch and Async off?
 		toFill = GetAlignedBufferSize(toFill);

 		// toFill is now aligned to a SndOutPacket
@ -168,16 +201,16 @@ bool SndBuffer::CheckUnderrunStatus(int& nSamples, int& quietSampleCount)
 			return false;
 		}

-		m_underrun_freeze = false;
+		s_underrun_freeze = false;
 		if (SPU2::MsgOverruns())
 			SPU2::ConLog(" * SPU2 > Underrun compensation (%d packets buffered)\n", toFill / SndOutPacketSize);
-		lastPct = 0.0; // normalize timestretcher
+		s_last_pct = 0.0; // normalize timestretcher
 	}
 	else if (data < nSamples)
 	{
 		quietSampleCount = nSamples - data;
 		nSamples = data;
-		m_underrun_freeze = true;
+		s_underrun_freeze = true;

 		if (EmuConfig.SPU2.SynchMode == Pcsx2Config::SPU2Options::SynchronizationMode::TimeStretch) // TimeStrech on
 			timeStretchUnderrun();
@ -191,37 +224,37 @@ bool SndBuffer::CheckUnderrunStatus(int& nSamples, int& quietSampleCount)
 int SndBuffer::_GetApproximateDataInBuffer()
 {
 	// WARNING: not necessarily 100% up to date by the time it's used, but it will have to do.
-	return (m_wpos + m_size - m_rpos) % m_size;
+	return (m_wpos + s_output_buffer_size - m_rpos) % s_output_buffer_size;
 }

-void SndBuffer::_WriteSamples_Internal(StereoOut32* bData, int nSamples)
+void SndBuffer::_WriteSamples_Internal(StereoOut16* bData, int nSamples)
 {
 	// WARNING: This assumes the write will NOT wrap around,
 	// and also assumes there's enough free space in the buffer.

-	memcpy(m_buffer + m_wpos, bData, nSamples * sizeof(StereoOut32));
-	m_wpos = (m_wpos + nSamples) % m_size;
+	std::memcpy(s_output_buffer.get() + m_wpos, bData, nSamples * sizeof(StereoOut16));
+	m_wpos = (m_wpos + nSamples) % s_output_buffer_size;
 }

 void SndBuffer::_DropSamples_Internal(int nSamples)
 {
-	m_rpos = (m_rpos + nSamples) % m_size;
+	m_rpos = (m_rpos + nSamples) % s_output_buffer_size;
 }

-void SndBuffer::_ReadSamples_Internal(StereoOut32* bData, int nSamples)
+void SndBuffer::_ReadSamples_Internal(StereoOut16* bData, int nSamples)
 {
 	// WARNING: This assumes the read will NOT wrap around,
 	// and also assumes there's enough data in the buffer.
-	memcpy(bData, m_buffer + m_rpos, nSamples * sizeof(StereoOut32));
+	std::memcpy(bData, s_output_buffer.get() + m_rpos, nSamples * sizeof(StereoOut16));
 	_DropSamples_Internal(nSamples);
 }

-void SndBuffer::_WriteSamples_Safe(StereoOut32* bData, int nSamples)
+void SndBuffer::_WriteSamples_Safe(StereoOut16* bData, int nSamples)
 {
 	// WARNING: This code assumes there's only ONE writing process.
-	if ((m_size - m_wpos) < nSamples)
+	if ((s_output_buffer_size - m_wpos) < nSamples)
 	{
-		const int b1 = m_size - m_wpos;
+		const int b1 = s_output_buffer_size - m_wpos;
 		const int b2 = nSamples - b1;

 		_WriteSamples_Internal(bData, b1);
@ -233,12 +266,12 @@ void SndBuffer::_WriteSamples_Safe(StereoOut32* bData, int nSamples)
 	}
 }

-void SndBuffer::_ReadSamples_Safe(StereoOut32* bData, int nSamples)
+void SndBuffer::_ReadSamples_Safe(StereoOut16* bData, int nSamples)
 {
 	// WARNING: This code assumes there's only ONE reading process.
-	if ((m_size - m_rpos) < nSamples)
+	if ((s_output_buffer_size - m_rpos) < nSamples)
 	{
-		const int b1 = m_size - m_rpos;
+		const int b1 = s_output_buffer_size - m_rpos;
 		const int b2 = nSamples - b1;

 		_ReadSamples_Internal(bData, b1);
@ -250,6 +283,15 @@ void SndBuffer::_ReadSamples_Safe(StereoOut32* bData, int nSamples)
 	}
 }

+static __fi StereoOut16 ApplyVolume(StereoOut16 frame, float volume)
+{
+	// TODO: This could be done with SSE/NEON, but we'd only be processing half our vector width.
+	// It happens on the audio thread anyway, so no biggie, but someone might want to do it at some point.
+	return StereoOut16(
+		static_cast<s16>(std::clamp(static_cast<float>(frame.Left) * volume, -32768.0f, 32767.0f)),
+		static_cast<s16>(std::clamp(static_cast<float>(frame.Right) * volume, -32768.0f, 32767.0f)));
+}
+
 // Note: When using with 32 bit output buffers, the user of this function is responsible
 // for shifting the values to where they need to be manually.  The fixed point depth of
 // the sample output is determined by the SndOutVolumeShift, which is the number of bits
@ -276,20 +318,34 @@ void SndBuffer::ReadSamples(T* bData, int nSamples)
 		pxAssume(nSamples <= SndOutPacketSize);

 		// WARNING: This code assumes there's only ONE reading process.
-		int b1 = m_size - m_rpos;
+		int b1 = s_output_buffer_size - m_rpos;

 		if (b1 > nSamples)
 			b1 = nSamples;

-		// First part
-		for (int i = 0; i < b1; i++)
-			bData[i].ResampleFrom(m_buffer[i + m_rpos]);
-
-		// Second part
 		const int b2 = nSamples - b1;
-		for (int i = 0; i < b2; i++)
-			bData[i + b1].ResampleFrom(m_buffer[i]);
-	
+
+		if (std::is_same_v<T, StereoOut16> && s_final_volume == 1.0f)
+		{
+			// First part
+			if (b1 > 0)
+				std::memcpy(bData, &s_output_buffer[m_rpos], sizeof(StereoOut16) * b1);
+
+			// Second part
+			if (b2 > 0)
+				std::memcpy(bData + b1, s_output_buffer.get(), sizeof(StereoOut16) * b2);
+		}
+		else
+		{
+			// First part
+			for (int i = 0; i < b1; i++)
+				bData[i].SetFrom(ApplyVolume(s_output_buffer[i + m_rpos], s_final_volume));
+
+			// Second part
+			for (int i = 0; i < b2; i++)
+				bData[i + b1].SetFrom(ApplyVolume(s_output_buffer[i], s_final_volume));
+		}
+
 		_DropSamples_Internal(nSamples);
 	}

@ -301,9 +357,6 @@ void SndBuffer::ReadSamples(T* bData, int nSamples)
 }

 template void SndBuffer::ReadSamples(StereoOut16*, int);
-template void SndBuffer::ReadSamples(StereoOut32*, int);
-
-//template void SndBuffer::ReadSamples(StereoOutFloat*);
 template void SndBuffer::ReadSamples(Stereo21Out16*, int);
 template void SndBuffer::ReadSamples(Stereo40Out16*, int);
 template void SndBuffer::ReadSamples(Stereo41Out16*, int);
@ -312,18 +365,9 @@ template void SndBuffer::ReadSamples(Stereo51Out16Dpl*, int);
 template void SndBuffer::ReadSamples(Stereo51Out16DplII*, int);
 template void SndBuffer::ReadSamples(Stereo71Out16*, int);

-template void SndBuffer::ReadSamples(Stereo20Out32*, int);
-template void SndBuffer::ReadSamples(Stereo21Out32*, int);
-template void SndBuffer::ReadSamples(Stereo40Out32*, int);
-template void SndBuffer::ReadSamples(Stereo41Out32*, int);
-template void SndBuffer::ReadSamples(Stereo51Out32*, int);
-template void SndBuffer::ReadSamples(Stereo51Out32Dpl*, int);
-template void SndBuffer::ReadSamples(Stereo51Out32DplII*, int);
-template void SndBuffer::ReadSamples(Stereo71Out32*, int);
-
-void SndBuffer::_WriteSamples(StereoOut32* bData, int nSamples)
+void SndBuffer::_WriteSamples(StereoOut16* bData, int nSamples)
 {
-	m_predictData = 0;
+	s_predict_data = 0;

 	// Problem:
 	//  If the SPU2 gets out of sync with the SndOut device, the writepos of the
@ -336,7 +380,7 @@ void SndBuffer::_WriteSamples(StereoOut32* bData, int nSamples)
 	//  The older portion of the buffer is discarded rather than incoming data,
 	//  so that the overall audio synchronization is better.

-	const int free = m_size - _GetApproximateDataInBuffer(); // -1, but the <= handles that
+	const int free = s_output_buffer_size - _GetApproximateDataInBuffer(); // -1, but the <= handles that
 	if (free <= nSamples)
 	{
 // Disabled since the lock-free queue can't handle changing the read end from the write thread
@ -367,7 +411,7 @@ void SndBuffer::_WriteSamples(StereoOut32* bData, int nSamples)
 #else
 		if (SPU2::MsgOverruns())
 			SPU2::ConLog(" * SPU2 > Overrun! 1 packet tossed)\n");
-		lastPct = 0.0; // normalize the timestretcher
+		s_last_pct = 0.0; // normalize the timestretcher

 		// Toss the packet because we overran the buffer.
 		return;
@ -391,13 +435,13 @@ bool SndBuffer::Init(const char* modname)
 	m_wpos = 0;

 	const float latencyMS = EmuConfig.SPU2.Latency * 16;
-	m_size = GetAlignedBufferSize((int)(latencyMS * SampleRate / 1000.0f));
-	m_buffer = new StereoOut32[m_size];
-	m_underrun_freeze = false;
+	s_output_buffer_size = GetAlignedBufferSize((int)(latencyMS * SampleRate / 1000.0f));
+	s_output_buffer = std::make_unique<StereoOut16[]>(s_output_buffer_size);
+	s_underrun_freeze = false;

-	sndTempBuffer = new StereoOut32[SndOutPacketSize];
-	sndTempBuffer16 = new StereoOut16[SndOutPacketSize * 2]; // in case of leftovers.
-	sndTempProgress = 0;
+	s_staging_buffer = std::make_unique<StereoOut16[]>(SndOutPacketSize);
+	s_float_buffer = std::make_unique<float[]>(SndOutPacketSize * 2);
+	s_staging_progress = 0;

 	soundtouchInit(); // initializes the timestretching

@ -421,20 +465,14 @@ void SndBuffer::Cleanup()

 	soundtouchCleanup();

-	safe_delete_array(m_buffer);
-	safe_delete_array(sndTempBuffer);
-	safe_delete_array(sndTempBuffer16);
+	s_output_buffer.reset();
+	s_staging_buffer.reset();
 }

-int SndBuffer::m_dsp_progress = 0;
-
-int SndBuffer::m_timestretch_progress = 0;
-int SndBuffer::ssFreeze = 0;
-
 void SndBuffer::ClearContents()
 {
-	SndBuffer::soundtouchClearContents();
-	SndBuffer::ssFreeze = 256; //Delays sound output for about 1 second.
+	soundtouchClearContents();
+	s_ss_freeze = 256; //Delays sound output for about 1 second.
 }

 void SndBuffer::ResetBuffers()
@ -448,36 +486,35 @@ void SPU2::SetOutputPaused(bool paused)
 	s_output_module->SetPaused(paused);
 }

-void SndBuffer::Write(const StereoOut32& Sample)
+void SndBuffer::Write(StereoOut16 Sample)
 {
 #ifdef PCSX2_DEVBUILD
 	// Log final output to wavefile.
-	WaveDump::WriteCore(1, CoreSrc_External, Sample.DownSample());
+	WaveDump::WriteCore(1, CoreSrc_External, Sample);
 #endif

 	if (WavRecordEnabled)
-		RecordWrite(Sample.DownSample());
+		RecordWrite(Sample);

-	sndTempBuffer[sndTempProgress++] = Sample;
+	s_staging_buffer[s_staging_progress++] = Sample;

 	// If we haven't accumulated a full packet yet, do nothing more:
-	if (sndTempProgress < SndOutPacketSize)
+	if (s_staging_progress < SndOutPacketSize)
 		return;
-	sndTempProgress = 0;
+	s_staging_progress = 0;

 	//Don't play anything directly after loading a savestate, avoids static killing your speakers.
-	if (ssFreeze > 0)
+	if (s_ss_freeze > 0)
 	{
-		ssFreeze--;
-		// Play silence
-		std::fill_n(sndTempBuffer, SndOutPacketSize, StereoOut32{});
+		s_ss_freeze--;
+		std::memset(s_staging_buffer.get(), 0, sizeof(StereoOut16) * SndOutPacketSize);
 	}
 	else
 	{
-		if (EmuConfig.SPU2.SynchMode == Pcsx2Config::SPU2Options::SynchronizationMode::TimeStretch) // TimeStrech on
+		if (EmuConfig.SPU2.SynchMode == Pcsx2Config::SPU2Options::SynchronizationMode::TimeStretch)
 			timeStretchWrite();
 		else
-			_WriteSamples(sndTempBuffer, SndOutPacketSize);
+			_WriteSamples(s_staging_buffer.get(), SndOutPacketSize);
 	}
 }

@ -490,20 +527,9 @@ void SndBuffer::Write(const StereoOut32& Sample)

 static std::unique_ptr<soundtouch::SoundTouch> pSoundTouch = nullptr;

-// data prediction amount, used to "commit" data that hasn't
-// finished timestretch processing.
-s32 SndBuffer::m_predictData;
-
-// records last buffer status (fill %, range -100 to 100, with 0 being 50% full)
-float SndBuffer::lastPct;
-float SndBuffer::lastEmergencyAdj;
-
-float SndBuffer::cTempo = 1;
-float SndBuffer::eTempo = 1;
-
 void SndBuffer::PredictDataWrite(int samples)
 {
-	m_predictData += samples;
+	s_predict_data += samples;
 }

 // Calculate the buffer status percentage.
@ -521,8 +547,8 @@ float SndBuffer::GetStatusPct()
 	//ConLog( "Data %d >>> driver: %d   predict: %d\n", m_data, drvempty, m_predictData );

 	const int data = _GetApproximateDataInBuffer();
-	float result = static_cast<float>(data + m_predictData - drvempty) - (m_size / 16);
-	result /= (m_size / 16);
+	float result = static_cast<float>(data + s_predict_data - drvempty) - (s_output_buffer_size / 16);
+	result /= (s_output_buffer_size / 16);
 	return result;
 }

@ -611,8 +637,8 @@ void SndBuffer::UpdateTempoChangeSoundTouch2()
 	float baseTargetFullness = static_cast<double>(targetSamplesReservoir); ///(double)m_size;//0.05;

 	//state vars
-	static bool inside_hysteresis;      //=false;
-	static int hys_ok_count;            //=0;
+	static bool inside_hysteresis; //=false;
+	static int hys_ok_count; //=0;
 	static float dynamicTargetFullness; //=baseTargetFullness;
 	if (gRequestStretcherReset >= STRETCHER_RESET_THRESHOLD)
 	{
@ -686,7 +712,7 @@ void SndBuffer::UpdateTempoChangeSoundTouch2()
 		if (Common::Timer::ConvertValueToSeconds(now - last) > 1.0f)
 		{ //report buffers state and tempo adjust every second
 			SPU2::ConLog("buffers: %4d ms (%3.0f%%), tempo: %f, comp: %2.3f, iters: %d, (N-IPS:%d -> avg:%d, minokc:%d, div:%d) reset:%d\n",
-				   (int)(data / 48), (double)(100.0 * bufferFullness / baseTargetFullness), (double)tempoAdjust, (double)(dynamicTargetFullness / baseTargetFullness), iters, (int)targetIPS, AVERAGING_WINDOW, hys_min_ok_count, compensationDivider, gRequestStretcherReset);
+				(int)(data / 48), (double)(100.0 * bufferFullness / baseTargetFullness), (double)tempoAdjust, (double)(dynamicTargetFullness / baseTargetFullness), iters, (int)targetIPS, AVERAGING_WINDOW, hys_min_ok_count, compensationDivider, gRequestStretcherReset);
 			last = now;
 			iters = 0;
 		}
@ -704,11 +730,11 @@ void SndBuffer::UpdateTempoChangeSoundTouch2()
 void SndBuffer::UpdateTempoChangeSoundTouch()
 {
 	const float statusPct = GetStatusPct();
-	const float pctChange = statusPct - lastPct;
+	const float pctChange = statusPct - s_last_pct;

 	float tempoChange;
 	float emergencyAdj = 0;
-	float newcee = cTempo; // workspace var. for cTempo
+	float newcee = s_cTempo; // workspace var. for cTempo

 	// IMPORTANT!
 	// If you plan to tweak these values, make sure you're using a release build
@ -761,10 +787,10 @@ void SndBuffer::UpdateTempoChangeSoundTouch()
 	// cope with low fps (underruns) than it is high fps (overruns).  So to help out a
 	// little, the low-end portions of this check are less forgiving than the high-sides.

-	if (cTempo < 0.965f || cTempo > 1.060f ||
+	if (s_cTempo < 0.965f || s_cTempo > 1.060f ||
 		pctChange < -0.38f || pctChange > 0.54f ||
 		statusPct < -0.42f || statusPct > 0.70f ||
-		eTempo < 0.89f || eTempo > 1.19f)
+		s_eTempo < 0.89f || s_eTempo > 1.19f)
 	{
 		//printf("Emergency stretch: cTempo = %f eTempo = %f pctChange = %f statusPct = %f\n",cTempo,eTempo,pctChange,statusPct);
 		emergencyAdj = (pow(statusPct * statusWeight, 3.0f) * statusRange);
@ -774,10 +800,10 @@ void SndBuffer::UpdateTempoChangeSoundTouch()
 	// It helps make the system 'feel' a little smarter by  giving it at least
 	// one packet worth of history to help work off of:

-	emergencyAdj = (emergencyAdj * 0.75f) + (lastEmergencyAdj * 0.25f);
+	emergencyAdj = (emergencyAdj * 0.75f) + (s_last_emergency_adj * 0.25f);

-	lastEmergencyAdj = emergencyAdj;
-	lastPct = statusPct;
+	s_last_emergency_adj = emergencyAdj;
+	s_last_pct = statusPct;

 	// Accumulate a fraction of the tempo change into the tempo itself.
 	// This helps the system run "smarter" to games that run consistently
@ -792,7 +818,7 @@ void SndBuffer::UpdateTempoChangeSoundTouch()
 	// to the current tempo.  (otherwise tempos rate of change at the extremes would
 	// be too drastic)

-	float newTempo = newcee + (emergencyAdj * cTempo);
+	float newTempo = newcee + (emergencyAdj * s_cTempo);

 	// ... and as a final optimization, only stretch if the new tempo is outside
 	// a nominal threshold.  Keep this threshold check small, because it could
@ -800,19 +826,19 @@ void SndBuffer::UpdateTempoChangeSoundTouch()
 	// is usually better/safer)
 	if (newTempo < 0.970f || newTempo > 1.045f)
 	{
-		cTempo = static_cast<float>(newcee);
+		s_cTempo = static_cast<float>(newcee);

 		if (newTempo < 0.10f)
 			newTempo = 0.10f;
 		else if (newTempo > 10.0f)
 			newTempo = 10.0f;

-		if (cTempo < 0.15f)
-			cTempo = 0.15f;
-		else if (cTempo > 7.5f)
-			cTempo = 7.5f;
+		if (s_cTempo < 0.15f)
+			s_cTempo = 0.15f;
+		else if (s_cTempo > 7.5f)
+			s_cTempo = 7.5f;

-		pSoundTouch->setTempo(eTempo = static_cast<float>(newTempo));
+		pSoundTouch->setTempo(s_eTempo = static_cast<float>(newTempo));

 		/*ConLog("* SPU2: [Nominal %d%%] [Emergency: %d%%] (baseTempo: %d%% ) (newTempo: %d%%) (buffer: %d%%)\n",
 			//(relation < 0.0) ? "Normalize" : "",
@ -828,16 +854,16 @@ void SndBuffer::UpdateTempoChangeSoundTouch()
 		// Nominal operation -- turn off stretching.
 		// note: eTempo 'slides' toward 1.0 for smoother audio and better
 		// protection against spikes.
-		if (cTempo != 1.0f)
+		if (s_cTempo != 1.0f)
 		{
-			cTempo = 1.0f;
-			eTempo = (1.0f + eTempo) * 0.5f;
-			pSoundTouch->setTempo(eTempo);
+			s_cTempo = 1.0f;
+			s_eTempo = (1.0f + s_eTempo) * 0.5f;
+			pSoundTouch->setTempo(s_eTempo);
 		}
 		else
 		{
-			if (eTempo != cTempo)
-				pSoundTouch->setTempo(eTempo = cTempo);
+			if (s_eTempo != s_cTempo)
+				pSoundTouch->setTempo(s_eTempo = s_cTempo);
 		}
 	}
 }
@ -847,7 +873,7 @@ void SndBuffer::UpdateTempoChangeAsyncMixing()
 {
 	const float statusPct = GetStatusPct();

-	lastPct = statusPct;
+	s_last_pct = statusPct;
 	if (statusPct < -0.1f)
 	{
 		TickInterval -= 4;
@ -873,10 +899,10 @@ void SndBuffer::timeStretchUnderrun()
 	gRequestStretcherReset++;
 	// timeStretcher failed it's job.  We need to slow down the audio some.

-	cTempo -= (cTempo * 0.12f);
-	eTempo -= (eTempo * 0.30f);
-	if (eTempo < 0.1f)
-		eTempo = 0.1f;
+	s_cTempo -= (s_cTempo * 0.12f);
+	s_eTempo -= (s_eTempo * 0.30f);
+	if (s_eTempo < 0.1f)
+		s_eTempo = 0.1f;
 	//	pSoundTouch->setTempo( eTempo );
 	//pSoundTouch->setTempoChange(-30); // temporary (until stretcher is called) slow down
 }
@ -885,10 +911,10 @@ s32 SndBuffer::timeStretchOverrun()
 {
 	// If we overran it means the timestretcher failed.  We need to speed
 	// up audio playback.
-	cTempo += cTempo * 0.12f;
-	eTempo += eTempo * 0.40f;
-	if (eTempo > 7.5f)
-		eTempo = 7.5f;
+	s_cTempo += s_cTempo * 0.12f;
+	s_eTempo += s_eTempo * 0.40f;
+	if (s_eTempo > 7.5f)
+		s_eTempo = 7.5f;
 	//pSoundTouch->setTempo( eTempo );
 	//pSoundTouch->setTempoChange(30);// temporary (until stretcher is called) speed up

@ -898,24 +924,58 @@ s32 SndBuffer::timeStretchOverrun()
 	return SndOutPacketSize * 2;
 }

-static void CvtPacketToFloat(StereoOut32* srcdest)
+static constexpr float S16_TO_FLOAT = 1.0f / 32767.0f;
+static constexpr float FLOAT_TO_S16 = 32767.0f;
+
+static void ConvertPacketToFloat(const StereoOut16* src, float* dst)
 {
-	StereoOutFloat* dest = (StereoOutFloat*)srcdest;
-	const StereoOut32* src = (StereoOut32*)srcdest;
-	for (uint i = 0; i < SndOutPacketSize; ++i, ++dest, ++src)
-		*dest = (StereoOutFloat)*src;
+	static_assert((SndOutPacketSize % 4) == 0);
+	constexpr u32 iterations = SndOutPacketSize / 4;
+
+	const __m128 S16_TO_FLOAT_V = _mm_set1_ps(S16_TO_FLOAT);
+
+	for (u32 i = 0; i < iterations; i++)
+	{
+		const __m128i sv = _mm_load_si128(reinterpret_cast<const __m128i*>(src));
+		src += 4;
+
+		__m128i iv1 = _mm_unpacklo_epi16(sv, sv); // [0, 0, 1, 1, 2, 2, 3, 3]
+		__m128i iv2 = _mm_unpackhi_epi16(sv, sv); // [4, 4, 5, 5, 6, 6, 7, 7]
+		iv1 = _mm_srai_epi32(iv1, 16); // [0, 1, 2, 3]
+		iv2 = _mm_srai_epi32(iv2, 16); // [4, 5, 6, 7]
+		__m128 fv1 = _mm_cvtepi32_ps(iv1); // [f0, f1, f2, f3]
+		__m128 fv2 = _mm_cvtepi32_ps(iv2); // [f4, f5, f6, f7]
+		fv1 = _mm_mul_ps(fv1, S16_TO_FLOAT_V);
+		fv2 = _mm_mul_ps(fv2, S16_TO_FLOAT_V);
+
+		_mm_store_ps(dst + 0, fv1);
+		_mm_store_ps(dst + 4, fv2);
+		dst += 8;
+	}
 }

-// Parameter note: Size should always be a multiple of 128, thanks!
-static void CvtPacketToInt(StereoOut32* srcdest, uint size)
+static void ConvertPacketToInt(StereoOut16* dst, const float* src, uint size)
 {
-	//pxAssume( (size & 127) == 0 );
+	static_assert((SndOutPacketSize % 4) == 0);
+	constexpr u32 iterations = SndOutPacketSize / 4;

-	const StereoOutFloat* src = (StereoOutFloat*)srcdest;
-	StereoOut32* dest = srcdest;
+	const __m128 FLOAT_TO_S16_V = _mm_set1_ps(FLOAT_TO_S16);

-	for (uint i = 0; i < size; ++i, ++dest, ++src)
-		*dest = (StereoOut32)*src;
+	for (u32 i = 0; i < iterations; i++)
+	{
+		__m128 fv1 = _mm_load_ps(src + 0);
+		__m128 fv2 = _mm_load_ps(src + 4);
+		src += 8;
+
+		fv1 = _mm_mul_ps(fv1, FLOAT_TO_S16_V);
+		fv2 = _mm_mul_ps(fv2, FLOAT_TO_S16_V);
+		__m128i iv1 = _mm_cvtps_epi32(fv1);
+		__m128i iv2 = _mm_cvtps_epi32(fv2);
+
+		__m128i iv = _mm_packs_epi32(iv1, iv2);
+		_mm_store_si128(reinterpret_cast<__m128i*>(dst), iv);
+		dst += 4;
+	}
 }

 void SndBuffer::timeStretchWrite()
@ -926,20 +986,20 @@ void SndBuffer::timeStretchWrite()
 	// suddenly we'll get several chunks back at once.  Thus we use
 	// data prediction to make the timestretcher more responsive.

-	PredictDataWrite((int)(SndOutPacketSize / eTempo));
-	CvtPacketToFloat(sndTempBuffer);
+	PredictDataWrite((int)(SndOutPacketSize / s_eTempo));
+	ConvertPacketToFloat(s_staging_buffer.get(), s_float_buffer.get());

-	pSoundTouch->putSamples((float*)sndTempBuffer, SndOutPacketSize);
+	pSoundTouch->putSamples(s_float_buffer.get(), SndOutPacketSize);

 	int tempProgress;
-	while (tempProgress = pSoundTouch->receiveSamples((float*)sndTempBuffer, SndOutPacketSize),
-		   tempProgress != 0)
+	while (tempProgress = pSoundTouch->receiveSamples(s_float_buffer.get(), SndOutPacketSize),
+		tempProgress != 0)
 	{
 		// Hint: It's assumed that pSoundTouch will return chunks of 128 bytes (it always does as
 		// long as the SSE optimizations are enabled), which means we can do our own SSE opts here.

-		CvtPacketToInt(sndTempBuffer, tempProgress);
-		_WriteSamples(sndTempBuffer, tempProgress);
+		ConvertPacketToInt(s_staging_buffer.get(), s_float_buffer.get(), tempProgress);
+		_WriteSamples(s_staging_buffer.get(), tempProgress);
 	}

 #ifdef SPU2X_USE_OLD_STRETCHER
@ -966,12 +1026,12 @@ void SndBuffer::soundtouchInit()

 	// some timestretch management vars:

-	cTempo = 1.0;
-	eTempo = 1.0;
-	lastPct = 0;
-	lastEmergencyAdj = 0;
+	s_cTempo = 1.0;
+	s_eTempo = 1.0;
+	s_last_pct = 0;
+	s_last_emergency_adj = 0;

-	m_predictData = 0;
+	s_predict_data = 0;
 }

 // reset timestretch management vars, and delay updates a bit:
@ -983,12 +1043,12 @@ void SndBuffer::soundtouchClearContents()
 	pSoundTouch->clear();
 	pSoundTouch->setTempo(1);

-	cTempo = 1.0;
-	eTempo = 1.0;
-	lastPct = 0;
-	lastEmergencyAdj = 0;
+	s_cTempo = 1.0;
+	s_eTempo = 1.0;
+	s_last_pct = 0;
+	s_last_emergency_adj = 0;

-	m_predictData = 0;
+	s_predict_data = 0;
 }

 void SndBuffer::soundtouchCleanup()
--- a/pcsx2/SPU2/SndOut.h
+++ b/pcsx2/SPU2/SndOut.h
@ -20,14 +20,7 @@
 // Number of stereo samples per SndOut block.
 // All drivers must work in units of this size when communicating with
 // SndOut.
-static const int SndOutPacketSize = 64;
-
-// Overall master volume shift; this is meant to be a precision value and does not affect
-// actual output volumes.  It converts SPU2 16 bit volumes to 32-bit volumes, and likewise
-// downsamples 32 bit samples to 16 bit sound driver output (this way timestretching and
-// DSP effects get better precision results)
-static const int SndOutVolumeShift = 12;
-static const int SndOutVolumeShift32 = 16 - SndOutVolumeShift; // shift up, not down
+static constexpr int SndOutPacketSize = 64;

 // Samplerate of the SPU2. For accurate playback we need to match this
 // exactly.  Trying to scale samplerates and maintain SPU2's Ts timing accuracy
@ -57,70 +50,46 @@ struct Stereo51Out16Dpl; // similar to DplII but without rear balancing
 struct Stereo51Out32Dpl;

 extern void ResetDplIIDecoder();
-extern void ProcessDplIISample16(const StereoOut32& src, Stereo51Out16DplII* s);
-extern void ProcessDplIISample32(const StereoOut32& src, Stereo51Out32DplII* s);
-extern void ProcessDplSample16(const StereoOut32& src, Stereo51Out16Dpl* s);
-extern void ProcessDplSample32(const StereoOut32& src, Stereo51Out32Dpl* s);
+extern void ProcessDplIISample16(const StereoOut16& src, Stereo51Out16DplII* s);
+extern void ProcessDplIISample32(const StereoOut16& src, Stereo51Out32DplII* s);
+extern void ProcessDplSample16(const StereoOut16& src, Stereo51Out16Dpl* s);
+extern void ProcessDplSample32(const StereoOut16& src, Stereo51Out32Dpl* s);

 struct StereoOut16
 {
 	s16 Left;
 	s16 Right;

-	StereoOut16()
+	__fi StereoOut16()
 		: Left(0)
 		, Right(0)
 	{
 	}

-	StereoOut16(const StereoOut32& src)
+	__fi StereoOut16(const StereoOut32& src)
 		: Left((s16)src.Left)
 		, Right((s16)src.Right)
 	{
 	}

-	StereoOut16(s16 left, s16 right)
+	__fi StereoOut16(s16 left, s16 right)
 		: Left(left)
 		, Right(right)
 	{
 	}

-	StereoOut32 UpSample() const;
-
-	void ResampleFrom(const StereoOut32& src)
-	{
-		// Use StereoOut32's built in conversion
-		*this = src.DownSample();
-	}
-};
-
-struct StereoOutFloat
-{
-	float Left;
-	float Right;
-
-	StereoOutFloat()
-		: Left(0)
-		, Right(0)
+	__fi StereoOut16 ApplyVolume(float volume)
 	{
+		return StereoOut16(
+			static_cast<s16>(std::clamp(static_cast<float>(Left) * volume, -32768.0f, 32767.0f)),
+			static_cast<s16>(std::clamp(static_cast<float>(Right) * volume, -32768.0f, 32767.0f))
+		);
 	}

-	explicit StereoOutFloat(const StereoOut32& src)
-		: Left(src.Left / 2147483647.0f)
-		, Right(src.Right / 2147483647.0f)
-	{
-	}
-
-	explicit StereoOutFloat(s32 left, s32 right)
-		: Left(left / 2147483647.0f)
-		, Right(right / 2147483647.0f)
-	{
-	}
-
-	StereoOutFloat(float left, float right)
-		: Left(left)
-		, Right(right)
+	__fi void SetFrom(const StereoOut16& src)
 	{
+		Left = src.Left;
+		Right = src.Right;
 	}
 };

@ -130,11 +99,11 @@ struct Stereo21Out16
 	s16 Right;
 	s16 LFE;

-	void ResampleFrom(const StereoOut32& src)
+	__fi void SetFrom(const StereoOut16& src)
 	{
-		Left = src.Left >> SndOutVolumeShift;
-		Right = src.Right >> SndOutVolumeShift;
-		LFE = (src.Left + src.Right) >> (SndOutVolumeShift + 1);
+		Left = src.Left;
+		Right = src.Right;
+		LFE = (src.Left + src.Right) >> 1;
 	}
 };

@ -145,28 +114,12 @@ struct Stereo40Out16
 	s16 LeftBack;
 	s16 RightBack;

-	void ResampleFrom(const StereoOut32& src)
+	__fi void SetFrom(const StereoOut16& src)
 	{
-		Left = src.Left >> SndOutVolumeShift;
-		Right = src.Right >> SndOutVolumeShift;
-		LeftBack = src.Left >> SndOutVolumeShift;
-		RightBack = src.Right >> SndOutVolumeShift;
-	}
-};
-
-struct Stereo40Out32
-{
-	s32 Left;
-	s32 Right;
-	s32 LeftBack;
-	s32 RightBack;
-
-	void ResampleFrom(const StereoOut32& src)
-	{
-		Left = src.Left << SndOutVolumeShift32;
-		Right = src.Right << SndOutVolumeShift32;
-		LeftBack = src.Left << SndOutVolumeShift32;
-		RightBack = src.Right << SndOutVolumeShift32;
+		Left = src.Left;
+		Right = src.Right;
+		LeftBack = src.Left;
+		RightBack = src.Right;
 	}
 };

@ -178,13 +131,13 @@ struct Stereo41Out16
 	s16 LeftBack;
 	s16 RightBack;

-	void ResampleFrom(const StereoOut32& src)
+	__fi void SetFrom(const StereoOut16& src)
 	{
-		Left = src.Left >> SndOutVolumeShift;
-		Right = src.Right >> SndOutVolumeShift;
-		LFE = (src.Left + src.Right) >> (SndOutVolumeShift + 1);
-		LeftBack = src.Left >> SndOutVolumeShift;
-		RightBack = src.Right >> SndOutVolumeShift;
+		Left = src.Left;
+		Right = src.Right;
+		LFE = (src.Left + src.Right) >> 1;
+		LeftBack = src.Left;
+		RightBack = src.Right;
 	}
 };

@ -202,14 +155,14 @@ struct Stereo51Out16
 	// systems do to their own low pass / crossover.  Manual lowpass is wasted effort
 	// and can't match solid state results anyway.

-	void ResampleFrom(const StereoOut32& src)
+	__fi void SetFrom(const StereoOut16& src)
 	{
-		Left = src.Left >> SndOutVolumeShift;
-		Right = src.Right >> SndOutVolumeShift;
-		Center = (src.Left + src.Right) >> (SndOutVolumeShift + 1);
+		Left = src.Left;
+		Right = src.Right;
+		Center = (src.Left + src.Right) >> 1;
 		LFE = Center;
-		LeftBack = src.Left >> SndOutVolumeShift;
-		RightBack = src.Right >> SndOutVolumeShift;
+		LeftBack = src.Left >> 1;
+		RightBack = src.Right >> 1;
 	}
 };

@ -222,7 +175,7 @@ struct Stereo51Out16DplII
 	s16 LeftBack;
 	s16 RightBack;

-	void ResampleFrom(const StereoOut32& src)
+	__fi void SetFrom(const StereoOut16& src)
 	{
 		ProcessDplIISample16(src, this);
 	}
@ -237,7 +190,7 @@ struct Stereo51Out32DplII
 	s32 LeftBack;
 	s32 RightBack;

-	void ResampleFrom(const StereoOut32& src)
+	__fi void SetFrom(const StereoOut32& src)
 	{
 		ProcessDplIISample32(src, this);
 	}
@ -252,7 +205,7 @@ struct Stereo51Out16Dpl
 	s16 LeftBack;
 	s16 RightBack;

-	void ResampleFrom(const StereoOut32& src)
+	__fi void SetFrom(const StereoOut16& src)
 	{
 		ProcessDplSample16(src, this);
 	}
@ -267,7 +220,7 @@ struct Stereo51Out32Dpl
 	s32 LeftBack;
 	s32 RightBack;

-	void ResampleFrom(const StereoOut32& src)
+	__fi void SetFrom(const StereoOut32& src)
 	{
 		ProcessDplSample32(src, this);
 	}
@ -284,178 +237,36 @@ struct Stereo71Out16
 	s16 LeftSide;
 	s16 RightSide;

-	void ResampleFrom(const StereoOut32& src)
+	__fi void SetFrom(const StereoOut16& src)
 	{
-		Left = src.Left >> SndOutVolumeShift;
-		Right = src.Right >> SndOutVolumeShift;
-		Center = (src.Left + src.Right) >> (SndOutVolumeShift + 1);
+		Left = src.Left;
+		Right = src.Right;
+		Center = (src.Left + src.Right) >> 1;
 		LFE = Center;
-		LeftBack = src.Left >> SndOutVolumeShift;
-		RightBack = src.Right >> SndOutVolumeShift;
+		LeftBack = src.Left;
+		RightBack = src.Right;

-		LeftSide = src.Left >> (SndOutVolumeShift + 1);
-		RightSide = src.Right >> (SndOutVolumeShift + 1);
+		LeftSide = src.Left >> 1;
+		RightSide = src.Right >> 1;
 	}
 };

-struct Stereo71Out32
+namespace SndBuffer
 {
-	s32 Left;
-	s32 Right;
-	s32 Center;
-	s32 LFE;
-	s32 LeftBack;
-	s32 RightBack;
-	s32 LeftSide;
-	s32 RightSide;
-
-	void ResampleFrom(const StereoOut32& src)
-	{
-		Left = src.Left << SndOutVolumeShift32;
-		Right = src.Right << SndOutVolumeShift32;
-		Center = (src.Left + src.Right) << (SndOutVolumeShift32 - 1);
-		LFE = Center;
-		LeftBack = src.Left << SndOutVolumeShift32;
-		RightBack = src.Right << SndOutVolumeShift32;
-
-		LeftSide = src.Left << (SndOutVolumeShift32 - 1);
-		RightSide = src.Right << (SndOutVolumeShift32 - 1);
-	}
-};
-
-struct Stereo20Out32
-{
-	s32 Left;
-	s32 Right;
-
-	void ResampleFrom(const StereoOut32& src)
-	{
-		Left = src.Left << SndOutVolumeShift32;
-		Right = src.Right << SndOutVolumeShift32;
-	}
-};
-
-struct Stereo21Out32
-{
-	s32 Left;
-	s32 Right;
-	s32 LFE;
-
-	void ResampleFrom(const StereoOut32& src)
-	{
-		Left = src.Left << SndOutVolumeShift32;
-		Right = src.Right << SndOutVolumeShift32;
-		LFE = (src.Left + src.Right) << (SndOutVolumeShift32 - 1);
-	}
-};
-
-struct Stereo41Out32
-{
-	s32 Left;
-	s32 Right;
-	s32 LFE;
-	s32 LeftBack;
-	s32 RightBack;
-
-	void ResampleFrom(const StereoOut32& src)
-	{
-		Left = src.Left << SndOutVolumeShift32;
-		Right = src.Right << SndOutVolumeShift32;
-		LFE = (src.Left + src.Right) << (SndOutVolumeShift32 - 1);
-
-		LeftBack = src.Left << SndOutVolumeShift32;
-		RightBack = src.Right << SndOutVolumeShift32;
-	}
-};
-
-struct Stereo51Out32
-{
-	s32 Left;
-	s32 Right;
-	s32 Center;
-	s32 LFE;
-	s32 LeftBack;
-	s32 RightBack;
-
-	void ResampleFrom(const StereoOut32& src)
-	{
-		Left = src.Left << SndOutVolumeShift32;
-		Right = src.Right << SndOutVolumeShift32;
-		Center = (src.Left + src.Right) << (SndOutVolumeShift32 - 1);
-		LFE = Center;
-		LeftBack = src.Left << SndOutVolumeShift32;
-		RightBack = src.Right << SndOutVolumeShift32;
-	}
-};
-
-// Developer Note: This is a static class only (all static members).
-class SndBuffer
-{
-private:
-	static bool m_underrun_freeze;
-	static s32 m_predictData;
-	static float lastPct;
-
-	static StereoOut32* sndTempBuffer;
-	static StereoOut16* sndTempBuffer16;
-
-	static int sndTempProgress;
-	static int m_dsp_progress;
-
-	static int m_timestretch_progress;
-	static int m_timestretch_writepos;
-
-	static StereoOut32* m_buffer;
-	static s32 m_size;
-
-	alignas(4) static volatile s32 m_rpos;
-	alignas(4) static volatile s32 m_wpos;
-
-	static float lastEmergencyAdj;
-	static float cTempo;
-	static float eTempo;
-	static int ssFreeze;
-
-	static bool CheckUnderrunStatus(int& nSamples, int& quietSampleCount);
-
-	static void soundtouchInit();
-	static void soundtouchClearContents();
-	static void soundtouchCleanup();
-	static void timeStretchWrite();
-	static void timeStretchUnderrun();
-	static s32 timeStretchOverrun();
-
-	static void PredictDataWrite(int samples);
-	static float GetStatusPct();
-	static void UpdateTempoChangeSoundTouch();
-	static void UpdateTempoChangeSoundTouch2();
-
-	static void _WriteSamples(StereoOut32* bData, int nSamples);
-
-	static void _WriteSamples_Safe(StereoOut32* bData, int nSamples);
-	static void _ReadSamples_Safe(StereoOut32* bData, int nSamples);
-
-	static void _WriteSamples_Internal(StereoOut32* bData, int nSamples);
-	static void _DropSamples_Internal(int nSamples);
-	static void _ReadSamples_Internal(StereoOut32* bData, int nSamples);
-
-	static int _GetApproximateDataInBuffer();
-
-public:
-	static void UpdateTempoChangeAsyncMixing();
-	static bool Init(const char* modname);
-	static void Cleanup();
-	static void Write(const StereoOut32& Sample);
-	static void ClearContents();
-	static void ResetBuffers();
+	void UpdateTempoChangeAsyncMixing();
+	bool Init(const char* modname);
+	void Cleanup();
+	void Write(StereoOut16 Sample);
+	void ClearContents();
+	void ResetBuffers();

 	// Note: When using with 32 bit output buffers, the user of this function is responsible
 	// for shifting the values to where they need to be manually.  The fixed point depth of
 	// the sample output is determined by the SndOutVolumeShift, which is the number of bits
 	// to shift right to get a 16 bit result.
 	template <typename T>
-	static void ReadSamples(T* bData, int nSamples = SndOutPacketSize);
-};
+	void ReadSamples(T* bData, int nSamples = SndOutPacketSize);
+}

 class SndOutModule
 {