diff --git a/plugins/spu2-x/src/3rdparty/liba52/parse.c b/plugins/spu2-x/src/3rdparty/liba52/parse.c
index 3626944f65..7245a315c9 100644
--- a/plugins/spu2-x/src/3rdparty/liba52/parse.c
+++ b/plugins/spu2-x/src/3rdparty/liba52/parse.c
@@ -39,7 +39,7 @@
 void * memalign (size_t align, size_t size);
 #else
 /* assume malloc alignment is sufficient */
-#define memalign(align,size) malloc (size)
+#define memalign(align,m_size) malloc (m_size)
 #endif
 
 typedef struct {
diff --git a/plugins/spu2-x/src/ADSR.cpp b/plugins/spu2-x/src/ADSR.cpp
index e99484cf3c..fe4f3ae536 100644
--- a/plugins/spu2-x/src/ADSR.cpp
+++ b/plugins/spu2-x/src/ADSR.cpp
@@ -216,7 +216,7 @@ bool V_ADSR::Calculate()
 #define VOLFLAG_EXPONENTIAL		(1ul<<2)
 #define VOLFLAG_SLIDE_ENABLE	(1ul<<3)
 
-void V_Volume::Update()
+void V_VolumeSlide::Update()
 {
 	if( !(Mode & VOLFLAG_SLIDE_ENABLE) ) return;
 		
diff --git a/plugins/spu2-x/src/BaseTypes.h b/plugins/spu2-x/src/BaseTypes.h
index 78570cc09d..25f3e40dd2 100644
--- a/plugins/spu2-x/src/BaseTypes.h
+++ b/plugins/spu2-x/src/BaseTypes.h
@@ -27,6 +27,8 @@
 using std::string;
 using std::wstring;
 
+#include "PS2Edefs.h"
+
 //////////////////////////////////////////////////////////////////////////
 // Override Win32 min/max macros with the STL's type safe and macro
 // free varieties (much safer!)
@@ -84,4 +86,40 @@ static const bool IsDebugBuild = false;
 
 #endif
 
+struct StereoOut16;
+struct StereoOutFloat;
+
+struct StereoOut32
+{
+	static StereoOut32 Empty;
+
+	s32 Left;
+	s32 Right;
+	
+	StereoOut32() :
+		Left( 0 ),
+		Right( 0 )
+	{
+	}
+
+	StereoOut32( s32 left, s32 right ) :
+		Left( left ),
+		Right( right )
+	{
+	}
+	
+	StereoOut32( const StereoOut16& src );
+	explicit StereoOut32( const StereoOutFloat& src );
+	
+	StereoOut16 DownSample() const;
+	
+	StereoOut32 operator+( const StereoOut32& right )
+	{
+		return StereoOut32(
+			Left + right.Left,
+			Right + right.Right
+		);
+	}
+};
+
 #endif
diff --git a/plugins/spu2-x/src/Debug.cpp b/plugins/spu2-x/src/Debug.cpp
index 79aedee3bc..3c001cfb70 100644
--- a/plugins/spu2-x/src/Debug.cpp
+++ b/plugins/spu2-x/src/Debug.cpp
@@ -71,6 +71,27 @@ void ConLog(const char *fmt, ...) {
 #endif
 }
 
+void V_VolumeSlide::DebugDump( FILE* dump, const char* title, const char* nameLR )
+{
+	fprintf( dump, "%s Volume for %s Channel:\t%x\n"
+		"  - Value:     %x\n"
+		"  - Mode:      %x\n"
+		"  - Increment: %x\n",
+		title, nameLR, Reg_VOL, Value, Mode, Increment);
+}
+
+void V_VolumeSlideLR::DebugDump( FILE* dump, const char* title )
+{
+	Left.DebugDump( dump, title, "Left" );
+	Right.DebugDump( dump, title, "Right" );
+}
+
+void V_VolumeLR::DebugDump( FILE* dump, const char* title )
+{
+	fprintf( dump, "Volume for %s (%s Channel):\t%x\n", title, "Left", Left );
+	fprintf( dump, "Volume for %s (%s Channel):\t%x\n", title, "Right", Right );
+}
+
 void DoFullDump()
 {
 #ifdef SPU2_LOG
@@ -98,32 +119,18 @@ void DoFullDump()
 
 	if(!CoresDump()) return;
 	dump = _wfopen( CoresDumpFileName, _T("wt") );
-	if (dump) {
+	if (dump)
+	{
 		for(c=0;c<2;c++)
 		{
 			fprintf(dump,"#### CORE %d DUMP.\n",c);
-			fprintf(dump,"Master Volume for Left Channel: %x\n"
-						 "  - Value:     %x\n"
-						 "  - Mode:      %x\n"
-						 "  - Increment: %x\n",
-						 Cores[c].MasterL.Reg_VOL,
-						 Cores[c].MasterL.Value,
-						 Cores[c].MasterL.Mode,
-						 Cores[c].MasterL.Increment);
-			fprintf(dump,"Master Volume for Right Channel: %x\n"
-						 "  - Value:     %x\n"
-						 "  - Mode:      %x\n"
-						 "  - Increment: %x\n",
-						 Cores[c].MasterR.Reg_VOL,
-						 Cores[c].MasterR.Value,
-						 Cores[c].MasterR.Mode,
-						 Cores[c].MasterR.Increment);
-			fprintf(dump,"Volume for External Data Input (Left Channel):  %x\n",Cores[c].ExtL);
-			fprintf(dump,"Volume for External Data Input (Right Channel): %x\n",Cores[c].ExtR);
-			fprintf(dump,"Volume for Sound Data Input (Left Channel):     %x\n",Cores[c].InpL);
-			fprintf(dump,"Volume for Sound Data Input (Right Channel):    %x\n",Cores[c].InpR);
-			fprintf(dump,"Volume for Output from Effects (Left Channel):  %x\n",Cores[c].FxL);
-			fprintf(dump,"Volume for Output from Effects (Right Channel): %x\n",Cores[c].FxR);
+
+			Cores[c].MasterVol.DebugDump( dump, "Master" );
+
+			Cores[c].ExtVol.DebugDump( dump, "External Data Input" );
+			Cores[c].InpVol.DebugDump( dump, "Voice Data Input [dry]" );
+			Cores[c].FxVol.DebugDump( dump, "Effects/Reverb [wet]" );
+
 			fprintf(dump,"Interrupt Address:          %x\n",Cores[c].IRQA);
 			fprintf(dump,"DMA Transfer Start Address: %x\n",Cores[c].TSA);
 			fprintf(dump,"External Input to Direct Output (Left):    %s\n",Cores[c].ExtDryL?"Yes":"No");
@@ -156,24 +163,11 @@ void DoFullDump()
 			fprintf(dump,"  - ENDX:   %x\n",Cores[c].Regs.VMIXER);
 			fprintf(dump,"  - STATX:  %x\n",Cores[c].Regs.VMIXEL);
 			fprintf(dump,"  - ATTR:   %x\n",Cores[c].Regs.VMIXER);
-			for(v=0;v<24;v++) {
+			for(v=0;v<24;v++)
+			{
 				fprintf(dump,"Voice %d:\n",v);
-				fprintf(dump,"  - Volume for Left Channel: %x\n"
-							 "     - Value:     %x\n"
-							 "     - Mode:      %x\n"
-							 "     - Increment: %x\n",
-							 Cores[c].Voices[v].VolumeL.Reg_VOL,
-							 Cores[c].Voices[v].VolumeL.Value,
-							 Cores[c].Voices[v].VolumeL.Mode,
-							 Cores[c].Voices[v].VolumeL.Increment);
-				fprintf(dump,"  - Volume for Right Channel: %x\n"
-							 "     - Value:     %x\n"
-							 "     - Mode:      %x\n"
-							 "     - Increment: %x\n",
-							 Cores[c].Voices[v].VolumeR.Reg_VOL,
-							 Cores[c].Voices[v].VolumeR.Value,
-							 Cores[c].Voices[v].VolumeR.Mode,
-							 Cores[c].Voices[v].VolumeR.Increment);
+				Cores[c].Voices[v].Volume.DebugDump( dump, "" );
+				
 				fprintf(dump,"  - ADSR Envelope: %x & %x\n"
 							 "     - Ar: %x\n"
 							 "     - Am: %x\n"
@@ -197,6 +191,7 @@ void DoFullDump()
 							 Cores[c].Voices[v].ADSR.ReleaseMode,
 							 Cores[c].Voices[v].ADSR.Phase,
 							 Cores[c].Voices[v].ADSR.Value);
+
 				fprintf(dump,"  - Pitch:     %x\n",Cores[c].Voices[v].Pitch);
 				fprintf(dump,"  - Modulated: %s\n",Cores[c].Voices[v].Modulated?"Yes":"No");
 				fprintf(dump,"  - Source:    %s\n",Cores[c].Voices[v].Noise?"Noise":"Wave");
@@ -204,12 +199,12 @@ void DoFullDump()
 				fprintf(dump,"  - Direct Output for Right Channel:  %s\n",Cores[c].Voices[v].DryR?"Yes":"No");
 				fprintf(dump,"  - Effects Output for Left Channel:  %s\n",Cores[c].Voices[v].WetL?"Yes":"No");
 				fprintf(dump,"  - Effects Output for Right Channel: %s\n",Cores[c].Voices[v].WetR?"Yes":"No");
-				fprintf(dump,"  - Loop Start Adress:  %x\n",Cores[c].Voices[v].LoopStartA);
-				fprintf(dump,"  - Sound Start Adress: %x\n",Cores[c].Voices[v].StartA);
-				fprintf(dump,"  - Next Data Adress:   %x\n",Cores[c].Voices[v].NextA);
-				fprintf(dump,"  - Play Start Cycle:   %d\n",Cores[c].Voices[v].PlayCycle);
-				fprintf(dump,"  - Play Status:        %s\n",(Cores[c].Voices[v].ADSR.Phase>0)?"Playing":"Not Playing");
-				fprintf(dump,"  - Block Sample:       %d\n",Cores[c].Voices[v].SCurrent);
+				fprintf(dump,"  - Loop Start Address:  %x\n",Cores[c].Voices[v].LoopStartA);
+				fprintf(dump,"  - Sound Start Address: %x\n",Cores[c].Voices[v].StartA);
+				fprintf(dump,"  - Next Data Address:   %x\n",Cores[c].Voices[v].NextA);
+				fprintf(dump,"  - Play Start Cycle:    %d\n",Cores[c].Voices[v].PlayCycle);
+				fprintf(dump,"  - Play Status:         %s\n",(Cores[c].Voices[v].ADSR.Phase>0)?"Playing":"Not Playing");
+				fprintf(dump,"  - Block Sample:        %d\n",Cores[c].Voices[v].SCurrent);
 			}
 			fprintf(dump,"#### END OF DUMP.\n\n");
 		}
diff --git a/plugins/spu2-x/src/Debug.h b/plugins/spu2-x/src/Debug.h
index b6d29e837a..ce79684b90 100644
--- a/plugins/spu2-x/src/Debug.h
+++ b/plugins/spu2-x/src/Debug.h
@@ -52,9 +52,10 @@ namespace WaveDump
 	,	CoreSrc_Count
 	};	
 
-	void Open();
-	void Close();
-	void WriteCore( uint coreidx, CoreSourceType src, s16 left, s16 right );
+	extern void Open();
+	extern void Close();
+	extern void WriteCore( uint coreidx, CoreSourceType src, s16 left, s16 right );
+	extern void WriteCore( uint coreidx, CoreSourceType src, const StereoOut16& sample );
 }
 
 using WaveDump::CoreSrc_Input;
diff --git a/plugins/spu2-x/src/Decoder.cpp b/plugins/spu2-x/src/Decoder.cpp
index ac7efcf57f..b1d0142cb7 100644
--- a/plugins/spu2-x/src/Decoder.cpp
+++ b/plugins/spu2-x/src/Decoder.cpp
@@ -58,7 +58,6 @@ int state=0;
 FILE *fSpdifDump;
 
 extern u32 core;
-void __fastcall ReadInput(V_Core& thiscore, s32& PDataL,s32& PDataR);
 
 union spdif_frame { // total size: 32bits
 	struct {
@@ -132,22 +131,23 @@ s32 stoi(sample_t n) //input: [-1..1]
 
 void spdif_update()
 {
-	s32 Data,Zero;
+	StereoOut32 Data;
 
 	core=0;
 	V_Core& thiscore( Cores[core] );
 	for(int i=0;i<data_rate;i++)
 	{
-		ReadInput(thiscore, Data,Zero);
+		// Right side data should be zero / ignored
+		ReadInput( thiscore, Data );
 		
 		if(fSpdifDump)
 		{
-			fwrite(&Data,4,1,fSpdifDump);
-			fwrite(&Zero,4,1,fSpdifDump);
+			fwrite(&Data.Left,4,1,fSpdifDump);
+			fwrite(&Data.Right,4,1,fSpdifDump);		// zero side.
 		}
 
 		if(ac3dec)
-			spdif_Write(Data);
+			spdif_Write(Data.Left);
 	}
 
 	if(!ac3dec) return;
diff --git a/plugins/spu2-x/src/DllInterface.cpp b/plugins/spu2-x/src/DllInterface.cpp
index 5548b26a23..8c5f2a1327 100644
--- a/plugins/spu2-x/src/DllInterface.cpp
+++ b/plugins/spu2-x/src/DllInterface.cpp
@@ -120,7 +120,7 @@ EXPORT_C_(void) SPU2about()
 
 EXPORT_C_(s32) SPU2test()
 {
-	return SndTest();
+	return SndBuffer::Test();
 }
 
 EXPORT_C_(s32) SPU2init() 
@@ -228,22 +228,20 @@ EXPORT_C_(s32) SPU2open(void *pDsp)
 	debugDialogOpen=1;
 	}*/
 
-	spu2open=true;
-	if (!SndInit())
+	spu2open = true;
+	try
 	{
+		SndBuffer::Init();
 		spdif_init();
-
 		DspLoadLibrary(dspPlugin,dspPluginModule);
-		
 		WaveDump::Open();
-
-		return 0;
 	}
-	else 
+	catch( ... )
 	{
 		SPU2close();
 		return -1;
-	};
+	}
+	return 0;
 }
 
 EXPORT_C_(void) SPU2close() 
@@ -253,7 +251,7 @@ EXPORT_C_(void) SPU2close()
 
 	DspCloseLibrary();
 	spdif_shutdown();
-	SndClose();
+	SndBuffer::Cleanup();
 
 	spu2open = false;
 }
diff --git a/plugins/spu2-x/src/Mixer.cpp b/plugins/spu2-x/src/Mixer.cpp
index 6dadeb4cd8..ed556afdfd 100644
--- a/plugins/spu2-x/src/Mixer.cpp
+++ b/plugins/spu2-x/src/Mixer.cpp
@@ -61,11 +61,17 @@ __forceinline s32 MulShr32( s32 srcval, s32 mulval )
 	// It won't fly on big endian machines though... :)
 }
 
-__forceinline s32 clamp_mix(s32 x, u8 bitshift)
+__forceinline s32 clamp_mix( s32 x, u8 bitshift )
 {
 	return GetClamped( x, -0x8000<<bitshift, 0x7fff<<bitshift );
 }
 
+__forceinline void clamp_mix( StereoOut32& sample, u8 bitshift )
+{
+	Clampify( sample.Left, -0x8000<<bitshift, 0x7fff<<bitshift );
+	Clampify( sample.Right, -0x8000<<bitshift, 0x7fff<<bitshift );
+}
+
 static void __forceinline XA_decode_block(s16* buffer, const s16* block, s32& prev1, s32& prev2)
 {
 	const s32 header = *block;
@@ -171,7 +177,7 @@ int g_counter_cache_ignores = 0;
 #define XAFLAG_LOOP			(1ul<<1)
 #define XAFLAG_LOOP_START	(1ul<<2)
 
-static void __forceinline __fastcall GetNextDataBuffered( V_Core& thiscore, V_Voice& vc, s32& Data) 
+static s32 __forceinline __fastcall GetNextDataBuffered( V_Core& thiscore, V_Voice& vc ) 
 {
 	if (vc.SCurrent<28)
 	{
@@ -259,19 +265,19 @@ static void __forceinline __fastcall GetNextDataBuffered( V_Core& thiscore, V_Vo
 	IncrementNextA( thiscore, vc );
 
 _skipIncrement:
-	Data = vc.SBuffer[vc.SCurrent++];
+	return vc.SBuffer[vc.SCurrent++];
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////////////////////
 //                                                                                     //
-static void __forceinline GetNoiseValues(s32& VD) 
+static s32 __forceinline GetNoiseValues()
 {
 	static s32 Seed = 0x41595321;
+	s32 retval = 0x8000;
 
-	if(Seed&0x100) VD = (s32)((Seed&0xff)<<8);
-	else if(!(Seed&0xffff)) VD = (s32)0x8000;
-	else VD = (s32)0x7fff;
+	if( Seed&0x100 ) retval = (Seed&0xff) << 8;
+	else if( Seed&0xffff ) retval = 0x7fff;
 
 	__asm {
 		MOV eax,Seed
@@ -284,6 +290,7 @@ static void __forceinline GetNoiseValues(s32& VD)
 		ROR eax,3
 		MOV Seed,eax
 	}
+	return retval;
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////
@@ -299,6 +306,22 @@ static __forceinline s32 ApplyVolume(s32 data, s32 volume)
 	return MulShr32( data<<1, volume );
 }
 
+static __forceinline StereoOut32 ApplyVolume( const StereoOut32& data, const V_VolumeLR& volume )
+{
+	return StereoOut32(
+		ApplyVolume( data.Left, volume.Left ),
+		ApplyVolume( data.Right, volume.Right )
+	);
+}
+
+static __forceinline StereoOut32 ApplyVolume( const StereoOut32& data, const V_VolumeSlideLR& volume )
+{
+	return StereoOut32(
+		ApplyVolume( data.Left, volume.Left.Value ),
+		ApplyVolume( data.Right, volume.Right.Value )
+	);
+}
+
 static void __forceinline UpdatePitch( V_Voice& vc )
 {
 	s32 pitch;
@@ -339,14 +362,12 @@ static __forceinline void CalculateADSR( V_Core& thiscore, V_Voice& vc )
 }
 
 // Returns a 16 bit result in Value.
-static void __forceinline GetVoiceValues_Linear(V_Core& thiscore, V_Voice& vc, s32& Value)
+static s32 __forceinline GetVoiceValues_Linear( V_Core& thiscore, V_Voice& vc )
 {
 	while( vc.SP > 0 )
 	{
 		vc.PV2 = vc.PV1;
-
-		GetNextDataBuffered( thiscore, vc, vc.PV1 );
-
+		vc.PV1 = GetNextDataBuffered( thiscore, vc );
 		vc.SP -= 4096;
 	}
 
@@ -358,28 +379,28 @@ static void __forceinline GetVoiceValues_Linear(V_Core& thiscore, V_Voice& vc, s
 
 	if(Interpolation==0)
 	{
-		Value = ApplyVolume( vc.PV1, vc.ADSR.Value );
+		return ApplyVolume( vc.PV1, vc.ADSR.Value );
 	} 
 	else //if(Interpolation==1) //must be linear
 	{
 		s32 t0 = vc.PV2 - vc.PV1;
-		Value = MulShr32( (vc.PV1<<1) - ((t0*vc.SP)>>11), vc.ADSR.Value );
+		return MulShr32( (vc.PV1<<1) - ((t0*vc.SP)>>11), vc.ADSR.Value );
 	}
 }
 
 // Returns a 16 bit result in Value.
-static void __forceinline GetVoiceValues_Cubic(V_Core& thiscore, V_Voice& vc, s32& Value)
+static s32 __forceinline GetVoiceValues_Cubic( V_Core& thiscore, V_Voice& vc )
 {
 	while( vc.SP > 0 )
 	{
-		vc.PV4=vc.PV3;
-		vc.PV3=vc.PV2;
-		vc.PV2=vc.PV1;
+		vc.PV4 = vc.PV3;
+		vc.PV3 = vc.PV2;
+		vc.PV2 = vc.PV1;
 
-		GetNextDataBuffered( thiscore, vc, vc.PV1 );
-		vc.PV1<<=2;
+		vc.PV1 = GetNextDataBuffered( thiscore, vc );
+		vc.PV1 <<= 2;
 		vc.SPc = vc.SP&4095;	// just the fractional part, please!
-		vc.SP-=4096;
+		vc.SP -= 4096;
 	}
 
 	CalculateADSR( thiscore, vc );
@@ -398,35 +419,37 @@ static void __forceinline GetVoiceValues_Cubic(V_Core& thiscore, V_Voice& vc, s3
 	// Note!  It's very important that ADSR stay as accurate as possible.  By the way
 	// it is used, various sound effects can end prematurely if we truncate more than
 	// one or two bits.
-	Value = MulShr32( val, vc.ADSR.Value>>1 );
+	return MulShr32( val, vc.ADSR.Value>>1 );
 }
 
 // Noise values need to be mixed without going through interpolation, since it
 // can wreak havoc on the noise (causing muffling or popping).  Not that this noise
 // generator is accurate in its own right.. but eh, ah well :)
-static void __forceinline __fastcall GetNoiseValues(V_Core& thiscore, V_Voice& vc, s32& Data)
+static s32 __forceinline __fastcall GetNoiseValues( V_Core& thiscore, V_Voice& vc )
 {
-	while(vc.SP>=4096) 
+	s32 retval = GetNoiseValues();
+
+	/*while(vc.SP>=4096)
 	{
-		GetNoiseValues( Data );
+		retval = GetNoiseValues();
 		vc.SP-=4096;
-	}
+	}*/
 
 	// GetNoiseValues can't set the phase zero on us unexpectedly
 	// like GetVoiceValues can.  Better assert just in case though..
-	jASSUME( vc.ADSR.Phase != 0 );	
+	jASSUME( vc.ADSR.Phase != 0 );
 
 	CalculateADSR( thiscore, vc );
 
 	// Yup, ADSR applies even to noise sources...
-	Data = MulShr32( Data, vc.ADSR.Value );
+	return ApplyVolume( retval, vc.ADSR.Value );
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////////////////////
 //                                                                                     //
 
-void __fastcall ReadInput(V_Core& thiscore, s32& PDataL,s32& PDataR) 
+void __fastcall ReadInput( V_Core& thiscore, StereoOut32& PData ) 
 {
 	if((thiscore.AutoDMACtrl&(core+1))==(core+1))
 	{
@@ -442,17 +465,17 @@ void __fastcall ReadInput(V_Core& thiscore, s32& PDataL,s32& PDataR)
 			// so we just downgrade it to 16 bits for now.
 			
 #ifdef PCM24_S1_INTERLEAVE
-			*PDataL=*(((s32*)(thiscore.ADMATempBuffer+(thiscore.InputPos<<1))));
-			*PDataR=*(((s32*)(thiscore.ADMATempBuffer+(thiscore.InputPos<<1)+2)));
+			*PData.Left=*(((s32*)(thiscore.ADMATempBuffer+(thiscore.InputPos<<1))));
+			*PData.Right=*(((s32*)(thiscore.ADMATempBuffer+(thiscore.InputPos<<1)+2)));
 #else
 			s32 *pl=(s32*)&(thiscore.ADMATempBuffer[thiscore.InputPos]);
 			s32 *pr=(s32*)&(thiscore.ADMATempBuffer[thiscore.InputPos+0x200]);
-			PDataL=*pl;
-			PDataR=*pr;
+			PData.Left = *pl;
+			PData.Right = *pr;
 #endif
 
-			PDataL>>=1; //give 31 bit data (SndOut downsamples the rest of the way)
-			PDataR>>=1;
+			PData.Left >>= 2; //give 30 bit data (SndOut downsamples the rest of the way)
+			PData.Right >>= 2;
 
 			thiscore.InputPos+=2;
 			if((thiscore.InputPos==0x100)||(thiscore.InputPos>=0x200)) {
@@ -495,8 +518,8 @@ void __fastcall ReadInput(V_Core& thiscore, s32& PDataL,s32& PDataR)
 
 			s32 *pl=(s32*)&(thiscore.ADMATempBuffer[thiscore.InputPos]);
 			s32 *pr=(s32*)&(thiscore.ADMATempBuffer[thiscore.InputPos+0x200]);
-			PDataL=*pl;
-			PDataR=*pr;
+			PData.Left  = *pl;
+			PData.Right = *pr;
 
 			thiscore.InputPos+=2;
 			if(thiscore.InputPos>=0x200) {
@@ -540,16 +563,16 @@ void __fastcall ReadInput(V_Core& thiscore, s32& PDataL,s32& PDataR)
 			else
 			{
 				// Using the temporary buffer because this area gets overwritten by some other code.
-				//*PDataL=(s32)*(s16*)(spu2mem+0x2000+(core<<10)+thiscore.InputPos);
-				//*PDataR=(s32)*(s16*)(spu2mem+0x2200+(core<<10)+thiscore.InputPos);
+				//*PData.Left  = (s32)*(s16*)(spu2mem+0x2000+(core<<10)+thiscore.InputPos);
+				//*PData.Right = (s32)*(s16*)(spu2mem+0x2200+(core<<10)+thiscore.InputPos);
 
-				tl=(s32)thiscore.ADMATempBuffer[thiscore.InputPos];
-				tr=(s32)thiscore.ADMATempBuffer[thiscore.InputPos+0x200];
+				tl = (s32)thiscore.ADMATempBuffer[thiscore.InputPos];
+				tr = (s32)thiscore.ADMATempBuffer[thiscore.InputPos+0x200];
 
 			}
 
-			PDataL=tl;
-			PDataR=tr;
+			PData.Left  = tl;
+			PData.Right = tr;
 
 			thiscore.InputPos++;
 			if((thiscore.InputPos==0x100)||(thiscore.InputPos>=0x200)) {
@@ -585,9 +608,10 @@ void __fastcall ReadInput(V_Core& thiscore, s32& PDataL,s32& PDataR)
 			}
 		}
 	}
-	else {
-		PDataL=0;
-		PDataR=0;
+	else
+	{
+		PData.Left  = 0;
+		PData.Right = 0;
 	}
 }
 
@@ -595,29 +619,21 @@ void __fastcall ReadInput(V_Core& thiscore, s32& PDataL,s32& PDataR)
 /////////////////////////////////////////////////////////////////////////////////////////
 //                                                                                     //
 
-static void __forceinline __fastcall ReadInputPV(V_Core& thiscore, s32& ValL,s32& ValR) 
+static __forceinline StereoOut32 ReadInputPV( V_Core& thiscore ) 
 {
-	s32 DL=0, DR=0;
-
 	u32 pitch=AutoDMAPlayRate[core];
 
 	if(pitch==0) pitch=48000;
 	
-	thiscore.ADMAPV+=pitch;
+	thiscore.ADMAPV += pitch;
 	while(thiscore.ADMAPV>=48000) 
 	{
-		ReadInput(thiscore, DL,DR);
-		thiscore.ADMAPV-=48000;
-		thiscore.ADMAPL=DL;
-		thiscore.ADMAPR=DR;
+		ReadInput( thiscore, thiscore.ADMAP );
+		thiscore.ADMAPV -= 48000;
 	}
 
-	ValL=thiscore.ADMAPL;
-	ValR=thiscore.ADMAPR;
-
 	// Apply volumes:
-	ValL = ApplyVolume( ValL, thiscore.InpL );
-	ValR = ApplyVolume( ValR, thiscore.InpR );
+	return ApplyVolume( thiscore.ADMAP, thiscore.InpVol );
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////
@@ -637,108 +653,107 @@ static __forceinline void spu2M_WriteFast( u32 addr, s16 value )
 }
 
 
-static __forceinline void MixVoice( V_Core& thiscore, V_Voice& vc, s32& VValL, s32& VValR )
+static __forceinline StereoOut32 MixVoice( V_Core& thiscore, V_Voice& vc )
 {
-	s32 Value=0;
-
-	VValL = 0;
-	VValR = 0;
-
 	// Most games don't use much volume slide effects.  So only call the UpdateVolume
 	// methods when needed by checking the flag outside the method here...
 
-	vc.VolumeL.Update();
-	vc.VolumeR.Update();
+	vc.Volume.Update();
 
+	// SPU2 Note: The spu2 continues to process voices for eternity, always, so we
+	// have to run through all the motions of updating the voice regardless of it's
+	// audible status.  Otherwise IRQs might not trigger and emulation might fail.
+	
 	if( vc.ADSR.Phase > 0 )
 	{
 		UpdatePitch( vc );
 
+		s32 Value;
+
 		if( vc.Noise )
-			GetNoiseValues( thiscore, vc, Value );
+			Value = GetNoiseValues( thiscore, vc );
 		else
 		{
 			if( Interpolation == 2 )
-				GetVoiceValues_Cubic( thiscore, vc, Value );
+				Value = GetVoiceValues_Cubic( thiscore, vc );
 			else
-				GetVoiceValues_Linear( thiscore, vc, Value );
+				Value = GetVoiceValues_Linear( thiscore, vc );
 		}
 
-		// Record the output (used for modulation effects)
+		// Note: All values recorded into OutX (may be used for modulation later)
 		vc.OutX = Value;
 
 		if( IsDevBuild )
-			DebugCores[core].Voices[voice].displayPeak = max(DebugCores[core].Voices[voice].displayPeak,abs(Value));
+			DebugCores[core].Voices[voice].displayPeak = max(DebugCores[core].Voices[voice].displayPeak,abs(vc.OutX));
 
-		// TODO : Implement this using high-def MulShr32.
-		//   vc.VolumeL/R are 15 bits.  Value should be 32 bits (but is currently 16)
+		// Write-back of raw voice data (post ADSR applied)
 
-		VValL = ApplyVolume(Value,vc.VolumeL.Value);
-		VValR = ApplyVolume(Value,vc.VolumeR.Value);
+		if (voice==1)      spu2M_WriteFast( 0x400 + (core<<12) + OutPos, vc.OutX );
+		else if (voice==3) spu2M_WriteFast( 0x600 + (core<<12) + OutPos, vc.OutX );
+
+		return ApplyVolume( StereoOut32( Value, Value ), vc.Volume );
 	}
+	else
+	{
+		// Write-back of raw voice data (some zeros since the voice is "dead")
 
-	// Write-back of raw voice data (post ADSR applied)
-
-	if (voice==1)      spu2M_WriteFast( 0x400 + (core<<12) + OutPos, (s16)Value );
-	else if (voice==3) spu2M_WriteFast( 0x600 + (core<<12) + OutPos, (s16)Value );
-
+		if (voice==1)      spu2M_WriteFast( 0x400 + (core<<12) + OutPos, 0 );
+		else if (voice==3) spu2M_WriteFast( 0x600 + (core<<12) + OutPos, 0 );
+		
+		return StereoOut32( 0, 0 );
+	}
 }
 
 
-static void __fastcall MixCore(s32& OutL, s32& OutR, s32 ExtL, s32 ExtR)
+static StereoOut32 __fastcall MixCore( const StereoOut32& Input, const StereoOut32& Ext )
 {
-	s32 RVL,RVR;
-	s32 SDL=0,SDR=0;
-	s32 SWL=0,SWR=0;
-
 	V_Core& thiscore( Cores[core] );
+	thiscore.MasterVol.Update();
+
+	StereoOut32 Dry(0,0), Wet(0,0);
 
 	for( voice=0; voice<24; ++voice )
 	{
-		s32 VValL,VValR;
-
 		V_Voice& vc( thiscore.Voices[voice] );
-		MixVoice( thiscore, vc, VValL, VValR );
+		StereoOut32 VVal( MixVoice( thiscore, vc ) );
 		
 		// Note: Results from MixVoice are ranged at 16 bits.
-		// Following muls are toggles only (0 or 1)
 
-		SDL += VValL & vc.DryL;
-		SDR += VValR & vc.DryR;
-		SWL += VValL & vc.WetL;
-		SWR += VValR & vc.WetR;
+		Dry.Left += VVal.Left & vc.DryL;
+		Dry.Right += VVal.Right & vc.DryR;
+		Wet.Left += VVal.Left & vc.WetL;
+		Wet.Right += VVal.Right & vc.WetR;
 	}
 	
 	// Saturate final result to standard 16 bit range.
-	SDL = clamp_mix( SDL );
-	SDR = clamp_mix( SDR );
-	SWL = clamp_mix( SWL );
-	SWR = clamp_mix( SWR );
+	clamp_mix( Dry );
+	clamp_mix( Wet );
 	
 	// Write Mixed results To Output Area
-	spu2M_WriteFast( 0x1000 + (core<<12) + OutPos, (s16)SDL );
-	spu2M_WriteFast( 0x1200 + (core<<12) + OutPos, (s16)SDR );
-	spu2M_WriteFast( 0x1400 + (core<<12) + OutPos, (s16)SWL );
-	spu2M_WriteFast( 0x1600 + (core<<12) + OutPos, (s16)SWR );
+	spu2M_WriteFast( 0x1000 + (core<<12) + OutPos, Dry.Left );
+	spu2M_WriteFast( 0x1200 + (core<<12) + OutPos, Dry.Right );
+	spu2M_WriteFast( 0x1400 + (core<<12) + OutPos, Wet.Left );
+	spu2M_WriteFast( 0x1600 + (core<<12) + OutPos, Wet.Right );
 	
 	// Write mixed results to logfile (if enabled)
 	
-	WaveDump::WriteCore( core, CoreSrc_DryVoiceMix, SDL, SDR );
-	WaveDump::WriteCore( core, CoreSrc_WetVoiceMix, SWL, SWR );
-
-	s32 TDL,TDR;
+	WaveDump::WriteCore( core, CoreSrc_DryVoiceMix, Dry );
+	WaveDump::WriteCore( core, CoreSrc_WetVoiceMix, Wet );
 
 	// Mix in the Input data
-	TDL = OutL & thiscore.InpDryL;
-	TDR = OutR & thiscore.InpDryR;
 
+	StereoOut32 TD(
+		Input.Left & thiscore.InpDryL,
+		Input.Right & thiscore.InpDryR
+	);
+	
 	// Mix in the Voice data
-	TDL += SDL & thiscore.SndDryL;
-	TDR += SDR & thiscore.SndDryR;
+	TD.Left += Dry.Left & thiscore.SndDryL;
+	TD.Right += Dry.Right & thiscore.SndDryR;
 
 	// Mix in the External (nothing/core0) data
-	TDL += ExtL & thiscore.ExtDryL;
-	TDR += ExtR & thiscore.ExtDryR;
+	TD.Left += Ext.Left & thiscore.ExtDryL;
+	TD.Right += Ext.Right & thiscore.ExtDryR;
 	
 	if( !EffectsDisabled )
 	{
@@ -747,138 +762,106 @@ static void __fastcall MixCore(s32& OutL, s32& OutR, s32 ExtL, s32 ExtR)
 
 		if( thiscore.FxEnable )
 		{
-			s32 TWL,TWR;
-
 			// Mix Input, Voice, and External data:
-			TWL = OutL & thiscore.InpWetL;
-			TWR = OutR & thiscore.InpWetR;
-			TWL += SWL & thiscore.SndWetL;
-			TWR += SWR & thiscore.SndWetR;
-			TWL += ExtL & thiscore.ExtWetL; 
-			TWR += ExtR & thiscore.ExtWetR;
+			StereoOut32 TW(
+				Input.Left & thiscore.InpWetL,
+				Input.Right & thiscore.InpWetR
+			);
+			
+			TW.Left += Wet.Left & thiscore.SndWetL;
+			TW.Right += Wet.Right & thiscore.SndWetR;
+			TW.Left += Ext.Left & thiscore.ExtWetL; 
+			TW.Right += Ext.Right & thiscore.ExtWetR;
 
-			WaveDump::WriteCore( core, CoreSrc_PreReverb, TWL, TWR );
+			WaveDump::WriteCore( core, CoreSrc_PreReverb, TW );
 
-			DoReverb( thiscore, RVL, RVR, TWL, TWR );
+			StereoOut32 RV( DoReverb( thiscore, TW ) );
 
 			// Volume boost after effects application.  Boosting volume prior to effects
 			// causes slight overflows in some games, and the volume boost is required.
 			// (like all over volumes on SPU2, reverb coefficients and stuff are signed,
 			// range -50% to 50%, thus *2 is needed)
 			
-			RVL *= 2;
-			RVR *= 2;
+			RV.Left  *= 2;
+			RV.Right *= 2;
 
-			WaveDump::WriteCore( core, CoreSrc_PostReverb, RVL, RVR );
-
-			TWL = ApplyVolume(RVL,thiscore.FxL);
-			TWR = ApplyVolume(RVR,thiscore.FxR);
+			WaveDump::WriteCore( core, CoreSrc_PostReverb, RV );
 
 			// Mix Dry+Wet
-			OutL = TDL + TWL;
-			OutR = TDR + TWR;
+			return StereoOut32( TD + ApplyVolume( RV, thiscore.FxVol ) );
 		}
 		else
 		{
 			WaveDump::WriteCore( core, CoreSrc_PreReverb, 0, 0 );
 			WaveDump::WriteCore( core, CoreSrc_PostReverb, 0, 0 );
-			OutL = TDL;
-			OutR = TDR;
 		}
 	}
-	else
-	{
-		OutL = TDL;
-		OutR = TDR;
-	}
-
-	// Apply Master Volume.  The core will need this when the function returns.
-	
-	thiscore.MasterL.Update();
-	thiscore.MasterR.Update();
+	return TD;
 }
 
 // used to throttle the output rate of cache stat reports
 static int p_cachestat_counter=0;
 
-void Mix() 
+__forceinline void Mix() 
 {
-	s32 ExtL=0, ExtR=0, OutL, OutR;
-
 	// ****  CORE ZERO  ****
+	core = 0;
 
-	core=0;
-	if( (PlayMode&4) == 0 )
-	{
-		// get input data from input buffers
-		ReadInputPV(Cores[0], ExtL, ExtR);
-		WaveDump::WriteCore( 0, CoreSrc_Input, ExtL, ExtR );
-	}
+	// Note: Playmode 4 is SPDIF, which overrides other inputs.
+	StereoOut32 Ext( (PlayMode&4) ? StereoOut32::Empty : ReadInputPV( Cores[0] ) );
+	WaveDump::WriteCore( 0, CoreSrc_Input, Ext );
 
-	MixCore( ExtL, ExtR, 0, 0 );
+	Ext = MixCore( Ext, StereoOut32::Empty );
 
 	if( (PlayMode & 4) || (Cores[0].Mute!=0) )
-	{
-		ExtL=0;
-		ExtR=0;
-	}
+		Ext = StereoOut32( 0, 0 );
 	else
 	{
-		ExtL = ApplyVolume( ExtL, Cores[0].MasterL.Value );
-		ExtR = ApplyVolume( ExtR, Cores[0].MasterR.Value );
+		Ext = ApplyVolume( Ext, Cores[0].MasterVol );
+		clamp_mix( Ext );
 	}
-	
+
 	// Commit Core 0 output to ram before mixing Core 1:
-	
-	ExtL = clamp_mix( ExtL );
-	ExtR = clamp_mix( ExtR );
 
-	spu2M_WriteFast( 0x800 + OutPos, ExtL );
-	spu2M_WriteFast( 0xA00 + OutPos, ExtR );
-
-	WaveDump::WriteCore( 0, CoreSrc_External, ExtL, ExtR );
+	spu2M_WriteFast( 0x800 + OutPos, Ext.Left );
+	spu2M_WriteFast( 0xA00 + OutPos, Ext.Right );
+	WaveDump::WriteCore( 0, CoreSrc_External, Ext );
 
 	// ****  CORE ONE  ****
 
 	core = 1;
-	if( (PlayMode&8) != 8 )
-	{
-		ReadInputPV(Cores[1], OutL, OutR);	// get input data from input buffers
-		WaveDump::WriteCore( 1, CoreSrc_Input, OutL, OutR );
-	}
+	StereoOut32 Out( (PlayMode&8) ? StereoOut32::Empty : ReadInputPV( Cores[1] ) );
+	WaveDump::WriteCore( 1, CoreSrc_Input, Out );
 
-	// Apply volume to the external (Core 0) input data.
-
-	MixCore( OutL, OutR, ApplyVolume( ExtL, Cores[1].ExtL), ApplyVolume( ExtR, Cores[1].ExtR) );
+	ApplyVolume( Ext, Cores[1].ExtVol );
+	Out = MixCore( Out, Ext );
 
 	if( PlayMode & 8 )
 	{
 		// Experimental CDDA support
 		// The CDDA overrides all other mixer output.  It's a direct feed!
 
-		ReadInput(Cores[1], OutL, OutR);
+		ReadInput( Cores[1], Out );
 		//WaveLog::WriteCore( 1, "CDDA-32", OutL, OutR );
 	}
 	else
 	{
-		OutL = MulShr32( OutL<<10, Cores[1].MasterL.Value );
-		OutR = MulShr32( OutR<<10, Cores[1].MasterR.Value );
+		Out.Left = MulShr32( Out.Left<<SndOutVolumeShift, Cores[1].MasterVol.Left.Value );
+		Out.Right = MulShr32( Out.Right<<SndOutVolumeShift, Cores[1].MasterVol.Right.Value );
 
-		// Final Clamp.
+		// Final Clamp!
 		// This could be circumvented by using 1/2th total output volume, although
-		// I suspect clamping at the higher volume is more true to the PS2's real
-		// implementation.
+		// I suspect this approach (clamping at the higher volume) is more true to the
+		// PS2's real implementation.
 
-		OutL = clamp_mix( OutL, SndOutVolumeShift );
-		OutR = clamp_mix( OutR, SndOutVolumeShift );
+		clamp_mix( Out, SndOutVolumeShift );
 	}
 
 	// Update spdif (called each sample)
 	if(PlayMode&4)
 		spdif_update();
 
-	// AddToBuffer
-	SndWrite(OutL, OutR);
+	SndBuffer::Write( Out );
 	
 	// Update AutoDMA output positioning
 	OutPos++;
diff --git a/plugins/spu2-x/src/RegTable.cpp b/plugins/spu2-x/src/RegTable.cpp
index f22df6686b..c74e2bdc88 100644
--- a/plugins/spu2-x/src/RegTable.cpp
+++ b/plugins/spu2-x/src/RegTable.cpp
@@ -31,14 +31,14 @@ const u16 zero=0;
 	PCORE(c,Voices[v].##p)
 
 #define PVC(c,v) \
-	PVCP(c,v,VolumeL.Reg_VOL), \
-	PVCP(c,v,VolumeR.Reg_VOL), \
+	PVCP(c,v,Volume.Left.Reg_VOL), \
+	PVCP(c,v,Volume.Right.Reg_VOL), \
 	PVCP(c,v,Pitch), \
 	PVCP(c,v,ADSR.Reg_ADSR1), \
 	PVCP(c,v,ADSR.Reg_ADSR2), \
 	PVCP(c,v,ADSR.Value)+1, \
-	PVCP(c,v,VolumeL.Value)+1, \
-	PVCP(c,v,VolumeR.Value)+1
+	PVCP(c,v,Volume.Left.Value)+1, \
+	PVCP(c,v,Volume.Right.Value)+1
 
 #define PVCA(c,v) \
 	PVCP(c,v,StartA)+1, \
@@ -247,16 +247,16 @@ u16* regtable[0x800] =
 	PRAW(0x758),PRAW(0x75A),PRAW(0x75C),PRAW(0x75E),
 
 	//0x760: weird area
-	PCORE(0,MasterL.Reg_VOL),
-	PCORE(0,MasterR.Reg_VOL),
-	PCORE(0,FxL)+1,
-	PCORE(0,FxR)+1,
-	PCORE(0,ExtL)+1,
-	PCORE(0,ExtR)+1,
-	PCORE(0,InpL)+1,
-	PCORE(0,InpR)+1,
-	PCORE(0,MasterL.Value)+1,
-	PCORE(0,MasterR.Value)+1,
+	PCORE(0,MasterVol.Left.Reg_VOL),
+	PCORE(0,MasterVol.Right.Reg_VOL),
+	PCORE(0,FxVol.Left)+1,
+	PCORE(0,FxVol.Right)+1,
+	PCORE(0,ExtVol.Left)+1,
+	PCORE(0,ExtVol.Right)+1,
+	PCORE(0,InpVol.Left)+1,
+	PCORE(0,InpVol.Right)+1,
+	PCORE(0,MasterVol.Left.Value)+1,
+	PCORE(0,MasterVol.Right.Value)+1,
 	PCORE(0,Revb.IIR_ALPHA),
 	PCORE(0,Revb.ACC_COEF_A),
 	PCORE(0,Revb.ACC_COEF_B),
@@ -268,16 +268,16 @@ u16* regtable[0x800] =
 	PCORE(0,Revb.IN_COEF_L),
 	PCORE(0,Revb.IN_COEF_R),
 
-	PCORE(1,MasterL.Reg_VOL),
-	PCORE(1,MasterR.Reg_VOL),
-	PCORE(1,FxL)+1,
-	PCORE(1,FxR)+1,
-	PCORE(1,ExtL)+1,
-	PCORE(1,ExtR)+1,
-	PCORE(1,InpL)+1,
-	PCORE(1,InpR)+1,
-	PCORE(1,MasterL.Value)+1,
-	PCORE(1,MasterR.Value)+1,
+	PCORE(1,MasterVol.Left.Reg_VOL),
+	PCORE(1,MasterVol.Right.Reg_VOL),
+	PCORE(1,FxVol.Left)+1,
+	PCORE(1,FxVol.Right)+1,
+	PCORE(1,ExtVol.Left)+1,
+	PCORE(1,ExtVol.Right)+1,
+	PCORE(1,InpVol.Left)+1,
+	PCORE(1,InpVol.Right)+1,
+	PCORE(1,MasterVol.Left.Value)+1,
+	PCORE(1,MasterVol.Right.Value)+1,
 	PCORE(1,Revb.IIR_ALPHA),
 	PCORE(1,Revb.ACC_COEF_A),
 	PCORE(1,Revb.ACC_COEF_B),
diff --git a/plugins/spu2-x/src/Reverb.cpp b/plugins/spu2-x/src/Reverb.cpp
index 4860bd7f22..8520830356 100644
--- a/plugins/spu2-x/src/Reverb.cpp
+++ b/plugins/spu2-x/src/Reverb.cpp
@@ -24,20 +24,18 @@
 static LPF_data lowpass_left( 11000, SampleRate );
 static LPF_data lowpass_right( 11000, SampleRate );
 
-static s32 EffectsBufferIndexer( V_Core& thiscore, s32 offset )
+static __forceinline s32 RevbGetIndexer( V_Core& thiscore, s32 offset )
 {
-	u32 pos = thiscore.EffectsStartA + thiscore.ReverbX + offset;
+	u32 pos = thiscore.ReverbX + offset;
 
 	// Need to use modulus here, because games can and will drop the buffer size
 	// without notice, and it leads to offsets several times past the end of the buffer.
 
 	if( pos > thiscore.EffectsEndA )
 	{
-		pos = thiscore.EffectsStartA + ((thiscore.ReverbX + offset) % (u32)thiscore.EffectsBufferSize);
-	}
-	else if( pos < thiscore.EffectsStartA )
-	{
-		pos = thiscore.EffectsEndA+1 - ((thiscore.ReverbX + offset) % (u32)thiscore.EffectsBufferSize );
+		//pos = thiscore.EffectsStartA + ((thiscore.ReverbX + offset) % (u32)thiscore.EffectsBufferSize);
+		pos -= thiscore.EffectsEndA+1;
+		pos += thiscore.EffectsStartA;
 	}
 	return pos;
 } 
@@ -52,15 +50,16 @@ void Reverb_AdvanceBuffer( V_Core& thiscore )
 {
 	if( (Cycles & 1) && (thiscore.EffectsBufferSize > 0) )
 	{
-		thiscore.ReverbX += 1;
-		if(thiscore.ReverbX >= (u32)thiscore.EffectsBufferSize )
-			thiscore.ReverbX %= (u32)thiscore.EffectsBufferSize;
+		thiscore.ReverbX = RevbGetIndexer( thiscore, 1 );
+		//thiscore.ReverbX += 1;
+		//if(thiscore.ReverbX >= (u32)thiscore.EffectsBufferSize )
+		//	thiscore.ReverbX %= (u32)thiscore.EffectsBufferSize;
 	}
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////
 
-void DoReverb( V_Core& thiscore, s32& OutL, s32& OutR, s32 InL, s32 InR)
+StereoOut32 DoReverb( V_Core& thiscore, const StereoOut32& Input )
 {
 	// Reverb processing occurs at 24khz, so we skip processing every other sample,
 	// and use the previous calculation for this core instead.
@@ -68,84 +67,90 @@ void DoReverb( V_Core& thiscore, s32& OutL, s32& OutR, s32 InL, s32 InR)
 	if( thiscore.EffectsBufferSize <= 0 )
 	{
 		// StartA is past EndA, so effects are disabled.
-		OutL = InL;
-		OutR = InR;
 		//ConLog( " * SPU2: Effects disabled due to leapfrogged EffectsStart." );
-		return;
+		return Input;
 	}
 
-	if((Cycles&1)==0) 
+	if( (Cycles&1)==0 )
 	{
-		OutL = thiscore.LastEffectL;
-		OutR = thiscore.LastEffectR;
-		
-		thiscore.LastEffectL = InL;
-		thiscore.LastEffectR = InR;
+		StereoOut32 retval( thiscore.LastEffect );
+		thiscore.LastEffect = Input;
+		return retval;
 	}
 	else  
 	{
+		if( thiscore.RevBuffers.NeedsUpdated )
+			thiscore.UpdateEffectsBufferSize();
+
 		// Advance the current reverb buffer pointer, and cache the read/write addresses we'll be
 		// needing for this session of reverb.
 		
-		const u32 src_a0 = EffectsBufferIndexer( thiscore, thiscore.Revb.IIR_SRC_A0 );
-		const u32 src_a1 = EffectsBufferIndexer( thiscore, thiscore.Revb.IIR_SRC_A1 );
-		const u32 src_b0 = EffectsBufferIndexer( thiscore, thiscore.Revb.IIR_SRC_B0 );
-		const u32 src_b1 = EffectsBufferIndexer( thiscore, thiscore.Revb.IIR_SRC_B1 );
+		const u32 src_a0 = RevbGetIndexer( thiscore, thiscore.RevBuffers.IIR_SRC_A0 );
+		const u32 src_a1 = RevbGetIndexer( thiscore, thiscore.RevBuffers.IIR_SRC_A1 );
+		const u32 src_b0 = RevbGetIndexer( thiscore, thiscore.RevBuffers.IIR_SRC_B0 );
+		const u32 src_b1 = RevbGetIndexer( thiscore, thiscore.RevBuffers.IIR_SRC_B1 );
 
-		const u32 dest_a0 = EffectsBufferIndexer( thiscore, thiscore.Revb.IIR_DEST_A0 );
-		const u32 dest_a1 = EffectsBufferIndexer( thiscore, thiscore.Revb.IIR_DEST_A1 );
-		const u32 dest_b0 = EffectsBufferIndexer( thiscore, thiscore.Revb.IIR_DEST_B0 );
-		const u32 dest_b1 = EffectsBufferIndexer( thiscore, thiscore.Revb.IIR_DEST_B1 );
+		const u32 dest_a0 = RevbGetIndexer( thiscore, thiscore.RevBuffers.IIR_DEST_A0 );
+		const u32 dest_a1 = RevbGetIndexer( thiscore, thiscore.RevBuffers.IIR_DEST_A1 );
+		const u32 dest_b0 = RevbGetIndexer( thiscore, thiscore.RevBuffers.IIR_DEST_B0 );
+		const u32 dest_b1 = RevbGetIndexer( thiscore, thiscore.RevBuffers.IIR_DEST_B1 );
 		
-		const u32 dest2_a0 = EffectsBufferIndexer( thiscore, thiscore.Revb.IIR_DEST_A0 + 1 );
-		const u32 dest2_a1 = EffectsBufferIndexer( thiscore, thiscore.Revb.IIR_DEST_A1 + 1 );
-		const u32 dest2_b0 = EffectsBufferIndexer( thiscore, thiscore.Revb.IIR_DEST_B0 + 1 );
-		const u32 dest2_b1 = EffectsBufferIndexer( thiscore, thiscore.Revb.IIR_DEST_B1 + 1 );
+		const u32 dest2_a0 = RevbGetIndexer( thiscore, thiscore.RevBuffers.IIR_DEST_A0 + 1 );
+		const u32 dest2_a1 = RevbGetIndexer( thiscore, thiscore.RevBuffers.IIR_DEST_A1 + 1 );
+		const u32 dest2_b0 = RevbGetIndexer( thiscore, thiscore.RevBuffers.IIR_DEST_B0 + 1 );
+		const u32 dest2_b1 = RevbGetIndexer( thiscore, thiscore.RevBuffers.IIR_DEST_B1 + 1 );
 		
-		const u32 acc_src_a0 = EffectsBufferIndexer( thiscore, thiscore.Revb.ACC_SRC_A0 );
-		const u32 acc_src_b0 = EffectsBufferIndexer( thiscore, thiscore.Revb.ACC_SRC_B0 );
-		const u32 acc_src_c0 = EffectsBufferIndexer( thiscore, thiscore.Revb.ACC_SRC_C0 );
-		const u32 acc_src_d0 = EffectsBufferIndexer( thiscore, thiscore.Revb.ACC_SRC_D0 );
+		const u32 acc_src_a0 = RevbGetIndexer( thiscore, thiscore.RevBuffers.ACC_SRC_A0 );
+		const u32 acc_src_b0 = RevbGetIndexer( thiscore, thiscore.RevBuffers.ACC_SRC_B0 );
+		const u32 acc_src_c0 = RevbGetIndexer( thiscore, thiscore.RevBuffers.ACC_SRC_C0 );
+		const u32 acc_src_d0 = RevbGetIndexer( thiscore, thiscore.RevBuffers.ACC_SRC_D0 );
 
-		const u32 acc_src_a1 = EffectsBufferIndexer( thiscore, thiscore.Revb.ACC_SRC_A1 );
-		const u32 acc_src_b1 = EffectsBufferIndexer( thiscore, thiscore.Revb.ACC_SRC_B1 );
-		const u32 acc_src_c1 = EffectsBufferIndexer( thiscore, thiscore.Revb.ACC_SRC_C1 );
-		const u32 acc_src_d1 = EffectsBufferIndexer( thiscore, thiscore.Revb.ACC_SRC_D1 );
+		const u32 acc_src_a1 = RevbGetIndexer( thiscore, thiscore.RevBuffers.ACC_SRC_A1 );
+		const u32 acc_src_b1 = RevbGetIndexer( thiscore, thiscore.RevBuffers.ACC_SRC_B1 );
+		const u32 acc_src_c1 = RevbGetIndexer( thiscore, thiscore.RevBuffers.ACC_SRC_C1 );
+		const u32 acc_src_d1 = RevbGetIndexer( thiscore, thiscore.RevBuffers.ACC_SRC_D1 );
 
-		const u32 fb_src_a0 = EffectsBufferIndexer( thiscore, thiscore.Revb.MIX_DEST_A0 - thiscore.Revb.FB_SRC_A );
-		const u32 fb_src_a1 = EffectsBufferIndexer( thiscore, thiscore.Revb.MIX_DEST_A1 - thiscore.Revb.FB_SRC_A );
-		const u32 fb_src_b0 = EffectsBufferIndexer( thiscore, thiscore.Revb.MIX_DEST_B0 - thiscore.Revb.FB_SRC_B );
-		const u32 fb_src_b1 = EffectsBufferIndexer( thiscore, thiscore.Revb.MIX_DEST_B1 - thiscore.Revb.FB_SRC_B );
+		const u32 fb_src_a0 = RevbGetIndexer( thiscore, thiscore.RevBuffers.FB_SRC_A0 );
+		const u32 fb_src_a1 = RevbGetIndexer( thiscore, thiscore.RevBuffers.FB_SRC_A1 );
+		const u32 fb_src_b0 = RevbGetIndexer( thiscore, thiscore.RevBuffers.FB_SRC_B0 );
+		const u32 fb_src_b1 = RevbGetIndexer( thiscore, thiscore.RevBuffers.FB_SRC_B1 );
 
-		const u32 mix_dest_a0 = EffectsBufferIndexer( thiscore, thiscore.Revb.MIX_DEST_A0 );
-		const u32 mix_dest_a1 = EffectsBufferIndexer( thiscore, thiscore.Revb.MIX_DEST_A1 );
-		const u32 mix_dest_b0 = EffectsBufferIndexer( thiscore, thiscore.Revb.MIX_DEST_B0 );
-		const u32 mix_dest_b1 = EffectsBufferIndexer( thiscore, thiscore.Revb.MIX_DEST_B1 );
+		const u32 mix_dest_a0 = RevbGetIndexer( thiscore, thiscore.RevBuffers.MIX_DEST_A0 );
+		const u32 mix_dest_a1 = RevbGetIndexer( thiscore, thiscore.RevBuffers.MIX_DEST_A1 );
+		const u32 mix_dest_b0 = RevbGetIndexer( thiscore, thiscore.RevBuffers.MIX_DEST_B0 );
+		const u32 mix_dest_b1 = RevbGetIndexer( thiscore, thiscore.RevBuffers.MIX_DEST_B1 );
 
 		// -----------------------------------------
 		//    End Buffer Pointers, Begin Reverb!
 		// -----------------------------------------
 
-		const s32 INPUT_SAMPLE_L = (thiscore.LastEffectL+InL);
-		const s32 INPUT_SAMPLE_R = (thiscore.LastEffectR+InR);
+		StereoOut32 INPUT_SAMPLE( thiscore.LastEffect + Input );
 		
-		//const s32 INPUT_SAMPLE_L = (s32)( lowpass_left.sample( (thiscore.LastEffectL+InL)/65536.0 ) * 65536.0 );
-		//const s32 INPUT_SAMPLE_R = (s32)( lowpass_right.sample( (thiscore.LastEffectR+InR)/65536.0 ) * 65536.0 );
+		const s32 IIR_INPUT_A0 = ((_spu2mem[src_a0] * thiscore.Revb.IIR_COEF) + (INPUT_SAMPLE.Left * thiscore.Revb.IN_COEF_L))>>16;
+		const s32 IIR_INPUT_A1 = ((_spu2mem[src_a1] * thiscore.Revb.IIR_COEF) + (INPUT_SAMPLE.Right * thiscore.Revb.IN_COEF_R))>>16;
+		const s32 IIR_INPUT_B0 = ((_spu2mem[src_b0] * thiscore.Revb.IIR_COEF) + (INPUT_SAMPLE.Left * thiscore.Revb.IN_COEF_L))>>16;
+		const s32 IIR_INPUT_B1 = ((_spu2mem[src_b1] * thiscore.Revb.IIR_COEF) + (INPUT_SAMPLE.Right * thiscore.Revb.IN_COEF_R))>>16;
 
-		const s32 IIR_INPUT_A0 = ((_spu2mem[src_a0] * thiscore.Revb.IIR_COEF) + (INPUT_SAMPLE_L * thiscore.Revb.IN_COEF_L))>>16;
-		const s32 IIR_INPUT_A1 = ((_spu2mem[src_a1] * thiscore.Revb.IIR_COEF) + (INPUT_SAMPLE_R * thiscore.Revb.IN_COEF_R))>>16;
-		const s32 IIR_INPUT_B0 = ((_spu2mem[src_b0] * thiscore.Revb.IIR_COEF) + (INPUT_SAMPLE_L * thiscore.Revb.IN_COEF_L))>>16;
-		const s32 IIR_INPUT_B1 = ((_spu2mem[src_b1] * thiscore.Revb.IIR_COEF) + (INPUT_SAMPLE_R * thiscore.Revb.IN_COEF_R))>>16;
+		//const s32 IIR_A0 = (IIR_INPUT_A0 * thiscore.Revb.IIR_ALPHA) + (_spu2mem[dest_a0] * (0x7fff - thiscore.Revb.IIR_ALPHA));
+		//const s32 IIR_A1 = (IIR_INPUT_A1 * thiscore.Revb.IIR_ALPHA) + (_spu2mem[dest_a1] * (0x7fff - thiscore.Revb.IIR_ALPHA));
+		//const s32 IIR_B0 = (IIR_INPUT_B0 * thiscore.Revb.IIR_ALPHA) + (_spu2mem[dest_b0] * (0x7fff - thiscore.Revb.IIR_ALPHA));
+		//const s32 IIR_B1 = (IIR_INPUT_B1 * thiscore.Revb.IIR_ALPHA) + (_spu2mem[dest_b1] * (0x7fff - thiscore.Revb.IIR_ALPHA));
 
-		const s32 IIR_A0 = (IIR_INPUT_A0 * thiscore.Revb.IIR_ALPHA) + (_spu2mem[dest_a0] * (0x7fff - thiscore.Revb.IIR_ALPHA));
-		const s32 IIR_A1 = (IIR_INPUT_A1 * thiscore.Revb.IIR_ALPHA) + (_spu2mem[dest_a1] * (0x7fff - thiscore.Revb.IIR_ALPHA));
-		const s32 IIR_B0 = (IIR_INPUT_B0 * thiscore.Revb.IIR_ALPHA) + (_spu2mem[dest_b0] * (0x7fff - thiscore.Revb.IIR_ALPHA));
-		const s32 IIR_B1 = (IIR_INPUT_B1 * thiscore.Revb.IIR_ALPHA) + (_spu2mem[dest_b1] * (0x7fff - thiscore.Revb.IIR_ALPHA));
+		//_spu2mem[dest2_a0] = clamp_mix( IIR_A0 >> 16 );
+		//_spu2mem[dest2_a1] = clamp_mix( IIR_A1 >> 16 );
+		//_spu2mem[dest2_b0] = clamp_mix( IIR_B0 >> 16 );
+		//_spu2mem[dest2_b1] = clamp_mix( IIR_B1 >> 16 );
 
-		_spu2mem[dest2_a0] = clamp_mix( IIR_A0 >> 16 );
-		_spu2mem[dest2_a1] = clamp_mix( IIR_A1 >> 16 );
-		_spu2mem[dest2_b0] = clamp_mix( IIR_B0 >> 16 );
-		_spu2mem[dest2_b1] = clamp_mix( IIR_B1 >> 16 );
+		// Faster single-mul approach to interpolation:
+		const s32 IIR_A0 = IIR_INPUT_A0 + ((_spu2mem[dest_a0]-IIR_INPUT_A0) * thiscore.Revb.IIR_ALPHA)>>16;
+		const s32 IIR_A1 = IIR_INPUT_A1 + ((_spu2mem[dest_a1]-IIR_INPUT_A1) * thiscore.Revb.IIR_ALPHA)>>16;
+		const s32 IIR_B0 = IIR_INPUT_B0 + ((_spu2mem[dest_b0]-IIR_INPUT_B0) * thiscore.Revb.IIR_ALPHA)>>16;
+		const s32 IIR_B1 = IIR_INPUT_B1 + ((_spu2mem[dest_b1]-IIR_INPUT_B1) * thiscore.Revb.IIR_ALPHA)>>16;
+
+		_spu2mem[dest2_a0] = clamp_mix( IIR_A0 );
+		_spu2mem[dest2_a1] = clamp_mix( IIR_A1 );
+		_spu2mem[dest2_b0] = clamp_mix( IIR_B0 );
+		_spu2mem[dest2_b1] = clamp_mix( IIR_B1 );
 
 		const s32 ACC0 =
 			((_spu2mem[acc_src_a0] * thiscore.Revb.ACC_COEF_A)) +
@@ -161,8 +166,6 @@ void DoReverb( V_Core& thiscore, s32& OutL, s32& OutR, s32 InL, s32 InR)
 
 		const s32 FB_A0 = (_spu2mem[fb_src_a0] * thiscore.Revb.FB_ALPHA);
 		const s32 FB_A1 = (_spu2mem[fb_src_a1] * thiscore.Revb.FB_ALPHA);
-		const s32 FB_B0 = (_spu2mem[fb_src_b0] * (0x7fff - thiscore.Revb.FB_ALPHA)); //>>16;
-		const s32 FB_B1 = (_spu2mem[fb_src_b1] * (0x7fff - thiscore.Revb.FB_ALPHA)); //>>16;
 
 		const s32 fb_xor_a0 = (_spu2mem[fb_src_a0] * ( thiscore.Revb.FB_ALPHA ^ 0x8000 ))>>2;
 		const s32 fb_xor_a1 = (_spu2mem[fb_src_a1] * ( thiscore.Revb.FB_ALPHA ^ 0x8000 ))>>2;
@@ -172,12 +175,13 @@ void DoReverb( V_Core& thiscore, s32& OutL, s32& OutR, s32 InL, s32 InR)
 		_spu2mem[mix_dest_b0] = clamp_mix( (MulShr32(thiscore.Revb.FB_ALPHA<<14, ACC0) - fb_xor_a0 - ((_spu2mem[fb_src_b0] * thiscore.Revb.FB_X)>>2)) >> 14 );
 		_spu2mem[mix_dest_b1] = clamp_mix( (MulShr32(thiscore.Revb.FB_ALPHA<<14, ACC1) - fb_xor_a1 - ((_spu2mem[fb_src_b1] * thiscore.Revb.FB_X)>>2)) >> 14 );
 
-		thiscore.LastEffectL = clamp_mix(_spu2mem[mix_dest_a0] + _spu2mem[mix_dest_b0]);
-		thiscore.LastEffectR = clamp_mix(_spu2mem[mix_dest_a1] + _spu2mem[mix_dest_b1]);
+		thiscore.LastEffect.Left  = _spu2mem[mix_dest_a0] + _spu2mem[mix_dest_b0];
+		thiscore.LastEffect.Right = _spu2mem[mix_dest_a1] + _spu2mem[mix_dest_b1];
+		clamp_mix( thiscore.LastEffect );
 		
-		//OutL = thiscore.LastEffectL;
-		//OutR = thiscore.LastEffectR;
-		OutL = (s32)(lowpass_left.sample( thiscore.LastEffectL / 32768.0 ) * 32768.0);
-		OutR = (s32)(lowpass_right.sample( thiscore.LastEffectR / 32768.0 ) * 32768.0);
+		thiscore.LastEffect.Left = (s32)(lowpass_left.sample( thiscore.LastEffect.Left / 32768.0 ) * 32768.0);
+		thiscore.LastEffect.Right = (s32)(lowpass_right.sample( thiscore.LastEffect.Right / 32768.0 ) * 32768.0);
+
+		return thiscore.LastEffect;
 	} 
 }
diff --git a/plugins/spu2-x/src/SndOut.cpp b/plugins/spu2-x/src/SndOut.cpp
index 8621c0a4a8..7a29816fe5 100644
--- a/plugins/spu2-x/src/SndOut.cpp
+++ b/plugins/spu2-x/src/SndOut.cpp
@@ -19,24 +19,45 @@
  * 
  */
 
-// [TODO] : The layout of this code file is now a complete hackish mess after
-// numerous timestretch-related additions.  The whole thing should really be
-// rethought and redone at this point.
-
 #include "spu2.h"
-#include "SoundTouch/SoundTouch.h"
-#include "SoundTouch/WavFile.h"
 
-#include <new>
 
-static int ts_stats_stretchblocks = 0;
-static int ts_stats_normalblocks = 0;
-static int ts_stats_logcounter = 0;
+StereoOut32 StereoOut32::Empty( 0, 0 );
+
+StereoOut32::StereoOut32( const StereoOut16& src ) :
+	Left( src.Left ),
+	Right( src.Right )
+{
+}
+
+StereoOut32::StereoOut32( const StereoOutFloat& src ) :
+	Left( (s32)(src.Left * 2147483647.0f) ),
+	Right( (s32)(src.Right * 2147483647.0f) )
+{
+}
+
+StereoOut16 StereoOut32::DownSample() const
+{
+	return StereoOut16(
+		Left >> SndOutVolumeShift,
+		Right >> SndOutVolumeShift
+	);
+}
+
+StereoOut32 StereoOut16::UpSample() const
+{
+	return StereoOut32(
+		Left << SndOutVolumeShift,
+		Right << SndOutVolumeShift
+	);
+
+}
+
 
 class NullOutModule: public SndOutModule
 {
 public:
-	s32  Init(SndBuffer *)  { return 0; }
+	s32  Init()  { return 0; }
 	void Close() { }
 	s32  Test() const { return 0; }
 	void Configure(HWND parent)  { }
@@ -61,7 +82,6 @@ SndOutModule* mods[]=
 	XAudio2Out,
 	DSoundOut,
 	WaveOut,
-	//ASIOOut,
 	NULL		// signals the end of our list
 };
 
@@ -77,528 +97,173 @@ int FindOutputModuleById( const wchar_t* omodid )
 	return modcnt;
 }
 
+StereoOut32 *SndBuffer::m_buffer;
+s32 SndBuffer::m_size;
+s32 SndBuffer::m_rpos;
+s32 SndBuffer::m_wpos;
+s32 SndBuffer::m_data;
 
-__forceinline s16 SndScaleVol( s32 inval )
+bool SndBuffer::m_underrun_freeze;
+StereoOut32* SndBuffer::sndTempBuffer = NULL;
+StereoOut16* SndBuffer::sndTempBuffer16 = NULL;
+int SndBuffer::sndTempProgress = 0;
+
+int GetAlignedBufferSize( int comp )
 {
-	return inval >> SndOutVolumeShift;
+	return (comp + SndOutPacketSize-1) & ~(SndOutPacketSize-1);
 }
 
-
-// records last buffer status (fill %, range -100 to 100, with 0 being 50% full)
-float lastPct;
-float lastEmergencyAdj;
-
-float cTempo=1;
-float eTempo = 1;
-int freezeTempo = 0;
-
-soundtouch::SoundTouch* pSoundTouch=NULL;
-
-
-//usefull when timestretch isn't available 
-
-class SndBufferImpl: public SndBuffer
+// Returns TRUE if there is data to be output, or false if no data
+// is available to be copied.
+bool SndBuffer::CheckUnderrunStatus( int& nSamples, int& quietSampleCount )
 {
-private:
-	s32 *buffer;
-	s32 size;
-	s32 rpos;
-	s32 wpos;
-	s32 data;
+	quietSampleCount = 0;
+	if( m_underrun_freeze )
+	{			
+		int toFill = (int)(m_size * ( timeStretchDisabled ? 0.50f : 0.1f ) );
+		toFill = GetAlignedBufferSize( toFill );
 
-	// data prediction amount, used to "commit" data that hasn't
-	// finished timestretch processing.
-	s32 predictData;
+		// toFill is now aligned to a SndOutPacket
 
-	bool pw;
-	bool underrun_freeze;
-
-protected:
-	int GetAlignedBufferSize( int comp )
-	{
-		return (comp + SndOutPacketSize-1) & ~(SndOutPacketSize-1);
-	}
-
-public:
-	SndBufferImpl( float latencyMS )
-	{
-		rpos=0;
-		wpos=0;
-		data=0;
-		size=GetAlignedBufferSize( (int)(latencyMS * SampleRate / 500.0f ) );
-		buffer = new s32[size];
-		pw=false;
-		underrun_freeze = false;
-		predictData = 0;
-	}
-
-	virtual ~SndBufferImpl()
-	{
-		delete buffer;
-	}
-
-	virtual void WriteSamples(s32 *bData, int nSamples)
-	{
-		int free = size-data;
-		predictData = 0;
-
-		jASSUME( data <= size );
-
-		// Problem:
-		//  If the SPU2 gets out of sync with the SndOut device, the writepos of the
-		//  circular buffer will overtake the readpos, leading to a prolonged period
-		//  of hopscotching read/write accesses (ie, lots of staticy crap sound for
-		//  several seconds).
-		//
-		// Compromise:
-		//  When an overrun occurs, we adapt by discarding a portion of the buffer.
-		//  The older portion of the buffer is discarded rather than incoming data,
-		//  so that the overall audio synchronization is better.
-		
-		if( free < nSamples )
+		if( m_data < toFill )
 		{
-			// Buffer overrun!
-			// Dump samples from the read portion of the buffer instead of dropping
-			// the newly written stuff.
-
-			s32 comp;
-
-			if( !timeStretchDisabled )
-			{
-				// If we overran it means the timestretcher failed.  We need to speed
-				// up audio playback.
-				cTempo += cTempo * 0.12f;
-				eTempo += eTempo * 0.40f;
-				if( eTempo > 7.5f ) eTempo = 7.5f;
-				pSoundTouch->setTempo( eTempo );
-
-				// Throw out just a little bit (two packets worth) to help
-				// give the TS some room to work:
-
-				comp = SndOutPacketSize*2;
-			}
-			else
-			{
-				// Toss half the buffer plus whatever's being written anew:
-				comp = GetAlignedBufferSize( (size + nSamples ) / 2 );
-				if( comp > (size-SndOutPacketSize) ) comp = size-SndOutPacketSize;
-			}
-
-			data -= comp;
-			rpos = (rpos+comp)%size;
-			if( MsgOverruns() )
-				ConLog(" * SPU2 > Overrun Compensation (%d packets tossed)\n", comp / SndOutPacketSize );
-			lastPct = 0.0;		// normalize the timestretcher
+			quietSampleCount = nSamples;
+			return false;
 		}
 
-		// copy in two phases, since there's a chance the packet
-		// wraps around the buffer (it'd be nice to deal in packets only, but
-		// the timestretcher and DSP options require flexibility).
-
-		const int endPos = wpos + nSamples;
-		const int secondCopyLen = endPos - size;
-		s32* wposbuffer = &buffer[wpos];
-
-		data += nSamples;
-		if( secondCopyLen > 0 )
-		{
-			nSamples -= secondCopyLen;
-			memcpy( buffer, &bData[nSamples], secondCopyLen * sizeof( *bData ) );
-			wpos = secondCopyLen;
-		}
-		else
-			wpos += nSamples;
-
-		memcpy( wposbuffer, bData, nSamples * sizeof( *bData ) );
+		m_underrun_freeze = false;
+		if( MsgOverruns() )
+			ConLog(" * SPU2 > Underrun compensation (%d packets buffered)\n", toFill / SndOutPacketSize );
+		lastPct = 0.0;		// normalize timestretcher
 	}
-
-	protected:
-	// Returns TRUE if there is data to be output, or false if no data
-	// is available to be copied.
-	bool CheckUnderrunStatus( int& nSamples, int& quietSampleCount )
+	else if( m_data < nSamples )
 	{
-		quietSampleCount = 0;
-		if( underrun_freeze )
-		{			
-			int toFill = (int)(size * ( timeStretchDisabled ? 0.50f : 0.1f ) );
-			toFill = GetAlignedBufferSize( toFill );
+		nSamples = m_data;
+		quietSampleCount = SndOutPacketSize - m_data;
+		m_underrun_freeze = true;
 
-			// toFill is now aligned to a SndOutPacket
+		if( !timeStretchDisabled )
+			timeStretchUnderrun();
 
-			if( data < toFill )
-			{
-				quietSampleCount = nSamples;
-				return false;
-			}
-
-			underrun_freeze = false;
-			if( MsgOverruns() )
-				ConLog(" * SPU2 > Underrun compensation (%d packets buffered)\n", toFill / SndOutPacketSize );
-			lastPct = 0.0;		// normalize timestretcher
-		}
-		else if( data < nSamples )
-		{
-			nSamples = data;
-			quietSampleCount = SndOutPacketSize - data;
-			underrun_freeze = true;
-
-			if( !timeStretchDisabled )
-			{
-				// timeStretcher failed it's job.  We need to slow down the audio some.
-
-				cTempo -= (cTempo * 0.12f);
-				eTempo -= (eTempo * 0.30f);
-				if( eTempo < 0.1f ) eTempo = 0.1f;
-				pSoundTouch->setTempo( eTempo );
-			}
-
-			return nSamples != 0;
-		}
-
-		return true;
+		return nSamples != 0;
 	}
 
-public:
-	void ReadSamples( s16* bData )
-	{
-		int nSamples = SndOutPacketSize;
-
-		// Problem:
-		//  If the SPU2 gets even the least bit out of sync with the SndOut device,
-		//  the readpos of the circular buffer will overtake the writepos,
-		//  leading to a prolonged period of hopscotching read/write accesses (ie,
-		//  lots of staticy crap sound for several seconds).
-		//
-		// Fix:
-		//  If the read position overtakes the write position, abort the
-		//  transfer immediately and force the SndOut driver to wait until
-		//  the read buffer has filled up again before proceeding.
-		//  This will cause one brief hiccup that can never exceed the user's
-		//  set buffer length in duration.
-
-		int quietSamples;
-		if( CheckUnderrunStatus( nSamples, quietSamples ) )
-		{
-			jASSUME( nSamples <= SndOutPacketSize );
-
-			// [Air] [TODO]: This loop is probably a candidiate for SSE2 optimization.
-
-			const int endPos = rpos + nSamples;
-			const int secondCopyLen = endPos - size;
-			const s32* rposbuffer = &buffer[rpos];
-
-			data -= nSamples;
-
-			if( secondCopyLen > 0 )
-			{
-				nSamples -= secondCopyLen;
-				for( int i=0; i<secondCopyLen; i++ )
-					bData[nSamples+i] = SndScaleVol( buffer[i] );
-				rpos = secondCopyLen;
-			}
-			else
-				rpos += nSamples;
-
-			for( int i=0; i<nSamples; i++ )
-				bData[i] = SndScaleVol( rposbuffer[i] );
-		}
-
-		// If quietSamples != 0 it means we have an underrun...
-		// Let's just dull out some silence, because that's usually the least
-		// painful way of dealing with underruns:
-		memset( bData, 0, quietSamples * sizeof(*bData) );
-	}
-
-	void ReadSamples( s32* bData )
-	{
-		int nSamples = SndOutPacketSize;
-
-		// Problem:
-		//  If the SPU2 gets even the least bit out of sync with the SndOut device,
-		//  the readpos of the circular buffer will overtake the writepos,
-		//  leading to a prolonged period of hopscotching read/write accesses (ie,
-		//  lots of staticy crap sound for several seconds).
-		//
-		// Fix:
-		//  If the read position overtakes the write position, abort the
-		//  transfer immediately and force the SndOut driver to wait until
-		//  the read buffer has filled up again before proceeding.
-		//  This will cause one brief hiccup that can never exceed the user's
-		//  set buffer length in duration.
-
-		int quietSamples;
-		if( CheckUnderrunStatus( nSamples, quietSamples ) )
-		{
-			// nSamples is garaunteed non-zero if CheckUnderrunStatus
-			// returned true.
-
-			const int endPos = rpos + nSamples;
-			const int secondCopyLen = endPos - size;
-			const int oldrpos = rpos;
-
-			data -= nSamples;
-
-			if( secondCopyLen > 0 )
-			{
-				nSamples -= secondCopyLen;
-				memcpy( &bData[nSamples], buffer, secondCopyLen * sizeof( *bData ) );
-				rpos = secondCopyLen;
-			}
-			else
-				rpos += nSamples;
-
-			memcpy( bData, &buffer[oldrpos], nSamples * sizeof( *bData ) );
-		}
-
-		// If quietSamples != 0 it means we have an underrun...
-		// Let's just dull out some silence, because that's usually the least
-		// painful way of dealing with underruns:
-		memset( bData, 0, quietSamples * sizeof(*bData) );
-	}
-
-	void PredictDataWrite( int samples )
-	{
-		predictData += samples;
-	}
-
-	virtual void PauseOnWrite(bool doPause) { pw = doPause; }
-
-	// Calculate the buffer status percentage.
-	// Returns range from -1.0 to 1.0
-	//    1.0 = buffer overflow!
-	//    0.0 = buffer nominal (50% full)
-	//   -1.0 = buffer underflow!
-	float GetStatusPct()
-	{
-		// Get the buffer status of the output driver too, so that we can
-		// obtain a more accurate overall buffer status.
-
-		int drvempty = mods[OutputModule]->GetEmptySampleCount(); // / 2;
-
-		//ConLog( "Data %d >>> driver: %d   predict: %d\n", data, drvempty, predictData );
-
-		float result = (float)(data + predictData - drvempty) - (size/2);
-		result /= (size/2);
-		return result;
-	}
-
-};
-
-SndBufferImpl *sndBuffer=NULL;
-
-s32* sndTempBuffer=NULL;
-s32 sndTempProgress=NULL;
-s16* sndTempBuffer16=NULL;
-
-void UpdateTempoChange()
-{
-	if( --freezeTempo > 0 )
-	{
-		return;
-	}
-
-	float statusPct = sndBuffer->GetStatusPct();
-	float pctChange = statusPct - lastPct;
-
-	float tempoChange;
-	float emergencyAdj = 0;
-	float newcee = cTempo;		// workspace var. for cTempo
-
-	// IMPORTANT!
-	// If you plan to tweak these values, make sure you're using a release build
-	// OUTSIDE THE DEBUGGER to test it!  The Visual Studio debugger can really cause
-	// erratic behavior in the audio buffers, and makes the timestretcher seem a
-	// lot more inconsistent than it really is.
-
-	// We have two factors.
-	//   * Distance from nominal buffer status (50% full)
-	//   * The change from previous update to this update.
-
-	// Prediction based on the buffer change:
-	// (linear seems to work better here)
-
-	tempoChange = pctChange * 0.75f;
-
-	if( statusPct * tempoChange < 0.0f )
-	{
-		// only apply tempo change if it is in synch with the buffer status.
-		// In other words, if the buffer is high (over 0%), and is decreasing,
-		// ignore it.  It'll just muck things up.
-
-		tempoChange = 0;
-	}
-
-	// Sudden spikes in framerate can cause the nominal buffer status
-	// to go critical, in which case we have to enact an emergency
-	// stretch. The following cubic formulas do that.  Values near
-	// the extremeites give much larger results than those near 0.
-	// And the value is added only this time, and does not accumulate.
-	// (otherwise a large value like this would cause problems down the road)
-
-	// Constants:
-	// Weight - weights the statusPct's "emergency" consideration.
-	//   higher values here will make the buffer perform more drastic
-	//   compensations at the outer edges of the buffer (at -75 or +75%
-	//   or beyond, for example).
-
-	// Range - scales the adjustment to the given range (more or less).
-	//   The actual range is dependent on the weight used, so if you increase
-	//   Weight you'll usually want to decrease Range somewhat to compensate.
-
-	// Prediction based on the buffer fill status:
-
-	const float statusWeight = 2.99f;
-	const float statusRange = 0.068f;
-
-	// "non-emergency" deadzone:  In this area stretching will be strongly discouraged.
-	// Note: due tot he nature of timestretch latency, it's always a wee bit harder to
-	// cope with low fps (underruns) tha it is high fps (overruns).  So to help out a
-	// little, the low-end portions of this check are less forgiving than the high-sides.
-
-	if( cTempo < 0.965f || cTempo > 1.060f ||
-		pctChange < -0.38f || pctChange > 0.54f ||
-		statusPct < -0.32f || statusPct > 0.39f ||
-		eTempo < 0.89f || eTempo > 1.19f )
-	{
-		emergencyAdj = ( pow( statusPct*statusWeight, 3.0f ) * statusRange);
-	}
-
-	// Smooth things out by factoring our previous adjustment into this one.
-	// It helps make the system 'feel' a little smarter by  giving it at least
-	// one packet worth of history to help work off of:
-
-	emergencyAdj = (emergencyAdj * 0.75f) + (lastEmergencyAdj * 0.25f );
-
-	lastEmergencyAdj = emergencyAdj;
-	lastPct = statusPct;
-
-	// Accumulate a fraction of the tempo change into the tempo itself.
-	// This helps the system run "smarter" to games that run consistently
-	// fast or slow by altering the base tempo to something closer to the
-	// game's active speed.  In tests most games normalize within 2 seconds
-	// at 100ms latency, which is pretty good (larger buffers normalize even
-	// quicker).
-
-	newcee += newcee * (tempoChange+emergencyAdj) * 0.03f;
-
-	// Apply tempoChange as a scale of cTempo.  That way the effect is proportional
-	// to the current tempo.  (otherwise tempos rate of change at the extremes would
-	// be too drastic)
-
-	float newTempo = newcee + ( emergencyAdj * cTempo );
-
-	// ... and as a final optimization, only stretch if the new tempo is outside
-	// a nominal threshold.  Keep this threshold check small, because it could
-	// cause some serious side effects otherwise. (enlarging the cTempo check above
-	// is usually better/safer)
-	if( newTempo < 0.970f || newTempo > 1.045f )
-	{
-		cTempo = (float)newcee;
-
-		if( newTempo < 0.10f ) newTempo = 0.10f;
-		else if( newTempo > 10.0f ) newTempo = 10.0f;
-
-		if( cTempo < 0.15f ) cTempo = 0.15f;
-		else if( cTempo > 7.5f ) cTempo = 7.5f;
-
-		pSoundTouch->setTempo( eTempo = (float)newTempo );
-		ts_stats_stretchblocks++;
-
-		/*ConLog(" * SPU2: [Nominal %d%%] [Emergency: %d%%] (baseTempo: %d%% ) (newTempo: %d%%) (buffer: %d%%)\n",
-			//(relation < 0.0) ? "Normalize" : "",
-			(int)(tempoChange * 100.0 * 0.03),
-			(int)(emergencyAdj * 100.0),
-			(int)(cTempo * 100.0),
-			(int)(newTempo * 100.0),
-			(int)(statusPct * 100.0)
-		);*/
-	}
-	else
-	{
-		// Nominal operation -- turn off stretching.
-		// note: eTempo 'slides' toward 1.0 for smoother audio and better
-		// protection against spikes.
-		if( cTempo != 1.0f )
-		{
-			cTempo = 1.0f;
-			eTempo = ( 1.0f + eTempo ) * 0.5f;
-			pSoundTouch->setTempo( eTempo );
-		}
-		else
-		{
-			if( eTempo != cTempo )
-				pSoundTouch->setTempo( eTempo=cTempo );
-			ts_stats_normalblocks++;
-		}
-	}
+	return true;
 }
 
-void soundtouchInit()
-{
-	pSoundTouch = new soundtouch::SoundTouch();
-	pSoundTouch->setSampleRate(SampleRate);
-    pSoundTouch->setChannels(2);
-
-    pSoundTouch->setSetting( SETTING_USE_QUICKSEEK, 0 );
-    pSoundTouch->setSetting( SETTING_USE_AA_FILTER, 0 );
-
-	pSoundTouch->setSetting( SETTING_SEQUENCE_MS, SoundtouchCfg::SequenceLenMS );
-	pSoundTouch->setSetting( SETTING_SEEKWINDOW_MS, SoundtouchCfg::SeekWindowMS );
-	pSoundTouch->setSetting( SETTING_OVERLAP_MS, SoundtouchCfg::OverlapMS );
-
-	pSoundTouch->setTempo(1);
-
-	// some timestretch management vars:
-
-	cTempo = 1.0;
-	eTempo = 1.0;
-	lastPct = 0;
-	lastEmergencyAdj = 0;
-
-	// just freeze tempo changes for a while at startup.
-	// the driver buffers are bogus anyway.
-	freezeTempo = 8;
-}
-
-static void _sndInitFail()
+void SndBuffer::_InitFail()
 {
 	// If a failure occurs, just initialize the NoSound driver.  This'll allow
 	// the game to emulate properly (hopefully), albeit without sound.
 	OutputModule = FindOutputModuleById( NullOut.GetIdent() );
-	mods[OutputModule]->Init( sndBuffer );
+	mods[OutputModule]->Init();
 }
 
-s32 SndInit()
+void SndBuffer::_WriteSamples(StereoOut32 *bData, int nSamples)
+{
+	int free = m_size-m_data;
+	m_predictData = 0;
+
+	jASSUME( m_data <= m_size );
+
+	// Problem:
+	//  If the SPU2 gets out of sync with the SndOut device, the writepos of the
+	//  circular buffer will overtake the readpos, leading to a prolonged period
+	//  of hopscotching read/write accesses (ie, lots of staticy crap sound for
+	//  several seconds).
+	//
+	// Compromise:
+	//  When an overrun occurs, we adapt by discarding a portion of the buffer.
+	//  The older portion of the buffer is discarded rather than incoming data,
+	//  so that the overall audio synchronization is better.
+
+	if( free < nSamples )
+	{
+		// Buffer overrun!
+		// Dump samples from the read portion of the buffer instead of dropping
+		// the newly written stuff.
+
+		s32 comp;
+
+		if( !timeStretchDisabled )
+		{
+			comp = timeStretchOverrun();
+		}
+		else
+		{
+			// Toss half the buffer plus whatever's being written anew:
+			comp = GetAlignedBufferSize( (m_size + nSamples ) / 2 );
+			if( comp > (m_size-SndOutPacketSize) ) comp = m_size-SndOutPacketSize;
+		}
+
+		m_data -= comp;
+		m_rpos = (m_rpos+comp) % m_size;
+		if( MsgOverruns() )
+			ConLog(" * SPU2 > Overrun Compensation (%d packets tossed)\n", comp / SndOutPacketSize );
+		lastPct = 0.0;		// normalize the timestretcher
+	}
+
+	// copy in two phases, since there's a chance the packet
+	// wraps around the buffer (it'd be nice to deal in packets only, but
+	// the timestretcher and DSP options require flexibility).
+
+	const int endPos = m_wpos + nSamples;
+	const int secondCopyLen = endPos - m_size;
+	StereoOut32* wposbuffer = &m_buffer[m_wpos];
+
+	m_data += nSamples;
+	if( secondCopyLen > 0 )
+	{
+		nSamples -= secondCopyLen;
+		memcpy( m_buffer, &bData[nSamples], secondCopyLen * sizeof( *bData ) );
+		m_wpos = secondCopyLen;
+	}
+	else
+		m_wpos += nSamples;
+
+	memcpy( wposbuffer, bData, nSamples * sizeof( *bData ) );
+}
+
+void SndBuffer::Init()
 {
 	if( mods[OutputModule] == NULL )
 	{
-		_sndInitFail();
-		return 0;
+		_InitFail();
+		return;
 	}
 
 	// initialize sound buffer
 	// Buffer actually attempts to run ~50%, so allocate near double what
 	// the requested latency is:
 
+
+	m_rpos = 0;
+	m_wpos = 0;
+	m_data = 0;
+
 	try
 	{
-		sndBuffer = new SndBufferImpl( SndOutLatencyMS * (timeStretchDisabled ? 1.5f : 2.0f ) );
-		sndTempBuffer = new s32[SndOutPacketSize];
-		sndTempBuffer16 = new s16[SndOutPacketSize];
+		const float latencyMS = SndOutLatencyMS * (timeStretchDisabled ? 1.5f : 2.0f );
+		m_size = GetAlignedBufferSize( (int)(latencyMS * SampleRate / 1000.0f ) );
+		m_buffer = new StereoOut32[m_size];
+		m_underrun_freeze = false;
+
+		sndTempBuffer = new StereoOut32[SndOutPacketSize];
+		sndTempBuffer16 = new StereoOut16[SndOutPacketSize];
 	}
 	catch( std::bad_alloc& )
 	{
 		// out of memory exception (most likely)
 
-		SysMessage( "Out of memory error occured while initializing SPU2." );
-		_sndInitFail();
-		return 0;
+		SysMessage( "Out of memory error occurred while initializing SPU2." );
+		_InitFail();
+		return;
 	}
 
 	// clear buffers!
 	// Fixes loopy sounds on emu resets.
-	memset( sndTempBuffer, 0, sizeof(s32) * SndOutPacketSize );
-	memset( sndTempBuffer16, 0, sizeof(s16) * SndOutPacketSize );
+	memset( sndTempBuffer, 0, sizeof(StereoOut32) * SndOutPacketSize );
+	memset( sndTempBuffer16, 0, sizeof(StereoOut16) * SndOutPacketSize );
 
 	sndTempProgress = 0;
 
@@ -608,104 +273,78 @@ s32 SndInit()
 	spdif_set51(mods[OutputModule]->Is51Out());
 
 	// initialize module
-	if( mods[OutputModule]->Init(sndBuffer) == -1 )
-	{
-		_sndInitFail();
-	}
-
-	return 0;
+	if( mods[OutputModule]->Init() == -1 ) _InitFail();
 }
 
-void SndClose()
+void SndBuffer::Cleanup()
 {
 	mods[OutputModule]->Close();
 
-	SAFE_DELETE_OBJ( sndBuffer );
+	SAFE_DELETE_ARRAY( m_buffer );
 	SAFE_DELETE_ARRAY( sndTempBuffer );
 	SAFE_DELETE_ARRAY( sndTempBuffer16 );
-	SAFE_DELETE_OBJ( pSoundTouch );
 }
 
-s32 SndWrite(s32 ValL, s32 ValR)
+int SndBuffer::m_dsp_progress = 0;
+int SndBuffer::m_dsp_writepos = 0;
+
+int SndBuffer::m_timestretch_progress = 0;
+
+void SndBuffer::Write( const StereoOut32& Sample )
 {
 	// Log final output to wavefile.
-	WaveDump::WriteCore( 1, CoreSrc_External, SndScaleVol(ValL), SndScaleVol(ValR) );
+	WaveDump::WriteCore( 1, CoreSrc_External, Sample.DownSample() );
+
+	RecordWrite( Sample.DownSample() );
 
-	RecordWrite(SndScaleVol(ValL),SndScaleVol(ValR));
- 
 	if(mods[OutputModule] == &NullOut) // null output doesn't need buffering or stretching! :p
-		return 0;
- 
-	sndTempBuffer[sndTempProgress++] = ValL;
-	sndTempBuffer[sndTempProgress++] = ValR;
- 
+		return;
+
+	sndTempBuffer[sndTempProgress++] = Sample;
+
 	// If we haven't accumulated a full packet yet, do nothing more:
-	if(sndTempProgress < SndOutPacketSize) return 1;
+	if(sndTempProgress < SndOutPacketSize) return;
+	sndTempProgress = 0;
 
-	if(dspPluginEnabled)
+	if( dspPluginEnabled )
 	{
-		for(int i=0;i<SndOutPacketSize;i++) { sndTempBuffer16[i] = SndScaleVol( sndTempBuffer[i] ); }
+		// Convert in, send to winamp DSP, and convert out.
 
-		// send to winamp DSP
-		sndTempProgress = DspProcess(sndTempBuffer16,sndTempProgress>>1)<<1;
+		for( int i=0; i<SndOutPacketSize; ++i, ++m_dsp_writepos ) { sndTempBuffer16[m_dsp_writepos] = sndTempBuffer[i].DownSample(); }
+		m_dsp_progress += DspProcess( (s16*)sndTempBuffer16, SndOutPacketSize );
 
-		for(int i=0;i<sndTempProgress;i++) { sndTempBuffer[i] = sndTempBuffer16[i]<<SndOutVolumeShift; }
-	}
-
-	static int equalized = 0;
-	if( !timeStretchDisabled )
-	{
-		bool progress = false;
-
-		// data prediction helps keep the tempo adjustments more accurate.
-		// The timestretcher returns packets in belated "clump" form.
-		// Meaning that most of the time we'll get nothing back, and then
-		// suddenly we'll get several chunks back at once.  Thus we use
-		// data prediction to make the timestretcher more responsive.
-
-		sndBuffer->PredictDataWrite( (int)( sndTempProgress / eTempo ) );
-		for(int i=0;i<sndTempProgress;i++) { ((float*)sndTempBuffer)[i] = sndTempBuffer[i]/2147483648.0f; }
-
-		pSoundTouch->putSamples((float*)sndTempBuffer, sndTempProgress>>1);
-
-		while( ( sndTempProgress = pSoundTouch->receiveSamples((float*)sndTempBuffer, sndTempProgress>>1)<<1 ) != 0 )
+		// Some ugly code to ensure full packet handling:
+		int ei = 0;
+		while( m_dsp_progress >= SndOutPacketSize )
 		{
-			// [Air] [TODO] : Implement an SSE downsampler to int.
-			for(int i=0;i<sndTempProgress;i++)
-			{
-				sndTempBuffer[i] = (s32)(((float*)sndTempBuffer)[i]*2147483648.0f);
-			}
-			sndBuffer->WriteSamples(sndTempBuffer, sndTempProgress);
-			progress = true;
+			for( int i=0; i<SndOutPacketSize; ++i, ++ei ) { sndTempBuffer[i] = sndTempBuffer16[ei].UpSample(); }
+
+			if( !timeStretchDisabled )
+				timeStretchWrite();
+			else
+				_WriteSamples(sndTempBuffer, sndTempProgress);
+
+			m_dsp_progress -= SndOutPacketSize;
 		}
-
-		UpdateTempoChange();
-
-		if( MsgOverruns() )
+		
+		// copy any leftovers to the front of the dsp buffer.
+		if( m_dsp_progress > 0 )
 		{
-			if( progress )
-			{
-				if( ++ts_stats_logcounter > 300 )
-				{
-					ts_stats_logcounter = 0;
-					ConLog( " * SPU2 > Timestretch Stats > %d%% of packets stretched.\n",
-						( ts_stats_stretchblocks * 100 ) / ( ts_stats_normalblocks + ts_stats_stretchblocks ) );
-					ts_stats_normalblocks = 0;
-					ts_stats_stretchblocks = 0;
-				}
-			}
+			memcpy( &sndTempBuffer16[ei], sndTempBuffer16,
+				sizeof(sndTempBuffer16[0]) * m_dsp_progress
+			);
 		}
 	}
 	else
 	{
-		sndBuffer->WriteSamples(sndTempBuffer, sndTempProgress);
-		sndTempProgress=0;
+		if( !timeStretchDisabled )
+			timeStretchWrite();
+		else
+			_WriteSamples(sndTempBuffer, SndOutPacketSize);
 	}
-
-	return 1;
 }
 
-s32 SndTest()
+s32 SndBuffer::Test()
 {
 	if( mods[OutputModule] == NULL )
 		return -1;
@@ -713,10 +352,11 @@ s32 SndTest()
 	return mods[OutputModule]->Test();
 }
 
-void SndConfigure(HWND parent, u32 module )
+void SndBuffer::Configure(HWND parent, u32 module )
 {
 	if( mods[module] == NULL )
 		return;
 
 	mods[module]->Configure(parent);
 }
+
diff --git a/plugins/spu2-x/src/SndOut.h b/plugins/spu2-x/src/SndOut.h
index ccb5da1355..6314725256 100644
--- a/plugins/spu2-x/src/SndOut.h
+++ b/plugins/spu2-x/src/SndOut.h
@@ -24,40 +24,310 @@
 // Number of stereo samples per SndOut block.
 // All drivers must work in units of this size when communicating with
 // SndOut.
-static const int SndOutPacketSize = 1024;
+static const int SndOutPacketSize = 512;
 
 // Overall master volume shift.
 // Converts the mixer's 32 bit value into a 16 bit value.
-static const int SndOutVolumeShift = 10;
+static const int SndOutVolumeShift = 13;
 
 // Samplerate of the SPU2. For accurate playback we need to match this
 // exactly.  Trying to scale samplerates and maintain SPU2's Ts timing accuracy
 // is too problematic. :)
 static const int SampleRate = 48000;
 
-extern s32  SndInit();
-extern void SndClose();
-extern s32  SndWrite(s32 ValL, s32 ValR);
-extern s32  SndTest();
-extern void SndConfigure(HWND parent, u32 outmodidx );
-extern bool SndGetStats(u32 *written, u32 *played);
-extern s16  SndScaleVol( s32 inval );
-
 int FindOutputModuleById( const wchar_t* omodid );
 
+struct StereoOut16
+{
+	s16 Left;
+	s16 Right;
+
+	StereoOut16() :
+		Left( 0 ),
+		Right( 0 )
+	{
+	}
+
+	StereoOut16( const StereoOut32& src ) :
+		Left( (s16)src.Left ),
+		Right( (s16)src.Right )
+	{
+	}
+
+	StereoOut16( s16 left, s16 right ) :
+		Left( left ),
+		Right( right )
+	{
+	}
+	
+	StereoOut32 UpSample() const;
+
+	void ResampleFrom( const StereoOut32& src )
+	{
+		// Use StereoOut32's built in conversion
+		*this = src.DownSample();
+	}
+};
+
+struct StereoOutFloat
+{
+	float Left;
+	float Right;
+
+	StereoOutFloat() :
+		Left( 0 ),
+		Right( 0 )
+	{
+	}
+
+	explicit StereoOutFloat( const StereoOut32& src ) :
+		Left( src.Left / 2147483647.0f ),
+		Right( src.Right / 2147483647.0f )
+	{
+	}
+
+	explicit StereoOutFloat( s32 left, s32 right ) :
+		Left( left / 2147483647.0f ),
+		Right( right / 2147483647.0f )
+	{
+	}
+
+	StereoOutFloat( float left, float right ) :
+		Left( left ),
+		Right( right )
+	{
+	}
+};
+
+struct Stereo21Out16
+{
+	s16 Left;
+	s16 Right;
+	s16 LFE;
+
+	void ResampleFrom( const StereoOut32& src )
+	{
+		Left = src.Left >> SndOutVolumeShift;
+		Right = src.Right >> SndOutVolumeShift;
+		LFE = (src.Left + src.Right) >> (SndOutVolumeShift + 1);
+	}
+};
+
+struct StereoQuadOut16
+{
+	s16 Left;
+	s16 Right;
+	s16 LeftBack;
+	s16 RightBack;
+
+	void ResampleFrom( const StereoOut32& src )
+	{
+		Left = src.Left >> SndOutVolumeShift;
+		Right = src.Right >> SndOutVolumeShift;
+		LeftBack = src.Left >> SndOutVolumeShift;
+		RightBack = src.Right >> SndOutVolumeShift;
+	}
+};
+
+struct Stereo41Out16
+{
+	s16 Left;
+	s16 Right;
+	s16 LFE;
+	s16 LeftBack;
+	s16 RightBack;
+
+	void ResampleFrom( const StereoOut32& src )
+	{
+		Left = src.Left >> SndOutVolumeShift;
+		Right = src.Right >> SndOutVolumeShift;
+		LFE = (src.Left + src.Right) >> (SndOutVolumeShift + 1);
+		LeftBack = src.Left >> SndOutVolumeShift;
+		RightBack = src.Right >> SndOutVolumeShift;
+	}
+};
+
+struct Stereo51Out16
+{
+	s16 Left;
+	s16 Right;
+	s16 Center;
+	s16 LFE;
+	s16 LeftBack;
+	s16 RightBack;
+
+	// Implementation Note: Center and Subwoofer/LFE -->
+	// This method is simple and sounds nice.  It relies on the speaker/soundcard
+	// systems do to their own low pass / crossover.  Manual lowpass is wasted effort
+	// and can't match solid state results anyway.
+
+	void ResampleFrom( const StereoOut32& src )
+	{
+		Left = src.Left >> SndOutVolumeShift;
+		Right = src.Right >> SndOutVolumeShift;
+		Center = (src.Left + src.Right) >> (SndOutVolumeShift + 1);
+		LFE = Center;
+		LeftBack = src.Left >> SndOutVolumeShift;
+		RightBack = src.Right >> SndOutVolumeShift;
+	}
+};
+
+struct Stereo71Out16
+{
+	s16 Left;
+	s16 Right;
+	s16 Center;
+	s16 LFE;
+	s16 LeftBack;
+	s16 RightBack;
+	s16 LeftSide;
+	s16 RightSide;
+
+	void ResampleFrom( const StereoOut32& src )
+	{
+		Left = src.Left >> SndOutVolumeShift;
+		Right = src.Right >> SndOutVolumeShift;
+		Center = (src.Left + src.Right) >> (SndOutVolumeShift + 1);
+		LFE = Center;
+		LeftBack = src.Left >> SndOutVolumeShift;
+		RightBack = src.Right >> SndOutVolumeShift;
+		
+		LeftSide = src.Left >> (SndOutVolumeShift+1);
+		RightSide = src.Right >> (SndOutVolumeShift+1);
+	}
+};
+
+struct Stereo21Out32
+{
+	s32 Left;
+	s32 Right;
+	s32 LFE;
+};
+
+struct Stereo41Out32
+{
+	s32 Left;
+	s32 Right;
+	s32 LFE;
+	s32 LeftBack;
+	s32 RightBack;
+};
+
+struct Stereo51Out32
+{
+	s32 Left;
+	s32 Right;
+	s32 Center;
+	s32 LFE;
+	s32 LeftBack;
+	s32 RightBack;
+};
+
+// Developer Note: This is a static class only (all static members).
 class SndBuffer
 {
+private:
+	static bool m_underrun_freeze;
+	static s32 m_predictData;
+	static float lastPct;
+
+	static StereoOut32* sndTempBuffer;
+	static StereoOut16* sndTempBuffer16;
+	
+	static int sndTempProgress;
+	static int m_dsp_progress;
+	static int m_dsp_writepos;
+
+	static int m_timestretch_progress;
+	static int m_timestretch_writepos;
+
+	static StereoOut32 *m_buffer;
+	static s32 m_size;
+	static s32 m_rpos;
+	static s32 m_wpos;
+	static s32 m_data;
+
+	static float lastEmergencyAdj;
+	static float cTempo;
+	static float eTempo;
+	static int freezeTempo;
+
+
+	static void _InitFail();
+	static void _WriteSamples(StereoOut32* bData, int nSamples);
+	static bool CheckUnderrunStatus( int& nSamples, int& quietSampleCount );
+
+	static void soundtouchInit();
+	static void soundtouchCleanup();
+	static void timeStretchWrite();
+	static void timeStretchUnderrun();
+	static s32 timeStretchOverrun();
+	
+	static void PredictDataWrite( int samples );
+	static float GetStatusPct();
+	static void UpdateTempoChange();
+	
 public:
-	virtual ~SndBuffer() {}
+	static void Init();
+	static void Cleanup();
+	static void Write( const StereoOut32& Sample );
+	static s32 Test();
+	static void Configure(HWND parent, u32 module );
+	
+	// Note: When using with 32 bit output buffers, the user of this function is responsible
+	// for shifting the values to where they need to be manually.  The fixed point depth of
+	// the sample output is determined by the SndOutVolumeShift, which is the number of bits
+	// to shift right to get a 16 bit result.
+	template< typename T >
+	static void ReadSamples( T* bData )
+	{
+		int nSamples = SndOutPacketSize;
 
-	virtual void WriteSamples(s32 *buffer, int nSamples)=0;
-	virtual void PauseOnWrite(bool doPause)=0;
+		// Problem:
+		//  If the SPU2 gets even the least bit out of sync with the SndOut device,
+		//  the readpos of the circular buffer will overtake the writepos,
+		//  leading to a prolonged period of hopscotching read/write accesses (ie,
+		//  lots of staticy crap sound for several seconds).
+		//
+		// Fix:
+		//  If the read position overtakes the write position, abort the
+		//  transfer immediately and force the SndOut driver to wait until
+		//  the read buffer has filled up again before proceeding.
+		//  This will cause one brief hiccup that can never exceed the user's
+		//  set buffer length in duration.
 
-	virtual void ReadSamples( s16* bData )=0;
-	virtual void ReadSamples( s32* bData )=0;
+		int quietSamples;
+		if( CheckUnderrunStatus( nSamples, quietSamples ) )
+		{
+			jASSUME( nSamples <= SndOutPacketSize );
 
-	//virtual s32  GetBufferUsage()=0;
-	//virtual s32  GetBufferSize()=0;
+			// [Air] [TODO]: This loop is probably a candidate for SSE2 optimization.
+
+			const int endPos = m_rpos + nSamples;
+			const int secondCopyLen = endPos - m_size;
+			const StereoOut32* rposbuffer = &m_buffer[m_rpos];
+
+			m_data -= nSamples;
+
+			if( secondCopyLen > 0 )
+			{
+				nSamples -= secondCopyLen;
+				for( int i=0; i<secondCopyLen; i++ )
+					bData[nSamples+i].ResampleFrom( m_buffer[i] );
+				m_rpos = secondCopyLen;
+			}
+			else
+				m_rpos += nSamples;
+
+			for( int i=0; i<nSamples; i++ )
+				bData[i].ResampleFrom( rposbuffer[i] );
+		}
+
+		// If quietSamples != 0 it means we have an underrun...
+		// Let's just dull out some silence, because that's usually the least
+		// painful way of dealing with underruns:
+		memset( bData, 0, quietSamples * sizeof(T) );
+	}
 };
 
 class SndOutModule
@@ -74,7 +344,7 @@ public:
 	// (for use in configuration screen)
 	virtual const wchar_t* GetLongName() const=0;
 
-	virtual s32  Init(SndBuffer *buffer)=0;
+	virtual s32  Init()=0;
 	virtual void Close()=0;
 	virtual s32  Test() const=0;
 	virtual void Configure(HWND parent)=0;
@@ -87,12 +357,9 @@ public:
 
 
 //internal
-extern SndOutModule *WaveOut;
-extern SndOutModule *DSoundOut;
-extern SndOutModule *FModOut;
-extern SndOutModule *ASIOOut;
-extern SndOutModule *XAudio2Out;
-extern SndOutModule *DSound51Out;
+extern SndOutModule* WaveOut;
+extern SndOutModule* DSoundOut;
+extern SndOutModule* XAudio2Out;
 
 extern SndOutModule* mods[];
 
diff --git a/plugins/spu2-x/src/Spu2.cpp b/plugins/spu2-x/src/Spu2.cpp
index c957445be5..8dfbc7db62 100644
--- a/plugins/spu2-x/src/Spu2.cpp
+++ b/plugins/spu2-x/src/Spu2.cpp
@@ -133,6 +133,13 @@ __inline void __fastcall spu2M_Write( u32 addr, u16 value )
 	spu2M_Write( addr, (s16)value );
 }
 
+V_VolumeLR V_VolumeLR::Max( 0x7FFFFFFF );
+V_VolumeSlideLR V_VolumeSlideLR::Max( 0x3FFF, 0x7FFFFFFF );
+
+V_Core::V_Core()
+{
+}
+
 void V_Core::Reset()
 {
 	memset( this, 0, sizeof(V_Core) );
@@ -141,16 +148,12 @@ void V_Core::Reset()
  
 	Regs.STATX=0;
 	Regs.ATTR=0;
-	ExtL = 0x7FFFFFFF;
-	ExtR = 0x7FFFFFFF;
-	InpL = 0x7FFFFFFF;
-	InpR = 0x7FFFFFFF;
-	FxL  = 0x7FFFFFFF;
-	FxR  = 0x7FFFFFFF;
-	MasterL.Reg_VOL= 0x3FFF;
-	MasterR.Reg_VOL= 0x3FFF;
-	MasterL.Value  = 0x7FFFFFFF;
-	MasterR.Value  = 0x7FFFFFFF;
+	ExtVol = V_VolumeLR::Max;
+	InpVol = V_VolumeLR::Max;
+	FxVol  = V_VolumeLR::Max;
+
+	MasterVol = V_VolumeSlideLR::Max;
+
 	ExtWetR = -1;
 	ExtWetL = -1;
 	ExtDryR = -1;
@@ -176,32 +179,94 @@ void V_Core::Reset()
  
 	for( uint v=0; v<24; ++v )
 	{
-		Voices[v].VolumeL.Reg_VOL = 0x3FFF;
-		Voices[v].VolumeR.Reg_VOL = 0x3FFF;
-
-		Voices[v].VolumeL.Value = 0x7FFFFFFF;
-		Voices[v].VolumeR.Value = 0x7FFFFFFF;
+		Voices[v].Volume = V_VolumeSlideLR::Max;
 		
-		Voices[v].ADSR.Value=0;
-		Voices[v].ADSR.Phase=0;
-		Voices[v].Pitch=0x3FFF;
+		Voices[v].ADSR.Value = 0;
+		Voices[v].ADSR.Phase = 0;
+		Voices[v].Pitch = 0x3FFF;
 		Voices[v].DryL = -1;
 		Voices[v].DryR = -1;
 		Voices[v].WetL = -1;
 		Voices[v].WetR = -1;
-		Voices[v].NextA=2800;
-		Voices[v].StartA=2800;
-		Voices[v].LoopStartA=2800;
+		Voices[v].NextA = 2800;
+		Voices[v].StartA = 2800;
+		Voices[v].LoopStartA = 2800;
 	}
-	DMAICounter=0;
-	AdmaInProgress=0;
+	DMAICounter = 0;
+	AdmaInProgress = 0;
  
-	Regs.STATX=0x80;
- }
+	Regs.STATX = 0x80;
+}
+
+s32 V_Core::EffectsBufferIndexer( s32 offset ) const
+{
+	u32 pos = EffectsStartA + ReverbX + offset;
+
+	// Need to use modulus here, because games can and will drop the buffer size
+	// without notice, and it leads to offsets several times past the end of the buffer.
+
+	if( pos > EffectsEndA )
+	{
+		pos = EffectsStartA + ((ReverbX + offset) % (u32)EffectsBufferSize);
+	}
+	else if( pos < EffectsStartA )
+	{
+		pos = EffectsEndA+1 - ((ReverbX + offset) % (u32)EffectsBufferSize );
+	}
+	return pos;
+} 
+
+void V_Core::UpdateFeedbackBuffersA()
+{
+	RevBuffers.FB_SRC_A0 = EffectsBufferIndexer( Revb.MIX_DEST_A0 - Revb.FB_SRC_A );
+	RevBuffers.FB_SRC_A1 = EffectsBufferIndexer( Revb.MIX_DEST_A1 - Revb.FB_SRC_A );
+}
+
+void V_Core::UpdateFeedbackBuffersB()
+{
+	RevBuffers.FB_SRC_B0 = EffectsBufferIndexer( Revb.MIX_DEST_B0 - Revb.FB_SRC_B );
+	RevBuffers.FB_SRC_B1 = EffectsBufferIndexer( Revb.MIX_DEST_B1 - Revb.FB_SRC_B );
+}
 
 void V_Core::UpdateEffectsBufferSize()
 {
-	EffectsBufferSize = EffectsEndA - EffectsStartA + 1;
+	ReverbX = 0;
+
+	const s32 newbufsize = EffectsEndA - EffectsStartA + 1;
+	if( !RevBuffers.NeedsUpdated && newbufsize ==  EffectsBufferSize ) return;
+	
+	RevBuffers.NeedsUpdated = false;
+
+	if( EffectsBufferSize == 0 ) return;
+
+	// Rebuild buffer indexers.
+
+	RevBuffers.ACC_SRC_A0 = EffectsBufferIndexer( Revb.ACC_SRC_A0 );
+	RevBuffers.ACC_SRC_A1 = EffectsBufferIndexer( Revb.ACC_SRC_A1 );
+	RevBuffers.ACC_SRC_B0 = EffectsBufferIndexer( Revb.ACC_SRC_B0 );
+	RevBuffers.ACC_SRC_B1 = EffectsBufferIndexer( Revb.ACC_SRC_B1 );
+	RevBuffers.ACC_SRC_C0 = EffectsBufferIndexer( Revb.ACC_SRC_C0 );
+	RevBuffers.ACC_SRC_C1 = EffectsBufferIndexer( Revb.ACC_SRC_C1 );
+	RevBuffers.ACC_SRC_D0 = EffectsBufferIndexer( Revb.ACC_SRC_D0 );
+	RevBuffers.ACC_SRC_D1 = EffectsBufferIndexer( Revb.ACC_SRC_D1 );
+
+	UpdateFeedbackBuffersA();
+	UpdateFeedbackBuffersB();
+	
+	RevBuffers.IIR_DEST_A0 = EffectsBufferIndexer( Revb.IIR_DEST_A0 );
+	RevBuffers.IIR_DEST_A1 = EffectsBufferIndexer( Revb.IIR_DEST_A1 );
+	RevBuffers.IIR_DEST_B0 = EffectsBufferIndexer( Revb.IIR_DEST_B0 );
+	RevBuffers.IIR_DEST_B1 = EffectsBufferIndexer( Revb.IIR_DEST_B1 );
+	
+	RevBuffers.IIR_SRC_A0 = EffectsBufferIndexer( Revb.IIR_SRC_A0 );
+	RevBuffers.IIR_SRC_A1 = EffectsBufferIndexer( Revb.IIR_SRC_A1 );
+	RevBuffers.IIR_SRC_B0 = EffectsBufferIndexer( Revb.IIR_SRC_B0 );
+	RevBuffers.IIR_SRC_B1 = EffectsBufferIndexer( Revb.IIR_SRC_B1 );
+	
+	RevBuffers.MIX_DEST_A0 = EffectsBufferIndexer( Revb.MIX_DEST_A0 );
+	RevBuffers.MIX_DEST_A1 = EffectsBufferIndexer( Revb.MIX_DEST_A1 );
+	RevBuffers.MIX_DEST_B0 = EffectsBufferIndexer( Revb.MIX_DEST_B0 );
+	RevBuffers.MIX_DEST_B1 = EffectsBufferIndexer( Revb.MIX_DEST_B1 );
 }
 
 void V_Voice::Start()
@@ -379,6 +444,11 @@ static s32 GetVol32( u16 src )
 	return (((s32)src) << 16 ) | ((src<<1) & 0xffff);
 }
 
+void V_VolumeSlide::RegSet( u16 src )
+{
+	Value = GetVol32( src );
+}
+
 void SPU_ps1_write(u32 mem, u16 value) 
 {
 	bool show=true;
@@ -393,15 +463,15 @@ void SPU_ps1_write(u32 mem, u16 value)
 		switch(vval)
 		{
 			case 0: //VOLL (Volume L)
-				Cores[0].Voices[voice].VolumeL.Mode = 0;
-				Cores[0].Voices[voice].VolumeL.Value = GetVol32( value<<1 );
-				Cores[0].Voices[voice].VolumeL.Reg_VOL = value;
+				Cores[0].Voices[voice].Volume.Left.Mode = 0;
+				Cores[0].Voices[voice].Volume.Left.RegSet( value << 1 );
+				Cores[0].Voices[voice].Volume.Left.Reg_VOL = value;
 			break;
 
 			case 1: //VOLR (Volume R)
-				Cores[0].Voices[voice].VolumeR.Mode = 0;
-				Cores[0].Voices[voice].VolumeR.Value = GetVol32( value<<1 );
-				Cores[0].Voices[voice].VolumeR.Reg_VOL = value;
+				Cores[0].Voices[voice].Volume.Right.Mode = 0;
+				Cores[0].Voices[voice].Volume.Right.RegSet( value << 1 );
+				Cores[0].Voices[voice].Volume.Right.Reg_VOL = value;
 			break;
 			
 			case 2:	Cores[0].Voices[voice].Pitch = value; break;
@@ -437,19 +507,22 @@ void SPU_ps1_write(u32 mem, u16 value)
 	else switch(reg)
 	{
 		case 0x1d80://         Mainvolume left
-			Cores[0].MasterL.Mode = 0;
-			Cores[0].MasterL.Value = GetVol32( value );
-			break;
+			Cores[0].MasterVol.Left.Mode = 0;
+			Cores[0].MasterVol.Left.RegSet( value );
+		break;
+
 		case 0x1d82://         Mainvolume right
-			Cores[0].MasterL.Mode = 0;
-			Cores[0].MasterR.Value = GetVol32( value );
-			break;
+			Cores[0].MasterVol.Right.Mode = 0;
+			Cores[0].MasterVol.Right.RegSet( value );
+		break;
+
 		case 0x1d84://         Reverberation depth left
-			Cores[0].FxL = GetVol32( value );
-			break;
+			Cores[0].FxVol.Left = GetVol32( value );
+		break;
+
 		case 0x1d86://         Reverberation depth right
-			Cores[0].FxR = GetVol32( value );
-			break;
+			Cores[0].FxVol.Right = GetVol32( value );
+		break;
 
 		case 0x1d88://         Voice ON  (0-15)
 			SPU2_FastWrite(REG_S_KON,value);
@@ -463,65 +536,74 @@ void SPU_ps1_write(u32 mem, u16 value)
 			break;
 		case 0x1d8e://         Voice OFF (16-23)
 			SPU2_FastWrite(REG_S_KOFF+2,value);
-			break;
+		break;
 
 		case 0x1d90://         Channel FM (pitch lfo) mode (0-15)
 			SPU2_FastWrite(REG_S_PMON,value);
-			break;
+		break;
+		
 		case 0x1d92://         Channel FM (pitch lfo) mode (16-23)
 			SPU2_FastWrite(REG_S_PMON+2,value);
-			break;
+		break;
 
 
 		case 0x1d94://         Channel Noise mode (0-15)
 			SPU2_FastWrite(REG_S_NON,value);
-			break;
+		break;
+		
 		case 0x1d96://         Channel Noise mode (16-23)
 			SPU2_FastWrite(REG_S_NON+2,value);
-			break;
+		break;
 
 		case 0x1d98://         Channel Reverb mode (0-15)
 			SPU2_FastWrite(REG_S_VMIXEL,value);
 			SPU2_FastWrite(REG_S_VMIXER,value);
-			break;
+		break;
+		
 		case 0x1d9a://         Channel Reverb mode (16-23)
 			SPU2_FastWrite(REG_S_VMIXEL+2,value);
 			SPU2_FastWrite(REG_S_VMIXER+2,value);
-			break;
+		break;
+		
 		case 0x1d9c://         Channel Reverb mode (0-15)
 			SPU2_FastWrite(REG_S_VMIXL,value);
 			SPU2_FastWrite(REG_S_VMIXR,value);
-			break;
+		break;
+		
 		case 0x1d9e://         Channel Reverb mode (16-23)
 			SPU2_FastWrite(REG_S_VMIXL+2,value);
 			SPU2_FastWrite(REG_S_VMIXR+2,value);
-			break;
+		break;
 
 		case 0x1da2://         Reverb work area start
-			{
-				u32 val=(u32)value <<8;
+		{
+			u32 val = (u32)value << 8;
 
-				SPU2_FastWrite(REG_A_ESA,  val&0xFFFF);
-				SPU2_FastWrite(REG_A_ESA+2,val>>16);
-			}
-			break;
+			SPU2_FastWrite(REG_A_ESA,  val&0xFFFF);
+			SPU2_FastWrite(REG_A_ESA+2,val>>16);
+		}
+		break;
+		
 		case 0x1da4:
 			Cores[0].IRQA=(u32)value<<8;
-			break;
+		break;
+
 		case 0x1da6:
 			Cores[0].TSA=(u32)value<<8;
-			break;
+		break;
 
 		case 0x1daa:
 			SPU2_FastWrite(REG_C_ATTR,value);
-			break;
+		break;
+
 		case 0x1dae:
 			SPU2_FastWrite(REG_P_STATX,value);
-			break;
+		break;
+
 		case 0x1da8:// Spu Write to Memory
 			DmaWrite(0,value);
 			show=false;
-			break;
+		break;
 	}
 
 	if(show) FileLog("[%10d] (!) SPU write mem %08x value %04x\n",Cycles,mem,value);
@@ -546,27 +628,31 @@ u16 SPU_ps1_read(u32 mem)
 			case 0: //VOLL (Volume L)
 				//value=Cores[0].Voices[voice].VolumeL.Mode;
 				//value=Cores[0].Voices[voice].VolumeL.Value;
-				value=Cores[0].Voices[voice].VolumeL.Reg_VOL;	break;
+				value = Cores[0].Voices[voice].Volume.Left.Reg_VOL;
+			break;
+			
 			case 1: //VOLR (Volume R)
 				//value=Cores[0].Voices[voice].VolumeR.Mode;
 				//value=Cores[0].Voices[voice].VolumeR.Value;
-				value=Cores[0].Voices[voice].VolumeR.Reg_VOL;	break;
-			case 2:	value=Cores[0].Voices[voice].Pitch;			break;
-			case 3:	value=Cores[0].Voices[voice].StartA;	break;
-			case 4: value=Cores[0].Voices[voice].ADSR.Reg_ADSR1;	break;
-			case 5: value=Cores[0].Voices[voice].ADSR.Reg_ADSR2;	break;
-			case 6:	value=Cores[0].Voices[voice].ADSR.Value >> 16;	break;
-			case 7:	value=Cores[0].Voices[voice].LoopStartA;	break;
+				value = Cores[0].Voices[voice].Volume.Right.Reg_VOL;
+			break;
+			
+			case 2:	value = Cores[0].Voices[voice].Pitch;		break;
+			case 3:	value = Cores[0].Voices[voice].StartA;		break;
+			case 4: value = Cores[0].Voices[voice].ADSR.Reg_ADSR1;	break;
+			case 5: value = Cores[0].Voices[voice].ADSR.Reg_ADSR2;	break;
+			case 6:	value = Cores[0].Voices[voice].ADSR.Value >> 16;	break;
+			case 7:	value = Cores[0].Voices[voice].LoopStartA;	break;
 
 			jNO_DEFAULT;
 		}
 	}
 	else switch(reg)
 	{
-		case 0x1d80: value = Cores[0].MasterL.Value>>16; break;
-		case 0x1d82: value = Cores[0].MasterR.Value>>16; break;
-		case 0x1d84: value = Cores[0].FxL>>16;           break;
-		case 0x1d86: value = Cores[0].FxR>>16;           break;
+		case 0x1d80: value = Cores[0].MasterVol.Left.Value >> 16;  break;
+		case 0x1d82: value = Cores[0].MasterVol.Right.Value >> 16; break;
+		case 0x1d84: value = Cores[0].FxVol.Left >> 16;            break;
+		case 0x1d86: value = Cores[0].FxVol.Right >> 16;           break;
 
 		case 0x1d88: value = 0; break;
 		case 0x1d8a: value = 0; break;
@@ -585,8 +671,11 @@ u16 SPU_ps1_read(u32 mem)
 		case 0x1d9e: value = Cores[0].Regs.VMIXL>>16;     break;
 
 		case 0x1da2:
-			value = Cores[0].EffectsStartA>>3;
-			Cores[0].UpdateEffectsBufferSize();
+			if( value != Cores[0].EffectsStartA>>3 )
+			{
+				value = Cores[0].EffectsStartA>>3;
+				Cores[0].UpdateEffectsBufferSize();
+			}
 		break;
 		case 0x1da4: value = Cores[0].IRQA>>3;            break;
 		case 0x1da6: value = Cores[0].TSA>>3;             break;
@@ -607,15 +696,49 @@ u16 SPU_ps1_read(u32 mem)
 	return value;
 }
 
-static u32 SetLoWord( u32 var, u16 writeval )
+// Ah the joys of endian-specific code! :D
+static __forceinline u32 SetHiWord( u32& src, u16 value )
 {
-	return (var & 0xFFFF0000) | writeval;
+	((u16*)&src)[1] = value;
+	return src;
 }
 
-
-static u32 SetHiWord( u32 var, u16 writeval )
+static __forceinline u32 SetLoWord( u32& src, u16 value )
 {
-	return (var & 0x0000FFFF) | (writeval<<16);
+	((u16*)&src)[0] = value;
+	return src;
+}
+
+static __forceinline s32 SetHiWord( s32& src, u16 value )
+{
+	((u16*)&src)[1] = value;
+	return src;
+}
+
+static __forceinline s32 SetLoWord( s32& src, u16 value )
+{
+	((u16*)&src)[0] = value;
+	return src;
+}
+
+static __forceinline u16 GetHiWord( u32& src )
+{
+	return ((u16*)&src)[1];
+}
+
+static __forceinline u16 GetLoWord( u32& src )
+{
+	return ((u16*)&src)[0];
+}
+
+static __forceinline u16 GetHiWord( s32& src )
+{
+	return ((u16*)&src)[1];
+}
+
+static __forceinline u16 GetLoWord( s32& src )
+{
+	return ((u16*)&src)[0];
 }
 
 __forceinline void SPU2_FastWrite( u32 rmem, u16 value )
@@ -637,7 +760,9 @@ __forceinline void SPU2_FastWrite( u32 rmem, u16 value )
 			case 0: //VOLL (Volume L)
 			case 1: //VOLR (Volume R)
 			{
-				V_Volume& thisvol = (param==0) ? thisvoice.VolumeL : thisvoice.VolumeR;
+				V_VolumeSlide& thisvol = (param==0) ? thisvoice.Volume.Left : thisvoice.Volume.Right;
+				thisvol.Reg_VOL = value;
+
 				if (value & 0x8000)		// +Lin/-Lin/+Exp/-Exp
 				{
 					thisvol.Mode = (value & 0xF000)>>12;
@@ -649,11 +774,10 @@ __forceinline void SPU2_FastWrite( u32 rmem, u16 value )
 					// Volumes range from 0x3fff to 0x7fff, with 0x4000 serving as
 					// the "sign" bit, so a simple bitwise extension will do the trick:
 
-					thisvol.Value = GetVol32( value<<1 );
+					thisvol.RegSet( value<<1 );
 					thisvol.Mode = 0;
 					thisvol.Increment = 0;
 				}
-				thisvol.Reg_VOL = value;
 			}
 			break;
 
@@ -677,8 +801,8 @@ __forceinline void SPU2_FastWrite( u32 rmem, u16 value )
 				ConLog( "* SPU2: Mysterious ADSR Volume Set to 0x%x", value );
 			break;
 			
-			case 6:	thisvoice.VolumeL.Value = GetVol32( value ); break;
-			case 7:	thisvoice.VolumeR.Value = GetVol32( value ); break;
+			case 6:	thisvoice.Volume.Left.RegSet( value ); break;
+			case 7:	thisvoice.Volume.Right.RegSet( value ); break;
 
 			jNO_DEFAULT;
 		}
@@ -727,6 +851,15 @@ __forceinline void SPU2_FastWrite( u32 rmem, u16 value )
 		*(regtable[mem>>1]) = value;
 		UpdateSpdifMode();
 	}
+	else if( mem >= R_FB_SRC_A && mem < REG_A_EEA )
+	{
+		// Signal to the Reverb code that the effects buffers need to be re-aligned.
+		// This is both simple, efficient, and safe, since we only want to re-align
+		// buffers after both hi and lo words have been written.
+
+		*(regtable[mem>>1]) = value;
+		Cores[core].RevBuffers.NeedsUpdated = true;
+	}
 	else
 	{
 		switch(omem)
@@ -783,22 +916,22 @@ __forceinline void SPU2_FastWrite( u32 rmem, u16 value )
 
 			case REG_S_PMON:
 				vx=2; for (vc=1;vc<16;vc++) { Cores[core].Voices[vc].Modulated=(s8)((value & vx)/vx); vx<<=1; }
-				Cores[core].Regs.PMON = SetLoWord( Cores[core].Regs.PMON, value );
+				SetLoWord( Cores[core].Regs.PMON, value );
 			break;
 
 			case (REG_S_PMON + 2):
 				vx=1; for (vc=16;vc<24;vc++) { Cores[core].Voices[vc].Modulated=(s8)((value & vx)/vx); vx<<=1; }
-				Cores[core].Regs.PMON = SetHiWord( Cores[core].Regs.PMON, value );
+				SetHiWord( Cores[core].Regs.PMON, value );
 			break;
 
 			case REG_S_NON:
 				vx=1; for (vc=0;vc<16;vc++) { Cores[core].Voices[vc].Noise=(s8)((value & vx)/vx); vx<<=1; }
-				Cores[core].Regs.NON = SetLoWord( Cores[core].Regs.NON, value );
+				SetLoWord( Cores[core].Regs.NON, value );
 			break;
 
 			case (REG_S_NON + 2):
 				vx=1; for (vc=16;vc<24;vc++) { Cores[core].Voices[vc].Noise=(s8)((value & vx)/vx); vx<<=1; }
-				Cores[core].Regs.NON = SetHiWord( Cores[core].Regs.NON, value );
+				SetHiWord( Cores[core].Regs.NON, value );
 			break;
 
 // Games like to repeatedly write these regs over and over with the same value, hence
@@ -895,26 +1028,23 @@ __forceinline void SPU2_FastWrite( u32 rmem, u16 value )
 
 			// Reverb Start and End Address Writes!
 			//  * Yes, these are backwards from all the volumes -- the hiword comes FIRST (wtf!)
-			//  * End position is a hiword only!  Lowword is always ffff.
+			//  * End position is a hiword only!  Loword is always ffff.
 			//  * The Reverb buffer position resets on writes to StartA.  It probably resets
 			//    on writes to End too.  Docs don't say, but they're for PSX, which couldn't
 			//    change the end address anyway.
 
 			case REG_A_ESA:
-				Cores[core].EffectsStartA = (Cores[core].EffectsStartA & 0x0000FFFF) | (value<<16);
-				Cores[core].ReverbX = 0;
+				SetHiWord( Cores[core].EffectsStartA, value );
 				Cores[core].UpdateEffectsBufferSize();
 			break;
 
 			case (REG_A_ESA + 2):
-				Cores[core].EffectsStartA = (Cores[core].EffectsStartA & 0xFFFF0000) | value;
-				Cores[core].ReverbX = 0;
+				SetLoWord( Cores[core].EffectsStartA, value );
 				Cores[core].UpdateEffectsBufferSize();
 			break;
 
 			case REG_A_EEA:
 				Cores[core].EffectsEndA = ((u32)value<<16) | 0xFFFF;
-				Cores[core].ReverbX = 0;
 				Cores[core].UpdateEffectsBufferSize();
 			break;
 			
@@ -923,7 +1053,7 @@ __forceinline void SPU2_FastWrite( u32 rmem, u16 value )
 			case REG_P_MVOLL:
 			case REG_P_MVOLR:
 			{
-				V_Volume& thisvol = (omem==REG_P_MVOLL) ? Cores[core].MasterL : Cores[core].MasterR;
+				V_VolumeSlide& thisvol = (omem==REG_P_MVOLL) ? Cores[core].MasterVol.Left : Cores[core].MasterVol.Right;
 
 				if( value & 0x8000 )	// +Lin/-Lin/+Exp/-Exp
 				{ 
@@ -945,27 +1075,27 @@ __forceinline void SPU2_FastWrite( u32 rmem, u16 value )
 			break;
 
 			case REG_P_EVOLL:
-				Cores[core].FxL = GetVol32( value );
+				Cores[core].FxVol.Left = GetVol32( value );
 			break;
 
 			case REG_P_EVOLR:
-				Cores[core].FxR = GetVol32( value );
+				Cores[core].FxVol.Right = GetVol32( value );
 			break;
 			
 			case REG_P_AVOLL:
-				Cores[core].ExtL = GetVol32( value );
+				Cores[core].ExtVol.Left = GetVol32( value );
 			break;
 
 			case REG_P_AVOLR:
-				Cores[core].ExtR = GetVol32( value );
+				Cores[core].ExtVol.Right = GetVol32( value );
 			break;
 			
 			case REG_P_BVOLL:
-				Cores[core].InpL = GetVol32( value );
+				Cores[core].InpVol.Left = GetVol32( value );
 			break;
 
 			case REG_P_BVOLR:
-				Cores[core].InpR = GetVol32( value );
+				Cores[core].InpVol.Right = GetVol32( value );
 			break;
 
 			case REG_S_ADMAS:
@@ -1012,7 +1142,7 @@ void StartVoices(int core, u32 value)
 					(thisvc.WetL)?"+":"-",(thisvc.WetR)?"+":"-",
 					*(u8*)GetMemPtr(thisvc.StartA),*(u8 *)GetMemPtr((thisvc.StartA)+1),
 					thisvc.Pitch,
-					thisvc.VolumeL.Value,thisvc.VolumeR.Value,
+					thisvc.Volume.Left.Value,thisvc.Volume.Right.Value,
 					thisvc.ADSR.Reg_ADSR1,thisvc.ADSR.Reg_ADSR2);
 			}
 		}
diff --git a/plugins/spu2-x/src/Spu2.h b/plugins/spu2-x/src/Spu2.h
index 992a8c1297..8b21f6a9ef 100644
--- a/plugins/spu2-x/src/Spu2.h
+++ b/plugins/spu2-x/src/Spu2.h
@@ -182,21 +182,25 @@ extern void DspUpdate(); // to let the Dsp process window messages
 
 extern void RecordStart();
 extern void RecordStop();
-extern void RecordWrite(s16 left, s16 right);
+extern void RecordWrite( const StereoOut16& sample );
 
 extern void UpdateSpdifMode();
 extern void LowPassFilterInit();
 extern void InitADSR();
 extern void CalculateADSR( V_Voice& vc );
 
+extern void __fastcall ReadInput( V_Core& thiscore, StereoOut32& PData );
+
+
 //////////////////////////////
 //    The Mixer Section     //
 //////////////////////////////
 
 extern void Mix();
-extern s32 clamp_mix(s32 x, u8 bitshift=0);
+extern s32 clamp_mix( s32 x, u8 bitshift=0 );
+extern void clamp_mix( StereoOut32& sample, u8 bitshift=0 );
 extern void Reverb_AdvanceBuffer( V_Core& thiscore );
-extern void DoReverb( V_Core& thiscore, s32& OutL, s32& OutR, s32 InL, s32 InR);
+extern StereoOut32 DoReverb( V_Core& thiscore, const StereoOut32& Input );
 extern s32 MulShr32( s32 srcval, s32 mulval );
 
 //#define PCM24_S1_INTERLEAVE
diff --git a/plugins/spu2-x/src/Timestretcher.cpp b/plugins/spu2-x/src/Timestretcher.cpp
new file mode 100644
index 0000000000..e82d46165a
--- /dev/null
+++ b/plugins/spu2-x/src/Timestretcher.cpp
@@ -0,0 +1,333 @@
+/* SPU2-X, A plugin for Emulating the Sound Processing Unit of the Playstation 2
+* Developed and maintained by the Pcsx2 Development Team.
+* 
+* Original portions from SPU2ghz are (c) 2008 by David Quintana [gigaherz]
+*
+* This library is free software; you can redistribute it and/or modify it under
+* the terms of the GNU Lesser General Public License as published by the Free 
+* Software Foundation; either version 2.1 of the the License, or (at your
+* option) any later version.
+* 
+* This library is distributed in the hope that it will be useful, but WITHOUT 
+* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+* FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
+* for more details.
+* 
+* You should have received a copy of the GNU Lesser General Public License along
+* with this library; if not, write to the Free Software Foundation, Inc., 59
+* Temple Place, Suite 330, Boston, MA  02111-1307  USA
+* 
+*/
+
+#include "spu2.h"
+#include "SoundTouch/SoundTouch.h"
+#include "SoundTouch/WavFile.h"
+
+
+static soundtouch::SoundTouch* pSoundTouch = NULL;
+static int ts_stats_stretchblocks = 0;
+static int ts_stats_normalblocks = 0;
+static int ts_stats_logcounter = 0;
+
+
+// data prediction amount, used to "commit" data that hasn't
+// finished timestretch processing.
+s32 SndBuffer::m_predictData;
+
+// records last buffer status (fill %, range -100 to 100, with 0 being 50% full)
+float SndBuffer::lastPct;
+float SndBuffer::lastEmergencyAdj;
+
+float SndBuffer::cTempo = 1;
+float SndBuffer::eTempo = 1;
+int SndBuffer::freezeTempo = 0;
+
+void SndBuffer::PredictDataWrite( int samples )
+{
+	m_predictData += samples;
+}
+
+// Calculate the buffer status percentage.
+// Returns range from -1.0 to 1.0
+//    1.0 = buffer overflow!
+//    0.0 = buffer nominal (50% full)
+//   -1.0 = buffer underflow!
+float SndBuffer::GetStatusPct()
+{
+	// Get the buffer status of the output driver too, so that we can
+	// obtain a more accurate overall buffer status.
+
+	int drvempty = mods[OutputModule]->GetEmptySampleCount(); // / 2;
+
+	//ConLog( "Data %d >>> driver: %d   predict: %d\n", data, drvempty, predictData );
+
+	float result = (float)(m_data + m_predictData - drvempty) - (m_size/2);
+	result /= (m_size/2);
+	return result;
+}
+
+void SndBuffer::UpdateTempoChange()
+{
+	if( --freezeTempo > 0 )
+	{
+		return;
+	}
+
+	float statusPct = GetStatusPct();
+	float pctChange = statusPct - lastPct;
+
+	float tempoChange;
+	float emergencyAdj = 0;
+	float newcee = cTempo;		// workspace var. for cTempo
+
+	// IMPORTANT!
+	// If you plan to tweak these values, make sure you're using a release build
+	// OUTSIDE THE DEBUGGER to test it!  The Visual Studio debugger can really cause
+	// erratic behavior in the audio buffers, and makes the timestretcher seem a
+	// lot more inconsistent than it really is.
+
+	// We have two factors.
+	//   * Distance from nominal buffer status (50% full)
+	//   * The change from previous update to this update.
+
+	// Prediction based on the buffer change:
+	// (linear seems to work better here)
+
+	tempoChange = pctChange * 0.75f;
+
+	if( statusPct * tempoChange < 0.0f )
+	{
+		// only apply tempo change if it is in synch with the buffer status.
+		// In other words, if the buffer is high (over 0%), and is decreasing,
+		// ignore it.  It'll just muck things up.
+
+		tempoChange = 0;
+	}
+
+	// Sudden spikes in framerate can cause the nominal buffer status
+	// to go critical, in which case we have to enact an emergency
+	// stretch. The following cubic formulas do that.  Values near
+	// the extremeites give much larger results than those near 0.
+	// And the value is added only this time, and does not accumulate.
+	// (otherwise a large value like this would cause problems down the road)
+
+	// Constants:
+	// Weight - weights the statusPct's "emergency" consideration.
+	//   higher values here will make the buffer perform more drastic
+	//   compensations at the outer edges of the buffer (at -75 or +75%
+	//   or beyond, for example).
+
+	// Range - scales the adjustment to the given range (more or less).
+	//   The actual range is dependent on the weight used, so if you increase
+	//   Weight you'll usually want to decrease Range somewhat to compensate.
+
+	// Prediction based on the buffer fill status:
+
+	const float statusWeight = 2.99f;
+	const float statusRange = 0.068f;
+
+	// "non-emergency" deadzone:  In this area stretching will be strongly discouraged.
+	// Note: due tot he nature of timestretch latency, it's always a wee bit harder to
+	// cope with low fps (underruns) tha it is high fps (overruns).  So to help out a
+	// little, the low-end portions of this check are less forgiving than the high-sides.
+
+	if( cTempo < 0.965f || cTempo > 1.060f ||
+		pctChange < -0.38f || pctChange > 0.54f ||
+		statusPct < -0.32f || statusPct > 0.39f ||
+		eTempo < 0.89f || eTempo > 1.19f )
+	{
+		emergencyAdj = ( pow( statusPct*statusWeight, 3.0f ) * statusRange);
+	}
+
+	// Smooth things out by factoring our previous adjustment into this one.
+	// It helps make the system 'feel' a little smarter by  giving it at least
+	// one packet worth of history to help work off of:
+
+	emergencyAdj = (emergencyAdj * 0.75f) + (lastEmergencyAdj * 0.25f );
+
+	lastEmergencyAdj = emergencyAdj;
+	lastPct = statusPct;
+
+	// Accumulate a fraction of the tempo change into the tempo itself.
+	// This helps the system run "smarter" to games that run consistently
+	// fast or slow by altering the base tempo to something closer to the
+	// game's active speed.  In tests most games normalize within 2 seconds
+	// at 100ms latency, which is pretty good (larger buffers normalize even
+	// quicker).
+
+	newcee += newcee * (tempoChange+emergencyAdj) * 0.03f;
+
+	// Apply tempoChange as a scale of cTempo.  That way the effect is proportional
+	// to the current tempo.  (otherwise tempos rate of change at the extremes would
+	// be too drastic)
+
+	float newTempo = newcee + ( emergencyAdj * cTempo );
+
+	// ... and as a final optimization, only stretch if the new tempo is outside
+	// a nominal threshold.  Keep this threshold check small, because it could
+	// cause some serious side effects otherwise. (enlarging the cTempo check above
+	// is usually better/safer)
+	if( newTempo < 0.970f || newTempo > 1.045f )
+	{
+		cTempo = (float)newcee;
+
+		if( newTempo < 0.10f ) newTempo = 0.10f;
+		else if( newTempo > 10.0f ) newTempo = 10.0f;
+
+		if( cTempo < 0.15f ) cTempo = 0.15f;
+		else if( cTempo > 7.5f ) cTempo = 7.5f;
+
+		pSoundTouch->setTempo( eTempo = (float)newTempo );
+		ts_stats_stretchblocks++;
+
+		/*ConLog(" * SPU2: [Nominal %d%%] [Emergency: %d%%] (baseTempo: %d%% ) (newTempo: %d%%) (buffer: %d%%)\n",
+			//(relation < 0.0) ? "Normalize" : "",
+			(int)(tempoChange * 100.0 * 0.03),
+			(int)(emergencyAdj * 100.0),
+			(int)(cTempo * 100.0),
+			(int)(newTempo * 100.0),
+			(int)(statusPct * 100.0)
+		);*/
+	}
+	else
+	{
+		// Nominal operation -- turn off stretching.
+		// note: eTempo 'slides' toward 1.0 for smoother audio and better
+		// protection against spikes.
+		if( cTempo != 1.0f )
+		{
+			cTempo = 1.0f;
+			eTempo = ( 1.0f + eTempo ) * 0.5f;
+			pSoundTouch->setTempo( eTempo );
+		}
+		else
+		{
+			if( eTempo != cTempo )
+				pSoundTouch->setTempo( eTempo=cTempo );
+			ts_stats_normalblocks++;
+		}
+	}
+}
+
+void SndBuffer::timeStretchUnderrun()
+{
+	// timeStretcher failed it's job.  We need to slow down the audio some.
+
+	cTempo -= (cTempo * 0.12f);
+	eTempo -= (eTempo * 0.30f);
+	if( eTempo < 0.1f ) eTempo = 0.1f;
+	pSoundTouch->setTempo( eTempo );
+}
+
+s32 SndBuffer::timeStretchOverrun()
+{
+	// If we overran it means the timestretcher failed.  We need to speed
+	// up audio playback.
+	cTempo += cTempo * 0.12f;
+	eTempo += eTempo * 0.40f;
+	if( eTempo > 7.5f ) eTempo = 7.5f;
+	pSoundTouch->setTempo( eTempo );
+
+	// Throw out just a little bit (two packets worth) to help
+	// give the TS some room to work:
+
+	return SndOutPacketSize*2;
+}
+
+static void CvtPacketToFloat( StereoOut32* srcdest )
+{
+	StereoOutFloat* dest = (StereoOutFloat*)srcdest;
+	const StereoOut32* src = (StereoOut32*)srcdest;
+	for( uint i=0; i<SndOutPacketSize; ++i, ++dest, ++src )
+		*dest = (StereoOutFloat)*src;
+}
+
+// Parameter note: Size should always be a multiple of 128, thanks!
+static void CvtPacketToInt( StereoOut32* srcdest, uint size )
+{
+	jASSUME( (size & 127) == 0 );
+	
+	const StereoOutFloat* src = (StereoOutFloat*)srcdest;
+	StereoOut32* dest = srcdest;
+
+	for( uint i=0; i<size; ++i, ++dest, ++src )
+		*dest = (StereoOut32)*src;
+}
+
+void SndBuffer::timeStretchWrite()
+{
+	bool progress = false;
+
+	// data prediction helps keep the tempo adjustments more accurate.
+	// The timestretcher returns packets in belated "clump" form.
+	// Meaning that most of the time we'll get nothing back, and then
+	// suddenly we'll get several chunks back at once.  Thus we use
+	// data prediction to make the timestretcher more responsive.
+
+	PredictDataWrite( (int)( SndOutPacketSize / eTempo ) );
+	CvtPacketToFloat( sndTempBuffer );
+
+	pSoundTouch->putSamples( (float*)sndTempBuffer, SndOutPacketSize );
+
+	int tempProgress;
+	while( tempProgress = pSoundTouch->receiveSamples( (float*)sndTempBuffer, SndOutPacketSize),
+		tempProgress != 0 )
+	{
+		// Hint: It's assumed that pSoundTouch will return chunks of 128 bytes (it always does as
+		// long as the SSE optimizations are enabled), which means we can do our own SSE opts here.
+		
+		CvtPacketToInt( sndTempBuffer, tempProgress );
+		_WriteSamples( sndTempBuffer, tempProgress );
+		progress = true;
+	}
+
+	UpdateTempoChange();
+
+	if( MsgOverruns() )
+	{
+		if( progress )
+		{
+			if( ++ts_stats_logcounter > 300 )
+			{
+				ts_stats_logcounter = 0;
+				ConLog( " * SPU2 > Timestretch Stats > %d%% of packets stretched.\n",
+					( ts_stats_stretchblocks * 100 ) / ( ts_stats_normalblocks + ts_stats_stretchblocks ) );
+				ts_stats_normalblocks = 0;
+				ts_stats_stretchblocks = 0;
+			}
+		}
+	}
+}
+
+void SndBuffer::soundtouchInit()
+{
+	pSoundTouch = new soundtouch::SoundTouch();
+	pSoundTouch->setSampleRate(SampleRate);
+	pSoundTouch->setChannels(2);
+
+	pSoundTouch->setSetting( SETTING_USE_QUICKSEEK, 0 );
+	pSoundTouch->setSetting( SETTING_USE_AA_FILTER, 0 );
+
+	pSoundTouch->setSetting( SETTING_SEQUENCE_MS, SoundtouchCfg::SequenceLenMS );
+	pSoundTouch->setSetting( SETTING_SEEKWINDOW_MS, SoundtouchCfg::SeekWindowMS );
+	pSoundTouch->setSetting( SETTING_OVERLAP_MS, SoundtouchCfg::OverlapMS );
+
+	pSoundTouch->setTempo(1);
+
+	// some timestretch management vars:
+
+	cTempo = 1.0;
+	eTempo = 1.0;
+	lastPct = 0;
+	lastEmergencyAdj = 0;
+
+	// just freeze tempo changes for a while at startup.
+	// the driver buffers are bogus anyway.
+	freezeTempo = 8;
+	m_predictData = 0;
+}
+
+void SndBuffer::soundtouchCleanup()
+{
+	SAFE_DELETE_OBJ( pSoundTouch );
+}
diff --git a/plugins/spu2-x/src/Wavedump_wav.cpp b/plugins/spu2-x/src/Wavedump_wav.cpp
index b3f8d225dd..c157328942 100644
--- a/plugins/spu2-x/src/Wavedump_wav.cpp
+++ b/plugins/spu2-x/src/Wavedump_wav.cpp
@@ -83,14 +83,16 @@ namespace WaveDump
 		}
 	}
 
-	void WriteCore( uint coreidx, CoreSourceType src, s16 left, s16 right )
+	void WriteCore( uint coreidx, CoreSourceType src, const StereoOut16& sample )
 	{
 		if( !IsDevBuild ) return;
 		if( m_CoreWav[coreidx][src] != NULL )
-		{
-			s16 buffer[2] = { left, right };
-			m_CoreWav[coreidx][src]->write( buffer, 2 );
-		}
+			m_CoreWav[coreidx][src]->write( (s16*)&sample, 2 );
+	}
+
+	void WriteCore( uint coreidx, CoreSourceType src, s16 left, s16 right )
+	{
+		WriteCore( coreidx, src, StereoOut16( left, right ) );
 	}
 }
 
@@ -116,10 +118,8 @@ void RecordStop()
 	SAFE_DELETE_OBJ( m_wavrecord );
 }
 
-void RecordWrite(s16 left, s16 right)
+void RecordWrite( const StereoOut16& sample )
 {
 	if( m_wavrecord == NULL ) return;
-
-	s16 buffer[2] = { left, right };
-	m_wavrecord->write( buffer, 2 );
+	m_wavrecord->write( (s16*)&sample, 2 );
 }
diff --git a/plugins/spu2-x/src/Win32/Config.cpp b/plugins/spu2-x/src/Win32/Config.cpp
index 8b92f151e5..5e36fe52d8 100644
--- a/plugins/spu2-x/src/Win32/Config.cpp
+++ b/plugins/spu2-x/src/Win32/Config.cpp
@@ -33,30 +33,32 @@ static const int LATENCY_MIN = 40;
 int AutoDMAPlayRate[2] = {0,0};
 
 // MIXING
-int Interpolation=1;
+int Interpolation = 1;
 /* values:
 		0: no interpolation (use nearest)
 		1. linear interpolation
 		2. cubic interpolation
 */
 
-bool EffectsDisabled=false;
+bool EffectsDisabled = false;
 
 // OUTPUT
-int SndOutLatencyMS=160;
-bool timeStretchDisabled=false;
+int SndOutLatencyMS = 160;
+bool timeStretchDisabled = false;
 
-u32 OutputModule=0; //OUTPUT_DSOUND;
+u32 OutputModule = 0;
 
 CONFIG_DSOUNDOUT Config_DSoundOut;
 CONFIG_WAVEOUT Config_WaveOut;
 CONFIG_XAUDIO2 Config_XAudio2;
 
 // DSP
-bool dspPluginEnabled=false;
-int  dspPluginModule=0;
+bool dspPluginEnabled = false;
+int  dspPluginModule = 0;
 wchar_t dspPlugin[256];
 
+bool StereoExpansionDisabled = true;
+
 /*****************************************************************************/
 
 void ReadSettings()
@@ -69,7 +71,8 @@ void ReadSettings()
 	timeStretchDisabled = CfgReadBool( _T("OUTPUT"), _T("Disable_Timestretch"), false );
 	EffectsDisabled = CfgReadBool( _T("MIXING"), _T("Disable_Effects"), false );
 
-	SndOutLatencyMS=CfgReadInt(_T("OUTPUT"),_T("Latency"), 160);
+	StereoExpansionDisabled = CfgReadBool( _T("OUTPUT"), _T("Disable_StereoExpansion"), false );
+	SndOutLatencyMS = CfgReadInt(_T("OUTPUT"),_T("Latency"), 160);
 
 	wchar_t omodid[128];
 	CfgReadStr( _T("OUTPUT"), _T("Output_Module"), omodid, 127, XAudio2Out->GetIdent() );
@@ -118,9 +121,10 @@ void WriteSettings()
 
 	CfgWriteBool(_T("MIXING"),_T("Disable_Effects"),EffectsDisabled);
 
-	CfgWriteStr(_T("OUTPUT"),_T("Output_Module"),mods[OutputModule]->GetIdent() );
-	CfgWriteInt(_T("OUTPUT"),_T("Latency"),SndOutLatencyMS);
-	CfgWriteBool(_T("OUTPUT"),_T("Disable_Timestretch"),timeStretchDisabled);
+	CfgWriteStr(_T("OUTPUT"),_T("Output_Module"), mods[OutputModule]->GetIdent() );
+	CfgWriteInt(_T("OUTPUT"),_T("Latency"), SndOutLatencyMS);
+	CfgWriteBool(_T("OUTPUT"),_T("Disable_Timestretch"), timeStretchDisabled);
+	CfgWriteBool(_T("OUTPUT"),_T("Disable_StereoExpansion"), StereoExpansionDisabled);
 
 	if( Config_DSoundOut.Device.empty() ) Config_DSoundOut.Device = _T("default");
 	if( Config_WaveOut.Device.empty() ) Config_WaveOut.Device = _T("default");
@@ -181,6 +185,7 @@ BOOL CALLBACK ConfigProc(HWND hWnd,UINT uMsg,WPARAM wParam,LPARAM lParam)
 			EnableWindow( GetDlgItem( hWnd, IDC_OPEN_CONFIG_DEBUG ), DebugEnabled );
 			
 			SET_CHECK(IDC_EFFECTS_DISABLE,	EffectsDisabled);
+			SET_CHECK(IDC_EXPANSION_DISABLE,StereoExpansionDisabled);
 			SET_CHECK(IDC_TS_DISABLE,		timeStretchDisabled);
 			SET_CHECK(IDC_DEBUG_ENABLE,		DebugEnabled);
 			SET_CHECK(IDC_DSP_ENABLE,		dspPluginEnabled);
@@ -212,7 +217,7 @@ BOOL CALLBACK ConfigProc(HWND hWnd,UINT uMsg,WPARAM wParam,LPARAM lParam)
 				break;
 
 				case IDC_OUTCONF:
-					SndConfigure( hWnd,
+					SndBuffer::Configure( hWnd,
 						(int)SendMessage(GetDlgItem(hWnd,IDC_OUTPUT),CB_GETCURSEL,0,0)
 					);
 				break;
@@ -234,6 +239,7 @@ BOOL CALLBACK ConfigProc(HWND hWnd,UINT uMsg,WPARAM wParam,LPARAM lParam)
 
 				HANDLE_CHECK(IDC_EFFECTS_DISABLE,EffectsDisabled);
 				HANDLE_CHECK(IDC_DSP_ENABLE,dspPluginEnabled);
+				HANDLE_CHECK(IDC_EXPANSION_DISABLE,StereoExpansionDisabled);
 				HANDLE_CHECKNB(IDC_TS_DISABLE,timeStretchDisabled);
 					EnableWindow( GetDlgItem( hWnd, IDC_OPEN_CONFIG_SOUNDTOUCH ), !timeStretchDisabled );
 				break;
diff --git a/plugins/spu2-x/src/Win32/Config.h b/plugins/spu2-x/src/Win32/Config.h
index a662bc1880..57c6ad734e 100644
--- a/plugins/spu2-x/src/Win32/Config.h
+++ b/plugins/spu2-x/src/Win32/Config.h
@@ -82,6 +82,7 @@ extern int  dspPluginModule;
 
 extern bool	dspPluginEnabled;
 extern bool timeStretchDisabled;
+extern bool StereoExpansionDisabled;
 
 class SoundtouchCfg
 {
@@ -120,12 +121,9 @@ struct CONFIG_XAUDIO2
 	std::wstring Device;
 	s8 NumBuffers;
 
-	bool ExpandTo51;
-
 	CONFIG_XAUDIO2() :
 		Device(),
-		NumBuffers( 2 ),
-		ExpandTo51( true )
+		NumBuffers( 2 )
 	{
 	}
 };
diff --git a/plugins/spu2-x/src/Win32/RealtimeDebugger.cpp b/plugins/spu2-x/src/Win32/RealtimeDebugger.cpp
index c2fdda1a47..d3243bd714 100644
--- a/plugins/spu2-x/src/Win32/RealtimeDebugger.cpp
+++ b/plugins/spu2-x/src/Win32/RealtimeDebugger.cpp
@@ -144,8 +144,8 @@ void UpdateDebugDialog()
 
 				SetDCBrushColor  (hdc,RGB(  0,255,  0));
 
-				int vl = abs(((vc.VolumeL.Value >> 16) * 24) >> 15);
-				int vr = abs(((vc.VolumeR.Value >> 16) * 24) >> 15);
+				int vl = abs(((vc.Volume.Left.Value >> 16) * 24) >> 15);
+				int vr = abs(((vc.Volume.Right.Value >> 16) * 24) >> 15);
 
 				FillRectangle(hdc,IX+38,IY+26 - vl, 4, vl);
 				FillRectangle(hdc,IX+42,IY+26 - vr, 4, vr);
diff --git a/plugins/spu2-x/src/Win32/SndOut_DSound.cpp b/plugins/spu2-x/src/Win32/SndOut_DSound.cpp
index 4c55c053d2..86c72f2320 100644
--- a/plugins/spu2-x/src/Win32/SndOut_DSound.cpp
+++ b/plugins/spu2-x/src/Win32/SndOut_DSound.cpp
@@ -23,6 +23,7 @@
 #include "spu2.h"
 #include "dialogs.h"
 
+#define DIRECTSOUND_VERSION 0x1000
 #include <dsound.h>
 
 static ds_device_data devices[32];
@@ -37,7 +38,6 @@ private:
 
 	static const int PacketsPerBuffer = 1;
 	static const int BufferSize = SndOutPacketSize * PacketsPerBuffer;
-	static const int BufferSizeBytes = BufferSize << 1;
 
 
 	u32 numBuffers;		// cached copy of our configuration setting.
@@ -57,25 +57,26 @@ private:
 
 	HANDLE waitEvent;
 
-	SndBuffer *buff;
-
-	static DWORD CALLBACK RThread(DSound*obj)
+	template< typename T >
+	static DWORD CALLBACK RThread( DSound* obj )
 	{
-		return obj->Thread();
+		return obj->Thread<T>();
 	}
 
+	template< typename T >
 	DWORD CALLBACK Thread()
 	{
+		static const int BufferSizeBytes = BufferSize * sizeof( T );
 
 		while( dsound_running )
 		{
 			u32 rv = WaitForMultipleObjects(numBuffers,buffer_events,FALSE,200);
 	 
-			s16* p1, *oldp1;
+			T* p1, *oldp1;
 			LPVOID p2;
 			DWORD s1,s2;
 	 
-			u32 poffset=BufferSizeBytes * rv;
+			u32 poffset = BufferSizeBytes * rv;
 
 			if( FAILED(buffer->Lock(poffset,BufferSizeBytes,(LPVOID*)&p1,&s1,&p2,&s2,0) ) )
 			{
@@ -86,9 +87,9 @@ private:
 			oldp1 = p1;
 
 			for(int p=0; p<PacketsPerBuffer; p++, p1+=SndOutPacketSize )
-				buff->ReadSamples( p1 );
+				SndBuffer::ReadSamples( p1 );
 
-			buffer->Unlock(oldp1,s1,p2,s2);
+			buffer->Unlock( oldp1, s1, p2, s2 );
 
 			// Set the write pointer to the beginning of the next block.
 			myLastWrite = (poffset + BufferSizeBytes) & ~BufferSizeBytes;
@@ -97,9 +98,8 @@ private:
 	}
 
 public:
-	s32 Init(SndBuffer *sb)
+	s32 Init()
 	{
-		buff = sb;
 		numBuffers = Config_DSoundOut.NumBuffers;
 
 		//
@@ -130,37 +130,46 @@ public:
 		if( FAILED(dsound->SetCooperativeLevel(GetDesktopWindow(),DSSCL_PRIORITY)) )
 			throw std::runtime_error( "DirectSound Error: Cooperative level could not be set." );
 		
+		// Determine the user's speaker configuration, and select an expansion option as needed.
+		// FAIL : Directsound doesn't appear to support audio expansion >_<
+		
+		DWORD speakerConfig = 2;
+		//dsound->GetSpeakerConfig( &speakerConfig );
+
 		IDirectSoundBuffer* buffer_;
  		DSBUFFERDESC desc; 
 	 
 		// Set up WAV format structure. 
 	 
 		memset(&wfx, 0, sizeof(WAVEFORMATEX)); 
-		wfx.wFormatTag = WAVE_FORMAT_PCM;
-		wfx.nSamplesPerSec = SampleRate;
-		wfx.nChannels=2;
-		wfx.wBitsPerSample = 16;
-		wfx.nBlockAlign = 2*2;
-		wfx.nAvgBytesPerSec = SampleRate * wfx.nBlockAlign;
-		wfx.cbSize=0;
+		wfx.wFormatTag		= WAVE_FORMAT_PCM;
+		wfx.nSamplesPerSec	= SampleRate;
+		wfx.nChannels		= speakerConfig;
+		wfx.wBitsPerSample	= 16;
+		wfx.nBlockAlign		= 2*speakerConfig;
+		wfx.nAvgBytesPerSec	= SampleRate * wfx.nBlockAlign;
+		wfx.cbSize			= 0;
+
+		uint BufferSizeBytes = BufferSize * wfx.nBlockAlign;
 	 
 		// Set up DSBUFFERDESC structure. 
 	 
 		memset(&desc, 0, sizeof(DSBUFFERDESC)); 
 		desc.dwSize = sizeof(DSBUFFERDESC); 
 		desc.dwFlags = DSBCAPS_GETCURRENTPOSITION2 | DSBCAPS_CTRLPOSITIONNOTIFY;// _CTRLPAN | DSBCAPS_CTRLVOLUME | DSBCAPS_CTRLFREQUENCY; 
-		desc.dwBufferBytes = BufferSizeBytes * numBuffers; 
-		desc.lpwfxFormat = &wfx; 
+		desc.dwBufferBytes = BufferSizeBytes * numBuffers;
+		desc.lpwfxFormat = &wfx;
 	 
 		desc.dwFlags |= DSBCAPS_LOCSOFTWARE;
 		desc.dwFlags |= DSBCAPS_GLOBALFOCUS;
 	 
-		if( FAILED(dsound->CreateSoundBuffer(&desc,&buffer_,0) ) ||
-			FAILED(buffer_->QueryInterface(IID_IDirectSoundBuffer8,(void**)&buffer)) )
+		if( FAILED(dsound->CreateSoundBuffer(&desc,&buffer_,0) ) )
+			throw std::runtime_error( "DirectSound Error: Interface could not be queried." );
+		
+		if(	FAILED(buffer_->QueryInterface(IID_IDirectSoundBuffer8,(void**)&buffer)) )
 			throw std::runtime_error( "DirectSound Error: Interface could not be queried." );
 
 		buffer_->Release();
-	 
 		verifyc( buffer->QueryInterface(IID_IDirectSoundNotify8,(void**)&buffer_notify) );
 
 		DSBPOSITIONNOTIFY not[MAX_BUFFER_COUNT];
@@ -171,9 +180,9 @@ public:
 			// it was needed for some quirky driver?  Theoretically we want the notification as soon
 			// as possible after the buffer has finished playing.
 
-			buffer_events[i]=CreateEvent(NULL,FALSE,FALSE,NULL);
-			not[i].dwOffset=(wfx.nBlockAlign*2 + BufferSizeBytes*(i+1))%desc.dwBufferBytes;
-			not[i].hEventNotify=buffer_events[i];
+			buffer_events[i] = CreateEvent(NULL,FALSE,FALSE,NULL);
+			not[i].dwOffset = (wfx.nBlockAlign + BufferSizeBytes*(i+1)) % desc.dwBufferBytes;
+			not[i].hEventNotify = buffer_events[i];
 		}
 	 
 		buffer_notify->SetNotificationPositions(numBuffers,not);
@@ -191,9 +200,9 @@ public:
 
 		// Start Thread
 		myLastWrite = 0;
-		dsound_running=true;
-		thread=CreateThread(NULL,0,(LPTHREAD_START_ROUTINE)RThread,this,0,&tid);
-		SetThreadPriority(thread,THREAD_PRIORITY_TIME_CRITICAL);
+		dsound_running = true;
+		thread = CreateThread(NULL,0,(LPTHREAD_START_ROUTINE)RThread<StereoOut16>,this,0,&tid);
+		SetThreadPriority(thread,THREAD_PRIORITY_ABOVE_NORMAL);
 
 		return 0;
 	}
diff --git a/plugins/spu2-x/src/Win32/SndOut_XAudio2.cpp b/plugins/spu2-x/src/Win32/SndOut_XAudio2.cpp
index 8f2f2526cf..bb3759be32 100644
--- a/plugins/spu2-x/src/Win32/SndOut_XAudio2.cpp
+++ b/plugins/spu2-x/src/Win32/SndOut_XAudio2.cpp
@@ -38,7 +38,6 @@ private:
 	class BaseStreamingVoice : public IXAudio2VoiceCallback
 	{
 	protected:
-		SndBuffer* m_sndout;
 		IXAudio2SourceVoice* pSourceVoice;
 		s16* qbuffer;
 
@@ -69,11 +68,10 @@ private:
 			DeleteCriticalSection( &cs );
 		}
 
-		BaseStreamingVoice( SndBuffer* sb, uint numChannels ) :
-			m_sndout( sb ),
+		BaseStreamingVoice( uint numChannels ) :
 			m_nBuffers( Config_XAudio2.NumBuffers ),
 			m_nChannels( numChannels ),
-			m_BufferSize( SndOutPacketSize/2 * m_nChannels * PacketsPerBuffer ),
+			m_BufferSize( SndOutPacketSize * m_nChannels * PacketsPerBuffer ),
 			m_BufferSizeBytes( m_BufferSize * sizeof(s16) )
 		{
 		}
@@ -133,18 +131,25 @@ private:
 			LeaveCriticalSection( &cs );
 		}
 
+		STDMETHOD_(void, OnVoiceProcessingPassStart) () {}
+		STDMETHOD_(void, OnVoiceProcessingPassStart) (UINT32) { };
+		STDMETHOD_(void, OnVoiceProcessingPassEnd) () {}
+		STDMETHOD_(void, OnStreamEnd) () {}
+		STDMETHOD_(void, OnBufferStart) ( void* ) {}
+		STDMETHOD_(void, OnLoopEnd) ( void* ) {}   
+		STDMETHOD_(void, OnVoiceError) (THIS_ void* pBufferContext, HRESULT Error) { };
 	};
 	
-	
-	class StreamingVoice_Stereo : public BaseStreamingVoice
+	template< typename T >
+	class StreamingVoice : public BaseStreamingVoice
 	{
 	public:
-		StreamingVoice_Stereo( SndBuffer* sb, IXAudio2* pXAudio2 ) :
-			BaseStreamingVoice( sb, 2 )
+		StreamingVoice( IXAudio2* pXAudio2 ) :
+			BaseStreamingVoice( sizeof(T) / sizeof( s16 ) )
 		{
 		}
 		
-		virtual ~StreamingVoice_Stereo() {}
+		virtual ~StreamingVoice() {}
 
 		void Init( IXAudio2* pXAudio2 )
 		{
@@ -152,11 +157,6 @@ private:
 		}
 
 	protected:
-		STDMETHOD_(void, OnVoiceProcessingPassStart) () {}
-		STDMETHOD_(void, OnVoiceProcessingPassStart) (UINT32) { };
-		STDMETHOD_(void, OnVoiceProcessingPassEnd) () {}
-		STDMETHOD_(void, OnStreamEnd) () {}
-		STDMETHOD_(void, OnBufferStart) ( void* ) {}
 		STDMETHOD_(void, OnBufferEnd) ( void* context )
 		{
 			EnterCriticalSection( &cs );
@@ -164,10 +164,10 @@ private:
 			// All of these checks are necessary because XAudio2 is wonky shizat.
 			if( pSourceVoice == NULL || context == NULL ) return;
 
-			s16* qb = (s16*)context;
+			T* qb = (T*)context;
 
 			for(int p=0; p<PacketsPerBuffer; p++, qb+=SndOutPacketSize )
-				m_sndout->ReadSamples( qb );
+				SndBuffer::ReadSamples( qb );
 
 			XAUDIO2_BUFFER buf = {0};
 			buf.AudioBytes	= m_BufferSizeBytes;
@@ -177,83 +177,6 @@ private:
 			pSourceVoice->SubmitSourceBuffer( &buf );
 			LeaveCriticalSection( &cs );
 		}
-		STDMETHOD_(void, OnLoopEnd) ( void* ) {}   
-		STDMETHOD_(void, OnVoiceError) (THIS_ void* pBufferContext, HRESULT Error) { };
-
-	};
-
-	class StreamingVoice_Surround51 : public BaseStreamingVoice
-	{
-	public:
-		//LPF_data m_lpf_left;
-		//LPF_data m_lpf_right;
-		
-		s32 buffer[2 * SndOutPacketSize * PacketsPerBuffer];
-
-		StreamingVoice_Surround51( SndBuffer* sb, IXAudio2* pXAudio2 ) :
-			BaseStreamingVoice( sb, 6 )
-			//m_lpf_left( Config_XAudio2.LowpassLFE, SampleRate ),
-			//m_lpf_right( Config_XAudio2.LowpassLFE, SampleRate )
-		{
-		}
-
-		virtual ~StreamingVoice_Surround51() {}
-
-		void Init( IXAudio2* pXAudio2 )
-		{
-			_init( pXAudio2, SPEAKER_5POINT1 );
-		}
-		
-	protected:
-		STDMETHOD_(void, OnVoiceProcessingPassStart) () {}
-		STDMETHOD_(void, OnVoiceProcessingPassStart) (UINT32) { };
-		STDMETHOD_(void, OnVoiceProcessingPassEnd) () {}
-		STDMETHOD_(void, OnStreamEnd) () {}
-		STDMETHOD_(void, OnBufferStart) ( void* ) {}
-		STDMETHOD_(void, OnBufferEnd) ( void* context )
-		{
-			EnterCriticalSection( &cs );
-
-			// All of these checks are necessary because XAudio2 is wonky shizat.
-			if( pSourceVoice == NULL || context == NULL ) return;
-
-			s16* qb = (s16*)context;
-
-			for(int p=0; p<PacketsPerBuffer; p++ )
-			{
-				m_sndout->ReadSamples( buffer );
-				const s32* src = buffer;
-
-				for( int i=0; i<SndOutPacketSize/2; i++, qb+=6, src+=2 )
-				{
-					// Left and right Front!
-					qb[0] = SndScaleVol( src[0] );
-					qb[1] = SndScaleVol( src[1] );
-					
-					// Center and Subwoofer/LFE -->
-					// This method is simple and sounds nice.  It relies on the speaker/soundcard
-					// systems do to their own low pass / crossover.  Manual lowpass is wasted effort
-					// and can't match solid state results anyway.
-					
-					qb[2] = qb[3] = (src[0] + src[1]) >> (SndOutVolumeShift+1);
-					
-					// Left and right rear!
-					qb[4] = SndScaleVol( src[0] );
-					qb[5] = SndScaleVol( src[1] );
-				}
-
-			}
-
-			XAUDIO2_BUFFER buf = { 0 };
-			buf.AudioBytes = m_BufferSizeBytes;
-			buf.pAudioData = (BYTE*)context;
-			buf.pContext = context;
-
-			pSourceVoice->SubmitSourceBuffer( &buf );
-			LeaveCriticalSection( &cs );
-		}
-		STDMETHOD_(void, OnLoopEnd) ( void* ) {}   
-		STDMETHOD_(void, OnVoiceError) (THIS_ void* pBufferContext, HRESULT Error) { };
 
 	};
 
@@ -263,7 +186,7 @@ private:
 
 public:
 
-	s32 Init( SndBuffer *sb )
+	s32 Init()
 	{
 		HRESULT hr;
 
@@ -273,9 +196,8 @@ public:
 		CoInitializeEx( NULL, COINIT_MULTITHREADED );
 
 		UINT32 flags = 0;
-#ifdef _DEBUG
-		flags |= XAUDIO2_DEBUG_ENGINE;
-#endif
+		if( IsDebugBuild )
+			flags |= XAUDIO2_DEBUG_ENGINE;
 
 		if ( FAILED(hr = XAudio2Create( &pXAudio2, flags ) ) )
 		{
@@ -298,18 +220,47 @@ public:
 			return -1;
 		}
 
-		if( Config_XAudio2.ExpandTo51 && deviceDetails.OutputFormat.Format.nChannels >= 6 )
-		{
-			ConLog( "* SPU2 > 5.1 speaker expansion enabled." );
-			voiceContext = new StreamingVoice_Surround51( sb, pXAudio2 );
-		}
-		else
-		{
-			voiceContext = new StreamingVoice_Stereo( sb, pXAudio2 );
-		}
+		if( StereoExpansionDisabled )
+			deviceDetails.OutputFormat.Format.nChannels	= 2;
 
+		// Any windows driver should support stereo at the software level, I should think!
+		jASSUME( deviceDetails.OutputFormat.Format.nChannels > 1 );
+
+		switch( deviceDetails.OutputFormat.Format.nChannels )
+		{
+			case 2:
+				ConLog( "* SPU2 > Using normal 2 speaker stereo output." );
+				voiceContext = new StreamingVoice<StereoOut16>( pXAudio2 );
+			break;
+
+			case 3:
+				ConLog( "* SPU2 > 2.1 speaker expansion enabled." );
+				voiceContext = new StreamingVoice<Stereo21Out16>( pXAudio2 );
+			break;
+
+			case 4:
+				ConLog( "* SPU2 > 4 speaker expansion enabled [quadraphenia]" );
+				voiceContext = new StreamingVoice<StereoQuadOut16>( pXAudio2 );
+			break;
+						
+			case 5:
+				ConLog( "* SPU2 > 4.1 speaker expansion enabled." );
+				voiceContext = new StreamingVoice<Stereo41Out16>( pXAudio2 );
+			break;
+
+			case 6:
+			case 7:
+				ConLog( "* SPU2 > 5.1 speaker expansion enabled." );
+				voiceContext = new StreamingVoice<Stereo51Out16>( pXAudio2 );
+			break;
+
+			default:	// anything 8 or more gets the 7.1 treatment!
+				ConLog( "* SPU2 > 7.1 speaker expansion enabled." );
+				voiceContext = new StreamingVoice<Stereo51Out16>( pXAudio2 );
+			break;
+		}
+		
 		voiceContext->Init( pXAudio2 );
-
 		return 0;
 	}
 
diff --git a/plugins/spu2-x/src/Win32/SndOut_waveOut.cpp b/plugins/spu2-x/src/Win32/SndOut_waveOut.cpp
index 9787a4c9ce..42e9c03c61 100644
--- a/plugins/spu2-x/src/Win32/SndOut_waveOut.cpp
+++ b/plugins/spu2-x/src/Win32/SndOut_waveOut.cpp
@@ -31,14 +31,13 @@ private:
 
 	static const int PacketsPerBuffer = (1024 / SndOutPacketSize);
 	static const int BufferSize = SndOutPacketSize*PacketsPerBuffer;
-	static const int BufferSizeBytes = BufferSize << 1;
 
 	u32 numBuffers;
 	HWAVEOUT hwodevice;
 	WAVEFORMATEX wformat;
 	WAVEHDR whbuffer[MAX_BUFFER_COUNT];
 
-	s16* qbuffer;
+	StereoOut16* qbuffer;
 
 	#define QBUFFER(x) (qbuffer + BufferSize * (x))
 
@@ -46,17 +45,13 @@ private:
 	HANDLE thread;
 	DWORD tid;
 
-	SndBuffer *buff;
-
 	wchar_t ErrText[256];
 
-	static DWORD CALLBACK RThread(WaveOutModule*obj)
-	{
-		return obj->Thread();
-	}
-
+	template< typename T >
 	DWORD CALLBACK Thread()
 	{
+		static const int BufferSizeBytes = BufferSize * sizeof( T );
+
 		while( waveout_running )
 		{
 			bool didsomething = false;
@@ -64,16 +59,16 @@ private:
 			{
 				if(!(whbuffer[i].dwFlags & WHDR_DONE) ) continue;
 
-				WAVEHDR *buf=whbuffer+i;
+				WAVEHDR *buf = whbuffer+i;
 
 				buf->dwBytesRecorded = buf->dwBufferLength;
 
-				s16 *t = (s16*)buf->lpData;
+				T* t = (T*)buf->lpData;
 				for(int p=0; p<PacketsPerBuffer; p++, t+=SndOutPacketSize )
-					buff->ReadSamples( t );
+					SndBuffer::ReadSamples( t );
 
-				whbuffer[i].dwFlags&=~WHDR_DONE;
-				waveOutWrite(hwodevice,buf,sizeof(WAVEHDR));
+				whbuffer[i].dwFlags &= ~WHDR_DONE;
+				waveOutWrite( hwodevice, buf, sizeof(WAVEHDR) );
 				didsomething = true;
 			}
 
@@ -85,25 +80,71 @@ private:
 		return 0;
 	}
 
-public:
-	s32 Init(SndBuffer *sb)
+	template< typename T >
+	static DWORD CALLBACK RThread(WaveOutModule*obj)
+	{
+		return obj->Thread<T>();
+	}
+
+public:
+	s32 Init()
 	{
-		buff = sb;
 		numBuffers = Config_WaveOut.NumBuffers;
 
 		MMRESULT woores;
 
 		if (Test()) return -1;
 
-		wformat.wFormatTag=WAVE_FORMAT_PCM;
-		wformat.nSamplesPerSec=SampleRate;
-		wformat.wBitsPerSample=16;
-		wformat.nChannels=2;
-		wformat.nBlockAlign=((wformat.wBitsPerSample * wformat.nChannels) / 8);
-		wformat.nAvgBytesPerSec=(wformat.nSamplesPerSec * wformat.nBlockAlign);
-		wformat.cbSize=0;
+		// TODO : Use dsound to determine the speaker configuration, and expand audio from there.
+
+		#if 0
+		int speakerConfig;
+
+		if( StereoExpansionDisabled )
+			speakerConfig = 2;
+
+		// Any windows driver should support stereo at the software level, I should think!
+		jASSUME( speakerConfig > 1 );
+		LPTHREAD_START_ROUTINE threadproc;
+
+		switch( speakerConfig )
+		{
+		case 2:
+			ConLog( "* SPU2 > Using normal 2 speaker stereo output." );
+			threadproc = (LPTHREAD_START_ROUTINE)&RThread<StereoOut16>;
+			speakerConfig = 2;
+		break;
+
+		case 4:
+			ConLog( "* SPU2 > 4 speaker expansion enabled [quadraphenia]" );
+			threadproc = (LPTHREAD_START_ROUTINE)&RThread<StereoQuadOut16>;
+			speakerConfig = 4;
+		break;
+
+		case 6:
+		case 7:
+			ConLog( "* SPU2 > 5.1 speaker expansion enabled." );
+			threadproc = (LPTHREAD_START_ROUTINE)&RThread<Stereo51Out16>;
+			speakerConfig = 6;
+		break;
+
+		default:
+			ConLog( "* SPU2 > 7.1 speaker expansion enabled." );
+			threadproc = (LPTHREAD_START_ROUTINE)&RThread<Stereo51Out16>;
+			speakerConfig = 8;
+		break;
+		}
+		#endif
+
+		wformat.wFormatTag		= WAVE_FORMAT_PCM;
+		wformat.nSamplesPerSec	= SampleRate;
+		wformat.wBitsPerSample	= 16;
+		wformat.nChannels		= 2;
+		wformat.nBlockAlign		= ((wformat.wBitsPerSample * wformat.nChannels) / 8);
+		wformat.nAvgBytesPerSec	= (wformat.nSamplesPerSec * wformat.nBlockAlign);
+		wformat.cbSize			= 0;
 		
-		qbuffer=new s16[BufferSize*numBuffers];
+		qbuffer = new StereoOut16[BufferSize*numBuffers];
 
 		woores = waveOutOpen(&hwodevice,WAVE_MAPPER,&wformat,0,0,0);
 		if (woores != MMSYSERR_NOERROR)
@@ -113,6 +154,8 @@ public:
 			return -1;
 		}
 
+		const int BufferSizeBytes = wformat.nBlockAlign * BufferSize;
+
 		for(u32 i=0;i<numBuffers;i++)
 		{
 			whbuffer[i].dwBufferLength=BufferSizeBytes;
@@ -133,7 +176,7 @@ public:
 		// love it needs and won't suck resources idling pointlessly.  Just don't try to
 		// run it in uber-low-latency mode.
 		waveout_running = true;
-		thread = CreateThread(NULL,0,(LPTHREAD_START_ROUTINE)RThread,this,0,&tid);
+		thread = CreateThread(NULL,0,(LPTHREAD_START_ROUTINE)RThread<StereoOut16>,this,0,&tid);
 
 		return 0;
 	}
@@ -276,4 +319,4 @@ public:
 
 } WO;
 
-SndOutModule *WaveOut=&WO;
+SndOutModule *WaveOut = &WO;
diff --git a/plugins/spu2-x/src/Win32/Spu2-X_vs2008.vcproj b/plugins/spu2-x/src/Win32/Spu2-X_vs2008.vcproj
index 9837055fe9..6cbd31dbf1 100644
--- a/plugins/spu2-x/src/Win32/Spu2-X_vs2008.vcproj
+++ b/plugins/spu2-x/src/Win32/Spu2-X_vs2008.vcproj
@@ -53,6 +53,7 @@
 				FavorSizeOrSpeed="1"
 				OmitFramePointers="true"
 				EnableFiberSafeOptimizations="true"
+				AdditionalIncludeDirectories=""
 				PreprocessorDefinitions="SPU2X_DEVBUILD;FLOAT_SAMPLES;NDEBUG;_USRDLL"
 				StringPooling="true"
 				RuntimeLibrary="0"
@@ -608,6 +609,10 @@
 					RelativePath=".\SndOut_XAudio2.cpp"
 					>
 				</File>
+				<File
+					RelativePath="..\Timestretcher.cpp"
+					>
+				</File>
 			</Filter>
 			<Filter
 				Name="decoder"
diff --git a/plugins/spu2-x/src/defs.h b/plugins/spu2-x/src/defs.h
index e7ced8e9c2..9451359986 100644
--- a/plugins/spu2-x/src/defs.h
+++ b/plugins/spu2-x/src/defs.h
@@ -22,7 +22,24 @@
 #ifndef DEFS_H_INCLUDED
 #define DEFS_H_INCLUDED
 
-struct V_Volume
+struct V_VolumeLR
+{
+	static V_VolumeLR Max;
+
+	s32 Left;
+	s32 Right;
+
+	V_VolumeLR() {}
+	V_VolumeLR( s32 both ) :
+		Left( both ),
+		Right( both )
+	{
+	}
+	
+	void DebugDump( FILE* dump, const char* title );
+};
+
+struct V_VolumeSlide
 {
 	// Holds the "original" value of the volume for this voice, prior to slides.
 	// (ie, the volume as written to the register)
@@ -33,9 +50,47 @@ struct V_Volume
 	s8 Mode;
 	
 public:
+	V_VolumeSlide() {}
+	V_VolumeSlide( s16 regval, s32 fullvol ) :
+		Reg_VOL( regval ),
+		Value( fullvol ),
+		Increment( 0 ),
+		Mode( 0 )
+	{
+	}
+
 	void Update();
+	void RegSet( u16 src );		// used to set the volume from a register source (16 bit signed)
+	void DebugDump( FILE* dump, const char* title, const char* nameLR );
+	
 };
 
+struct V_VolumeSlideLR
+{
+	static V_VolumeSlideLR Max;
+
+	V_VolumeSlide Left;
+	V_VolumeSlide Right;
+
+public:
+	V_VolumeSlideLR() {}
+		
+	V_VolumeSlideLR( s16 regval, s32 bothval ) :
+		Left( regval, bothval ),
+		Right( regval, bothval )
+	{
+	}
+
+	void Update()
+	{
+		Left.Update();
+		Right.Update();
+	}
+	
+	void DebugDump( FILE* dump, const char* title );
+};
+
+
 struct V_ADSR
 {
 	u16 Reg_ADSR1;
@@ -61,12 +116,10 @@ public:
 
 struct V_Voice
 {
-// SPU2 cycle where the Playing started
-	u32 PlayCycle;
-// Left Volume
-	V_Volume VolumeL;
-// Right Volume
-	V_Volume VolumeR;
+	u32 PlayCycle;		// SPU2 cycle where the Playing started
+
+	V_VolumeSlideLR Volume;
+
 // Envelope
 	V_ADSR ADSR;
 // Pitch (also Reg_PITCH)
@@ -198,6 +251,39 @@ struct V_Reverb
 	u32 MIX_DEST_B1;
 };
 
+struct V_ReverbBuffers
+{
+	s32 FB_SRC_A0;
+	s32 FB_SRC_B0;
+	s32 FB_SRC_A1;
+	s32 FB_SRC_B1;
+
+	s32 IIR_SRC_A0;
+	s32 IIR_SRC_A1;
+	s32 IIR_SRC_B1;
+	s32 IIR_SRC_B0;
+	s32 IIR_DEST_A0;
+	s32 IIR_DEST_A1;
+	s32 IIR_DEST_B0;
+	s32 IIR_DEST_B1;
+
+	s32 ACC_SRC_A0;
+	s32 ACC_SRC_A1;
+	s32 ACC_SRC_B0;
+	s32 ACC_SRC_B1;
+	s32 ACC_SRC_C0;
+	s32 ACC_SRC_C1;
+	s32 ACC_SRC_D0;
+	s32 ACC_SRC_D1;
+
+	s32 MIX_DEST_A0;
+	s32 MIX_DEST_A1;
+	s32 MIX_DEST_B0;
+	s32 MIX_DEST_B1;
+	
+	bool NeedsUpdated;
+};
+
 struct V_SPDIF
 {
 	u16 Out;
@@ -228,22 +314,14 @@ struct V_Core
 {
 // Core Voices
 	V_Voice Voices[24];
-// Master Volume for Left Channel
-	V_Volume MasterL;
-// Master Volume for Right Channel
-	V_Volume MasterR;
-// Volume for External Data Input (Left Channel)
-	s32 ExtL;
-// Volume for External Data Input (Right Channel)
-	s32 ExtR;
-// Volume for Sound Data Input (Left Channel)
-	s32 InpL;
-// Volume for Sound Data Input (Right Channel)
-	s32 InpR;
-// Volume for Output from Effects (Left Channel)
-	s32 FxL;
-// Volume for Output from Effects (Right Channel)
-	s32 FxR;
+
+
+	V_VolumeSlideLR MasterVol;// Master Volume
+	
+	V_VolumeLR ExtVol;		// Volume for External Data Input
+	V_VolumeLR InpVol;		// Volume for Sound Data Input
+	V_VolumeLR FxVol;		// Volume for Output from Effects 
+	
 // Interrupt Address
 	u32 IRQA;
 // DMA Transfer Start Address
@@ -296,6 +374,7 @@ struct V_Core
 
 // Reverb
 	V_Reverb Revb;
+	V_ReverbBuffers RevBuffers;		// buffer pointers for reverb, pre-calculated and pre-clipped.
 	u32 EffectsStartA;
 	u32 EffectsEndA;
 	u32 ReverbX;
@@ -311,8 +390,7 @@ struct V_Core
 	// Last samples to pass through the effects processor.
 	// Used because the effects processor works at 24khz and just pulls
 	// from this for the odd Ts.
-	s16 LastEffectL;
-	s16 LastEffectR;
+	StereoOut32 LastEffect;
 
 	u8 InitDelay;
 
@@ -329,12 +407,15 @@ struct V_Core
 	s16 ADMATempBuffer[0x1000];
 
 	u32 ADMAPV;
-	u32 ADMAPL;
-	u32 ADMAPR;
-
+	StereoOut32 ADMAP;
 
 	void Reset();
 	void UpdateEffectsBufferSize();
+
+	V_Core();		// our badass constructor
+	s32 EffectsBufferIndexer( s32 offset ) const;
+	void UpdateFeedbackBuffersA();
+	void UpdateFeedbackBuffersB();
 };
 
 extern V_Core Cores[2];