SPU2Ghz: Updated to include Jake.Stine's latest work. Some issues still exist, but will be fixed soon.

Also made the speedlimiter switch toggle timestretch for easier debugging and convenience :) git-svn-id: http://pcsx2-playground.googlecode.com/svn/trunk@227 a6443dda-0b58-4228-96e9-037be469359c
2008-10-22 16:39:31 +00:00 · 2008-10-22 16:39:31 +00:00 · ee9dcd4130
parent e4b3c6e649
commit ee9dcd4130
8 changed files with 894 additions and 391 deletions
--- a/plugins/spu2ghz/debug.h
+++ b/plugins/spu2ghz/debug.h
@ -21,7 +21,6 @@

 extern FILE *spu2Log;

-extern int Log;
 void FileLog(const char *fmt, ...);
 void ConLog(const char *fmt, ...);

@ -34,4 +33,4 @@ void wavedump_close();
 void wavedump_write(s16 left,s16 right);


-#endif // DEBUG_H_INCLUDED //
+#endif // DEBUG_H_INCLUDED //
--- a/plugins/spu2ghz/defs.h
+++ b/plugins/spu2ghz/defs.h
@ -90,39 +90,68 @@ typedef struct {
 	s32 Prev2;

 	s8 LoopMode;
-	s8 LoopStart;
-	s8 Loop;
-	s8 LoopEnd;
+	s8 LoopFlags;

+// [Air] : Replaced loop flags read from the ADPCM header with
+//  a single LoopFlags value (above) -- more cache-friendly.
+	//s8 LoopStart;
+	//s8 Loop;
+	//s8 LoopEnd;
+
+// Sample pointer (19:12 bit fixed point)
 	s32 SP;

-	s32 PV1;
-	s32 PV2;
-	s32 PV3;
-	s32 PV4;
+// Sample pointer for Cubic Interpolation
+// Cubic interpolation mixes a sample behind Linear, so that it
+// can have sample data to either side of the end points from which
+// to extrapolate.  This SP represents that late sample position.
+	s32 SPc;

+// Previous sample values - used for interpolation
+// [Air] : Inverted order of these members to match the access order in the
+//   code (might improve cache hits).
+	s32 PV4;
+	s32 PV3;
+	s32 PV2;
+	s32 PV1;
+
+// Last outputted audio value, used for voice modulation.
 	s32 OutX;

-	s8 FirstBlock;
+// SBuffer now points directly to an ADPCM cache entry.
+	s16 *SBuffer;

-	s32 PeakX;
-	s32 SampleData;
-
-	// [Air]: Changed SBuffer from 32-bit to 16-bit. (this breaks old savestates)
-	//   Everything stored in SBuffer is 16-bit values, and on modern CPUs the benefit
-	//   of reduced data cache clutter out-weighs the benefit of using 'cpu native' 32-bit
-	//   values. (doesn't apply to SIMD of course, but no SIMD here anyway)
-	//   Because this breaks savestates it might not be worth the bother though.
-	s16 SBuffer[32];
+// sample position within the current decoded packet.
 	s32 SCurrent;

-	s32 displayPeak;
-
-	s32 lastSetStartA;
-
-	s32 lastStopReason;
 } V_Voice;

+#ifndef PUBLIC
+// ** Begin Debug-only variables section **
+// Separated from the V_Voice struct to improve cache performance of
+// the Public Release build.
+struct V_VoiceDebug
+{
+	s8 FirstBlock;
+	s32 SampleData;
+	s32 PeakX;
+	s32 displayPeak;
+	s32 lastSetStartA;
+	s32 lastStopReason;
+};
+
+struct V_CoreDebug
+{
+	V_VoiceDebug Voices[24];
+	s32 AutoDMAPeak;
+// Last Transfer Size
+	u32 lastsize;
+};
+
+// Debug tracking information - 24 voices and 2 cores.
+extern V_CoreDebug DebugCores[2];
+#endif
+
 typedef struct {
 	u16 IN_COEF_L;
 	u16 IN_COEF_R;
@ -256,8 +285,6 @@ typedef struct {
 	u32 EffectsStartA;
 	u32 EffectsEndA;
 	u32 ReverbX;
-// Last Transfer Size
-	u32 lastsize;
 // Registers
 	V_CoreRegs Regs;

@ -279,7 +306,6 @@ typedef struct {
 	u32 ADMAPL;
 	u32 ADMAPR;

-	s32 AutoDMAPeak;
 } V_Core;

 extern V_Core Cores[2];
--- a/plugins/spu2ghz/dma.cpp
+++ b/plugins/spu2ghz/dma.cpp
@ -91,15 +91,14 @@ void DMALogClose() {


 u16 DmaRead(u32 core) {
-	u16 ret;
-	ret=spu2Mu16(Cores[core].TDA);
+	const u16 ret = (u16)spu2M_Read(Cores[core].TDA);
 	Cores[core].TDA++;
 	Cores[core].TDA&=0xfffff;
 	return ret;
 }

 void DmaWrite(u32 core, u16 value) {
-	spu2Mu16(Cores[core].TSA)=value;
+	spu2M_Write( Cores[core].TSA, value );
 	Cores[core].TSA++;
 	Cores[core].TSA&=0xfffff;
 }
@ -207,7 +206,7 @@ void DoDMAWrite(int core,u16 *pMem,u32 size)

 	Cores[core].TDA=Cores[core].TSA;
 	for (i=0;i<size;i++) {
-		spu2Mu16(Cores[core].TDA)=pMem[i];
+		spu2M_Write( Cores[core].TDA, pMem[i] );
 		Cores[core].TDA++;
 		Cores[core].TDA&=0xfffff;
 	}
@ -228,6 +227,8 @@ void DoDMAWrite(int core,u16 *pMem,u32 size)

 void SPU2readDMA(int core, u16* pMem, u32 size) 
 {
+	if( disableEverything ) return;
+
 	if(hasPtr) TimeUpdate(*cPtr,1);

 	u32 i;
@ -254,6 +255,8 @@ void SPU2readDMA(int core, u16* pMem, u32 size)

 void SPU2writeDMA(int core, u16* pMem, u32 size) 
 {
+	if( disableEverything ) return;
+
 	if(hasPtr) TimeUpdate(*cPtr,1);

 	Cores[core].DMAPtr=pMem;
@ -267,7 +270,9 @@ void SPU2writeDMA(int core, u16* pMem, u32 size)
 		return;
 	}

-	Cores[core].lastsize=size;
+	#ifndef PUBLIC
+	DebugCores[core].lastsize=size;
+	#endif
 	Cores[core].TSA&=~7;

 	bool adma_enable = ((Cores[core].AutoDMACtrl&(core+1))==(core+1));
--- a/plugins/spu2ghz/mixer.cpp
+++ b/plugins/spu2ghz/mixer.cpp
@ -51,12 +51,44 @@ double srate_pv=1.0;
 extern u32 PsxRates[160];


+#define SPU2_DYN_MEMLINE 0x3600
+
+// Performs a 64-bit multiplication between two values and returns the
+// high 32 bits as a result (discarding the fractional 32 bits).
+// The combined fracional bits of both inputs must be 32 bits for this
+// to work properly.
+//
+// This is meant to be a drop-in replacement for times when the 'div' part
+// of a MulDiv is a constant.  (example: 1<<8, or 4096, etc)
+//
+// [Air] Performance breakdown: This is over 10 times faster than MulDiv in
+//   a *worst case* scenario.  It's also more accurate since it forces the
+//   caller to  extend the inputs so that they make use of all 32 bits of
+//   precision.
+//
+static s32 __forceinline MulShr32( s32 srcval, s32 mulval )
+{
+	s64 tmp = ((s64)srcval * mulval );
+	return ((s32*)&tmp)[1];
+
+	// Performance note: Using the temp var and memory reference
+	// actually ends up being roughly 2x faster than using a bitshift.
+	// It won't fly on big endian machines though... :)
+}
+
+static s32 __forceinline MulShr32su( s32 srcval, u32 mulval )
+{
+ 	s64 tmp = ((s64)srcval * mulval );
+	return ((s32*)&tmp)[1];
+}
+
+
 void InitADSR()                                    // INIT ADSR
 {
 	for (int i=0; i<(32+128); i++)
 		{
 			int shift=(i-32)>>2;
-			__int64 rate=(i&3)+4;
+			s64 rate=(i&3)+4;
 			if (shift<0)
 			{
 				rate>>=-shift;
@ -80,40 +112,9 @@ const s32 f[5][2] ={{    0,   0 },
 					{   98, -55 },
 					{  122, -60 }};

-static s16 __forceinline XA_decode(s32 pred1, s32 pred2, s32 shift, s32& prev1, s32& prev2, s32 data)
+static void __forceinline XA_decode_block(s16* buffer, const s16* block, s32& prev1, s32& prev2)
 {
-	s32 pcm = data>>shift;
-	pcm+=((pred1*prev1)+(pred2*prev2))>>6;
-	if(pcm> 32767) pcm= 32767;
-	if(pcm<-32768) pcm=-32768;
-	prev2=prev1;
-	prev1=pcm;
-	return (s16)pcm;
-}
-
-static s16 __forceinline XA_decode_block(s16* buffer, const s16* block, s32& prev1, s32& prev2)
-{
-	s32 data=*block;
-	s32 Shift	 =  ((data>> 0)&0xF)+16;
-	s32 Predict1 = f[(data>> 4)&0xF][0];
-	s32 Predict2 = f[(data>> 4)&0xF][1];
-
-	for(int i=0;i<7;i++)
-	{
-		s32 SampleData=block[i+1];
-
-		*(buffer++) = XA_decode(Predict1, Predict2, Shift, prev1, prev2, (SampleData<<28)&0xF0000000);
-		*(buffer++) = XA_decode(Predict1, Predict2, Shift, prev1, prev2, (SampleData<<24)&0xF0000000);
-		*(buffer++) = XA_decode(Predict1, Predict2, Shift, prev1, prev2, (SampleData<<20)&0xF0000000);
-		*(buffer++) = XA_decode(Predict1, Predict2, Shift, prev1, prev2, (SampleData<<16)&0xF0000000);
-	}
-
-	return data;
-}
-
-static s16 __forceinline XA_decode_block_fast(s16* buffer, const s16* block, s32& prev1, s32& prev2)
-{
-	s32 header = *block;
+	const s32 header = *block;
 	s32 shift =  ((header>> 0)&0xF)+16;
 	s32 pred1 = f[(header>> 4)&0xF][0];
 	s32 pred2 = f[(header>> 4)&0xF][1];
@ -147,13 +148,11 @@ static s16 __forceinline XA_decode_block_fast(s16* buffer, const s16* block, s32
 		prev2=pcm;
 		prev1=pcm2;
 	}
-
-	return header;
 }

-static s16 __forceinline XA_decode_block_unsaturated(s16* buffer, const s16* block, s32& prev1, s32& prev2)
+static void __forceinline XA_decode_block_unsaturated(s16* buffer, const s16* block, s32& prev1, s32& prev2)
 {
-	s32 header = *block;
+	const s32 header = *block;
 	s32 shift =  ((header>> 0)&0xF)+16;
 	s32 pred1 = f[(header>> 4)&0xF][0];
 	s32 pred2 = f[(header>> 4)&0xF][1];
@ -182,14 +181,14 @@ static s16 __forceinline XA_decode_block_unsaturated(s16* buffer, const s16* blo
 		prev2=pcm;
 		prev1=pcm2;
 	}
-
-	return header;
 }

 static void __forceinline IncrementNextA( const V_Core& thiscore, V_Voice& vc )
 {
 	if((vc.NextA==thiscore.IRQA)&&(thiscore.IRQEnable)) { 
+		#ifndef PUBLIC
 		ConLog(" * SPU2: IRQ Called (IRQ passed).\n"); 
+		#endif
 		Spdif.Info=4<<core;
 		SetIrqCall();
 	}
@ -199,67 +198,110 @@ static void __forceinline IncrementNextA( const V_Core& thiscore, V_Voice& vc )
 }


-static void __fastcall GetNextDataBuffered( V_Core& thiscore, V_Voice& vc, s32& Data) 
-{
-	//static s32 pcm=0;
-	s16 data=0;
+u32 *pcm_cache_flags=NULL;
+s16 *pcm_cache_data=NULL;

-	if (vc.SCurrent>=28) 
+#ifndef PUBLIC
+int g_counter_cache_hits=0;
+int g_counter_cache_misses=0;
+int g_counter_cache_ignores=0;
+#endif
+
+#define XAFLAG_LOOP_END		(1ul<<0)
+#define XAFLAG_LOOP			(1ul<<1)
+#define XAFLAG_LOOP_START	(1ul<<2)
+
+static void __forceinline __fastcall GetNextDataBuffered( V_Core& thiscore, V_Voice& vc, s32& Data) 
+{
+	if (vc.SCurrent<28)
 	{
-		if(vc.LoopEnd)
+		// [Air] : skip the increment?
+		//    (witness one of the rare ideal uses of a goto statement!)
+		if( (vc.SCurrent&3) != 3 ) goto _skipIncrement;
+	}
+	else
+	{
+		if(vc.LoopFlags & XAFLAG_LOOP_END)
 		{
-			if(vc.Loop)
+			if(vc.LoopFlags & XAFLAG_LOOP)
 			{
 				vc.NextA=vc.LoopStartA;
 			}
 			else
 			{
-				if(MsgVoiceOff) ConLog(" * SPU2: Voice Off by EndPoint: %d \n", voice);
 				VoiceStop(core,voice);
 				thiscore.Regs.ENDX|=1<<voice;
-				vc.lastStopReason = 1;
+				#ifndef PUBLIC
+				if(MsgVoiceOff) ConLog(" * SPU2: Voice Off by EndPoint: %d \n", voice);
+				DebugCores[core].Voices[voice].lastStopReason = 1;
+				#endif
 			}
 		}

-		// [Air]: Original ADPCM decoder.
-		//data = XA_decode_block(vc.SBuffer,GetMemPtr(vc.NextA&0xFFFFF), vc.Prev1, vc.Prev2);
+		// We'll need the loop flags and buffer pointers regardless of cache status:
+		// Note to Self : NextA addresses WORDS (not bytes).

-		// [Air]: Testing of a new saturated decoder. (benchmark needed)
-		//   My gut tells me that this should be faster, but you never can tell with these types
-		//   of things.  Benchmark it against the original and see what you think.
+		s16* memptr = GetMemPtr(vc.NextA&0xFFFFF);
+		vc.LoopFlags = *memptr >> 8;	// grab loop flags from the upper byte.
+		int nexta = vc.NextA >> 3;		// 8 words per encoded block.
+		
+		vc.SBuffer = &pcm_cache_data[nexta * 28];

-		//data = XA_decode_block_fast(vc.SBuffer,GetMemPtr(vc.NextA&0xFFFFF), vc.Prev1, vc.Prev2);
+		const u32 flagbitmask = 1ul<<(nexta & 31);  // 32 flags per array entry
+		nexta >>= 5;

-		// [Air]: Testing use of a new unsaturated decoder. (benchmark needed)
-		//   Chances are the saturation isn't needed, but for a very few exception games.
-		//   This is definitely faster than either of the above versions, but the question is by how
-		//   much (biggest impact will be on games like Xenosaga2, which use lots of SPU2 voices).
-		//   If the speed boost is worth it then maybe it should be added as a speedhack option
-		//   in the spu2ghz config.
-
-		data = XA_decode_block_unsaturated(vc.SBuffer,GetMemPtr(vc.NextA&0xFFFFF), vc.Prev1, vc.Prev2);
-
-		vc.LoopEnd  =   (data>> 8)&1;
-		vc.Loop     =   (data>> 9)&1;
-		vc.LoopStart=   (data>>10)&1;
-		vc.SCurrent = 0;
-		vc.FirstBlock = 0;
-
-		if( vc.LoopStart && !vc.LoopMode )
+		if( pcm_cache_flags[nexta] & flagbitmask )
 		{
-			vc.LoopStartA=vc.NextA; 
+			// Cached block!  Read from the cache directly (ie, do nothing)
+
+			#ifndef PUBLIC
+			g_counter_cache_hits++;
+			#endif
+		}
+		else
+		{
+			// Only flag the cache if it's a non-dynamic memory range.
+			if( nexta >= (SPU2_DYN_MEMLINE / (8*32)) )
+				pcm_cache_flags[nexta] |= flagbitmask;
+
+			#ifndef PUBLIC
+			if( nexta < (SPU2_DYN_MEMLINE / (8*32)) )
+				g_counter_cache_ignores++;
+			else
+				g_counter_cache_misses++;
+			#endif
+
+			// saturated decoder
+
+			XA_decode_block(vc.SBuffer, memptr, vc.Prev1, vc.Prev2);
+
+			// [Air]: Testing use of a new unsaturated decoder. (benchmark needed)
+			//   Chances are the saturation isn't needed, but for a very few exception games.
+			//   This is definitely faster than the above version, but is it by enough to
+			//   merit possible lower compatibility?  Especially now that games that make
+			//   heavy use of the SPU2 via music or sfx will mostly use the cache anyway.
+
+			//XA_decode_block_unsaturated( vc.SBuffer, memptr, vc.Prev1, vc.Prev2 );
+
+			//vc.LoopEnd  =   (data>> 8)&1;
+			//vc.Loop     =   (data>> 9)&1;
+			//vc.LoopStart=   (data>>10)&1;
 		}

-		IncrementNextA( thiscore, vc );
+		vc.SCurrent = 0;
+		if( (vc.LoopFlags & XAFLAG_LOOP_START) && !vc.LoopMode )
+		{
+			vc.LoopStartA=vc.NextA;
+		}
+
+		// [Air] : Increment will get called below (change made to avoid needless code cache clutter)		
+		//IncrementNextA( thiscore, vc );
 	}

-	Data=vc.SBuffer[vc.SCurrent];
+	IncrementNextA( thiscore, vc );

-	if((vc.SCurrent&3)==3)
-	{
-		IncrementNextA( thiscore, vc );
-	}
-	vc.SCurrent++;
+_skipIncrement:
+	Data = vc.SBuffer[vc.SCurrent++];
 }

 /////////////////////////////////////////////////////////////////////////////////////////
@ -271,12 +313,17 @@ const int InvExpOffsets[] = { 0,4,6,8,9,10,11,12 };
 static void __forceinline CalculateADSR( V_Voice& vc ) 
 {
 	V_ADSR& env(vc.ADSR);
+
+	if( env.Phase == 0 ) return;
+
 	u32 SLevel=((u32)env.Sl)<<27;
 	u32 off=InvExpOffsets[(env.Value>>28)&7];

 	if(env.Releasing)
 	{
-		if((env.Phase>0)&&(env.Phase<5))
+		// [Air] : Simplified conditional, as phase cannot be zero here.
+		//  (zeros get trapped above)
+		if(/*(env.Phase>0)&&*/(env.Phase<5))
 		{
 			env.Phase=5;
 		}
@ -392,15 +439,17 @@ static void __forceinline CalculateADSR( V_Voice& vc )
 			env.Value=0;
 			break;

-		//jNO_DEFAULT
+		jNO_DEFAULT
 	}

 	if (env.Phase==6) {
+		#ifndef PUBLIC
 		if(MsgVoiceOff) ConLog(" * SPU2: Voice Off by ADSR: %d \n", voice);
+		DebugCores[core].Voices[voice].lastStopReason = 2;
+		#endif
 		VoiceStop(core,voice);
 		Cores[core].Regs.ENDX|=(1<<voice);
 		env.Phase=0;
-		vc.lastStopReason = 2;
 	}
 }

@ -448,36 +497,29 @@ void LowPass(s32& VL, s32& VR)
 /////////////////////////////////////////////////////////////////////////////////////////
 //                                                                                     //

-static void __fastcall GetVoiceValues(V_Core& thiscore, V_Voice& vc, s32& Value)
+static void __forceinline UpdatePitch( V_Voice& vc )
 {
-	s64 Data=0;
-	s32 DT=0;
+	s32 pitch;

-	// [Air] : Put a scope on the pitch variable, which should help it get optimized to a
-	//   register.
+	// [Air] : re-ordered comparisons: Modulated is much more likely to be zero than voice,
+	//   and so the way it was before it's have to check both voice and modulated values
+	//   most of the time.  Now it'll just check Modulated and short-circuit past the voice
+	//   check (not that it amounts to much, but eh every little bit helps).
+	if( (vc.Modulated==0) || (voice==0) )
+		pitch=vc.Pitch;
+	else
+		pitch=(vc.Pitch*(32768 + abs(Cores[core].Voices[voice-1].OutX)))>>15;
+	
+	vc.SP+=pitch;
+}
+
+static void __forceinline GetVoiceValues_Linear(V_Core& thiscore, V_Voice& vc, s32& Value)
+{
+	while( vc.SP > 0 )
 	{
-		s32 pitch;
-
-		// [Air] : re-ordered comparisons: Modulated is much more likely to be zero than voice,
-		//   and so the way it was before it's have to check both voice and modulated values
-		//   most of the time.  Now it'll just check Modulated and short-circut past the voice
-		//   check (not that it amounts to much, but eh every little bit helps).
-		if( (vc.Modulated==0) || (voice==0) )
-			pitch=vc.Pitch;
-		else
-			pitch=(vc.Pitch*(32768 + abs(thiscore.Voices[voice-1].OutX)))>>15;
-		
-		vc.SP+=pitch;
-	}
-
-	while(vc.SP>=4096) 
-	{
-		GetNextDataBuffered( thiscore, vc, DT );
-
-		vc.PV4=vc.PV3;
-		vc.PV3=vc.PV2;
 		vc.PV2=vc.PV1;
-		vc.PV1=DT<<16; //32bit processing
+
+		GetNextDataBuffered( thiscore, vc, vc.PV1 );

 		vc.SP-=4096;
 	}
@ -486,89 +528,91 @@ static void __fastcall GetVoiceValues(V_Core& thiscore, V_Voice& vc, s32& Value)

 	if(vc.ADSR.Phase==0)
 	{
-		Value=0;
-		vc.OutX=0;
+		Value = 0;
 	}
 	else
 	{
-		// [Air]: if SP is zero then we landed perfectly on a sample source, no
-		// interpolation necessary (besides being a little faster this is important
-		// too, since the interpolator will pick the wrong sample to mix otherwise).
-
-		if(Interpolation==0 || vc.SP == 0)
+		if(Interpolation==0) // || vc.SP == 0)
 		{
-			Data = vc.PV1;
+			Value = vc.PV1;
 		} 
-		else if(Interpolation==1) //linear
+		else //if(Interpolation==1) //must be linear
 		{
-			// [Air]: Inverted the interpolation delta.  The old way was generating
-			// inverted waveforms.
-			s64 t0 = vc.PV2 - vc.PV1;
-			s64 t1 = vc.PV1;
-			Data = (((t0*vc.SP)>>12) + t1);
+			s32 t0 = vc.PV2 - vc.PV1;
+			s32 t1 = vc.PV1<<12;
+			Value = t1 - (t0*vc.SP);
 		}
-		else // if(Interpolation==2) //must be cubic
-		{
-			s64 a0 = vc.PV1 - vc.PV2 - vc.PV4 + vc.PV3;
-			s64 a1 = vc.PV4 - vc.PV3 - a0;
-			s64 a2 = vc.PV1 - vc.PV4;
-			s64 a3 = vc.PV2;
-			s64 mu = 4096-vc.SP;
-
-			s64 t0 = ((a0   )*mu)>>18;
-			s64 t1 = ((t0+a1)*mu)>>18;
-			s64 t2 = ((t1+a2)*mu)>>18;
-			s64 t3 = ((t2+a3));
-
-			Data = t3;
-		}
-
-		Value=(s32)((Data*vc.ADSR.Value)>>48); //32bit ADSR + convert to 16bit
-
-		// [Air]: Moved abs() to the modulation code above, so that the abs conditionals are
-		//   only run in select cases where modulation is active.
-		vc.OutX=Value;
+		Value = MulShr32su( Value, vc.ADSR.Value>>12 );
 	}
 }

-// [Air]: Noise values need to be mixed without going through interpolation, since it
-//    can wreak havoc on the noise (causing muffling or popping)
-static void __fastcall GetNoiseValues(V_Core& thiscore, V_Voice& vc, s32& Value)
+
+static void __forceinline GetVoiceValues_Cubic(V_Core& thiscore, V_Voice& vc, s32& Value)
 {
-	s64 Data=0;
-	s32 DT=0;
-
+	while( vc.SP > 0 )
 	{
-		s32 pitch;
+		vc.PV4=vc.PV3;
+		vc.PV3=vc.PV2;
+		vc.PV2=vc.PV1;

-		if( (vc.Modulated==0) || (voice==0) )
-			pitch=vc.Pitch;
-		else
-			pitch=(vc.Pitch*(32768 + abs(thiscore.Voices[voice-1].OutX)))>>15;
-		
-		vc.SP+=pitch;
-	}
-
-	while(vc.SP>=4096) 
-	{
-		GetNoiseValues(DT);
+		GetNextDataBuffered( thiscore, vc, vc.PV1 );
+		vc.PV1<<=3;
+		vc.SPc = vc.SP&4095;	// just the fractional part, please!
 		vc.SP-=4096;
 	}

-	Data = DT<<16; //32bit processing
-
 	CalculateADSR( vc );

 	if(vc.ADSR.Phase==0)
 	{
-		Value=0;
-		vc.OutX=0;
+		Value = 0;
 	}
 	else
 	{
-		Value=(s32)((Data*vc.ADSR.Value)>>48); //32bit ADSR + convert to 16bit
-		vc.OutX=Value;
+		s32 z0 = vc.PV3 - vc.PV4 + vc.PV1 - vc.PV2;
+		s32 z1 = (vc.PV4 - vc.PV3 - z0);
+		s32 z2 = (vc.PV2 - vc.PV4);
+
+		s32 mu = vc.SPc;
+
+		s32 val = (z0 * mu) >> 12;
+		val = ((val + z1) * mu) >> 12;
+		val = ((val + z2) * mu) >> 12;
+		val += vc.PV2;
+
+		/*
+		s64 a0 = vc.PV1 - vc.PV2 - vc.PV4 + vc.PV3;
+		s64 a1 = vc.PV4 - vc.PV3 - a0;
+		s64 a2 = vc.PV1 - vc.PV4;
+		s64 a3 = vc.PV2;
+		s64 mu = 4096+vc.SP;
+
+		s64 t0 = ((a0   )*mu)>>12;
+		s64 t1 = ((t0-a1)*mu)>>12;
+		s64 t2 = ((t1-a2)*mu)>>12;
+		s64 t3 = ((t2-a3));*/
+
+		Value = MulShr32su( val, vc.ADSR.Value>>3 );
 	}
+
+	//Value=(s32)((Data*vc.ADSR.Value)>>40); //32bit ADSR + convert to 16bit
+}
+
+// [Air]: Noise values need to be mixed without going through interpolation, since it
+//    can wreak havoc on the noise (causing muffling or popping).
+static void __forceinline __fastcall GetNoiseValues(V_Core& thiscore, V_Voice& vc, s32& Data)
+{
+	while(vc.SP>=4096) 
+	{
+		GetNoiseValues( Data );
+		vc.SP-=4096;
+	}
+
+	// GetNoiseValues can't set the phase zero on us unexpectedly
+	// like GetVoiceValues can.
+	jASSUME( vc.ADSR.Phase != 0 );	
+
+	CalculateADSR( vc );
 }

 /////////////////////////////////////////////////////////////////////////////////////////
@ -619,10 +663,12 @@ void __fastcall ReadInput(V_Core& thiscore, s32& PDataL,s32& PDataR)
 					{
 						FileLog("[%10d] AutoDMA%c block end.\n",Cycles, (core==0)?'4':'7');

+						#ifndef PUBLIC
 						if(thiscore.InputDataLeft>0)
 						{
 							if(MsgAutoDMA) ConLog("WARNING: adma buffer didn't finish with a whole block!!\n");
 						}
+						#endif
 						thiscore.InputDataLeft=0;
 						thiscore.DMAICounter=1;
 					}
@ -657,10 +703,12 @@ void __fastcall ReadInput(V_Core& thiscore, s32& PDataL,s32& PDataR)
 					{
 						FileLog("[%10d] Spdif AutoDMA%c block end.\n",Cycles, (core==0)?'4':'7');

+						#ifndef PUBLIC
 						if(thiscore.InputDataLeft>0)
 						{
 							if(MsgAutoDMA) ConLog("WARNING: adma buffer didn't finish with a whole block!!\n");
 						}
+						#endif
 						thiscore.InputDataLeft=0;
 						thiscore.DMAICounter=1;
 					}
@ -709,10 +757,12 @@ void __fastcall ReadInput(V_Core& thiscore, s32& PDataL,s32& PDataR)

 						thiscore.AutoDMACtrl |=~3;

+						#ifndef PUBLIC
 						if(thiscore.InputDataLeft>0)
 						{
 							if(MsgAutoDMA) ConLog("WARNING: adma buffer didn't finish with a whole block!!\n");
 						}
+						#endif
 						thiscore.InputDataLeft=0;
 						thiscore.DMAICounter=1;
 					}
@ -756,60 +806,79 @@ void __fastcall ReadInputPV(V_Core& thiscore, s32& ValL,s32& ValR)
 /////////////////////////////////////////////////////////////////////////////////////////
 //                                                                                     //

-static void __forceinline UpdateVolume(V_Volume& Vol)
-{
-	s32 NVal;
+#define VOLFLAG_REVERSE_PHASE	(1ul<<0)
+#define VOLFLAG_DECREMENT		(1ul<<1)
+#define VOLFLAG_EXPONENTIAL		(1ul<<2)
+#define VOLFLAG_SLIDE_ENABLE	(1ul<<3)

+static void __fastcall UpdateVolume(V_Volume& Vol)
+{
 	// TIMINGS ARE FAKE!!! Need to investigate.

-	int reverse_phase = Vol.Mode&1;
-	int exponential   = Vol.Mode&4;
-	int decrement     = Vol.Mode&2;
-	int slide_enable  = Vol.Mode&8;
-	
-	if (!slide_enable) return;
+	// [Air]: Cleaned up this code... may have broken it.  Can't really
+	//   test it here since none of my games seem to use it.  If anything's
+	//   not sounding right, we should revert the code in this method first.

-	NVal=Vol.Value;
-	if(reverse_phase) NVal = -NVal;
+	// [Air] Reverse phasing?
+	//   Invert our value so that exponential mathematics are applied
+	//   as if the volume were sliding the other direction.  This makes
+	//   a lot more sense than the old method's likeliness to chop off
+	//   sound volumes to zero abruptly.

-	if (decrement) { // Decrement
-
-		if(exponential)
-		{
-			NVal=NVal * Vol.Increment >> 7;
-		}
-		else
-		{
-			NVal-=Vol.Increment;
-		}
-		NVal-=((32768*5)>>(Vol.Increment));
-		if (NVal<0) {
-			Vol.Value=0;
-			Vol.Mode=0;
-		}
-		else Vol.Value=NVal & 0xffff;
-	}
-	else { // Increment
-		if(exponential)
-		{
-			int T = Vol.Increment>>(NVal>>12);
-			NVal+=T;
-		}
-		else
-		{
-			NVal+=Vol.Increment;
-		}
-	}
-
-	if((NVal<0)||(NVal>0x7fff))
+	if(Vol.Mode & VOLFLAG_REVERSE_PHASE)
 	{
-		NVal=decrement?0:0x7fff;
-		Vol.Mode=0; // disable slide
+		ConLog( " *** SPU2 > Reverse Phase in progress!\n" );
+		Vol.Value = 0x7fff - Vol.Value;
 	}

-	if(reverse_phase) NVal = -NVal;
+	if (Vol.Mode & VOLFLAG_DECREMENT)
+	{
+		// Decrement

-	Vol.Value=NVal;
+		if(Vol.Mode & VOLFLAG_EXPONENTIAL)
+		{
+			//ConLog( " *** SPU2 > Exponential Volume Slide Down!\n" );
+			Vol.Value *= Vol.Increment >> 7;
+			Vol.Value-=((32768*5)>>(Vol.Increment));
+		}
+		else
+		{
+			Vol.Value-=Vol.Increment;
+		}
+
+		if (Vol.Value<0)
+		{
+			Vol.Value = 0;
+			Vol.Mode=0;	// disable slide
+		}
+	}
+	else
+	{
+		//ConLog( " *** SPU2 > Volflag > Increment!\n" );
+		// Increment
+		if(Vol.Mode & VOLFLAG_EXPONENTIAL)
+		{
+			//ConLog( " *** SPU2 > Exponential Volume Slide Up!\n" );
+			int T = Vol.Increment>>(Vol.Value>>12);
+			Vol.Value+=T;
+		}
+		else
+		{
+			Vol.Value+=Vol.Increment;
+		}
+
+		if( Vol.Value > 0x7fff )
+		{
+			Vol.Value = 0x7fff;
+			Vol.Mode=0; // disable slide
+		}
+	}
+
+	// Reverse phasing
+	//  Invert the value back into output form:
+	if(Vol.Mode & VOLFLAG_REVERSE_PHASE) Vol.Value = 0x7fff-Vol.Value;
+
+	//Vol.Value=NVal;
 }

 /////////////////////////////////////////////////////////////////////////////////////////
@ -923,36 +992,62 @@ static s32 __forceinline ApplyVolume(s32 data, s32 volume)
 	return (volume * data);
 }

-static void __forceinline MixVoice(V_Voice& vc, s32& VValL, s32& VValR)
+// writes a signed value to the SPU2 ram
+// Performs no cache invalidation -- use only for dynamic memory ranges
+// of the SPU2 (between 0x0000 and SPU2_DYN_MEMLINE)
+static void __forceinline spu2M_WriteFast( u32 addr, s16 value )
+{
+	// throw an assertion if the memory range is invalid:
+	jASSUME( addr < SPU2_DYN_MEMLINE );
+	*GetMemPtr( addr ) = value;
+}
+
+
+static void __forceinline MixVoice( V_Core& thiscore, V_Voice& vc, s32& VValL, s32& VValR )
 {
 	s32 Value=0;

 	VValL=0;
 	VValR=0;

-	UpdateVolume(vc.VolumeL);
-	UpdateVolume(vc.VolumeR);
+	// [Air] : Most games don't use much volume slide effects.  So only
+	//   call the UpdateVolume methods when needed by checking the flag
+	//   outside the method here...
+
+	if( vc.VolumeL.Mode & VOLFLAG_SLIDE_ENABLE ) UpdateVolume( vc.VolumeL );
+	if( vc.VolumeR.Mode & VOLFLAG_SLIDE_ENABLE ) UpdateVolume( vc.VolumeR );

 	if (vc.ADSR.Phase>0) 
 	{
-		if( vc.Noise )
-			GetNoiseValues( Cores[core], vc, Value );
-		else
-			GetVoiceValues( Cores[core], vc, Value );
+		UpdatePitch( vc );

-		#ifdef _DEBUG
-		vc.displayPeak = max(vc.displayPeak,abs(Value));
+		if( vc.Noise )
+			GetNoiseValues( thiscore, vc, Value );
+		else
+		{
+			if( Interpolation == 2 )
+				GetVoiceValues_Cubic( thiscore, vc, Value );
+			else
+				GetVoiceValues_Linear( thiscore, vc, Value );
+		}
+
+		// Record the output (used for modulation effects)
+		vc.OutX = Value;
+
+		#ifndef PUBLIC
+		DebugCores[core].Voices[voice].displayPeak = max(DebugCores[core].Voices[voice].displayPeak,abs(Value));
 		#endif

 		VValL=ApplyVolume(Value,(vc.VolumeL.Value));
 		VValR=ApplyVolume(Value,(vc.VolumeR.Value));
 	}

-	if (voice==1)      spu2Ms16(0x400 + (core<<12) + OutPos)=(s16)((Value));
-	else if (voice==3) spu2Ms16(0x600 + (core<<12) + OutPos)=(s16)((Value));
+	if (voice==1)      spu2M_WriteFast( 0x400 + (core<<12) + OutPos, (s16)Value );
+	else if (voice==3) spu2M_WriteFast( 0x600 + (core<<12) + OutPos, (s16)Value );

 }

+
 static void __fastcall MixCore(s32& OutL, s32& OutR, s32 ExtL, s32 ExtR)
 {
 	s32 InpL=0, InpR=0;
@ -964,8 +1059,8 @@ static void __fastcall MixCore(s32& OutL, s32& OutR, s32 ExtL, s32 ExtR)
 	TDL=TDR=TWL=TWR=(s32)0;

 	if (core == 1) { //Core 0 doesn't have External input
-		spu2Ms16(0x800 + OutPos)=(s16)(ExtL>>16);
-		spu2Ms16(0xA00 + OutPos)=(s16)(ExtR>>16);
+		spu2M_WriteFast( 0x800 + OutPos, (s16)(ExtL>>16) );
+		spu2M_WriteFast( 0xA00 + OutPos, (s16)(ExtR>>16) );
 	}
 	
 	V_Core& thiscore( Cores[core] );
@ -979,14 +1074,30 @@ static void __fastcall MixCore(s32& OutL, s32& OutR, s32 ExtL, s32 ExtR)
 		ReadInputPV(thiscore, InpL,InpR);	// get input data from input buffers
 	}

+	#ifndef PUBLIC
 	s32 InputPeak = max(abs(InpL),abs(InpR));
-	if(thiscore.AutoDMAPeak<InputPeak) thiscore.AutoDMAPeak=InputPeak;
+	if(DebugCores[core].AutoDMAPeak<InputPeak) DebugCores[core].AutoDMAPeak=InputPeak;
+	#endif
 	
-	InpL = MulDiv(InpL,(thiscore.InpL),1<<1);
-	InpR = MulDiv(InpR,(thiscore.InpR),1<<1);
+	//MulShr32( InpL, thiscore.InpL, 1 );
+	//MulShr32( InpR, thiscore.InpR, 1 );

-	ExtL = MulDiv(ExtL,(thiscore.ExtL),1<<12);
-	ExtR = MulDiv(ExtR,(thiscore.ExtR),1<<12);
+	// [Air] : InpL and InpR don't need 64 bit muls.
+
+	InpL *= thiscore.InpL;
+	InpR *= thiscore.InpR;
+	InpL >>= 1;
+	InpR >>= 1;
+
+	// shift inputs by 20 collectively, so that the result is
+	// effectively downshifted by 12:
+	ExtL = MulShr32su( ExtL<<3, ((int)thiscore.ExtL)<<16);
+	ExtR = MulShr32su( ExtR<<3, ((int)thiscore.ExtR)<<16);
+
+	//InpL = MulDiv(InpL,(thiscore.InpL),1<<1);
+	//InpR = MulDiv(InpR,(thiscore.InpR),1<<1);
+	//ExtL = MulDiv(ExtL,(thiscore.ExtL),1<<12);
+	//ExtR = MulDiv(ExtR,(thiscore.ExtR),1<<12);

 	SDL=SDR=SWL=SWR=(s32)0;

@ -995,7 +1106,7 @@ static void __fastcall MixCore(s32& OutL, s32& OutR, s32 ExtL, s32 ExtR)
 		s32 VValL,VValR;

 		V_Voice& vc( thiscore.Voices[voice] );
-		MixVoice( vc,VValL,VValR );
+		MixVoice( thiscore, vc, VValL, VValR );

 		SDL += VValL * vc.DryL;
 		SDR += VValR * vc.DryR;
@ -1004,10 +1115,10 @@ static void __fastcall MixCore(s32& OutL, s32& OutR, s32 ExtL, s32 ExtR)
 	}

 	//Write To Output Area
-	spu2Ms16(0x1000 + (core<<12) + OutPos)=(s16)(SDL>>16);
-	spu2Ms16(0x1200 + (core<<12) + OutPos)=(s16)(SDR>>16);
-	spu2Ms16(0x1400 + (core<<12) + OutPos)=(s16)(SWL>>16);
-	spu2Ms16(0x1600 + (core<<12) + OutPos)=(s16)(SWR>>16);
+	spu2M_WriteFast( 0x1000 + (core<<12) + OutPos, (s16)(SDL>>16) );
+	spu2M_WriteFast( 0x1200 + (core<<12) + OutPos, (s16)(SDR>>16) );
+	spu2M_WriteFast( 0x1400 + (core<<12) + OutPos, (s16)(SWL>>16) );
+	spu2M_WriteFast( 0x1600 + (core<<12) + OutPos, (s16)(SWR>>16) );

 	// Mix in the Voice data
 	TDL += SDL * thiscore.SndDryL;
@ -1046,12 +1157,13 @@ static void __fastcall MixCore(s32& OutL, s32& OutR, s32 ExtL, s32 ExtR)
 	OutR=(TDR + TWR);

 	//Apply Master Volume
-	UpdateVolume(thiscore.MasterL);
-	UpdateVolume(thiscore.MasterR);
+	if( thiscore.MasterL.Mode & VOLFLAG_SLIDE_ENABLE )  UpdateVolume(thiscore.MasterL);
+	if( thiscore.MasterR.Mode & VOLFLAG_SLIDE_ENABLE )  UpdateVolume(thiscore.MasterR);

-	if (thiscore.Mute==0) {
-		OutL=MulDiv(OutL,thiscore.MasterL.Value,1<<16);
-		OutR=MulDiv(OutR,thiscore.MasterR.Value,1<<16);
+	if (thiscore.Mute==0)
+	{
+		OutL = MulShr32( OutL, ((s32)thiscore.MasterL.Value)<<16 );
+		OutR = MulShr32( OutR, ((s32)thiscore.MasterR.Value)<<16 );
 	}
 	else 
 	{
@ -1071,24 +1183,32 @@ static void __fastcall MixCore(s32& OutL, s32& OutR, s32 ExtL, s32 ExtR)
 	}
 }

+// used to throttle the output rate of cache stat reports
+static int p_cachestat_counter=0;
+
 void __fastcall Mix() 
 {
 	s32 ExtL=0, ExtR=0, OutL, OutR;

-	static s32 Peak0,Peak1;
-	static s32 PCount;
-
 	core=0;
 	MixCore(ExtL,ExtR,0,0);

 	core=1;
 	MixCore(OutL,OutR,ExtL,ExtR);

-#ifdef _DEBUG
+#ifndef PUBLIC
+	static s32 Peak0,Peak1;
+	static s32 PCount;
+
 	Peak0 = max(Peak0,max(ExtL,ExtR));
 	Peak1 = max(Peak1,max(OutL,OutR));
 #endif

+	// [Air] [TODO] : Replace this with MulShr32.
+	//   I haven't done it yet because it would require making the
+	//   VolumeDivisor a constant .. which it should be anyway.  The presence
+	//   of VolumeMultiplier more or less negates the need for a variable divisor.
+
 	ExtL=MulDiv(OutL,VolumeMultiplier,VolumeDivisor<<6);
 	ExtR=MulDiv(OutR,VolumeMultiplier,VolumeDivisor<<6);

@ -1102,6 +1222,23 @@ void __fastcall Mix()
 	SndWrite(ExtL,ExtR);
 	OutPos++;
 	if (OutPos>=0x200) OutPos=0;
+
+#ifndef PUBLIC
+	// [TODO]: Create an INI option to enable/disable this particular log.
+	p_cachestat_counter++;
+	if(p_cachestat_counter > (48000*6) )
+	{
+		p_cachestat_counter = 0;
+		ConLog( " * SPU2 > CacheStatistics > Hits: %d  Misses: %d  Ignores: %d\n",
+			g_counter_cache_hits,
+			g_counter_cache_misses,
+			g_counter_cache_ignores );
+
+		g_counter_cache_hits = 
+		g_counter_cache_misses =
+		g_counter_cache_ignores = 0;
+	}
+#endif
 }

 /////////////////////////////////////////////////////////////////////////////////////////
--- a/plugins/spu2ghz/sndout.cpp
+++ b/plugins/spu2ghz/sndout.cpp
@ -142,6 +142,32 @@ public:
 		}

 		// either pw=false or free>nSamples
+
+		// Problem:
+		//  If the SPU2 gets out of sync with the SndOut device, the writepos of the
+		//  circular buffer will overtake the readpos, leading to a prolonged period
+		//  of hopscotching read/write accesses (ie, lots of staticy crap sound for
+		//  several seconds).
+		//
+		// Compromise:
+		//  Same as with underruns below, an overrun can be handled by aborting
+		//  the write operation before the writepos goes past the readpos, and then
+		//  ignoring the rest of the incoming data.  The resultant sound will have
+		//  a single hiccup when an overflow occurs, instead of getting crapped out
+		//  for several seconds (or in many cases, until the SPU sndout driver was
+		//  manually reset.. grr!).
+		
+#ifndef DYNAMIC_BUFFER_LIMITING
+		while(data<size && nSamples>0)
+		{
+			buffer[wpos] = *(bData++);
+			wpos=(wpos+1)%size;
+			data++;
+			nSamples--;
+		}
+
+#elif defined( DYNAMIC_BUFFER_LIMITING )
+
 		while(nSamples>0)
 		{
 			buffer[wpos] = *(bData++);
@ -156,10 +182,9 @@ public:
 				data-=size;
 			}
 			while(data>size);
-#ifdef DYNAMIC_BUFFER_LIMITING
 			overflows++;
-#endif
 		}
+#endif

 		
 		LeaveCriticalSection(&cs);
@ -167,9 +192,70 @@ public:

 	virtual void ReadSamples (s32 *bData, s32 nSamples)
 	{
+		static bool underrun_freeze = false;
+
 		EnterCriticalSection(&cs);
 		dataread+=nSamples;
 		
+		// Problem:
+		//  If the SPU2 gets even the least bit out of sync with the SndOut device,
+		//  the readpos of the circular buffer will overtake the writepos,
+		//  leading to a prolonged period of hopscotching read/write accesses (ie,
+		//  lots of staticy crap sound for several seconds).
+		//
+		// Fix:
+		//  If the read position overtakes the write position, abort the
+		//  transfer immediately and force the SndOut driver to wait until
+		//  the read buffer has filled up again before proceeding.
+		//  This will cause one brief hiccup that can never exceed the user's
+		//  set buffer length in duration.
+
+#ifndef DYNAMIC_BUFFER_LIMITING
+		if( underrun_freeze )
+		{
+			if( data < (int)(size * 0.85) )
+			{
+				while( nSamples>0 )
+				{
+					*(bData++) = 0;
+					nSamples--;
+				}
+				LeaveCriticalSection(&cs);
+				return;
+			}
+
+			underrun_freeze = false;
+			//ConLog( " * SPU2 > Underrun Freeze Finished!\n" );
+		}
+
+		while(data>0 && nSamples>0)
+		{
+			*(bData++) = buffer[rpos];
+			rpos=(rpos+1)%size;
+			data--;
+			nSamples--;
+		}
+
+		while( nSamples>0 )
+		{
+			// buffer underrun code:
+			// the contents of this loop only get run if data reached zero
+			// before nSamples.
+			// Let's just dull out some silence, because that's usually the least
+			// painful way of dealing with underruns.
+
+			*(bData++) = 0;
+			nSamples--;
+		}
+
+		if( data == 0 && !pw )
+		{
+			ConLog( " * SPU2 > Underrun compensation\n" );
+			underrun_freeze = true;
+		}
+
+#elif defined( DYNAMIC_BUFFER_LIMITING )
+
 		while(nSamples>0)
 		{
 			*(bData++) = buffer[rpos];
@ -178,7 +264,6 @@ public:
 			nSamples--;
 		}

-#ifdef DYNAMIC_BUFFER_LIMITING
 		if(data<0)
 		{
 			do
@ -195,16 +280,9 @@ public:
 			data+=size;
 			uflow = true;
 		}
-
-		//if( uflow )
-			//ConLog( " * SPU2 : Data Underflow detected!\n" );
-
 #endif

-		//if(isWaiting)
-		{
-			PulseEvent(hSyncEvent);
-		}
+		PulseEvent(hSyncEvent);

 #ifdef DYNAMIC_BUFFER_LIMITING

@ -318,7 +396,6 @@ void UpdateTempoChange()
 
 	s32 bufferUsage = sndBuffer->GetBufferUsage();
 	s32 bufferSize  = sndBuffer->GetBufferSize();
-
 //Emergency stretch to compensate for FPS fluctuations and keep the buffers happy
 	bool a=(bufferUsage < CurBufferSize * 4);
 	bool b=(bufferUsage >= (bufferSize - CurBufferSize * 4));
@ -438,12 +515,18 @@ void SndClose()

 void SndUpdateLimitMode()
 {
-	sndBuffer->PauseOnWrite(LimitMode!=0);
+	//sndBuffer->PauseOnWrite(LimitMode!=0);

-	if(LimitMode!=0)
-		printf(" * SPU2 limiter is now ON.\n");
-	else
-		printf(" * SPU2 limiter is now OFF.\n");
+	if(LimitMode!=0) {
+		timeStretchEnabled = true;
+		//printf(" * SPU2 limiter is now ON.\n");
+		printf(" * SPU2 timestretch is now ON.\n");
+	}
+	else {
+		//printf(" * SPU2 limiter is now OFF.\n");
+		printf(" * SPU2 timestretch is now OFF.\n");
+		timeStretchEnabled = false;
+	}

 }

--- a/plugins/spu2ghz/spu2.cpp
+++ b/plugins/spu2ghz/spu2.cpp
@ -34,10 +34,11 @@ const unsigned char build	 = 9;	// increase that with each version

 static char *libraryName	  = "GiGaHeRz's SPU2 (" 
 #ifdef _DEBUG
-	"Playground Debug "
-#endif
-#ifdef PUBLIC
+	"Playground Debug"
+#elif defined( PUBLIC )
 	"Playground Mod"
+#else
+	"Playground Dev"
 #endif
 ")";

@ -50,9 +51,18 @@ const char *AddressNames[6]={"SSAH","SSAL","LSAH","LSAL","NAXH","NAXL"};
 double opitch;
 int osps;

-int Log = 1;
+// [Air]: Adding the spu2init boolean wasn't necessary except to help me in
+//   debugging the spu2 suspend/resume behavior (when user hits escape).
+static bool spu2open=false;	// has spu2open plugin interface been called?
+static bool spu2init=false;	// has spu2init plugin interface been called?

-s8 spu2open=0;
+// [Air]: fixed the hacky part of UpdateTimer with this:
+static bool resetClock = true;
+
+// Used to make spu2 more robust at loading incompatible saves.
+// You won't get any sound but it won't cause mass instability either.
+// (should allow players to get to their next save point more easily)
+bool disableEverything=false;

 void (* _irqcallback)();
 void (* dma4callback)();
@ -69,6 +79,9 @@ u32	ThreadFuncID;

 char fname[]="01234567890123456789012345";

+#ifndef PUBLIC
+V_CoreDebug DebugCores[2];
+#endif
 V_Core Cores[2];
 V_SPDIF Spdif;

@ -139,12 +152,40 @@ void SysMessage(char *fmt, ...)
 	MessageBox(0, tmp, "SPU2ghz Msg", 0);
 }

-s16 __forceinline *GetMemPtr(u32 addr)
+s16 __forceinline * __fastcall GetMemPtr(u32 addr)
 {
 	assert(addr<0x100000);
 	return (_spu2mem+addr);
 }

+s16 __forceinline __fastcall spu2M_Read( u32 addr )
+{
+	return *GetMemPtr( addr & 0xfffff );
+}
+
+// writes a signed value to the SPU2 ram
+// Invalidates the ADPCM cache in the process.
+// Optimization note: don't use __forceinline because the footprint of this
+// function is a little too heavy now.  Better to let the compiler decide.
+void __inline __fastcall spu2M_Write( u32 addr, s16 value )
+{
+	// Make sure the cache is invalidated:
+	// (note to self : addr address WORDs, not bytes)
+
+	const u32 nexta = addr >> 3;		// 8 words per encoded block.
+	const u32 flagbitmask = 1ul<<(nexta & 31);  // 31 flags per array entry
+	pcm_cache_flags[nexta>>5] &= ~flagbitmask;
+
+	*GetMemPtr( addr & 0xfffff ) = value;
+}
+
+// writes an unsigned value to the SPU2 ram
+void __inline __fastcall spu2M_Write( u32 addr, u16 value )
+{
+	spu2M_Write( addr, (s16)value );
+}
+
+
 void CoreReset(int c)
 {
 	int v=0;
@ -203,7 +244,10 @@ void CoreReset(int c)
 		Cores[c].Voices[v].NextA=2800;
 		Cores[c].Voices[v].StartA=2800;
 		Cores[c].Voices[v].LoopStartA=2800;
-		Cores[c].Voices[v].lastSetStartA=2800;
+		Cores[c].Voices[v].SBuffer=pcm_cache_data;
+		#ifndef PUBLIC
+		DebugCores[c].Voices[v].lastSetStartA=2800;
+		#endif
 	}
 	Cores[c].DMAICounter=0;
 	Cores[c].AdmaInProgress=0;
@ -226,6 +270,7 @@ s32 CALLBACK SPU2init()
 	s32 c=0,v=0;
 	ReadSettings();
 	acumCycles=0;
+
 #ifdef SPU2_LOG
 	if(AccessLog) 
 	{
@ -235,12 +280,34 @@ s32 CALLBACK SPU2init()
 	}
 #endif
 	srand((unsigned)time(NULL));
-	if (spu2open) return 0;
+
+	disableEverything=false;
+
+	if (spu2init)
+	{
+		ConLog( " * SPU2: Already initialized - Ignoring SPU2init signal." );
+		return 0;
+	}
+
+	spu2init=true;
+
 	spu2regs  = (short*)malloc(0x010000);
 	_spu2mem  = (short*)malloc(0x200000);
-	if ((spu2regs == NULL) || (_spu2mem == NULL)) 
+
+	// adpcm decoder cache:
+	//  the cache data size is determined by taking the number of adpcm blocks
+	//  (2MB / 16) and multiplying it by the decoded block size (28 samples).
+	//  Thus: pcm_cache_data = 7,340,032 bytes (ouch!)
+	//  Expanded: 16 bytes expands to 56 bytes [3.5:1 ratio]
+	//    Resulting in 2MB * 3.5.
+
+	pcm_cache_flags = (u32*)calloc( 0x200000 / (16*32), 4 );
+	pcm_cache_data = (s16*)calloc( (0x200000 / 16) * 28, 2 );
+
+	if( (spu2regs == NULL) || (_spu2mem == NULL) ||
+		(pcm_cache_data == NULL) || (pcm_cache_flags == NULL) )
 	{
-		SysMessage("Error allocating Memory\n"); return -1;
+		SysMessage("SPU2: Error allocating Memory\n"); return -1;
 	}

 	for(int mem=0;mem<0x800;mem++)
@ -273,7 +340,6 @@ s32 CALLBACK SPU2init()
 	}

 	LowPassFilterInit();
-
 	InitADSR();

 #ifdef STREAM_DUMP
@ -329,7 +395,10 @@ BOOL CALLBACK DebugProc(HWND hWnd,UINT uMsg,WPARAM wParam,LPARAM lParam)
 	return TRUE;
 }

-s32 CALLBACK SPU2open(void *pDsp) {
+s32 CALLBACK SPU2open(void *pDsp)
+{
+	if( spu2open ) return 0;
+
 	FileLog("[%10d] SPU2 Open\n",Cycles);

 	/*if(debugDialogOpen==0)
@ -359,19 +428,23 @@ s32 CALLBACK SPU2open(void *pDsp) {

 void CALLBACK SPU2close() 
 {
+	if( !spu2open ) return;
 	FileLog("[%10d] SPU2 Close\n",Cycles);
-	spu2open=0;

 	DspCloseLibrary();
-
 	spdif_shutdown();
-
 	SndClose();
+
+	spu2open = false;
 }

 void CALLBACK SPU2shutdown() 
 {
-	if(spu2open) SPU2close();
+	if(!spu2init) return;
+
+	ConLog( " * SPU2: Shutting down.\n" );
+
+	SPU2close();

 #ifdef S2R_ENABLE
 	if(!replay_mode)
@ -390,8 +463,20 @@ void CALLBACK SPU2shutdown()
 	if(WaveLog && wavedump_ok) wavedump_close();

 	DMALogClose();
+
+	spu2init = false;
+
 	free(spu2regs);
 	free(_spu2mem);
+
+	free( pcm_cache_flags );
+	free( pcm_cache_data );
+
+	spu2regs = NULL;
+	_spu2mem = NULL;
+	pcm_cache_flags = NULL;
+	pcm_cache_data = NULL;
+
 #ifdef SPU2_LOG
 	if(!AccessLog) return;
 	FileLog("[%10d] SPU2shutdown\n",Cycles);
@ -427,8 +512,8 @@ BOOL DrawRectangle(HDC dc, int left, int top, int width, int height)
 	return Polyline(dc, p, 5);
 }

+#ifndef PUBLIC
 HFONT hf = NULL;
-
 int lCount=0;
 void UpdateDebugDialog()
 {
@ -455,6 +540,7 @@ void UpdateDebugDialog()
 				int IX = 8+256*c;
 				int IY = 8+ 32*v;
 				V_Voice& vc(Cores[c].Voices[v]);
+				V_VoiceDebug& vcd( DebugCores[c].Voices[v] );

 				SetDCBrushColor(hdc,RGB(  0,  0,  0));
 				if((vc.ADSR.Phase>0)&&(vc.ADSR.Phase<6))
@ -463,11 +549,11 @@ void UpdateDebugDialog()
 				}
 				else
 				{
-					if(vc.lastStopReason==1)
+					if(vcd.lastStopReason==1)
 					{
 						SetDCBrushColor(hdc,RGB(128,  0,  0));
 					}
-					if(vc.lastStopReason==2)
+					if(vcd.lastStopReason==2)
 					{
 						SetDCBrushColor(hdc,RGB(  0,128,  0));
 					}
@ -491,7 +577,7 @@ void UpdateDebugDialog()

 				FillRectangle(hdc,IX+48,IY+26 - adsr, 4, adsr);

-				int peak = vc.displayPeak * 24 / 32768;
+				int peak = vcd.displayPeak * 24 / 32768;

 				FillRectangle(hdc,IX+56,IY+26 - peak, 4, peak);

@ -509,13 +595,13 @@ void UpdateDebugDialog()
 				sprintf(t,"%06x",vc.LoopStartA);
 				TextOut(hdc,IX+4,IY+21,t,6);

-				vc.displayPeak = 0;
+				vcd.displayPeak = 0;

-				if(vc.lastSetStartA != vc.StartA)
+				if(vcd.lastSetStartA != vc.StartA)
 				{
 					printf(" *** Warning! Core %d Voice %d: StartA should be %06x, and is %06x.\n",
-						c,v,vc.lastSetStartA,vc.StartA);
-					vc.lastSetStartA = vc.lastSetStartA;
+						c,v,vcd.lastSetStartA,vc.StartA);
+					vcd.lastSetStartA = vcd.lastSetStartA;
 				}
 			}
 		}
@ -530,6 +616,7 @@ void UpdateDebugDialog()
 		DispatchMessage(&msg);
 	}
 }
+#endif

 //SHOULD be 768, but 751/752 seems to get better results
 #define TickInterval 768
@ -558,12 +645,20 @@ DWORD CALLBACK TimeThread(PVOID /* unused param */)
 	return 0;
 }

-void CALLBACK TimeUpdate(u32 cClocks, u32 syncType)
+void __fastcall TimeUpdate(u32 cClocks, u32 syncType)
 {
 	u32 dClocks = cClocks-lClocks;

-	// HACKY but should work anyway.
-	if(lClocks==0) lClocks = cClocks;
+	// [Air]: Sanity Check
+	//  If for some reason our clock value seems way off base, just mix
+	//  out a little bit, skip the rest, and hope the ship "rights" itself later on.
+
+	if( dClocks > TickInterval*32 )
+	{
+		ConLog( " * SPU2 > TimeUpdate > Sanity Check Failed: %d (cc: %d)\n", dClocks/TickInterval, cClocks/TickInterval );
+		dClocks = TickInterval*32;
+		lClocks = cClocks-dClocks;
+	}

 	//Update Mixing Progress
 	while(dClocks>=TickInterval)
@ -636,12 +731,15 @@ void CALLBACK TimeUpdate(u32 cClocks, u32 syncType)

 bool numpad_minus_old=false;
 bool numpad_minus = false;
-u32 timer=0,time1=0,time2=0;
+
 void CALLBACK SPU2async(u32 cycles) 
 {
-	u32 oldClocks = lClocks;
-	timer++;
+	if( disableEverything ) return;

+#ifndef PUBLIC
+	u32 oldClocks = lClocks;
+	static u32 timer=0,time1=0,time2=0;
+	timer++;
 	if (timer == 1){
 		time1=timeGetTime();
 	}
@ -649,6 +747,8 @@ void CALLBACK SPU2async(u32 cycles)
 		time2 = timeGetTime()-time1 ;
 		timer=0;
 	}
+#endif
+
 	DspUpdate();

 	if(LimiterToggleEnabled)
@ -765,6 +865,8 @@ void CALLBACK SPU_ps1_write(u32 mem, u16 value)
 				Cores[0].Voices[voice].ADSR.Reg_ADSR2 = value;	break;
 			case 6:	Cores[0].Voices[voice].ADSR.Value=value;	break;
 			case 7:	Cores[0].Voices[voice].LoopStartA=(u32)value <<8;	break;
+
+			jNO_DEFAULT;
 		}
 	}
 	else switch(reg)
@ -890,6 +992,8 @@ u16 CALLBACK SPU_ps1_read(u32 mem)
 			case 5: value=Cores[0].Voices[voice].ADSR.Reg_ADSR2;	break;
 			case 6:	value=Cores[0].Voices[voice].ADSR.Value;	break;
 			case 7:	value=Cores[0].Voices[voice].LoopStartA;	break;
+
+			jNO_DEFAULT;
 		}
 	}
 	else switch(reg)
@ -1147,6 +1251,8 @@ void CALLBACK SPU2writeLog(u32 rmem, u16 value)

 void CALLBACK SPU2write(u32 rmem, u16 value) 
 {
+	if( disableEverything ) return;
+
 #ifdef S2R_ENABLE
 	if(!replay_mode)
 		s2r_writereg(Cycles,rmem,value);
@ -1160,7 +1266,7 @@ void CALLBACK SPU2write(u32 rmem, u16 value)
 			Spdif.Info=4;
 			SetIrqCall();
 		}
-		spu2Mu16(Cores[0].TSA++)=value;
+		spu2M_Write( Cores[0].TSA++, value );
 		Cores[0].TSA&=0xfffff;

 		return;
@ -1173,7 +1279,7 @@ void CALLBACK SPU2write(u32 rmem, u16 value)
 			Spdif.Info=4;
 			SetIrqCall();
 		}
-		spu2Mu16(Cores[1].TSA++)=value;
+		spu2M_Write( Cores[1].TSA++, value );
 		Cores[1].TSA&=0xfffff;

 		return;
@ -1240,6 +1346,8 @@ void CALLBACK SPU2write(u32 rmem, u16 value)
 			case 5:	Cores[core].Voices[voice].ADSR.Value=value;		break;
 			case 6:	Cores[core].Voices[voice].VolumeL.Value=value;	break;
 			case 7:	Cores[core].Voices[voice].VolumeR.Value=value;	break;
+
+			jNO_DEFAULT;
 		}
 	}
 	else if ((omem >= 0x01C0) && (omem < 0x02DE)) {
@ -1249,10 +1357,14 @@ void CALLBACK SPU2write(u32 rmem, u16 value)
 		
 		switch (address) {
 			case 0:	Cores[core].Voices[voice].StartA=((value & 0x0F) << 16) | (Cores[core].Voices[voice].StartA & 0xFFF8); 
-					Cores[core].Voices[voice].lastSetStartA = Cores[core].Voices[voice].StartA; 
+					#ifndef PUBLIC
+					DebugCores[core].Voices[voice].lastSetStartA = Cores[core].Voices[voice].StartA; 
+					#endif
 					break;
 			case 1:	Cores[core].Voices[voice].StartA=(Cores[core].Voices[voice].StartA & 0x0F0000) | (value & 0xFFF8); 
-					Cores[core].Voices[voice].lastSetStartA = Cores[core].Voices[voice].StartA; 
+					#ifndef PUBLIC
+					DebugCores[core].Voices[voice].lastSetStartA = Cores[core].Voices[voice].StartA; 
+					#endif
 					//if(core==1) printf(" *** StartA for C%dV%02d set to 0x%05x\n",core,voice,Cores[core].Voices[voice].StartA);
 					break;
 			case 2:	Cores[core].Voices[voice].LoopStartA=((value & 0x0F) << 16) | (Cores[core].Voices[voice].LoopStartA & 0xFFF8);
@ -1471,6 +1583,7 @@ void CALLBACK SPU2write(u32 rmem, u16 value)

 u16  CALLBACK SPU2read(u32 rmem) 
 {
+	if( disableEverything ) return 0;

 //	if(!replay_mode)
 //		s2r_readreg(Cycles,rmem);
@ -1516,35 +1629,22 @@ s32 CALLBACK SPU2test() {
 	return SndTest();
 }

+#define PCM_CACHE_BLOCK_COUNT ( 0x200000 / 16 )
+
+struct cacheFreezeData
+{
+	u32 flags[PCM_CACHE_BLOCK_COUNT/32];
+	s16 startData;
+};
+
 typedef struct 
 {
-	// compatibility with zerospu2
-    u32 version;
+	// compatibility with zerospu2 removed...
+
+	u32 version;
 	u8 unkregs[0x10000];
 	u8 mem[0x200000];
-    u16 interrupt;
-    int nSpuIrq[2];
-    u32 dwNewChannel2[2], dwEndChannel2[2];
-    u32 dwNoiseVal;
-    int iFMod[48];
-    u32 MemAddr[2];

-	struct ADMA
-	{
-		unsigned short * MemAddr;
-		int			  Index;
-		int			  AmountLeft;
-		int			  Enabled;
-	} adma[2];
-    u32 Adma4MemAddr, Adma7MemAddr;
-
-    int SPUCycles, SPUWorkerCycles;
-    int SPUStartCycle[2];
-    int SPUTargetCycle[2];
-
-    int voicesize;
-
-	// compatibility with zerospu2
 	u32 id;
 	V_Core Cores[2];
 	V_SPDIF Spdif;
@ -1560,34 +1660,89 @@ typedef struct

 	int lClocks;

+	cacheFreezeData cacheData;
+
 } SPU2freezeData;

-#define ZEROSPU_VERSION 0x70000001
+// No more ZeroSPU compatibility...
+//#define ZEROSPU_VERSION 0x70000001
+
 #define SAVE_ID 0x73326701

-s32  CALLBACK SPU2freeze(int mode, freezeData *data){
+// versioning for saves.
+// Increment this if changes to V_Core or V_Voice structs are made.
+// Chances are we'll never explicitly support older save versions,
+// but might as well version them anyway.  Could come in handly someday!
+#define SAVE_VERSION 0x0100

-	SPU2freezeData *spud;
+static int getFreezeSize()
+{
+	if( disableEverything ) return 7;	// length of the string id "invalid"

-	if (mode == FREEZE_LOAD) {
+	int size = sizeof(SPU2freezeData);

-		spud = (SPU2freezeData*)data->data;
+	// calculate the amount of memory consumed by our cache:

-		if(spud->id!=SAVE_ID)
+	//size += PCM_CACHE_BLOCK_COUNT / 8;
+
+	for( int bidx=0; bidx<PCM_CACHE_BLOCK_COUNT; bidx++ )
+	{
+		const u32 flagmask = 1ul << (bidx & 31);
+		if( pcm_cache_flags[bidx>>5] & flagmask )
 		{
-			printf("SPU2Ghz Warning:\n");
+			size += 28*2;
+		}
+	}
+	return size;
+}
+
+
+s32 CALLBACK SPU2freeze(int mode, freezeData *data)
+{
+	if (mode == FREEZE_LOAD)
+	{
+		const SPU2freezeData *spud = (SPU2freezeData*)data->data;
+
+		if( spud->id != SAVE_ID || spud->version != SAVE_VERSION )
+		{
+			// [Air]: Running the SPU2 from an "empty" state this way is pretty unreliable.
+			//  It usually didn't crash at least, but it never output sound anyway and would
+			//  confuse the new cache system.
+			//
+			//  To fix it I introduced a new global flag that disables the SPU2 logic completely.
+			//  This is the safest way to recover from an unsupported SPU2 save, since it pretty
+			//  well garauntees the user will have a stable enough environment to reach a save spot.
+
+			printf("\n*** SPU2Ghz Warning:\n");
 			printf("The savestate you are trying to load was not made with this plugin.\n");
-			printf("Let it try to load a while, it could take up to one minute\n");
-			printf("If it loads ok try to reach the next memorycard savespot, save your game and continue from there.\n");
+			printf("Sound will be disabled until the emulator is reset.\n");
+			printf("Find a memorycard savespot to save your game, reset, and then continue from there.\n\n");
+
+			// Clear stuff, not that it matters:
+
+			disableEverything=true;
 			lClocks = 0;
+			resetClock = true;
+
+			// Reset the cores.
+
+			CoreReset( 0 );
+			CoreReset( 1 );
+
+			// adpcm cache : Just clear all the cache flags, which forces the mixer
+			//   to re-decode everything.
+
+			memset( pcm_cache_flags, 0, (0x200000 / (16*32)) * 4 );
+			memset( pcm_cache_data, 0, (0x200000 / 16) * 28 * 2 );
 		}
 		else
 		{
+			disableEverything=false;
+
 			// base stuff
 			memcpy(spu2regs, spud->unkregs, 0x010000);
 			memcpy(_spu2mem, spud->mem,     0x200000);

-			
 			memcpy(Cores, spud->Cores, sizeof(Cores));
 			memcpy(&Spdif, &spud->Spdif, sizeof(Spdif));
 			OutPos=spud->OutPos;
@ -1599,21 +1754,62 @@ s32  CALLBACK SPU2freeze(int mode, freezeData *data){
 			opitch=spud->opitch;
 			osps=spud->osps;
 			PlayMode=spud->PlayMode;
-			lClocks = spud->lClocks;	
+			lClocks = spud->lClocks;
+
+			// Load the ADPCM cache:
+
+			const cacheFreezeData &cfd = spud->cacheData;
+			const s16* pcmSrc = &cfd.startData;
+
+			memcpy( pcm_cache_flags, cfd.flags, PCM_CACHE_BLOCK_COUNT / 8 );
+
+			int blksLoaded=0;
+
+			for( int bidx=0; bidx<PCM_CACHE_BLOCK_COUNT; bidx++ )
+			{
+				const u32 flagmask = 1ul << (bidx & 31);
+				if( cfd.flags[bidx>>5] & flagmask )
+				{
+					// load a cache block!
+					memcpy( &pcm_cache_data[bidx*28], pcmSrc, 28*2 );
+					pcmSrc += 28;
+					blksLoaded++;
+				}
+			}
+
+			// Go through the V_Voice structs and replace the SBuffer pointer
+			// with an absolute address into our cache buffer this session.
+
+			for( int c=0; c<2; c++ )
+			{
+				for( int v=0; v<24; v++ )
+				{
+					Cores[c].Voices[v].SBuffer = (s16*) ((u64)spud->Cores[c].Voices[v].SBuffer + (u64)pcm_cache_data );
+				}
+			}
+
+			//printf( " * SPU2 > FreezeLoad > Loaded %d cache blocks.\n", blksLoaded++ );
 		}

-	} else if (mode == FREEZE_SAVE) {
-
-		data->size = sizeof(SPU2freezeData);
-
-		data->data = (s8*)malloc(data->size);
-
+	} else if (mode == FREEZE_SAVE)
+	{
 		if (data->data == NULL) return -1;

-		spud = (SPU2freezeData*)data->data;
+		if( disableEverything )
+		{
+			// No point in making a save state since the SPU2
+			// state is completely bogus anyway... Let's just
+			// give this some random ID that no one will recognize.
+
+			strcpy( data->data, "invalid" );
+			return 0;
+		}
+
+
+		SPU2freezeData *spud = (SPU2freezeData*)data->data;

 		spud->id=SAVE_ID;
-		spud->version=SAVE_ID;//ZEROSPU_VERSION; //Zero compat working bad, better not save that
+		spud->version=SAVE_VERSION;//ZEROSPU_VERSION; //Zero compat working bad, better not save that

 		memcpy(spud->unkregs, spu2regs, 0x010000);
 		memcpy(spud->mem,     _spu2mem, 0x200000);
@ -1630,10 +1826,52 @@ s32  CALLBACK SPU2freeze(int mode, freezeData *data){
 		spud->PlayMode=PlayMode;
 		spud->lClocks = lClocks;

-	} else if (mode == FREEZE_SIZE) {
-		data->size = sizeof(SPU2freezeData);
-	}
+		// Save our cache:
+		//   We could just force the user to rebuild the cache when loading
+		//   from stavestates, but for most games the cache is pretty
+		//   small and compresses well.
+		//
+		// Potential Alternative:
+		//   If the cache is not saved then it is necessary to save the
+		//   decoded blocks currently in use by active voices.  This allows
+		//   voices to resume seamlessly on load.

+		cacheFreezeData &cfd = spud->cacheData;
+		s16* pcmDst = &cfd.startData;
+
+		memcpy( cfd.flags, pcm_cache_flags, sizeof(cfd.flags) );
+
+		int blksSaved=0;
+		for( int bidx=0; bidx<PCM_CACHE_BLOCK_COUNT; bidx++ )
+		{
+			const u32 flagmask = 1ul << (bidx & 31);
+			if( cfd.flags[bidx>>5] & flagmask )
+			{
+				// save a cache block!
+				memcpy( pcmDst, &pcm_cache_data[bidx*28], 28*2 );
+				pcmDst += 28;
+				blksSaved++;
+			}
+		}
+
+		// Time to go through the V_Voice structs and replace the SBuffer pointer
+		// with a relative address that can be applied later on when the state is loaded.
+
+		for( int c=0; c<2; c++ )
+		{
+			for( int v=0; v<24; v++ )
+			{
+				spud->Cores[c].Voices[v].SBuffer = 
+					(s16*) ((u64)spud->Cores[c].Voices[v].SBuffer - (u64)pcm_cache_data );
+			}
+		}
+		//printf( " * SPU2 > FreezeSave > Saved %d cache blocks.\n", blksSaved++ );
+
+	}
+	else if (mode == FREEZE_SIZE)
+	{
+		data->size = getFreezeSize();
+	}
 	return 0;

 }
@ -1654,25 +1892,23 @@ void VoiceStart(int core,int vc)
 		Cores[core].Voices[vc].PlayCycle=Cycles;
 		Cores[core].Voices[vc].SCurrent=28;
 		Cores[core].Voices[vc].LoopMode=0;
-		Cores[core].Voices[vc].Loop=0;
-		Cores[core].Voices[vc].LoopStart=0;
-		Cores[core].Voices[vc].LoopEnd=0;
+		Cores[core].Voices[vc].LoopFlags=0;
 		Cores[core].Voices[vc].LoopStartA=Cores[core].Voices[vc].StartA;
 		Cores[core].Voices[vc].NextA=Cores[core].Voices[vc].StartA;
-		Cores[core].Voices[vc].FirstBlock=1;
 		Cores[core].Voices[vc].Prev1=0;
 		Cores[core].Voices[vc].Prev2=0;

 		// [Air]: Don't wipe interpolation values on VoiceStart.
-		//   There'll be less popping/clicking if we just interpolate from the
-		//   old sample and the new sample.
+		//   There should be less popping/clicking if we just interpolate from the
+		//   old sample into the new sample.

-		//Cores[core].Voices[vc].PV1=Cores[core].Voices[vc].PV2=0;
-		//Cores[core].Voices[vc].PV3=Cores[core].Voices[vc].PV4=0;
+		Cores[core].Voices[vc].PV1=Cores[core].Voices[vc].PV2=0;
+		Cores[core].Voices[vc].PV3=Cores[core].Voices[vc].PV4=0;

 		Cores[core].Regs.ENDX&=~(1<<vc);

-
+		#ifndef PUBLIC
+		DebugCores[core].Voices[vc].FirstBlock=1;
 		if(core==1)
 		{
 			if(MsgKeyOnOff) ConLog(" * SPU2: KeyOn: C%dV%02d: SSA: %8x; M: %s%s%s%s; H: %02x%02x; P: %04x V: %04x/%04x; ADSR: %04x%04x\n",
@ -1684,6 +1920,7 @@ void VoiceStart(int core,int vc)
 						Cores[core].Voices[vc].VolumeL.Value,Cores[core].Voices[vc].VolumeR.Value,
 						Cores[core].Voices[vc].ADSR.Reg_ADSR1,Cores[core].Voices[vc].ADSR.Reg_ADSR2);
 		}
+		#endif
 	}
 	else
 	{
@ -1699,8 +1936,8 @@ void VoiceStop(int core,int vc)
 	// [Air]: Wipe the interpolation values here, since stopped voices
 	//   are essentially silence (and any new voices shold thusly interpolate up from
 	//   such silence)
-	Cores[core].Voices[vc].PV1=Cores[core].Voices[vc].PV2=0;
-	Cores[core].Voices[vc].PV3=Cores[core].Voices[vc].PV4=0;
+	//Cores[core].Voices[vc].PV1=Cores[core].Voices[vc].PV2=0;
+	//Cores[core].Voices[vc].PV3=Cores[core].Voices[vc].PV4=0;

 	//Cores[core].Regs.ENDX|=(1<<vc);
 }
@ -1734,6 +1971,8 @@ void StopVoices(int core, u32 value)
 // for now, pData is not used
 int CALLBACK SPU2setupRecording(int start, void* pData)
 {
+	if( disableEverything ) return 0;
+
 	if(start==0)
 	{
 		//stop recording
--- a/plugins/spu2ghz/spu2.h
+++ b/plugins/spu2ghz/spu2.h
@ -93,13 +93,19 @@ extern void spdif_get_samples(s32 *samples); // fills the buffer with [l,r,c,lfe
 extern short *spu2regs;
 extern short *_spu2mem;

-extern s16 __forceinline *GetMemPtr(u32 addr);
+extern u32 *pcm_cache_flags;
+extern s16 *pcm_cache_data;
+
+extern s16 __forceinline * __fastcall GetMemPtr(u32 addr);
+extern s16 __forceinline __fastcall spu2M_Read( u32 addr );
+extern void __inline __fastcall spu2M_Write( u32 addr, s16 value );
+extern void __inline __fastcall spu2M_Write( u32 addr, u16 value );

 #define spu2Rs16(mmem)	(*(s16 *)((s8 *)spu2regs + ((mmem) & 0x1fff)))
 #define spu2Ru16(mmem)	(*(u16 *)((s8 *)spu2regs + ((mmem) & 0x1fff)))

-#define spu2Ms16(mmem)	(*GetMemPtr((mmem) & 0xfffff))
-#define spu2Mu16(mmem)	(*(u16*)GetMemPtr((mmem) & 0xfffff))
+//#define spu2Ms16(mmem)	(*GetMemPtr((mmem) & 0xfffff))
+//#define spu2Mu16(mmem)	(*(u16*)GetMemPtr((mmem) & 0xfffff))

 void SysMessage(char *fmt, ...);

@ -135,7 +141,9 @@ extern u32 lClocks;
 extern u32* cPtr;
 extern bool hasPtr;

-void CALLBACK TimeUpdate(u32 cClocks, u32 syncType);
+extern bool disableEverything;
+
+void __fastcall TimeUpdate(u32 cClocks, u32 syncType);

 void TimestretchUpdate(int bufferusage,int buffersize);

--- a/plugins/spu2ghz/waveout.cpp
+++ b/plugins/spu2ghz/waveout.cpp
@ -104,6 +104,9 @@ public:
 		wformat.nAvgBytesPerSec=(wformat.nSamplesPerSec * wformat.nBlockAlign);
 		wformat.cbSize=0;
 		
+		qbuffer=new s16[BufferSize*MAX_BUFFER_COUNT];
+		tbuffer=new s32[BufferSize];
+
 		woores = waveOutOpen(&hwodevice,WAVE_MAPPER,&wformat,0,0,0);
 		if (woores != MMSYSERR_NOERROR)
 		{
@ -112,9 +115,6 @@ public:
 			return -1;
 		}

-		qbuffer=new s16[BufferSize*MAX_BUFFER_COUNT];
-		tbuffer=new s32[BufferSize];
-
 		for(int i=0;i<MAX_BUFFER_COUNT;i++)
 		{
 			whbuffer[i].dwBufferLength=BufferSizeBytes;
@ -127,6 +127,12 @@ public:
 			whbuffer[i].reserved=0;
 			waveOutPrepareHeader(hwodevice,whbuffer+i,sizeof(WAVEHDR));
 			whbuffer[i].dwFlags|=WHDR_DONE; //avoid deadlock
+
+			// Feed blocks into the device.
+			// It'll all be empty samples, but it helps reduce some of the pop-on-init.
+
+			//whbuffer[i].dwFlags&=~WHDR_DONE;
+			//waveOutWrite(hwodevice,&whbuffer[i],sizeof(WAVEHDR));
 		}

 		// Start Thread