Using this as a base now, it works very well for me. Thanks to Jake.Stine's patch the plugin got a bit faster ;)

I made a small mistake in the last 2 commits, its fine again. git-svn-id: http://pcsx2-playground.googlecode.com/svn/trunk@204 a6443dda-0b58-4228-96e9-037be469359c
2008-10-16 14:12:40 +00:00 · 2008-10-16 14:12:40 +00:00 · e1381992e5
parent a6eba17dfa
commit e1381992e5
9 changed files with 533 additions and 318 deletions
--- a/plugins/spu2ghz/decoder.cpp
+++ b/plugins/spu2ghz/decoder.cpp
@ -58,7 +58,7 @@ int state=0;
 FILE *fSpdifDump;

 extern u32 core;
-void __fastcall ReadInput(s32& PDataL,s32& PDataR);
+void __fastcall ReadInput(V_Core& thiscore, s32& PDataL,s32& PDataR);

 union spdif_frame { // total size: 32bits
 	struct {
@ -134,10 +134,11 @@ void spdif_update()
 {
 	s32 Data,Zero;

+	core=0;
+	V_Core& thiscore( Cores[core] );
 	for(int i=0;i<data_rate;i++)
 	{
-		core=0;
-		ReadInput(Data,Zero);
+		ReadInput(thiscore, Data,Zero);
 		
 		if(fSpdifDump)
 		{
--- a/plugins/spu2ghz/defs.h
+++ b/plugins/spu2ghz/defs.h
@ -55,6 +55,7 @@ typedef struct {
 	u8 Releasing;
 } V_ADSR;

+
 typedef struct {
 // SPU2 cycle where the Playing started
 	u32 PlayCycle;
@ -107,7 +108,12 @@ typedef struct {
 	s32 PeakX;
 	s32 SampleData;

-	s32 SBuffer[32];
+	// [Air]: Changed SBuffer from 32-bit to 16-bit. (this breaks old savestates)
+	//   Everything stored in SBuffer is 16-bit values, and on modern CPUs the benefit
+	//   of reduced data cache clutter out-weighs the benefit of using 'cpu native' 32-bit
+	//   values. (doesn't apply to SIMD of course, but no SIMD here anyway)
+	//   Because this breaks savestates it might not be worth the bother though.
+	s16 SBuffer[32];
 	s32 SCurrent;

 	s32 displayPeak;
--- a/plugins/spu2ghz/dsoundout.cpp
+++ b/plugins/spu2ghz/dsoundout.cpp
@ -119,6 +119,7 @@ private:

 	DWORD CALLBACK Thread()
 	{
+
 		while( dsound_running )
 		{
 			u32 rv = WaitForMultipleObjects(MAX_BUFFER_COUNT,buffer_events,FALSE,400);
@ -126,25 +127,31 @@ private:
 			LPVOID p1,p2;
 			DWORD s1,s2;
 	 
-			for(int i=0;i<MAX_BUFFER_COUNT;i++)
+			u32 poffset=BufferSizeBytes * rv;
+
+		    //DWORD          play, write;
+			//buffer->GetCurrentPosition( &play, &write );
+			//ConLog( " * SPU2 > Play: %d   Write: %d  poffset: %d\n", play, write, poffset );
+
+			buff->ReadSamples(tbuffer,BufferSize);
+
+			verifyc(buffer->Lock(poffset,BufferSizeBytes,&p1,&s1,&p2,&s2,0));
+			
 			{
-				if (rv==WAIT_OBJECT_0+i)
+				s16 *t = (s16*)p1;
+				s32 *s = (s32*)tbuffer;
+				for(int j=0;j<BufferSize;j++)
 				{
-					u32 poffset=BufferSizeBytes * i;
-
-					buff->ReadSamples(tbuffer,BufferSize);
-
-					verifyc(buffer->Lock(poffset,BufferSizeBytes,&p1,&s1,&p2,&s2,0));
-					s16 *t = (s16*)p1;
-					s32 *s = (s32*)tbuffer;
-					for(int j=0;j<BufferSize;j++)
-					{
-						*(t++) = (s16)((*(s++))>>8);
-					}
-					verifyc(buffer->Unlock(p1,s1,p2,s2));
-
+					*(t++) = (s16)((*(s++))>>8);
 				}
 			}
+
+			/*if( p2 != NULL )
+			{
+				ConLog( " * SPU2 > DSound Driver Loop-Around Occured.  Length: %d", s2 );
+			}*/
+
+			verifyc(buffer->Unlock(p1,s1,p2,s2));
 		}
 		return 0;
 	}
--- a/plugins/spu2ghz/mixer.cpp
+++ b/plugins/spu2ghz/mixer.cpp
@ -15,6 +15,14 @@
 //License along with this library; if not, write to the Free Software
 //Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 //
+
+// [Air] Notes ----->
+//  Adding 'static' to the __forceinline methods hints to the linker that it need not
+//  actually include procedural versions of the methods in the DLL.  Under normal circumstances
+//  the compiler will still generate the procedures even though they are never used (the inline
+//  code is used instead).  Using static reduced the size of my generated .DLL by a few KB.
+//   (doesn't really make anything faster, but eh... whatever :)
+//
 #include "spu2.h"

 #include <assert.h>
@ -42,6 +50,7 @@ double srate_pv=1.0;

 extern u32 PsxRates[160];

+
 void InitADSR()                                    // INIT ADSR
 {
 	for (int i=0; i<(32+128); i++)
@ -71,18 +80,18 @@ const s32 f[5][2] ={{    0,   0 },
 					{   98, -55 },
 					{  122, -60 }};

-s32 __forceinline XA_decode(s32 pred1, s32 pred2, s32 shift, s32& prev1, s32& prev2, s32 data)
+static s16 __forceinline XA_decode(s32 pred1, s32 pred2, s32 shift, s32& prev1, s32& prev2, s32 data)
 {
-	s32 pcm =data>>shift;
+	s32 pcm = data>>shift;
 	pcm+=((pred1*prev1)+(pred2*prev2))>>6;
 	if(pcm> 32767) pcm= 32767;
 	if(pcm<-32768) pcm=-32768;
 	prev2=prev1;
 	prev1=pcm;
-	return pcm;
+	return (s16)pcm;
 }

-s32 __forceinline XA_decode_block(s32* buffer, s16* block, s32& prev1, s32& prev2)
+static s16 __forceinline XA_decode_block(s16* buffer, const s16* block, s32& prev1, s32& prev2)
 {
 	s32 data=*block;
 	s32 Shift	 =  ((data>> 0)&0xF)+16;
@ -102,11 +111,84 @@ s32 __forceinline XA_decode_block(s32* buffer, s16* block, s32& prev1, s32& prev
 	return data;
 }

-void __forceinline IncrementNextA()
+static s16 __forceinline XA_decode_block_fast(s16* buffer, const s16* block, s32& prev1, s32& prev2)
 {
-	V_Voice& vc(Cores[core].Voices[voice]);
+	s32 header = *block;
+	s32 shift =  ((header>> 0)&0xF)+16;
+	s32 pred1 = f[(header>> 4)&0xF][0];
+	s32 pred2 = f[(header>> 4)&0xF][1];

-	if((vc.NextA==Cores[core].IRQA)&&(Cores[core].IRQEnable)) { 
+	const s8* blockbytes = (s8*)&block[1];
+
+	for(int i=0; i<14; i++, blockbytes++)
+	{
+		s32 pcm, pcm2;
+		{
+			s32 data = ((*blockbytes)<<28) & 0xF0000000;
+			pcm = data>>shift;
+			pcm+=((pred1*prev1)+(pred2*prev2))>>6;
+			if(pcm> 32767) pcm= 32767;
+			if(pcm<-32768) pcm=-32768;
+			*(buffer++) = pcm;
+		}
+
+		//prev2=prev1;
+		//prev1=pcm;
+
+		{
+			s32 data = ((*blockbytes)<<24) & 0xF0000000;
+			pcm2 = data>>shift;
+			pcm2+=((pred1*pcm)+(pred2*prev1))>>6;
+			if(pcm2> 32767) pcm2= 32767;
+			if(pcm2<-32768) pcm2=-32768;
+			*(buffer++) = pcm2;
+		}
+
+		prev2=pcm;
+		prev1=pcm2;
+	}
+
+	return header;
+}
+
+static s16 __forceinline XA_decode_block_unsaturated(s16* buffer, const s16* block, s32& prev1, s32& prev2)
+{
+	s32 header = *block;
+	s32 shift =  ((header>> 0)&0xF)+16;
+	s32 pred1 = f[(header>> 4)&0xF][0];
+	s32 pred2 = f[(header>> 4)&0xF][1];
+
+	const s8* blockbytes = (s8*)&block[1];
+
+	for(int i=0; i<14; i++, blockbytes++)
+	{
+		s32 pcm, pcm2;
+		{
+			s32 data = ((*blockbytes)<<28) & 0xF0000000;
+			pcm = data>>shift;
+			pcm+=((pred1*prev1)+(pred2*prev2))>>6;
+			// [Air] : Fast method, no saturation is performed.
+			*(buffer++) = pcm;
+		}
+
+		{
+			s32 data = ((*blockbytes)<<24) & 0xF0000000;
+			pcm2 = data>>shift;
+			pcm2+=((pred1*pcm)+(pred2*prev1))>>6;
+			// [Air] : Fast method, no saturation is performed.
+			*(buffer++) = pcm2;
+		}
+
+		prev2=pcm;
+		prev1=pcm2;
+	}
+
+	return header;
+}
+
+static void __forceinline IncrementNextA( const V_Core& thiscore, V_Voice& vc )
+{
+	if((vc.NextA==thiscore.IRQA)&&(thiscore.IRQEnable)) { 
 		ConLog(" * SPU2: IRQ Called (IRQ passed).\n"); 
 		Spdif.Info=4<<core;
 		SetIrqCall();
@ -116,12 +198,11 @@ void __forceinline IncrementNextA()
 	vc.NextA&=0xFFFFF;
 }

-void __fastcall GetNextDataBuffered(s32& Data) 
-{
-	static s32 pcm=0;
-	static s32 data=0;

-	V_Voice& vc(Cores[core].Voices[voice]);
+static void __fastcall GetNextDataBuffered( V_Core& thiscore, V_Voice& vc, s32& Data) 
+{
+	//static s32 pcm=0;
+	s16 data=0;

 	if (vc.SCurrent>=28) 
 	{
@ -135,31 +216,48 @@ void __fastcall GetNextDataBuffered(s32& Data)
 			{
 				if(MsgVoiceOff) ConLog(" * SPU2: Voice Off by EndPoint: %d \n", voice);
 				VoiceStop(core,voice);
-				Cores[core].Regs.ENDX|=1<<voice;
+				thiscore.Regs.ENDX|=1<<voice;
 				vc.lastStopReason = 1;
 			}
 		}

-		data = XA_decode_block(vc.SBuffer,GetMemPtr(vc.NextA&0xFFFFF), vc.Prev1, vc.Prev2);
+		// [Air]: Original ADPCM decoder.
+		//data = XA_decode_block(vc.SBuffer,GetMemPtr(vc.NextA&0xFFFFF), vc.Prev1, vc.Prev2);
+
+		// [Air]: Testing of a new saturated decoder. (benchmark needed)
+		//   My gut tells me that this should be faster, but you never can tell with these types
+		//   of things.  Benchmark it against the original and see what you think.
+
+		//data = XA_decode_block_fast(vc.SBuffer,GetMemPtr(vc.NextA&0xFFFFF), vc.Prev1, vc.Prev2);
+
+		// [Air]: Testing use of a new unsaturated decoder. (benchmark needed)
+		//   Chances are the saturation isn't needed, but for a very few exception games.
+		//   This is definitely faster than either of the above versions, but the question is by how
+		//   much (biggest impact will be on games like Xenosaga2, which use lots of SPU2 voices).
+		//   If the speed boost is worth it then maybe it should be added as a speedhack option
+		//   in the spu2ghz config.
+
+		data = XA_decode_block_unsaturated(vc.SBuffer,GetMemPtr(vc.NextA&0xFFFFF), vc.Prev1, vc.Prev2);
+
 		vc.LoopEnd  =   (data>> 8)&1;
 		vc.Loop     =   (data>> 9)&1;
 		vc.LoopStart=   (data>>10)&1;
-		vc.FirstBlock=0;
 		vc.SCurrent = 0;
+		vc.FirstBlock = 0;

-		if (vc.LoopStart&&!vc.LoopMode)
+		if( vc.LoopStart && !vc.LoopMode )
 		{
 			vc.LoopStartA=vc.NextA; 
 		}

-		IncrementNextA();
+		IncrementNextA( thiscore, vc );
 	}

 	Data=vc.SBuffer[vc.SCurrent];

 	if((vc.SCurrent&3)==3)
 	{
-		IncrementNextA();
+		IncrementNextA( thiscore, vc );
 	}
 	vc.SCurrent++;
 }
@ -170,9 +268,9 @@ void __fastcall GetNextDataBuffered(s32& Data)

 const int InvExpOffsets[] = { 0,4,6,8,9,10,11,12 };

-void __forceinline CalculateADSR() 
+static void __forceinline CalculateADSR( V_Voice& vc ) 
 {
-	V_ADSR& env(Cores[core].Voices[voice].ADSR);
+	V_ADSR& env(vc.ADSR);
 	u32 SLevel=((u32)env.Sl)<<27;
 	u32 off=InvExpOffsets[(env.Value>>28)&7];

@ -293,6 +391,8 @@ void __forceinline CalculateADSR()
 		case 6: // release end
 			env.Value=0;
 			break;
+
+		//jNO_DEFAULT
 	}

 	if (env.Phase==6) {
@ -300,17 +400,17 @@ void __forceinline CalculateADSR()
 		VoiceStop(core,voice);
 		Cores[core].Regs.ENDX|=(1<<voice);
 		env.Phase=0;
-		Cores[core].Voices[voice].lastStopReason = 2;
+		vc.lastStopReason = 2;
 	}
 }

 /////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////////////////////
 //                                                                                     //
-s32 Seed = 0x41595321;
-
-void __forceinline GetNoiseValues(s32& VD) 
+static void __forceinline GetNoiseValues(s32& VD) 
 {
+	static s32 Seed = 0x41595321;
+
 	if(Seed&0x100) VD =(s32)((Seed&0xff)<<8);
 	else if(!(Seed&0xffff)) VD = (s32)0x8000;
 	else VD = (s32)0x7fff;
@ -348,27 +448,31 @@ void LowPass(s32& VL, s32& VR)
 /////////////////////////////////////////////////////////////////////////////////////////
 //                                                                                     //

-void GetVoiceValues(s32& Value) {
+static void __fastcall GetVoiceValues(V_Core& thiscore, V_Voice& vc, s32& Value)
+{
 	s64 Data=0;
 	s32 DT=0;
-	s32 pitch;

-	V_Voice& vc(Cores[core].Voices[voice]);
+	// [Air] : Put a scope on the pitch variable, which should help it get optimized to a
+	//   register.
+	{
+		s32 pitch;

-	if((voice==0)||(vc.Modulated==0))
-		pitch=vc.Pitch;
-	else
-		pitch=(vc.Pitch*(32768 + Cores[core].Voices[voice-1].OutX))>>15;
+		// [Air] : re-ordered comparisons: Modulated is much more likely to be zero than voice,
+		//   and so the way it was before it's have to check both voice and modulated values
+		//   most of the time.  Now it'll just check Modulated and short-circut past the voice
+		//   check (not that it amounts to much, but eh every little bit helps).
+		if( (vc.Modulated==0) || (voice==0) )
+			pitch=vc.Pitch;
+		else
+			pitch=(vc.Pitch*(32768 + abs(thiscore.Voices[voice-1].OutX)))>>15;
+		
+		vc.SP+=pitch;
+	}

-	vc.SP+=pitch;
 	while(vc.SP>=4096) 
 	{
-		DT=0;
-
-		if(vc.Noise) 
-			GetNoiseValues(DT);
-		else
-			GetNextDataBuffered(DT);
+		GetNextDataBuffered( thiscore, vc, DT );

 		vc.PV4=vc.PV3;
 		vc.PV3=vc.PV2;
@ -378,8 +482,7 @@ void GetVoiceValues(s32& Value) {
 		vc.SP-=4096;
 	}

-	CalculateADSR();
-//	CalculateADSR();
+	CalculateADSR( vc );

 	if(vc.ADSR.Phase==0)
 	{
@ -388,26 +491,33 @@ void GetVoiceValues(s32& Value) {
 	}
 	else
 	{
-		if(Interpolation==0) {
+		// [Air]: if SP is zero then we landed perfectly on a sample source, no
+		// interpolation necessary (besides being a little faster this is important
+		// too, since the interpolator will pick the wrong sample to mix otherwise).
+
+		if(Interpolation==0 || vc.SP == 0)
+		{
 			Data = vc.PV1;
 		} 
 		else if(Interpolation==1) //linear
 		{
-			s64 t0 = vc.PV1 - vc.PV2;
+			// [Air]: Inverted the interpolation delta.  The old way was generating
+			// inverted waveforms.
+			s64 t0 = vc.PV2 - vc.PV1;
 			s64 t1 = vc.PV1;
 			Data = (((t0*vc.SP)>>12) + t1);
 		}
-		else if(Interpolation==2) //cubic
+		else // if(Interpolation==2) //must be cubic
 		{
 			s64 a0 = vc.PV1 - vc.PV2 - vc.PV4 + vc.PV3;
 			s64 a1 = vc.PV4 - vc.PV3 - a0;
 			s64 a2 = vc.PV1 - vc.PV4;
 			s64 a3 = vc.PV2;
-			s64 mu = vc.SP;
+			s64 mu = 4096-vc.SP;

-			s64 t0 = ((a0   )*mu)>>12;
-			s64 t1 = ((t0+a1)*mu)>>12;
-			s64 t2 = ((t1+a2)*mu)>>12;
+			s64 t0 = ((a0   )*mu)>>18;
+			s64 t1 = ((t0+a1)*mu)>>18;
+			s64 t2 = ((t1+a2)*mu)>>18;
 			s64 t3 = ((t2+a3));

 			Data = t3;
@ -415,32 +525,73 @@ void GetVoiceValues(s32& Value) {

 		Value=(s32)((Data*vc.ADSR.Value)>>48); //32bit ADSR + convert to 16bit

-		vc.OutX=abs(Value);
+		// [Air]: Moved abs() to the modulation code above, so that the abs conditionals are
+		//   only run in select cases where modulation is active.
+		vc.OutX=Value;
+	}
+}
+
+// [Air]: Noise values need to be mixed without going through interpolation, since it
+//    can wreak havoc on the noise (causing muffling or popping)
+static void __fastcall GetNoiseValues(V_Core& thiscore, V_Voice& vc, s32& Value)
+{
+	s64 Data=0;
+	s32 DT=0;
+
+	{
+		s32 pitch;
+
+		if( (vc.Modulated==0) || (voice==0) )
+			pitch=vc.Pitch;
+		else
+			pitch=(vc.Pitch*(32768 + abs(thiscore.Voices[voice-1].OutX)))>>15;
+		
+		vc.SP+=pitch;
+	}
+
+	while(vc.SP>=4096) 
+	{
+		GetNoiseValues(DT);
+		vc.SP-=4096;
+	}
+
+	Data = DT<<16; //32bit processing
+
+	CalculateADSR( vc );
+
+	if(vc.ADSR.Phase==0)
+	{
+		Value=0;
+		vc.OutX=0;
+	}
+	else
+	{
+		Value=(s32)((Data*vc.ADSR.Value)>>48); //32bit ADSR + convert to 16bit
+		vc.OutX=Value;
 	}
-	if(vc.PeakX<vc.OutX) vc.PeakX=vc.OutX;
 }

 /////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////////////////////
 //                                                                                     //

-void __fastcall ReadInput(s32& PDataL,s32& PDataR) 
+void __fastcall ReadInput(V_Core& thiscore, s32& PDataL,s32& PDataR) 
 {
-	if((Cores[core].AutoDMACtrl&(core+1))==(core+1))
+	if((thiscore.AutoDMACtrl&(core+1))==(core+1))
 	{
 		s32 tl,tr;

 		if((core==1)&&((PlayMode&8)==8))
 		{
-			Cores[core].InputPos&=~1;
+			thiscore.InputPos&=~1;

 			//CDDA mode
 #ifdef PCM24_S1_INTERLEAVE
-			*PDataL=*(((s32*)(Cores[core].ADMATempBuffer+(Cores[core].InputPos<<1))));
-			*PDataR=*(((s32*)(Cores[core].ADMATempBuffer+(Cores[core].InputPos<<1)+2)));
+			*PDataL=*(((s32*)(thiscore.ADMATempBuffer+(thiscore.InputPos<<1))));
+			*PDataR=*(((s32*)(thiscore.ADMATempBuffer+(thiscore.InputPos<<1)+2)));
 #else
-			s32 *pl=(s32*)&(Cores[core].ADMATempBuffer[Cores[core].InputPos]);
-			s32 *pr=(s32*)&(Cores[core].ADMATempBuffer[Cores[core].InputPos+0x200]);
+			s32 *pl=(s32*)&(thiscore.ADMATempBuffer[thiscore.InputPos]);
+			s32 *pr=(s32*)&(thiscore.ADMATempBuffer[thiscore.InputPos+0x200]);
 			PDataL=*pl;
 			PDataR=*pr;
 #endif
@ -448,73 +599,73 @@ void __fastcall ReadInput(s32& PDataL,s32& PDataR)
 			PDataL>>=4; //give 16.8 data
 			PDataR>>=4;

-			Cores[core].InputPos+=2;
-			if((Cores[core].InputPos==0x100)||(Cores[core].InputPos>=0x200)) {
-				Cores[core].AdmaInProgress=0;
-				if(Cores[core].InputDataLeft>=0x200)
+			thiscore.InputPos+=2;
+			if((thiscore.InputPos==0x100)||(thiscore.InputPos>=0x200)) {
+				thiscore.AdmaInProgress=0;
+				if(thiscore.InputDataLeft>=0x200)
 				{
-					u8 k=Cores[core].InputDataLeft>=Cores[core].InputDataProgress;
+					u8 k=thiscore.InputDataLeft>=thiscore.InputDataProgress;

 #ifdef PCM24_S1_INTERLEAVE
 					AutoDMAReadBuffer(core,1);
 #else
 					AutoDMAReadBuffer(core,0);
 #endif
-					Cores[core].AdmaInProgress=1;
+					thiscore.AdmaInProgress=1;

-					Cores[core].TSA=(core<<10)+Cores[core].InputPos;
+					thiscore.TSA=(core<<10)+thiscore.InputPos;

-					if (Cores[core].InputDataLeft<0x200) 
+					if (thiscore.InputDataLeft<0x200) 
 					{
 						FileLog("[%10d] AutoDMA%c block end.\n",Cycles, (core==0)?'4':'7');

-						if(Cores[core].InputDataLeft>0)
+						if(thiscore.InputDataLeft>0)
 						{
 							if(MsgAutoDMA) ConLog("WARNING: adma buffer didn't finish with a whole block!!\n");
 						}
-						Cores[core].InputDataLeft=0;
-						Cores[core].DMAICounter=1;
+						thiscore.InputDataLeft=0;
+						thiscore.DMAICounter=1;
 					}
 				}
-				Cores[core].InputPos&=0x1ff;
+				thiscore.InputPos&=0x1ff;
 			}

 		}
 		else if((core==0)&&((PlayMode&4)==4))
 		{
-			Cores[core].InputPos&=~1;
+			thiscore.InputPos&=~1;

-			s32 *pl=(s32*)&(Cores[core].ADMATempBuffer[Cores[core].InputPos]);
-			s32 *pr=(s32*)&(Cores[core].ADMATempBuffer[Cores[core].InputPos+0x200]);
+			s32 *pl=(s32*)&(thiscore.ADMATempBuffer[thiscore.InputPos]);
+			s32 *pr=(s32*)&(thiscore.ADMATempBuffer[thiscore.InputPos+0x200]);
 			PDataL=*pl;
 			PDataR=*pr;

-			Cores[core].InputPos+=2;
-			if(Cores[core].InputPos>=0x200) {
-				Cores[core].AdmaInProgress=0;
-				if(Cores[core].InputDataLeft>=0x200)
+			thiscore.InputPos+=2;
+			if(thiscore.InputPos>=0x200) {
+				thiscore.AdmaInProgress=0;
+				if(thiscore.InputDataLeft>=0x200)
 				{
-					u8 k=Cores[core].InputDataLeft>=Cores[core].InputDataProgress;
+					u8 k=thiscore.InputDataLeft>=thiscore.InputDataProgress;

 					AutoDMAReadBuffer(core,0);

-					Cores[core].AdmaInProgress=1;
+					thiscore.AdmaInProgress=1;

-					Cores[core].TSA=(core<<10)+Cores[core].InputPos;
+					thiscore.TSA=(core<<10)+thiscore.InputPos;

-					if (Cores[core].InputDataLeft<0x200) 
+					if (thiscore.InputDataLeft<0x200) 
 					{
 						FileLog("[%10d] Spdif AutoDMA%c block end.\n",Cycles, (core==0)?'4':'7');

-						if(Cores[core].InputDataLeft>0)
+						if(thiscore.InputDataLeft>0)
 						{
 							if(MsgAutoDMA) ConLog("WARNING: adma buffer didn't finish with a whole block!!\n");
 						}
-						Cores[core].InputDataLeft=0;
-						Cores[core].DMAICounter=1;
+						thiscore.InputDataLeft=0;
+						thiscore.DMAICounter=1;
 					}
 				}
-				Cores[core].InputPos&=0x1ff;
+				thiscore.InputPos&=0x1ff;
 			}

 		}
@ -528,45 +679,45 @@ void __fastcall ReadInput(s32& PDataL,s32& PDataR)
 			else
 			{
 				// Using the temporary buffer because this area gets overwritten by some other code.
-				//*PDataL=(s32)*(s16*)(spu2mem+0x2000+(core<<10)+Cores[core].InputPos);
-				//*PDataR=(s32)*(s16*)(spu2mem+0x2200+(core<<10)+Cores[core].InputPos);
+				//*PDataL=(s32)*(s16*)(spu2mem+0x2000+(core<<10)+thiscore.InputPos);
+				//*PDataR=(s32)*(s16*)(spu2mem+0x2200+(core<<10)+thiscore.InputPos);

-				tl=(s32)Cores[core].ADMATempBuffer[Cores[core].InputPos];
-				tr=(s32)Cores[core].ADMATempBuffer[Cores[core].InputPos+0x200];
+				tl=(s32)thiscore.ADMATempBuffer[thiscore.InputPos];
+				tr=(s32)thiscore.ADMATempBuffer[thiscore.InputPos+0x200];

 			}

 			PDataL=tl;
 			PDataR=tr;

-			Cores[core].InputPos++;
-			if((Cores[core].InputPos==0x100)||(Cores[core].InputPos>=0x200)) {
-				Cores[core].AdmaInProgress=0;
-				if(Cores[core].InputDataLeft>=0x200)
+			thiscore.InputPos++;
+			if((thiscore.InputPos==0x100)||(thiscore.InputPos>=0x200)) {
+				thiscore.AdmaInProgress=0;
+				if(thiscore.InputDataLeft>=0x200)
 				{
-					u8 k=Cores[core].InputDataLeft>=Cores[core].InputDataProgress;
+					u8 k=thiscore.InputDataLeft>=thiscore.InputDataProgress;

 					AutoDMAReadBuffer(core,0);

-					Cores[core].AdmaInProgress=1;
+					thiscore.AdmaInProgress=1;

-					Cores[core].TSA=(core<<10)+Cores[core].InputPos;
+					thiscore.TSA=(core<<10)+thiscore.InputPos;

-					if (Cores[core].InputDataLeft<0x200) 
+					if (thiscore.InputDataLeft<0x200) 
 					{
 						FileLog("[%10d] AutoDMA%c block end.\n",Cycles, (core==0)?'4':'7');

-						Cores[core].AutoDMACtrl |=~3;
+						thiscore.AutoDMACtrl |=~3;

-						if(Cores[core].InputDataLeft>0)
+						if(thiscore.InputDataLeft>0)
 						{
 							if(MsgAutoDMA) ConLog("WARNING: adma buffer didn't finish with a whole block!!\n");
 						}
-						Cores[core].InputDataLeft=0;
-						Cores[core].DMAICounter=1;
+						thiscore.InputDataLeft=0;
+						thiscore.DMAICounter=1;
 					}
 				}
-				Cores[core].InputPos&=0x1ff;
+				thiscore.InputPos&=0x1ff;
 			}
 		}
 	}
@ -580,7 +731,7 @@ void __fastcall ReadInput(s32& PDataL,s32& PDataR)
 /////////////////////////////////////////////////////////////////////////////////////////
 //                                                                                     //

-void ReadInputPV(s32& ValL,s32& ValR) 
+void __fastcall ReadInputPV(V_Core& thiscore, s32& ValL,s32& ValR) 
 {
 	s32 DL=0, DR=0;

@ -588,24 +739,25 @@ void ReadInputPV(s32& ValL,s32& ValR)

 	if(pitch==0) pitch=48000;
 	
-	Cores[core].ADMAPV+=pitch;
-	while(Cores[core].ADMAPV>=48000) 
+	thiscore.ADMAPV+=pitch;
+	while(thiscore.ADMAPV>=48000) 
 	{
-		ReadInput(DL,DR);
-		Cores[core].ADMAPV-=48000;
-		Cores[core].ADMAPL=DL;
-		Cores[core].ADMAPR=DR;
+		ReadInput(thiscore, DL,DR);
+		thiscore.ADMAPV-=48000;
+		thiscore.ADMAPL=DL;
+		thiscore.ADMAPR=DR;
 	}

-	ValL=Cores[core].ADMAPL;
-	ValR=Cores[core].ADMAPR;
+	ValL=thiscore.ADMAPL;
+	ValR=thiscore.ADMAPR;
 }

 /////////////////////////////////////////////////////////////////////////////////////////
 /////////////////////////////////////////////////////////////////////////////////////////
 //                                                                                     //

-void __forceinline UpdateVolume(V_Volume& Vol) {
+static void __forceinline UpdateVolume(V_Volume& Vol)
+{
 	s32 NVal;

 	// TIMINGS ARE FAKE!!! Need to investigate.
@ -664,7 +816,7 @@ void __forceinline UpdateVolume(V_Volume& Vol) {
 /////////////////////////////////////////////////////////////////////////////////////////
 //                                                                                     //

-s32 __forceinline clamp(s32 x)
+static s32 __forceinline clamp(s32 x)
 {
 	if (x>0x00ffffff)  return 0x00ffffff;
 	if (x<0xff000000)  return 0xff000000;
@ -675,12 +827,12 @@ s32 __forceinline clamp(s32 x)
 /////////////////////////////////////////////////////////////////////////////////////////
 //                                                                                     //

-void DoReverb(s32& OutL, s32& OutR, s32 InL, s32 InR)
+static void DoReverb( V_Core& thiscore, s32& OutL, s32& OutR, s32 InL, s32 InR)
 {
 	static s32 INPUT_SAMPLE_L,INPUT_SAMPLE_R;
 	static s32 OUTPUT_SAMPLE_L,OUTPUT_SAMPLE_R;

-	if(!(Cores[core].FxEnable&&EffectsEnabled))
+	if(!(thiscore.FxEnable&&EffectsEnabled))
 	{
 		OUTPUT_SAMPLE_L=0;
 		OUTPUT_SAMPLE_R=0;
@ -698,14 +850,14 @@ void DoReverb(s32& OutL, s32& OutR, s32 InL, s32 InR)
 		s32 IIR_INPUT_A0,IIR_INPUT_A1,IIR_INPUT_B0,IIR_INPUT_B1;
 		s32 ACC0,ACC1;
 		s32 FB_A0,FB_A1,FB_B0,FB_B1;
-		s32 buffsize=Cores[core].EffectsEndA-Cores[core].EffectsStartA+1;
+		s32 buffsize=thiscore.EffectsEndA-thiscore.EffectsStartA+1;

 		if(buffsize<0)
 		{
-			buffsize = Cores[core].EffectsEndA;
-			Cores[core].EffectsEndA=Cores[core].EffectsStartA;
-			Cores[core].EffectsStartA=buffsize;
-			buffsize=Cores[core].EffectsEndA-Cores[core].EffectsStartA+1;
+			buffsize = thiscore.EffectsEndA;
+			thiscore.EffectsEndA=thiscore.EffectsStartA;
+			thiscore.EffectsStartA=buffsize;
+			buffsize=thiscore.EffectsEndA-thiscore.EffectsStartA+1;
 		}

 		//filter the 2 samples (prev then current)
@ -715,42 +867,42 @@ void DoReverb(s32& OutL, s32& OutR, s32 InL, s32 InR)
 		INPUT_SAMPLE_L=(INPUT_SAMPLE_L+InL)>>9;
 		INPUT_SAMPLE_R=(INPUT_SAMPLE_R+InR)>>9;

-#define BUFFER(x)	((s32)(*GetMemPtr(Cores[core].EffectsStartA + ((Cores[core].ReverbX + buffsize-((x)<<2))%buffsize))))
-#define SBUFFER(x)	(*GetMemPtr(Cores[core].EffectsStartA + ((Cores[core].ReverbX + buffsize-((x)<<2))%buffsize)))
+#define BUFFER(x)	((s32)(*GetMemPtr(thiscore.EffectsStartA + ((thiscore.ReverbX + buffsize-((x)<<2))%buffsize))))
+#define SBUFFER(x)	(*GetMemPtr(thiscore.EffectsStartA + ((thiscore.ReverbX + buffsize-((x)<<2))%buffsize)))
 		
-		Cores[core].ReverbX=((Cores[core].ReverbX + 4)%buffsize);
+		thiscore.ReverbX=((thiscore.ReverbX + 4)%buffsize);

-		IIR_INPUT_A0 = (BUFFER(Cores[core].Revb.IIR_SRC_A0) * Cores[core].Revb.IIR_COEF + INPUT_SAMPLE_L * Cores[core].Revb.IN_COEF_L)>>16;
-		IIR_INPUT_A1 = (BUFFER(Cores[core].Revb.IIR_SRC_A1) * Cores[core].Revb.IIR_COEF + INPUT_SAMPLE_R * Cores[core].Revb.IN_COEF_R)>>16;
-		IIR_INPUT_B0 = (BUFFER(Cores[core].Revb.IIR_SRC_B0) * Cores[core].Revb.IIR_COEF + INPUT_SAMPLE_L * Cores[core].Revb.IN_COEF_L)>>16;
-		IIR_INPUT_B1 = (BUFFER(Cores[core].Revb.IIR_SRC_B1) * Cores[core].Revb.IIR_COEF + INPUT_SAMPLE_R * Cores[core].Revb.IN_COEF_R)>>16;
+		IIR_INPUT_A0 = (BUFFER(thiscore.Revb.IIR_SRC_A0) * thiscore.Revb.IIR_COEF + INPUT_SAMPLE_L * thiscore.Revb.IN_COEF_L)>>16;
+		IIR_INPUT_A1 = (BUFFER(thiscore.Revb.IIR_SRC_A1) * thiscore.Revb.IIR_COEF + INPUT_SAMPLE_R * thiscore.Revb.IN_COEF_R)>>16;
+		IIR_INPUT_B0 = (BUFFER(thiscore.Revb.IIR_SRC_B0) * thiscore.Revb.IIR_COEF + INPUT_SAMPLE_L * thiscore.Revb.IN_COEF_L)>>16;
+		IIR_INPUT_B1 = (BUFFER(thiscore.Revb.IIR_SRC_B1) * thiscore.Revb.IIR_COEF + INPUT_SAMPLE_R * thiscore.Revb.IN_COEF_R)>>16;

-		SBUFFER(Cores[core].Revb.IIR_DEST_A0 + 4) = clamp((IIR_INPUT_A0 * Cores[core].Revb.IIR_ALPHA + BUFFER(Cores[core].Revb.IIR_DEST_A0) * (65535 - Cores[core].Revb.IIR_ALPHA))>>16);
-		SBUFFER(Cores[core].Revb.IIR_DEST_A1 + 4) = clamp((IIR_INPUT_A1 * Cores[core].Revb.IIR_ALPHA + BUFFER(Cores[core].Revb.IIR_DEST_A1) * (65535 - Cores[core].Revb.IIR_ALPHA))>>16);
-		SBUFFER(Cores[core].Revb.IIR_DEST_B0 + 4) = clamp((IIR_INPUT_B0 * Cores[core].Revb.IIR_ALPHA + BUFFER(Cores[core].Revb.IIR_DEST_B0) * (65535 - Cores[core].Revb.IIR_ALPHA))>>16);
-		SBUFFER(Cores[core].Revb.IIR_DEST_B1 + 4) = clamp((IIR_INPUT_B1 * Cores[core].Revb.IIR_ALPHA + BUFFER(Cores[core].Revb.IIR_DEST_B1) * (65535 - Cores[core].Revb.IIR_ALPHA))>>16);
+		SBUFFER(thiscore.Revb.IIR_DEST_A0 + 4) = clamp((IIR_INPUT_A0 * thiscore.Revb.IIR_ALPHA + BUFFER(thiscore.Revb.IIR_DEST_A0) * (65535 - thiscore.Revb.IIR_ALPHA))>>16);
+		SBUFFER(thiscore.Revb.IIR_DEST_A1 + 4) = clamp((IIR_INPUT_A1 * thiscore.Revb.IIR_ALPHA + BUFFER(thiscore.Revb.IIR_DEST_A1) * (65535 - thiscore.Revb.IIR_ALPHA))>>16);
+		SBUFFER(thiscore.Revb.IIR_DEST_B0 + 4) = clamp((IIR_INPUT_B0 * thiscore.Revb.IIR_ALPHA + BUFFER(thiscore.Revb.IIR_DEST_B0) * (65535 - thiscore.Revb.IIR_ALPHA))>>16);
+		SBUFFER(thiscore.Revb.IIR_DEST_B1 + 4) = clamp((IIR_INPUT_B1 * thiscore.Revb.IIR_ALPHA + BUFFER(thiscore.Revb.IIR_DEST_B1) * (65535 - thiscore.Revb.IIR_ALPHA))>>16);

-		ACC0 = (s32)(BUFFER(Cores[core].Revb.ACC_SRC_A0) * Cores[core].Revb.ACC_COEF_A +
-				BUFFER(Cores[core].Revb.ACC_SRC_B0) * Cores[core].Revb.ACC_COEF_B +
-				BUFFER(Cores[core].Revb.ACC_SRC_C0) * Cores[core].Revb.ACC_COEF_C +
-				BUFFER(Cores[core].Revb.ACC_SRC_D0) * Cores[core].Revb.ACC_COEF_D)>>16;
-		ACC1 = (s32)(BUFFER(Cores[core].Revb.ACC_SRC_A1) * Cores[core].Revb.ACC_COEF_A +
-				BUFFER(Cores[core].Revb.ACC_SRC_B1) * Cores[core].Revb.ACC_COEF_B +
-				BUFFER(Cores[core].Revb.ACC_SRC_C1) * Cores[core].Revb.ACC_COEF_C +
-				BUFFER(Cores[core].Revb.ACC_SRC_D1) * Cores[core].Revb.ACC_COEF_D)>>16;
+		ACC0 = (s32)(BUFFER(thiscore.Revb.ACC_SRC_A0) * thiscore.Revb.ACC_COEF_A +
+				BUFFER(thiscore.Revb.ACC_SRC_B0) * thiscore.Revb.ACC_COEF_B +
+				BUFFER(thiscore.Revb.ACC_SRC_C0) * thiscore.Revb.ACC_COEF_C +
+				BUFFER(thiscore.Revb.ACC_SRC_D0) * thiscore.Revb.ACC_COEF_D)>>16;
+		ACC1 = (s32)(BUFFER(thiscore.Revb.ACC_SRC_A1) * thiscore.Revb.ACC_COEF_A +
+				BUFFER(thiscore.Revb.ACC_SRC_B1) * thiscore.Revb.ACC_COEF_B +
+				BUFFER(thiscore.Revb.ACC_SRC_C1) * thiscore.Revb.ACC_COEF_C +
+				BUFFER(thiscore.Revb.ACC_SRC_D1) * thiscore.Revb.ACC_COEF_D)>>16;

-		FB_A0 = BUFFER(Cores[core].Revb.MIX_DEST_A0 - Cores[core].Revb.FB_SRC_A);
-		FB_A1 = BUFFER(Cores[core].Revb.MIX_DEST_A1 - Cores[core].Revb.FB_SRC_A);
-		FB_B0 = BUFFER(Cores[core].Revb.MIX_DEST_B0 - Cores[core].Revb.FB_SRC_B);
-		FB_B1 = BUFFER(Cores[core].Revb.MIX_DEST_B1 - Cores[core].Revb.FB_SRC_B);
+		FB_A0 = BUFFER(thiscore.Revb.MIX_DEST_A0 - thiscore.Revb.FB_SRC_A);
+		FB_A1 = BUFFER(thiscore.Revb.MIX_DEST_A1 - thiscore.Revb.FB_SRC_A);
+		FB_B0 = BUFFER(thiscore.Revb.MIX_DEST_B0 - thiscore.Revb.FB_SRC_B);
+		FB_B1 = BUFFER(thiscore.Revb.MIX_DEST_B1 - thiscore.Revb.FB_SRC_B);

-		SBUFFER(Cores[core].Revb.MIX_DEST_A0) = clamp((ACC0 - FB_A0 * Cores[core].Revb.FB_ALPHA)>>16);
-		SBUFFER(Cores[core].Revb.MIX_DEST_A1) = clamp((ACC1 - FB_A1 * Cores[core].Revb.FB_ALPHA)>>16);
-		SBUFFER(Cores[core].Revb.MIX_DEST_B0) = clamp(((Cores[core].Revb.FB_ALPHA * ACC0) - FB_A0 * (65535 - Cores[core].Revb.FB_ALPHA) - FB_B0 * Cores[core].Revb.FB_X)>>16);
-		SBUFFER(Cores[core].Revb.MIX_DEST_B1) = clamp(((Cores[core].Revb.FB_ALPHA * ACC1) - FB_A1 * (65535 - Cores[core].Revb.FB_ALPHA) - FB_B1 * Cores[core].Revb.FB_X)>>16);
+		SBUFFER(thiscore.Revb.MIX_DEST_A0) = clamp((ACC0 - FB_A0 * thiscore.Revb.FB_ALPHA)>>16);
+		SBUFFER(thiscore.Revb.MIX_DEST_A1) = clamp((ACC1 - FB_A1 * thiscore.Revb.FB_ALPHA)>>16);
+		SBUFFER(thiscore.Revb.MIX_DEST_B0) = clamp(((thiscore.Revb.FB_ALPHA * ACC0) - FB_A0 * (65535 - thiscore.Revb.FB_ALPHA) - FB_B0 * thiscore.Revb.FB_X)>>16);
+		SBUFFER(thiscore.Revb.MIX_DEST_B1) = clamp(((thiscore.Revb.FB_ALPHA * ACC1) - FB_A1 * (65535 - thiscore.Revb.FB_ALPHA) - FB_B1 * thiscore.Revb.FB_X)>>16);

-		OUTPUT_SAMPLE_L=clamp((BUFFER(Cores[core].Revb.MIX_DEST_A0)+BUFFER(Cores[core].Revb.MIX_DEST_B0))>>2);
-		OUTPUT_SAMPLE_R=clamp((BUFFER(Cores[core].Revb.MIX_DEST_B1)+BUFFER(Cores[core].Revb.MIX_DEST_B1))>>2);
+		OUTPUT_SAMPLE_L=clamp((BUFFER(thiscore.Revb.MIX_DEST_A0)+BUFFER(thiscore.Revb.MIX_DEST_B0))>>2);
+		OUTPUT_SAMPLE_R=clamp((BUFFER(thiscore.Revb.MIX_DEST_B1)+BUFFER(thiscore.Revb.MIX_DEST_B1))>>2);
 	} 
 	OutL=OUTPUT_SAMPLE_L;
 	OutR=OUTPUT_SAMPLE_R;
@ -766,15 +918,13 @@ double rfactor=1;
 double cfactor=1;
 double diff=0;

-s32 __forceinline ApplyVolume(s32 data, s32 volume)
+static s32 __forceinline ApplyVolume(s32 data, s32 volume)
 {
 	return (volume * data);
 }

-void __forceinline MixVoice(s32& VValL, s32& VValR)
+static void __forceinline MixVoice(V_Voice& vc, s32& VValL, s32& VValR)
 {
-	V_Voice& vc(Cores[core].Voices[voice]);
-
 	s32 Value=0;

 	VValL=0;
@ -783,11 +933,16 @@ void __forceinline MixVoice(s32& VValL, s32& VValR)
 	UpdateVolume(vc.VolumeL);
 	UpdateVolume(vc.VolumeR);

-	if (Cores[core].Voices[voice].ADSR.Phase>0) 
+	if (vc.ADSR.Phase>0) 
 	{
-		GetVoiceValues(Value);
+		if( vc.Noise )
+			GetNoiseValues( Cores[core], vc, Value );
+		else
+			GetVoiceValues( Cores[core], vc, Value );

+		#ifdef _DEBUG
 		vc.displayPeak = max(vc.displayPeak,abs(Value));
+		#endif

 		VValL=ApplyVolume(Value,(vc.VolumeL.Value));
 		VValR=ApplyVolume(Value,(vc.VolumeR.Value));
@ -798,7 +953,7 @@ void __forceinline MixVoice(s32& VValL, s32& VValR)

 }

-__forceinline void MixCore(s32& OutL, s32& OutR, s32 ExtL, s32 ExtR)
+static void __fastcall MixCore(s32& OutL, s32& OutR, s32 ExtL, s32 ExtR)
 {
 	s32 InpL=0, InpR=0;

@ -813,23 +968,25 @@ __forceinline void MixCore(s32& OutL, s32& OutR, s32 ExtL, s32 ExtR)
 		spu2Ms16(0xA00 + OutPos)=(s16)(ExtR>>16);
 	}
 	
+	V_Core& thiscore( Cores[core] );
+
 	if((core==0)&&((PlayMode&4)!=4))
 	{
-		ReadInputPV(InpL,InpR);	// get input data from input buffers
+		ReadInputPV(thiscore, InpL,InpR);	// get input data from input buffers
 	}
 	if((core==1)&&((PlayMode&8)!=8))
 	{
-		ReadInputPV(InpL,InpR);	// get input data from input buffers
+		ReadInputPV(thiscore, InpL,InpR);	// get input data from input buffers
 	}

 	s32 InputPeak = max(abs(InpL),abs(InpR));
-	if(Cores[core].AutoDMAPeak<InputPeak) Cores[core].AutoDMAPeak=InputPeak;
+	if(thiscore.AutoDMAPeak<InputPeak) thiscore.AutoDMAPeak=InputPeak;
 	
-	InpL = MulDiv(InpL,(Cores[core].InpL),1<<1);
-	InpR = MulDiv(InpR,(Cores[core].InpR),1<<1);
+	InpL = MulDiv(InpL,(thiscore.InpL),1<<1);
+	InpR = MulDiv(InpR,(thiscore.InpR),1<<1);

-	ExtL = MulDiv(ExtL,(Cores[core].ExtL),1<<12);
-	ExtR = MulDiv(ExtR,(Cores[core].ExtR),1<<12);
+	ExtL = MulDiv(ExtL,(thiscore.ExtL),1<<12);
+	ExtR = MulDiv(ExtR,(thiscore.ExtR),1<<12);

 	SDL=SDR=SWL=SWR=(s32)0;

@ -837,12 +994,13 @@ __forceinline void MixCore(s32& OutL, s32& OutR, s32 ExtL, s32 ExtR)
 	{
 		s32 VValL,VValR;

-		MixVoice(VValL,VValR);
+		V_Voice& vc( thiscore.Voices[voice] );
+		MixVoice( vc,VValL,VValR );

-		SDL += VValL * Cores[core].Voices[voice].DryL;
-		SDR += VValR * Cores[core].Voices[voice].DryR;
-		SWL += VValL * Cores[core].Voices[voice].WetL;
-		SWR += VValR * Cores[core].Voices[voice].WetR;
+		SDL += VValL * vc.DryL;
+		SDR += VValR * vc.DryR;
+		SWL += VValL * vc.WetL;
+		SWR += VValR * vc.WetR;
 	}

 	//Write To Output Area
@ -852,30 +1010,30 @@ __forceinline void MixCore(s32& OutL, s32& OutR, s32 ExtL, s32 ExtR)
 	spu2Ms16(0x1600 + (core<<12) + OutPos)=(s16)(SWR>>16);

 	// Mix in the Voice data
-	TDL += SDL * Cores[core].SndDryL;
-	TDR += SDR * Cores[core].SndDryR;
-	TWL += SWL * Cores[core].SndWetL;
-	TWR += SWR * Cores[core].SndWetR;
+	TDL += SDL * thiscore.SndDryL;
+	TDR += SDR * thiscore.SndDryR;
+	TWL += SWL * thiscore.SndWetL;
+	TWR += SWR * thiscore.SndWetR;

 	// Mix in the Input data
-	TDL += InpL * Cores[core].InpDryL;
-	TDR += InpR * Cores[core].InpDryR;
-	TWL += InpL * Cores[core].InpWetL;
-	TWR += InpR * Cores[core].InpWetR;
+	TDL += InpL * thiscore.InpDryL;
+	TDR += InpR * thiscore.InpDryR;
+	TWL += InpL * thiscore.InpWetL;
+	TWR += InpR * thiscore.InpWetR;

 	// Mix in the External (nothing/core0) data
-	TDL += ExtL * Cores[core].ExtDryL;
-	TDR += ExtR * Cores[core].ExtDryR;
-	TWL += ExtL * Cores[core].ExtWetL; 
-	TWR += ExtR * Cores[core].ExtWetR;
+	TDL += ExtL * thiscore.ExtDryL;
+	TDR += ExtR * thiscore.ExtDryR;
+	TWL += ExtL * thiscore.ExtWetL; 
+	TWR += ExtR * thiscore.ExtWetR;
 	
 	if(EffectsEnabled)
 	{
 		//Apply Effects
-		DoReverb(RVL,RVR,TWL>>16,TWR>>16);
+		DoReverb( thiscore, RVL,RVR,TWL>>16,TWR>>16);

-		TWL=ApplyVolume(RVL,VOL(Cores[core].FxL));
-		TWR=ApplyVolume(RVR,VOL(Cores[core].FxR));
+		TWL=ApplyVolume(RVL,VOL(thiscore.FxL));
+		TWR=ApplyVolume(RVR,VOL(thiscore.FxR));
 	}
 	else
 	{
@ -888,12 +1046,12 @@ __forceinline void MixCore(s32& OutL, s32& OutR, s32 ExtL, s32 ExtR)
 	OutR=(TDR + TWR);

 	//Apply Master Volume
-	UpdateVolume(Cores[core].MasterL);
-	UpdateVolume(Cores[core].MasterR);
+	UpdateVolume(thiscore.MasterL);
+	UpdateVolume(thiscore.MasterR);

-	if (Cores[core].Mute==0) {
-		OutL=MulDiv(OutL,Cores[core].MasterL.Value,1<<16);
-		OutR=MulDiv(OutR,Cores[core].MasterR.Value,1<<16);
+	if (thiscore.Mute==0) {
+		OutL=MulDiv(OutL,thiscore.MasterL.Value,1<<16);
+		OutR=MulDiv(OutR,thiscore.MasterR.Value,1<<16);
 	}
 	else 
 	{
@ -903,7 +1061,7 @@ __forceinline void MixCore(s32& OutL, s32& OutR, s32 ExtL, s32 ExtR)

 	if((core==1)&&(PlayMode&8))
 	{
-		ReadInput(OutL,OutR);
+		ReadInput(thiscore, OutL,OutR);
 	}

 	if((core==0)&&(PlayMode&4))
@ -923,12 +1081,13 @@ void __fastcall Mix()
 	core=0;
 	MixCore(ExtL,ExtR,0,0);

-	Peak0 = max(Peak0,max(ExtL,ExtR));
-
 	core=1;
 	MixCore(OutL,OutR,ExtL,ExtR);

+#ifdef _DEBUG
+	Peak0 = max(Peak0,max(ExtL,ExtR));
 	Peak1 = max(Peak1,max(OutL,OutR));
+#endif

 	ExtL=MulDiv(OutL,VolumeMultiplier,VolumeDivisor<<6);
 	ExtR=MulDiv(OutR,VolumeMultiplier,VolumeDivisor<<6);
--- a/plugins/spu2ghz/mixer.h
+++ b/plugins/spu2ghz/mixer.h
@ -23,5 +23,4 @@ void __fastcall Mix();
 void __fastcall LogVolInit();
 void __fastcall LogVolClose();

-
 #endif // MIXER_H_INCLUDED //
--- a/plugins/spu2ghz/sndout.cpp
+++ b/plugins/spu2ghz/sndout.cpp
@ -126,16 +126,18 @@ public:
 #endif
 			while((free<nSamples)&&(pw)) 
 			{
-				isWaiting=true;
+				//isWaiting=true;
 				LeaveCriticalSection(&cs);
+				ConLog( " * SPU2 : Waiting for object... " );
 				WaitForSingleObject(hSyncEvent,1000);
+				ConLog( " Signaled! \n" );
 				EnterCriticalSection(&cs);
 #ifdef DYNAMIC_BUFFER_LIMITING
 				free = buffer_limit-data;
 #else
 				free = size-data;
 #endif
-				isWaiting=false;
+				//isWaiting=false;
 			}
 		}

@ -187,13 +189,19 @@ public:
 			underflows++;
 		}
 #else
+		bool uflow = false;
 		while(data<0)
 		{
 			data+=size;
+			uflow = true;
 		}
+
+		//if( uflow )
+			//ConLog( " * SPU2 : Data Underflow detected!\n" );
+
 #endif

-		if(isWaiting)
+		//if(isWaiting)
 		{
 			PulseEvent(hSyncEvent);
 		}
--- a/plugins/spu2ghz/spu2.cpp
+++ b/plugins/spu2ghz/spu2.cpp
@ -565,75 +565,72 @@ void CALLBACK TimeUpdate(u32 cClocks, u32 syncType)
 	// HACKY but should work anyway.
 	if(lClocks==0) lClocks = cClocks;

-	if(dClocks>=TickInterval)
+	//Update Mixing Progress
+	while(dClocks>=TickInterval)
 	{
-		//Update Mixing Progress
-		while(dClocks>=TickInterval)
+		
+		//UpdateDebugDialog();
+
+		if(has_to_call_irq)
 		{
-			
-			//UpdateDebugDialog();
-
-			if(has_to_call_irq)
-			{
-				ConLog(" * SPU2: Irq Called (%04x).\n",Spdif.Info);
-				has_to_call_irq=false;
-				if(_irqcallback) _irqcallback();
-			}
-
-			if(Cores[0].InitDelay>0)
-			{
-				Cores[0].InitDelay--;
-				if(Cores[0].InitDelay==0)
-				{
-					CoreReset(0);
-				}
-			}
-
-			if(Cores[1].InitDelay>0)
-			{
-				Cores[1].InitDelay--;
-				if(Cores[1].InitDelay==0)
-				{
-					CoreReset(1);
-				}
-			}
-
-			//Update DMA4 interrupt delay counter
-			if(Cores[0].DMAICounter>0) 
-			{
-				Cores[0].DMAICounter-=TickInterval;
-				if(Cores[0].DMAICounter<=0)
-				{
-					Cores[0].MADR=Cores[0].TADR;
-					Cores[0].DMAICounter=0;
-					if(dma4callback) dma4callback();
-				}
-				else {
-					Cores[0].MADR+=TickInterval<<1;
-				}
-			}
-
-			//Update DMA7 interrupt delay counter
-			if(Cores[1].DMAICounter>0) 
-			{
-				Cores[1].DMAICounter-=TickInterval;
-				if(Cores[1].DMAICounter<=0)
-				{
-					Cores[1].MADR=Cores[1].TADR;
-					Cores[1].DMAICounter=0;
-					if(dma7callback) dma7callback();
-				}
-				else {
-					Cores[1].MADR+=TickInterval<<1;
-				}
-			}
-
-			dClocks-=TickInterval;
-			lClocks+=TickInterval;
-			Cycles++;
-
-			Mix();
+			ConLog(" * SPU2: Irq Called (%04x).\n",Spdif.Info);
+			has_to_call_irq=false;
+			if(_irqcallback) _irqcallback();
 		}
+
+		if(Cores[0].InitDelay>0)
+		{
+			Cores[0].InitDelay--;
+			if(Cores[0].InitDelay==0)
+			{
+				CoreReset(0);
+			}
+		}
+
+		if(Cores[1].InitDelay>0)
+		{
+			Cores[1].InitDelay--;
+			if(Cores[1].InitDelay==0)
+			{
+				CoreReset(1);
+			}
+		}
+
+		//Update DMA4 interrupt delay counter
+		if(Cores[0].DMAICounter>0) 
+		{
+			Cores[0].DMAICounter-=TickInterval;
+			if(Cores[0].DMAICounter<=0)
+			{
+				Cores[0].MADR=Cores[0].TADR;
+				Cores[0].DMAICounter=0;
+				if(dma4callback) dma4callback();
+			}
+			else {
+				Cores[0].MADR+=TickInterval<<1;
+			}
+		}
+
+		//Update DMA7 interrupt delay counter
+		if(Cores[1].DMAICounter>0) 
+		{
+			Cores[1].DMAICounter-=TickInterval;
+			if(Cores[1].DMAICounter<=0)
+			{
+				Cores[1].MADR=Cores[1].TADR;
+				Cores[1].DMAICounter=0;
+				if(dma7callback) dma7callback();
+			}
+			else {
+				Cores[1].MADR+=TickInterval<<1;
+			}
+		}
+
+		dClocks-=TickInterval;
+		lClocks+=TickInterval;
+		Cycles++;
+
+		Mix();
 	}
 }

@ -1713,8 +1710,12 @@ void VoiceStart(int core,int vc)
 		Cores[core].Voices[vc].Prev1=0;
 		Cores[core].Voices[vc].Prev2=0;

-		Cores[core].Voices[vc].PV1=Cores[core].Voices[vc].PV2=0;
-		Cores[core].Voices[vc].PV3=Cores[core].Voices[vc].PV4=0;
+		// [Air]: Don't wipe interpolation values on VoiceStart.
+		//   There'll be less popping/clicking if we just interpolate from the
+		//   old sample and the new sample.
+
+		//Cores[core].Voices[vc].PV1=Cores[core].Voices[vc].PV2=0;
+		//Cores[core].Voices[vc].PV3=Cores[core].Voices[vc].PV4=0;

 		Cores[core].Regs.ENDX&=~(1<<vc);

@ -1741,6 +1742,13 @@ void VoiceStop(int core,int vc)
 {
 	Cores[core].Voices[vc].ADSR.Value=0;
 	Cores[core].Voices[vc].ADSR.Phase=0;
+
+	// [Air]: Wipe the interpolation values here, since stopped voices
+	//   are essentially silence (and any new voices shold thusly interpolate up from
+	//   such silence)
+	Cores[core].Voices[vc].PV1=Cores[core].Voices[vc].PV2=0;
+	Cores[core].Voices[vc].PV3=Cores[core].Voices[vc].PV4=0;
+
 	//Cores[core].Regs.ENDX|=(1<<vc);
 }

--- a/plugins/spu2ghz/spu2.h
+++ b/plugins/spu2ghz/spu2.h
@ -53,6 +53,37 @@

 #include "debug.h"

+// [Air] : give hints to the optimizer
+//  This is primarily useful for the default case switch optimizer, which enables VC to
+//  generate more compact switches.
+
+#ifdef NDEBUG
+#	define jBREAKPOINT() ((void) 0)
+#	ifdef _MSC_VER
+#		define jASSUME(exp) (__assume(exp))
+#	else
+#		define jASSUME(exp) ((void) sizeof(exp))
+#	endif
+#else
+#	if defined(_MSC_VER)
+#		define jBREAKPOINT() do { __asm int 3 } while(0)
+#	else
+#		define jBREAKPOINT() ((void) *(volatile char *) 0)
+#	endif
+#	define jASSUME(exp) if(exp) ; else jBREAKPOINT()
+#endif
+
+// disable the default case in a switch
+#define jNO_DEFAULT \
+{ \
+	break; \
+	\
+default: \
+	jASSUME(0); \
+	break; \
+}
+
+
 extern void spdif_set51(u32 is_5_1_out);
 extern u32  spdif_init();
 extern void spdif_shutdown();
--- a/plugins/spu2ghz/waveout.cpp
+++ b/plugins/spu2ghz/waveout.cpp
@ -54,40 +54,33 @@ private:
 	{
 		while( waveout_running )
 		{
-			int free=0;
-			int first=-1;
-			do
+			bool didsomething = false;
+			for(int i=0;i<MAX_BUFFER_COUNT;i++)
 			{
-				free=0;
-				for(int i=0;i<MAX_BUFFER_COUNT;i++)
+				if(!(whbuffer[i].dwFlags & WHDR_DONE) ) continue;
+
+				WAVEHDR *buf=whbuffer+i;
+
+				buf->dwBytesRecorded = buf->dwBufferLength;
+
+				buff->ReadSamples(tbuffer,BufferSize);
+				s16 *t = (s16*)buf->lpData;
+				s32 *s = (s32*)tbuffer;
+
+				for(int bleh=0;bleh<BufferSize;bleh++)
 				{
-					if(whbuffer[i].dwFlags & WHDR_DONE) 
-					{
-						whbuffer[i].dwFlags&=~WHDR_DONE;
-						first=i;
-						free=1;
-						break;
-					}
+					*(t++) = (s16)((*(s++))>>8);
 				}
-				if(free)
-					break;
-				else
-					Sleep(1);
-			} while(free==0);

-			WAVEHDR *buf=whbuffer+first;
-
-			buf->dwBytesRecorded= buf->dwBufferLength;
-
-			buff->ReadSamples(tbuffer,BufferSize);
-			s16 *t = (s16*)buf->lpData;
-			s32 *s = (s32*)tbuffer;
-			for(int i=0;i<BufferSize;i++)
-			{
-				*(t++) = (s16)((*(s++))>>8);
+				whbuffer[i].dwFlags&=~WHDR_DONE;
+				waveOutWrite(hwodevice,buf,sizeof(WAVEHDR));
+				didsomething = true;
 			}

-			waveOutWrite(hwodevice,buf,sizeof(WAVEHDR));
+			if( didsomething )
+				Sleep(1);
+			else
+				Sleep(0);
 		}
 		return 0;
 	}
@ -137,9 +130,12 @@ public:
 		}

 		// Start Thread
+		// [Air]: The waveout code does not use wait objects, so setting a time critical
+		// priority level is a bad idea.  Standard priority will do fine.  The buffer will get the
+		// love it needs and won't suck resources idling pointlessly.
 		waveout_running=true;
-			thread=CreateThread(NULL,0,(LPTHREAD_START_ROUTINE)RThread,this,0,&tid);
-		SetThreadPriority(thread,THREAD_PRIORITY_TIME_CRITICAL);
+		thread=CreateThread(NULL,0,(LPTHREAD_START_ROUTINE)RThread,this,0,&tid);
+		//SetThreadPriority( thread, THREAD_PRIORITY_TIME_CRITICAL );

 		return 0;
 	}