add spu synch method "P" featuring very refined code from PCSX2 which tempo-shifts intelligently.

2009-10-23 06:14:46 +00:00 · 2009-10-23 06:14:46 +00:00 · 44a36b7950
parent a3fd06019e
commit 44a36b7950
45 changed files with 9560 additions and 599 deletions
--- a/desmume/src/Makefile.am
+++ b/desmume/src/Makefile.am
@ -47,6 +47,7 @@ libdesmume_a_SOURCES = \
 	addons/compactFlash.cpp addons/gbagame.cpp addons/none.cpp addons/rumblepak.cpp addons/guitarGrip.cpp addons/expMemory.cpp fs.h \
 	cheatSystem.cpp cheatSystem.h \
 	texcache.cpp texcache.h rasterize.cpp rasterize.h \
+	metaspu/metaspu.cpp \
 	version.h

 if HAVE_ALSA
--- a/desmume/src/SPU.cpp
+++ b/desmume/src/SPU.cpp
@ -43,425 +43,25 @@
 #include "NDSSystem.h"
 #include "matrix.h"

+#include "metaspu/metaspu.h"
+
 #define K_ADPCM_LOOPING_RECOVERY_INDEX 99999

 //#undef FORCEINLINE
 //#define FORCEINLINE

-class ISynchronizingAudioBuffer
-{
-public:
-	virtual void enqueue_samples(s16* buf, int samples_provided) = 0;
-
-	//returns the number of samples actually supplied, which may not match the number requested
-	virtual int output_samples(s16* buf, int samples_requested) = 0;
-};
-
-template<typename T> inline T _abs(T val)
-{
-	if(val<0) return -val;
-	else return val;
-}
-
-template<typename T> inline T moveValueTowards(T val, T target, T incr)
-{
-	incr = _abs(incr);
-	T delta = _abs(target-val);
-	if(val<target) val += incr;
-	else if(val>target) val -= incr;
-	T newDelta = _abs(target-val);
-	if(newDelta >= delta)
-		val = target;
-	return val;
-}


-class ZeromusSynchronizer : public ISynchronizingAudioBuffer
-{
-public:
-	ZeromusSynchronizer()
-		: mixqueue_go(false)
-		,
-		#ifdef NDEBUG
-		adjustobuf(200,1000)
-		#else
-		adjustobuf(22000,44000)
-		#endif
-	{
-
-	}
-
-	bool mixqueue_go;
-
-	virtual void enqueue_samples(s16* buf, int samples_provided)
-	{
-		for(int i=0;i<samples_provided;i++) {
-			s16 left = *buf++;
-			s16 right = *buf++;
-			adjustobuf.enqueue(left,right);
-		}
-	}
-
-	//returns the number of samples actually supplied, which may not match the number requested
-	virtual int output_samples(s16* buf, int samples_requested)
-	{
-		int done = 0;
-		if(!mixqueue_go) {
-			if(adjustobuf.size > 200)
-				mixqueue_go = true;
-		}
-		else
-		{
-			for(int i=0;i<samples_requested;i++) {
-				if(adjustobuf.size==0) {
-					mixqueue_go = false;
-					break;
-				}
-				done++;
-				s16 left, right;
-				adjustobuf.dequeue(left,right);
-				*buf++ = left;
-				*buf++ = right;
-			}
-		}
-		
-		return done;
-	}
-
-private:
-	class Adjustobuf
-	{
-	public:
-		Adjustobuf(int _minLatency, int _maxLatency)
-			: size(0)
-			, minLatency(_minLatency)
-			, maxLatency(_maxLatency)
-		{
-			rollingTotalSize = 0;
-			targetLatency = (maxLatency + minLatency)/2;
-			rate = 1.0f;
-			cursor = 0.0f;
-			curr[0] = curr[1] = 0;
-			kAverageSize = 80000;
-		}
-
-		float rate, cursor;
-		int minLatency, targetLatency, maxLatency;
-		std::queue<s16> buffer;
-		int size;
-		s16 curr[2];
-
-		std::queue<int> statsHistory;
-
-		void enqueue(s16 left, s16 right) 
-		{
-			buffer.push(left);
-			buffer.push(right);
-			size++;
-		}
-
-		s64 rollingTotalSize;
-
-		u32 kAverageSize;
-
-		void addStatistic()
-		{
-			statsHistory.push(size);
-			rollingTotalSize += size;
-			if(statsHistory.size()>kAverageSize)
-			{
-				rollingTotalSize -= statsHistory.front();
-				statsHistory.pop();
-
-				float averageSize = (float)(rollingTotalSize / kAverageSize);
-				//static int ctr=0;  ctr++; if((ctr&127)==0) printf("avg size: %f curr size: %d rate: %f\n",averageSize,size,rate);
-				{
-					float targetRate;
-					if(averageSize < targetLatency)
-					{
-						targetRate = 1.0f - (targetLatency-averageSize)/kAverageSize;
-					}
-					else if(averageSize > targetLatency) {
-						targetRate = 1.0f + (averageSize-targetLatency)/kAverageSize;
-					} else targetRate = 1.0f;
-				
-					//rate = moveValueTowards(rate,targetRate,0.001f);
-					rate = targetRate;
-				}
-
-			}
-
-
-		}
-
-		void dequeue(s16& left, s16& right)
-		{
-			left = right = 0; 
-			addStatistic();
-			if(size==0) { return; }
-			cursor += rate;
-			while(cursor>1.0f) {
-				cursor -= 1.0f;
-				if(size>0) {
-					curr[0] = buffer.front(); buffer.pop();
-					curr[1] = buffer.front(); buffer.pop();
-					size--;
-				}
-			}
-			left = curr[0]; 
-			right = curr[1];
-		}
-	} adjustobuf;
-};
-
-class NitsujaSynchronizer : public ISynchronizingAudioBuffer
-{
-private:
-	struct ssamp
-	{
-		s16 l, r;
-		ssamp() {}
-		ssamp(s16 ll, s16 rr) : l(ll), r(rr) {}
-	};
-
-	std::vector<ssamp> sampleQueue;
-
-	// returns values going between 0 and y-1 in a saw wave pattern, based on x
-	static FORCEINLINE int pingpong(int x, int y)
-	{
-		x %= 2*y;
-		if(x >= y)
-			x = 2*y - x - 1;
-		return x;
-
-		// in case we want to switch to odd buffer sizes for more sharpness
-		//x %= 2*(y-1);
-		//if(x >= y)
-		//	x = 2*(y-1) - x;
-		//return x;
-	}
-
-	static FORCEINLINE ssamp crossfade (ssamp lhs, ssamp rhs,  int cur, int start, int end)
-	{
-		if(cur <= start)
-			return lhs;
-		if(cur >= end)
-			return rhs;
-
-		// in case we want sine wave interpolation instead of linear here
-		//float ang = 3.14159f * (float)(cur - start) / (float)(end - start);
-		//cur = start + (int)((1-cosf(ang))*0.5f * (end - start));
-
-		int inNum = cur - start;
-		int outNum = end - cur;
-		int denom = end - start;
-
-		int lrv = ((int)lhs.l * outNum + (int)rhs.l * inNum) / denom;
-		int rrv = ((int)lhs.r * outNum + (int)rhs.r * inNum) / denom;
-
-		return ssamp(lrv,rrv);
-	}
-
-	static FORCEINLINE void emit_sample(s16*& outbuf, ssamp sample)
-	{
-		*outbuf++ = sample.l;
-		*outbuf++ = sample.r;
-	}
-
-	static FORCEINLINE void emit_samples(s16*& outbuf, const ssamp* samplebuf, int samples)
-	{
-		for(int i=0;i<samples;i++)
-			emit_sample(outbuf,samplebuf[i]);
-	}
-
-public:
-	NitsujaSynchronizer()
-	{}
-
-	virtual void enqueue_samples(s16* buf, int samples_provided)
-	{
-		for(int i=0;i<samples_provided;i++)
-		{
-			sampleQueue.push_back(ssamp(buf[0],buf[1]));
-			buf += 2;
-		}
-	}
-
-	virtual int output_samples(s16* buf, int samples_requested)
-	{
-		int audiosize = samples_requested;
-		int queued = sampleQueue.size();
-
-		// truncate input and output sizes to multiples of 8 because I am too lazy to deal with odd numbers
-		audiosize &= ~7;
-		queued &= ~7;
-
-		if(queued > 0x200 && audiosize > 0) // is there any work to do?
-		{
-			// are we going at normal speed?
-			// or more precisely, are the input and output queues/buffers of similar size?
-			if(queued > 900 || audiosize > queued * 2)
-			{
-				// not normal speed. we have to resample it somehow in this case.
-				if(audiosize <= queued)
-				{
-					// fast forward speed
-					// this is the easy case, just crossfade it and it sounds ok
-					for(int i = 0; i < audiosize; i++)
-					{
-						int j = i + queued - audiosize;
-						ssamp outsamp = crossfade(sampleQueue[i],sampleQueue[j], i,0,audiosize);
-						emit_sample(buf,outsamp);
-					}
-				}
-				else
-				{
-					// slow motion speed
-					// here we take a very different approach,
-					// instead of crossfading it, we select a single sample from the queue
-					// and make sure that the index we use to select a sample is constantly moving
-					// and that it starts at the first sample in the queue and ends on the last one.
-					//
-					// hopefully the index doesn't move discontinuously or we'll get slight crackling
-					// (there might still be a minor bug here that causes this occasionally)
-					//
-					// here's a diagram of how the index we sample from moves:
-					//
-					// queued (this axis represents the index we sample from. the top means the end of the queue)
-					// ^
-					// |   --> audiosize (this axis represents the output index we write to, right meaning forward in output time/position)
-					// |   A           C       C  end
-					//    A A     B   C C     C
-					//   A   A   A B C   C   C
-					//  A     A A   B     C C
-					// A       A           C
-					// start
-					//
-					// yes, this means we are spending some stretches of time playing the sound backwards,
-					// but the stretches are short enough that this doesn't sound weird.
-					// this lets us avoid most crackling problems due to the endpoints matching up.
-					// TODO: it might help to calculate the approximate fundamental frequency
-					// and reduce either buffer size such that the reflections line up with it.
-
-					int midpointX = audiosize >> 1;
-					int midpointY = queued >> 1;
-
-					// all we need to do here is calculate the X position of the leftmost "B" in the above diagram.
-					// TODO: we should calculate it with a simple equation like
-					//   midpointXOffset = min(something,somethingElse);
-					// but it's a little difficult to work it out exactly
-					// so here's a stupid search for the value for now:
-
-					int prevA = 999999;
-					int midpointXOffset = queued/2;
-					while(true)
-					{
-						int a = abs(pingpong(midpointX - midpointXOffset, queued) - midpointY) - midpointXOffset;
-						if(((a > 0) != (prevA > 0) || (a < 0) != (prevA < 0)) && prevA != 999999)
-						{
-							if((a + prevA)&1) // there's some sort of off-by-one problem with this search since we're moving diagonally...
-								midpointXOffset++; // but this fixes it most of the time...
-							break; // found it
-						}
-						prevA = a;
-						midpointXOffset--;
-						if(midpointXOffset < 0)
-						{
-							midpointXOffset = 0;
-							break; // failed to find it. the two sides probably meet exactly in the center.
-						}
-					}
-
-					int leftMidpointX = midpointX - midpointXOffset;
-					int rightMidpointX = midpointX + midpointXOffset;
-					int leftMidpointY = pingpong(leftMidpointX, queued);
-					int rightMidpointY = (queued-1) - pingpong((int)audiosize-1 - rightMidpointX + queued*2, queued);
-
-					// output the left almost-half of the sound (section "A")
-					for(int x = 0; x < leftMidpointX; x++)
-					{
-						int i = pingpong(x, queued);
-						emit_sample(buf,sampleQueue[i]);
-					}
-
-					// output the middle stretch (section "B")
-					int y = leftMidpointY;
-					int dyMidLeft  = (leftMidpointY  < midpointY) ? 1 : -1;
-					int dyMidRight = (rightMidpointY > midpointY) ? 1 : -1;
-					for(int x = leftMidpointX; x < midpointX; x++, y+=dyMidLeft)
-						emit_sample(buf,sampleQueue[y]);
-					for(int x = midpointX; x < rightMidpointX; x++, y+=dyMidRight)
-						emit_sample(buf,sampleQueue[y]);
-
-					// output the end of the queued sound (section "C")
-					for(int x = rightMidpointX; x < audiosize; x++)
-					{
-						int i = (queued-1) - pingpong((int)audiosize-1 - x + queued*2, queued);
-						emit_sample(buf,sampleQueue[i]);
-					}
-				} //end else
-
-				sampleQueue.erase(sampleQueue.begin(), sampleQueue.begin() + queued);
-				return audiosize;
-			}
-			else
-			{
-				// normal speed
-				// just output the samples straightforwardly.
-				//
-				// at almost-full speeds (like 50/60 FPS)
-				// what will happen is that we rapidly fluctuate between entering this branch
-				// and entering the "slow motion speed" branch above.
-				// but that's ok! because all of these branches sound similar enough that we can get away with it.
-				// so the two cases actually complement each other.
-
-				if(audiosize >= queued)
-				{
-					emit_samples(buf,&sampleQueue[0],queued);
-					sampleQueue.erase(sampleQueue.begin(), sampleQueue.begin() + queued);
-					return queued;
-				}
-				else
-				{
-					emit_samples(buf,&sampleQueue[0],audiosize);
-					sampleQueue.erase(sampleQueue.begin(), sampleQueue.begin()+audiosize);
-					return audiosize;
-				}
-
-			} //end normal speed
-
-		} //end if there is any work to do
-		else
-		{
-			return 0;
-		}
-
-	} //output_samples
-
-private:
-
-}; //NitsujaSynchronizer
-
-//static ISynchronizingAudioBuffer* synchronizer = new ZeromusSynchronizer();
-static ISynchronizingAudioBuffer* synchronizer = new NitsujaSynchronizer();
+//static ISynchronizingAudioBuffer* synchronizer = metaspu_construct(ESynchMethod_Z);
+static ISynchronizingAudioBuffer* synchronizer = metaspu_construct(ESynchMethod_N);

 SPU_struct *SPU_core = 0;
 SPU_struct *SPU_user = 0;
 int SPU_currentCoreNum = SNDCORE_DUMMY;
 static int volume = 100;

-enum ESynchMode
-{
-	ESynchMode_DualSynchAsynch,
-	ESynchMode_Synchronous
-};
-static ESynchMode synchmode = ESynchMode_DualSynchAsynch;

-enum ESynchMethod
-{
-	ESynchMethod_N, //nitsuja's
-	ESynchMethod_Z //zero's
-};
+static ESynchMode synchmode = ESynchMode_DualSynchAsynch;
 static ESynchMethod synchmethod = ESynchMethod_N;

 static SoundInterface_struct *SNDCore=NULL;
@ -625,9 +225,7 @@ void SPU_SetSynchMode(int mode, int method)
 		synchmethod = (ESynchMethod)method;
 		delete synchronizer;
 		//grr does this need to be locked? spu might need a lock method
-		if(synchmethod == ESynchMethod_N)
-			synchronizer = new NitsujaSynchronizer();
-		else synchronizer = new ZeromusSynchronizer();
+		synchronizer = metaspu_construct(synchmethod);
 	}
 }

--- a/desmume/src/metaspu/README
+++ b/desmume/src/metaspu/README
@ -0,0 +1,3 @@
+Much of the contents of this directory was taken from the SPU2-X plugin from PCSX2 (GPL)
+This in turn makes use of the SoundTouch library (LGPL)
+The main metaspu files were written for DeSmuME (GPL) and ported from there to other emulators.
--- a/desmume/src/metaspu/SndOut.cpp
+++ b/desmume/src/metaspu/SndOut.cpp
@ -0,0 +1,388 @@
+/* SPU2-X, A plugin for Emulating the Sound Processing Unit of the Playstation 2
+ * Developed and maintained by the Pcsx2 Development Team.
+ * 
+ * Original portions from SPU2ghz are (c) 2008 by David Quintana [gigaherz]
+ *
+ * SPU2-X is free software: you can redistribute it and/or modify it under the terms
+ * of the GNU Lesser General Public License as published by the Free Software Found-
+ * ation, either version 3 of the License, or (at your option) any later version.
+ *
+ * SPU2-X is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+ * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ * PURPOSE.  See the GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with SPU2-X.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+//#include "Global.h"
+#include "types.h"
+#include "SndOut.h"
+#include <assert.h>
+
+//----------------
+int SndOutLatencyMS = 160;
+bool timeStretchDisabled = false;
+//----------------
+
+StereoOut32 StereoOut32::Empty( 0, 0 );
+
+StereoOut32::StereoOut32( const StereoOut16& src ) :
+	Left( src.Left ),
+	Right( src.Right )
+{
+}
+
+StereoOut32::StereoOut32( const StereoOutFloat& src ) :
+	Left( (s32)(src.Left * 2147483647.0f) ),
+	Right( (s32)(src.Right * 2147483647.0f) )
+{
+}
+
+StereoOut16 StereoOut32::DownSample() const
+{
+	return StereoOut16(
+		Left >> SndOutVolumeShift,
+		Right >> SndOutVolumeShift
+	);
+}
+
+StereoOut32 StereoOut16::UpSample() const
+{
+	return StereoOut32(
+		Left << SndOutVolumeShift,
+		Right << SndOutVolumeShift
+	);
+
+}
+
+
+//class NullOutModule: public SndOutModule
+//{
+//public:
+//	s32  Init()  { return 0; }
+//	void Close() { }
+//	s32  Test() const { return 0; }
+//	void Configure(uptr parent)  { }
+//	bool Is51Out() const { return false; }
+//	int GetEmptySampleCount() const { return 0; }
+//	
+//	const wchar_t* GetIdent() const
+//	{
+//		return L"nullout";
+//	}
+//
+//	const wchar_t* GetLongName() const
+//	{
+//		return L"No Sound (Emulate SPU2 only)";
+//	}
+//
+//	void ReadSettings()
+//	{
+//	}
+//
+//	void WriteSettings() const
+//	{
+//	}
+//
+//} NullOut;
+//
+//SndOutModule* mods[]=
+//{
+//	&NullOut,
+//#ifdef _MSC_VER
+//	XAudio2Out,
+//	DSoundOut,
+//	WaveOut,
+//#endif
+//	NULL		// signals the end of our list
+//};
+//
+//int FindOutputModuleById( const wchar_t* omodid )
+//{
+//	int modcnt = 0;
+//	while( mods[modcnt] != NULL )
+//	{
+//		if( wcscmp( mods[modcnt]->GetIdent(), omodid ) == 0 )
+//			break;
+//		++modcnt;
+//	}
+//	return modcnt;
+//}
+
+StereoOut32 *SndBuffer::m_buffer;
+s32 SndBuffer::m_size;
+s32 SndBuffer::m_rpos;
+s32 SndBuffer::m_wpos;
+s32 SndBuffer::m_data;
+
+bool SndBuffer::m_underrun_freeze;
+StereoOut32* SndBuffer::sndTempBuffer = NULL;
+StereoOut16* SndBuffer::sndTempBuffer16 = NULL;
+int SndBuffer::sndTempProgress = 0;
+
+int GetAlignedBufferSize( int comp )
+{
+	return (comp + SndOutPacketSize-1) & ~(SndOutPacketSize-1);
+}
+
+// Returns TRUE if there is data to be output, or false if no data
+// is available to be copied.
+bool SndBuffer::CheckUnderrunStatus( int& nSamples, int& quietSampleCount )
+{
+	quietSampleCount = 0;
+	if( m_underrun_freeze )
+	{			
+		int toFill = (int)(m_size * ( timeStretchDisabled ? 0.50f : 0.1f ) );
+		toFill = GetAlignedBufferSize( toFill );
+
+		// toFill is now aligned to a SndOutPacket
+
+		if( m_data < toFill )
+		{
+			quietSampleCount = nSamples;
+			return false;
+		}
+
+		m_underrun_freeze = false;
+		//TODO
+		//if( MsgOverruns() )
+			printf(" * SPU2 > Underrun compensation (%d packets buffered)\n", toFill / SndOutPacketSize );
+		lastPct = 0.0;		// normalize timestretcher
+	}
+	else if( m_data < nSamples )
+	{
+		nSamples = m_data;
+		quietSampleCount = SndOutPacketSize - m_data;
+		m_underrun_freeze = true;
+
+		if( !timeStretchDisabled )
+			timeStretchUnderrun();
+
+		return nSamples != 0;
+	}
+
+	return true;
+}
+
+void SndBuffer::_InitFail()
+{
+	// If a failure occurs, just initialize the NoSound driver.  This'll allow
+	// the game to emulate properly (hopefully), albeit without sound.
+	//OutputModule = FindOutputModuleById( NullOut.GetIdent() );
+	//mods[OutputModule]->Init();
+}
+
+void SndBuffer::_WriteSamples(StereoOut32 *bData, int nSamples)
+{
+	int free = m_size-m_data;
+	m_predictData = 0;
+
+	assert( m_data <= m_size );
+
+	// Problem:
+	//  If the SPU2 gets out of sync with the SndOut device, the writepos of the
+	//  circular buffer will overtake the readpos, leading to a prolonged period
+	//  of hopscotching read/write accesses (ie, lots of staticy crap sound for
+	//  several seconds).
+	//
+	// Compromise:
+	//  When an overrun occurs, we adapt by discarding a portion of the buffer.
+	//  The older portion of the buffer is discarded rather than incoming data,
+	//  so that the overall audio synchronization is better.
+
+	if( free < nSamples )
+	{
+		// Buffer overrun!
+		// Dump samples from the read portion of the buffer instead of dropping
+		// the newly written stuff.
+
+		s32 comp;
+
+		if( !timeStretchDisabled )
+		{
+			comp = timeStretchOverrun();
+		}
+		else
+		{
+			// Toss half the buffer plus whatever's being written anew:
+			comp = GetAlignedBufferSize( (m_size + nSamples ) / 2 );
+			if( comp > (m_size-SndOutPacketSize) ) comp = m_size-SndOutPacketSize;
+		}
+
+		m_data -= comp;
+		m_rpos = (m_rpos+comp) % m_size;
+		//TODO
+		//if( MsgOverruns() )
+			printf(" * SPU2 > Overrun Compensation (%d packets tossed)\n", comp / SndOutPacketSize );
+		lastPct = 0.0;		// normalize the timestretcher
+	}
+
+	// copy in two phases, since there's a chance the packet
+	// wraps around the buffer (it'd be nice to deal in packets only, but
+	// the timestretcher and DSP options require flexibility).
+
+	const int endPos = m_wpos + nSamples;
+	const int secondCopyLen = endPos - m_size;
+	StereoOut32* wposbuffer = &m_buffer[m_wpos];
+
+	m_data += nSamples;
+	if( secondCopyLen > 0 )
+	{
+		nSamples -= secondCopyLen;
+		memcpy( m_buffer, &bData[nSamples], secondCopyLen * sizeof( *bData ) );
+		m_wpos = secondCopyLen;
+	}
+	else
+		m_wpos += nSamples;
+
+	memcpy( wposbuffer, bData, nSamples * sizeof( *bData ) );
+}
+
+void SndBuffer::Init()
+{
+	//if( mods[OutputModule] == NULL )
+	//{
+	//	_InitFail();
+	//	return;
+	//}
+
+	// initialize sound buffer
+	// Buffer actually attempts to run ~50%, so allocate near double what
+	// the requested latency is:
+
+
+	m_rpos = 0;
+	m_wpos = 0;
+	m_data = 0;
+
+	try
+	{
+		const float latencyMS = SndOutLatencyMS * (timeStretchDisabled ? 1.5f : 2.0f );
+		m_size = GetAlignedBufferSize( (int)(latencyMS * SampleRate / 1000.0f ) );
+		m_buffer = new StereoOut32[m_size];
+		m_underrun_freeze = false;
+
+		sndTempBuffer = new StereoOut32[SndOutPacketSize];
+		sndTempBuffer16 = new StereoOut16[SndOutPacketSize];
+	}
+	catch( std::bad_alloc& )
+	{
+		// out of memory exception (most likely)
+
+		printf( "Out of memory error occurred while initializing SPU2." );
+		_InitFail();
+		return;
+	}
+
+	// clear buffers!
+	// Fixes loopy sounds on emu resets.
+	memset( sndTempBuffer, 0, sizeof(StereoOut32) * SndOutPacketSize );
+	memset( sndTempBuffer16, 0, sizeof(StereoOut16) * SndOutPacketSize );
+
+	sndTempProgress = 0;
+
+	soundtouchInit();		// initializes the timestretching
+
+	// some crap
+	//spdif_set51(mods[OutputModule]->Is51Out());
+
+	// initialize module
+	//if( mods[OutputModule]->Init() == -1 ) _InitFail();
+}
+
+void SndBuffer::Cleanup()
+{
+	//mods[OutputModule]->Close();
+
+	soundtouchCleanup();
+
+	//safe_delete_array( m_buffer );
+	//safe_delete_array( sndTempBuffer );
+	//safe_delete_array( sndTempBuffer16 );
+	delete[] m_buffer;
+	delete[] sndTempBuffer;
+	delete[] sndTempBuffer16;
+}
+
+int SndBuffer::m_dsp_progress = 0;
+
+int SndBuffer::m_timestretch_progress = 0;
+int SndBuffer::ssFreeze = 0;
+
+void SndBuffer::ClearContents()
+{
+	SndBuffer::soundtouchClearContents();
+	SndBuffer::ssFreeze = 30; //Delays sound output for about half a second.
+}
+
+void SndBuffer::Write( const StereoOut32& Sample )
+{
+	// Log final output to wavefile.
+	//WaveDump::WriteCore( 1, CoreSrc_External, Sample.DownSample() );
+
+	//RecordWrite( Sample.DownSample() );
+
+	//if(mods[OutputModule] == &NullOut) // null output doesn't need buffering or stretching! :p
+	//	return;
+
+	sndTempBuffer[sndTempProgress++] = Sample;
+
+	// If we haven't accumulated a full packet yet, do nothing more:
+	if(sndTempProgress < SndOutPacketSize) return;
+	sndTempProgress = 0;
+
+	//Don't play anything directly after loading a savestate, avoids static killing your speakers.
+//	if ( ssFreeze > 0 )
+//	{	
+//		ssFreeze--;
+//		return;
+//	}
+//#ifndef __LINUX__
+//	else if( dspPluginEnabled )
+//	{
+//		// Convert in, send to winamp DSP, and convert out.
+//
+//		for( int i=0; i<SndOutPacketSize; ++i ) { sndTempBuffer16[i] = sndTempBuffer[i].DownSample(); }
+//		m_dsp_progress += DspProcess( (s16*)sndTempBuffer16, SndOutPacketSize );
+//
+//		// Some ugly code to ensure full packet handling:
+//		int ei = 0;
+//		while( m_dsp_progress >= SndOutPacketSize )
+//		{
+//			for( int i=0; i<SndOutPacketSize; ++i, ++ei ) { sndTempBuffer[i] = sndTempBuffer16[ei].UpSample(); }
+//
+//			if( !timeStretchDisabled )
+//				timeStretchWrite();
+//			else
+//				_WriteSamples(sndTempBuffer, sndTempProgress);
+//
+//			m_dsp_progress -= SndOutPacketSize;
+//		}
+//		
+//		// copy any leftovers to the front of the dsp buffer.
+//		if( m_dsp_progress > 0 )
+//		{
+//			memcpy( &sndTempBuffer16[ei], sndTempBuffer16,
+//				sizeof(sndTempBuffer16[0]) * m_dsp_progress
+//			);
+//		}
+//	}
+//#endif
+//	else
+	{
+		if( !timeStretchDisabled )
+			timeStretchWrite();
+		else
+			_WriteSamples(sndTempBuffer, SndOutPacketSize);
+	}
+}
+
+s32 SndBuffer::Test()
+{
+	//if( mods[OutputModule] == NULL )
+	//	return -1;
+
+	//return mods[OutputModule]->Test();
+	return 0;
+}
--- a/desmume/src/metaspu/SndOut.h
+++ b/desmume/src/metaspu/SndOut.h
@ -0,0 +1,553 @@
+/* SPU2-X, A plugin for Emulating the Sound Processing Unit of the Playstation 2
+ * Developed and maintained by the Pcsx2 Development Team.
+ * 
+ * Original portions from SPU2ghz are (c) 2008 by David Quintana [gigaherz]
+ *
+ * SPU2-X is free software: you can redistribute it and/or modify it under the terms
+ * of the GNU Lesser General Public License as published by the Free Software Found-
+ * ation, either version 3 of the License, or (at your option) any later version.
+ *
+ * SPU2-X is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+ * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ * PURPOSE.  See the GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with SPU2-X.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#include <algorithm>
+
+struct StereoOut16;
+struct StereoOut32;
+struct StereoOutFloat;
+
+struct StereoOut32
+{
+	static StereoOut32 Empty;
+
+	s32 Left;
+	s32 Right;
+
+	StereoOut32() :
+		Left( 0 ),
+		Right( 0 )
+	{
+	}
+
+	StereoOut32( s32 left, s32 right ) :
+		Left( left ),
+		Right( right )
+	{
+	}
+
+	StereoOut32( const StereoOut16& src );
+	explicit StereoOut32( const StereoOutFloat& src );
+
+	StereoOut16 DownSample() const;
+
+	StereoOut32 operator+( const StereoOut32& right ) const
+	{
+		return StereoOut32(
+			Left + right.Left,
+			Right + right.Right
+		);
+	}
+
+	StereoOut32 operator/( int src ) const
+	{
+		return StereoOut32( Left / src, Right / src );
+	}
+};
+
+
+// Number of stereo samples per SndOut block.
+// All drivers must work in units of this size when communicating with
+// SndOut.
+static const int SndOutPacketSize = 512;
+
+// Overall master volume shift.
+// Converts the mixer's 32 bit value into a 16 bit value.
+//static const int SndOutVolumeShift = 13;
+
+//edit - zeromus 23-oct-2009
+//this is hardcoded differently for metaspu
+static const int SndOutVolumeShift = 0;
+
+// Samplerate of the SPU2. For accurate playback we need to match this
+// exactly.  Trying to scale samplerates and maintain SPU2's Ts timing accuracy
+// is too problematic. :)
+//this is hardcoded differently for metaspu
+//edit - zeromus 23-oct-2009
+//static const int SampleRate = 48000;
+static const int SampleRate = 44100;
+
+extern int FindOutputModuleById( const wchar_t* omodid );
+
+struct StereoOut16
+{
+	s16 Left;
+	s16 Right;
+
+	StereoOut16() :
+		Left( 0 ),
+		Right( 0 )
+	{
+	}
+
+	StereoOut16( const StereoOut32& src ) :
+		Left( (s16)src.Left ),
+		Right( (s16)src.Right )
+	{
+	}
+
+	StereoOut16( s16 left, s16 right ) :
+		Left( left ),
+		Right( right )
+	{
+	}
+
+	StereoOut32 UpSample() const;
+
+	void ResampleFrom( const StereoOut32& src )
+	{
+		// Use StereoOut32's built in conversion
+		*this = src.DownSample();
+	}
+};
+
+struct StereoOutFloat
+{
+	float Left;
+	float Right;
+
+	StereoOutFloat() :
+		Left( 0 ),
+		Right( 0 )
+	{
+	}
+
+	explicit StereoOutFloat( const StereoOut32& src ) :
+		Left( src.Left / 2147483647.0f ),
+		Right( src.Right / 2147483647.0f )
+	{
+	}
+
+	explicit StereoOutFloat( s32 left, s32 right ) :
+		Left( left / 2147483647.0f ),
+		Right( right / 2147483647.0f )
+	{
+	}
+
+	StereoOutFloat( float left, float right ) :
+		Left( left ),
+		Right( right )
+	{
+	}
+};
+
+struct Stereo21Out16
+{
+	s16 Left;
+	s16 Right;
+	s16 LFE;
+
+	void ResampleFrom( const StereoOut32& src )
+	{
+		Left = src.Left >> SndOutVolumeShift;
+		Right = src.Right >> SndOutVolumeShift;
+		LFE = (src.Left + src.Right) >> (SndOutVolumeShift + 1);
+	}
+};
+
+struct StereoQuadOut16
+{
+	s16 Left;
+	s16 Right;
+	s16 LeftBack;
+	s16 RightBack;
+
+	void ResampleFrom( const StereoOut32& src )
+	{
+		Left = src.Left >> SndOutVolumeShift;
+		Right = src.Right >> SndOutVolumeShift;
+		LeftBack = src.Left >> SndOutVolumeShift;
+		RightBack = src.Right >> SndOutVolumeShift;
+	}
+};
+
+struct Stereo41Out16
+{
+	s16 Left;
+	s16 Right;
+	s16 LFE;
+	s16 LeftBack;
+	s16 RightBack;
+
+	void ResampleFrom( const StereoOut32& src )
+	{
+		Left = src.Left >> SndOutVolumeShift;
+		Right = src.Right >> SndOutVolumeShift;
+		LFE = (src.Left + src.Right) >> (SndOutVolumeShift + 1);
+		LeftBack = src.Left >> SndOutVolumeShift;
+		RightBack = src.Right >> SndOutVolumeShift;
+	}
+};
+
+struct Stereo51Out16
+{
+	s16 Left;
+	s16 Right;
+	s16 Center;
+	s16 LFE;
+	s16 LeftBack;
+	s16 RightBack;
+
+	// Implementation Note: Center and Subwoofer/LFE -->
+	// This method is simple and sounds nice.  It relies on the speaker/soundcard
+	// systems do to their own low pass / crossover.  Manual lowpass is wasted effort
+	// and can't match solid state results anyway.
+
+	void ResampleFrom( const StereoOut32& src )
+	{
+		Left = src.Left >> SndOutVolumeShift;
+		Right = src.Right >> SndOutVolumeShift;
+		Center = (src.Left + src.Right) >> (SndOutVolumeShift + 1);
+		LFE = Center;
+		LeftBack = src.Left >> SndOutVolumeShift;
+		RightBack = src.Right >> SndOutVolumeShift;
+	}
+};
+
+struct Stereo51Out16DplII
+{
+	s16 Left;
+	s16 Right;
+	s16 Center;
+	s16 LFE;
+	s16 LeftBack;
+	s16 RightBack;
+
+	void ResampleFrom( const StereoOut32& src )
+	{
+		static const u8 sLogTable[256] = {
+			0x00,0x3C,0x60,0x78,0x8C,0x9C,0xA8,0xB4,0xBE,0xC8,0xD0,0xD8,0xDE,0xE4,0xEA,0xF0,
+			0xF6,0xFA,0xFE,0x04,0x08,0x0C,0x10,0x14,0x16,0x1A,0x1E,0x20,0x24,0x26,0x2A,0x2C,
+			0x2E,0x32,0x34,0x36,0x38,0x3A,0x3E,0x40,0x42,0x44,0x46,0x48,0x4A,0x4C,0x4E,0x50,
+			0x50,0x52,0x54,0x56,0x58,0x5A,0x5A,0x5C,0x5E,0x60,0x60,0x62,0x64,0x66,0x66,0x68,
+			0x6A,0x6A,0x6C,0x6E,0x6E,0x70,0x70,0x72,0x74,0x74,0x76,0x76,0x78,0x7A,0x7A,0x7C,
+			0x7C,0x7E,0x7E,0x80,0x80,0x82,0x82,0x84,0x84,0x86,0x86,0x88,0x88,0x8A,0x8A,0x8C,
+			0x8C,0x8C,0x8E,0x8E,0x90,0x90,0x92,0x92,0x92,0x94,0x94,0x96,0x96,0x96,0x98,0x98,
+			0x9A,0x9A,0x9A,0x9C,0x9C,0x9C,0x9E,0x9E,0xA0,0xA0,0xA0,0xA2,0xA2,0xA2,0xA4,0xA4,
+			0xA4,0xA6,0xA6,0xA6,0xA8,0xA8,0xA8,0xAA,0xAA,0xAA,0xAC,0xAC,0xAC,0xAC,0xAE,0xAE,
+			0xAE,0xB0,0xB0,0xB0,0xB2,0xB2,0xB2,0xB2,0xB4,0xB4,0xB4,0xB6,0xB6,0xB6,0xB6,0xB8,
+			0xB8,0xB8,0xB8,0xBA,0xBA,0xBA,0xBC,0xBC,0xBC,0xBC,0xBE,0xBE,0xBE,0xBE,0xC0,0xC0,
+			0xC0,0xC0,0xC2,0xC2,0xC2,0xC2,0xC2,0xC4,0xC4,0xC4,0xC4,0xC6,0xC6,0xC6,0xC6,0xC8,
+			0xC8,0xC8,0xC8,0xC8,0xCA,0xCA,0xCA,0xCA,0xCC,0xCC,0xCC,0xCC,0xCC,0xCE,0xCE,0xCE,
+			0xCE,0xCE,0xD0,0xD0,0xD0,0xD0,0xD0,0xD2,0xD2,0xD2,0xD2,0xD2,0xD4,0xD4,0xD4,0xD4,
+			0xD4,0xD6,0xD6,0xD6,0xD6,0xD6,0xD8,0xD8,0xD8,0xD8,0xD8,0xD8,0xDA,0xDA,0xDA,0xDA,
+			0xDA,0xDC,0xDC,0xDC,0xDC,0xDC,0xDC,0xDE,0xDE,0xDE,0xDE,0xDE,0xDE,0xE0,0xE0,0xE0,
+		};
+
+		static s32 Gfl=0,Gfr=0;
+		static s32 LMax=0,RMax=0;
+
+		static s32 LAccum;
+		static s32 RAccum;
+		static s32 ANum;
+
+		s32 ValL = src.Left >> (SndOutVolumeShift-8);
+		s32 ValR = src.Right >> (SndOutVolumeShift-8);
+
+		s32 XL = abs(ValL>>8);
+		s32 XR = abs(ValR>>8);
+
+		if(XL>LMax) LMax = XL;
+		if(XR>RMax) RMax = XR;
+
+		ANum++;
+		if(ANum>=128)
+		{
+			ANum=0;
+			LAccum = 1+((LAccum * 224 + LMax * 31)>>8);
+			RAccum = 1+((RAccum * 224 + RMax * 31)>>8);
+
+			LMax = 0;
+			RMax = 0;
+
+			s32 Tfl=(RAccum)*255/(LAccum);
+			s32 Tfr=(LAccum)*255/(RAccum);
+
+			int gMax = std::max(Tfl,Tfr);
+			Tfl = Tfl*255/gMax;
+			Tfr = Tfr*255/gMax;
+
+			if(Tfl>255) Tfl=255;
+			if(Tfr>255) Tfr=255;
+			if(Tfl<1) Tfl=1;
+			if(Tfr<1) Tfr=1;
+
+			Gfl = (Gfl * 200 + Tfl * 56)>>8;
+			Gfr = (Gfr * 200 + Tfr * 56)>>8;
+
+		}
+
+		s32 L,R,C,SUB,SL,SR;
+
+		C=(ValL+ValR)>>1; //16.8
+
+		ValL-=C;//16.8
+		ValR-=C;//16.8
+
+		L=ValL>>8; //16.0
+		R=ValR>>8; //16.0
+		C=C>>8;    //16.0
+		SUB = C;
+
+		{
+			s32 Cfl = 1+sLogTable[Gfl];
+			s32 Cfr = 1+sLogTable[Gfr];
+
+			s32 VL=(ValL>>4) * Cfl; //16.12
+			s32 VR=(ValR>>4) * Cfr;
+
+			//s32 SC = (VL-VR)>>15;
+
+			SL = (((VR/148 - VL/209)>>4)*Cfr)>>8;
+			SR = (((VR/209 - VL/148)>>4)*Cfl)>>8;
+
+		}
+
+		// Random-ish values to get it to compile
+		int GainL = 200;
+		int GainR = 200;
+		int GainC = 180;
+		int GainSL = 230;
+		int GainSR = 230;
+		int GainLFE = 200;
+		int AddCLR = 55;
+
+		int AddCX  = (C * AddCLR)>>8;
+
+		Left	= (((L   * GainL  ))>>8) + AddCX;
+		Right	= (((R   * GainR  ))>>8) + AddCX;
+		Center	= (((C   * GainC  ))>>8);
+		LFE		= (((SUB * GainLFE))>>8);
+		LeftBack	= (((SL  * GainSL ))>>8);
+		RightBack	= (((SR  * GainSR ))>>8);
+	}
+};
+
+struct Stereo71Out16
+{
+	s16 Left;
+	s16 Right;
+	s16 Center;
+	s16 LFE;
+	s16 LeftBack;
+	s16 RightBack;
+	s16 LeftSide;
+	s16 RightSide;
+
+	void ResampleFrom( const StereoOut32& src )
+	{
+		Left = src.Left >> SndOutVolumeShift;
+		Right = src.Right >> SndOutVolumeShift;
+		Center = (src.Left + src.Right) >> (SndOutVolumeShift + 1);
+		LFE = Center;
+		LeftBack = src.Left >> SndOutVolumeShift;
+		RightBack = src.Right >> SndOutVolumeShift;
+
+		LeftSide = src.Left >> (SndOutVolumeShift+1);
+		RightSide = src.Right >> (SndOutVolumeShift+1);
+	}
+};
+
+struct Stereo21Out32
+{
+	s32 Left;
+	s32 Right;
+	s32 LFE;
+};
+
+struct Stereo41Out32
+{
+	s32 Left;
+	s32 Right;
+	s32 LFE;
+	s32 LeftBack;
+	s32 RightBack;
+};
+
+struct Stereo51Out32
+{
+	s32 Left;
+	s32 Right;
+	s32 Center;
+	s32 LFE;
+	s32 LeftBack;
+	s32 RightBack;
+};
+
+// Developer Note: This is a static class only (all static members).
+class SndBuffer
+{
+private:
+	static bool m_underrun_freeze;
+	static s32 m_predictData;
+	static float lastPct;
+
+	static StereoOut32* sndTempBuffer;
+	static StereoOut16* sndTempBuffer16;
+
+	static int sndTempProgress;
+	static int m_dsp_progress;
+
+	static int m_timestretch_progress;
+	static int m_timestretch_writepos;
+
+	static StereoOut32 *m_buffer;
+	static s32 m_size;
+	static s32 m_rpos;
+	static s32 m_wpos;
+	static s32 m_data;
+
+	static float lastEmergencyAdj;
+	static float cTempo;
+	static float eTempo;
+	static int freezeTempo;
+	static int ssFreeze;
+
+	static void _InitFail();
+	static void _WriteSamples(StereoOut32* bData, int nSamples);
+	static bool CheckUnderrunStatus( int& nSamples, int& quietSampleCount );
+
+	static void soundtouchInit();
+	static void soundtouchClearContents();
+	static void soundtouchCleanup();
+	static void timeStretchWrite();
+	static void timeStretchUnderrun();
+	static s32 timeStretchOverrun();
+
+	static void PredictDataWrite( int samples );
+	static float GetStatusPct();
+	static void UpdateTempoChange();
+
+public:
+	static void Init();
+	static void Cleanup();
+	static void Write( const StereoOut32& Sample );
+	static s32 Test();
+	static void ClearContents();
+
+	// Note: When using with 32 bit output buffers, the user of this function is responsible
+	// for shifting the values to where they need to be manually.  The fixed point depth of
+	// the sample output is determined by the SndOutVolumeShift, which is the number of bits
+	// to shift right to get a 16 bit result.
+	template< typename T >
+	static void ReadSamples( T* bData )
+	{
+		int nSamples = SndOutPacketSize;
+
+		// Problem:
+		//  If the SPU2 gets even the least bit out of sync with the SndOut device,
+		//  the readpos of the circular buffer will overtake the writepos,
+		//  leading to a prolonged period of hopscotching read/write accesses (ie,
+		//  lots of staticy crap sound for several seconds).
+		//
+		// Fix:
+		//  If the read position overtakes the write position, abort the
+		//  transfer immediately and force the SndOut driver to wait until
+		//  the read buffer has filled up again before proceeding.
+		//  This will cause one brief hiccup that can never exceed the user's
+		//  set buffer length in duration.
+
+		int quietSamples;
+		if( CheckUnderrunStatus( nSamples, quietSamples ) )
+		{
+			assert( nSamples <= SndOutPacketSize );
+
+			// [Air] [TODO]: This loop is probably a candidate for SSE2 optimization.
+
+			const int endPos = m_rpos + nSamples;
+			const int secondCopyLen = endPos - m_size;
+			const StereoOut32* rposbuffer = &m_buffer[m_rpos];
+
+			m_data -= nSamples;
+
+			if( secondCopyLen > 0 )
+			{
+				nSamples -= secondCopyLen;
+				for( int i=0; i<secondCopyLen; i++ )
+					bData[nSamples+i].ResampleFrom( m_buffer[i] );
+				m_rpos = secondCopyLen;
+			}
+			else
+				m_rpos += nSamples;
+
+			for( int i=0; i<nSamples; i++ )
+				bData[i].ResampleFrom( rposbuffer[i] );
+		}
+
+		// If quietSamples != 0 it means we have an underrun...
+		// Let's just dull out some silence, because that's usually the least
+		// painful way of dealing with underruns:
+		memset( bData, 0, quietSamples * sizeof(T) );
+	}
+};
+
+//class SndOutModule
+//{
+//public:
+//	// Virtual destructor, because it helps fight C+++ funny-business.
+//	virtual ~SndOutModule() {}
+//
+//	// Returns a unique identification string for this driver.
+//	// (usually just matches the driver's cpp filename)
+//	virtual const wchar_t* GetIdent() const=0;
+//
+//	// Returns the long name / description for this driver.
+//	// (for use in configuration screen)
+//	virtual const wchar_t* GetLongName() const=0;
+//
+//	virtual s32  Init()=0;
+//	virtual void Close()=0;
+//	virtual s32  Test() const=0;
+//
+//	// Gui function: Used to open the configuration box for this driver.
+//	virtual void Configure(uptr parent)=0;
+//
+//	// Loads settings from the INI file for this driver
+//	virtual void ReadSettings()=0;
+//
+//	// Saves settings to the INI file for this driver
+//	virtual void WriteSettings() const=0;
+//
+//	virtual bool Is51Out() const=0;
+//
+//	// Returns the number of empty samples in the output buffer.
+//	// (which is effectively the amount of data played since the last update)
+//	virtual int GetEmptySampleCount() const=0;
+//};
+//
+//
+//#ifdef _MSC_VER
+////internal
+//extern SndOutModule* WaveOut;
+//extern SndOutModule* DSoundOut;
+//extern SndOutModule* XAudio2Out;
+//#endif
+//
+//extern SndOutModule* mods[];
+//
+//// =====================================================================================================
+//
+//extern void RecordStart();
+//extern void RecordStop();
+//extern void RecordWrite( const StereoOut16& sample );
+//
+//extern s32  DspLoadLibrary(wchar_t *fileName, int modNum);
+//extern void DspCloseLibrary();
+//extern int  DspProcess(s16 *buffer, int samples);
+//extern void DspUpdate(); // to let the Dsp process window messages
--- a/desmume/src/metaspu/SoundTouch/3dnow_win.cpp
+++ b/desmume/src/metaspu/SoundTouch/3dnow_win.cpp
@ -0,0 +1,350 @@
+////////////////////////////////////////////////////////////////////////////////
+///
+/// Win32 version of the AMD 3DNow! optimized routines for AMD K6-2/Athlon 
+/// processors. All 3DNow! optimized functions have been gathered into this
+/// single source code file, regardless to their class or original source code 
+/// file, in order to ease porting the library to other compiler and processor 
+/// platforms.
+///
+/// By the way; the performance gain depends heavily on the CPU generation: On 
+/// K6-2 these routines provided speed-up of even 2.4 times, while on Athlon the 
+/// difference to the original routines stayed at unremarkable 8%! Such a small 
+/// improvement on Athlon is due to 3DNow can perform only two operations in 
+/// parallel, and obviously also the Athlon FPU is doing a very good job with
+/// the standard C floating point routines! Here these routines are anyway, 
+/// although it might not be worth the effort to convert these to GCC platform, 
+/// for Athlon CPU at least. The situation is different regarding the SSE 
+/// optimizations though, thanks to the four parallel operations of SSE that 
+/// already make a difference.
+/// 
+/// This file is to be compiled in Windows platform with Microsoft Visual C++ 
+/// Compiler. Please see '3dnow_gcc.cpp' for the gcc compiler version for all
+/// GNU platforms (if file supplied).
+///
+/// NOTICE: If using Visual Studio 6.0, you'll need to install the "Visual C++ 
+/// 6.0 processor pack" update to support 3DNow! instruction set. The update is 
+/// available for download at Microsoft Developers Network, see here:
+/// http://msdn.microsoft.com/vstudio/downloads/tools/ppack/default.aspx
+///
+/// If the above URL is expired or removed, go to "http://msdn.microsoft.com" and 
+/// perform a search with keywords "processor pack".
+///
+/// Author        : Copyright (c) Olli Parviainen
+/// Author e-mail : oparviai 'at' iki.fi
+/// SoundTouch WWW: http://www.surina.net/soundtouch
+///
+////////////////////////////////////////////////////////////////////////////////
+//
+// Last changed  : $Date: 2006/02/05 16:44:06 $
+// File revision : $Revision: 1.10 $
+//
+// $Id: 3dnow_win.cpp,v 1.10 2006/02/05 16:44:06 Olli Exp $
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// License :
+//
+//  SoundTouch audio processing library
+//  Copyright (c) Olli Parviainen
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2.1 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "cpu_detect.h"
+#include "STTypes.h"
+
+#ifndef _WIN32
+#error "wrong platform - this source code file is exclusively for Win32 platform"
+#endif
+
+using namespace soundtouch;
+
+#ifdef ALLOW_3DNOW
+// 3DNow! routines available only with float sample type    
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// implementation of 3DNow! optimized functions of class 'TDStretch3DNow'
+//
+//////////////////////////////////////////////////////////////////////////////
+
+#include "TDStretch.h"
+#include <limits.h>
+
+// these are declared in 'TDStretch.cpp'
+extern int scanOffsets[4][24];
+
+
+// Calculates cross correlation of two buffers
+double TDStretch3DNow::calcCrossCorrStereo(const float *pV1, const float *pV2) const
+{
+    uint overlapLengthLocal = overlapLength;
+    float corr;
+
+    // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors
+    /*
+    c-pseudocode:
+
+        corr = 0;
+        for (i = 0; i < overlapLength / 4; i ++)
+        {
+            corr += pV1[0] * pV2[0];
+                    pV1[1] * pV2[1];
+                    pV1[2] * pV2[2];
+                    pV1[3] * pV2[3];
+                    pV1[4] * pV2[4];
+                    pV1[5] * pV2[5];
+                    pV1[6] * pV2[6];
+                    pV1[7] * pV2[7];
+
+            pV1 += 8;
+            pV2 += 8;
+        }
+    */
+
+    _asm 
+    {
+        // give prefetch hints to CPU of what data are to be needed soonish.
+        // give more aggressive hints on pV1 as that changes more between different calls 
+        // while pV2 stays the same.
+        prefetch [pV1]
+        prefetch [pV2]
+        prefetch [pV1 + 32]
+
+        mov     eax, dword ptr pV2
+        mov     ebx, dword ptr pV1
+
+        pxor    mm0, mm0
+
+        mov     ecx, overlapLengthLocal
+        shr     ecx, 2  // div by four
+
+    loop1:
+        movq    mm1, [eax]
+        prefetch [eax + 32]     // give a prefetch hint to CPU what data are to be needed soonish
+        pfmul   mm1, [ebx]
+        prefetch [ebx + 64]     // give a prefetch hint to CPU what data are to be needed soonish
+
+        movq    mm2, [eax + 8]
+        pfadd   mm0, mm1
+        pfmul   mm2, [ebx + 8]
+
+        movq    mm3, [eax + 16]
+        pfadd   mm0, mm2
+        pfmul   mm3, [ebx + 16]
+
+        movq    mm4, [eax + 24]
+        pfadd   mm0, mm3
+        pfmul   mm4, [ebx + 24]
+
+        add     eax, 32
+        pfadd   mm0, mm4
+        add     ebx, 32
+
+        dec     ecx
+        jnz     loop1
+
+        // add halfs of mm0 together and return the result. 
+        // note: mm1 is used as a dummy parameter only, we actually don't care about it's value
+        pfacc   mm0, mm1
+        movd    corr, mm0
+        femms
+    }
+
+    return corr;
+}
+
+
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// implementation of 3DNow! optimized functions of class 'FIRFilter'
+//
+//////////////////////////////////////////////////////////////////////////////
+
+#include "FIRFilter.h"
+
+FIRFilter3DNow::FIRFilter3DNow() : FIRFilter()
+{
+    filterCoeffsUnalign = NULL;
+}
+
+
+FIRFilter3DNow::~FIRFilter3DNow()
+{
+    delete[] filterCoeffsUnalign;
+}
+
+
+// (overloaded) Calculates filter coefficients for 3DNow! routine
+void FIRFilter3DNow::setCoefficients(const float *coeffs, uint newLength, uint uResultDivFactor)
+{
+    uint i;
+    float fDivider;
+
+    FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
+
+    // Scale the filter coefficients so that it won't be necessary to scale the filtering result
+    // also rearrange coefficients suitably for 3DNow!
+    // Ensure that filter coeffs array is aligned to 16-byte boundary
+    delete[] filterCoeffsUnalign;
+    filterCoeffsUnalign = new float[2 * newLength + 4];
+    filterCoeffsAlign = (float *)(((uint)filterCoeffsUnalign + 15) & -16);
+
+    fDivider = (float)resultDivider;
+
+    // rearrange the filter coefficients for mmx routines 
+    for (i = 0; i < newLength; i ++)
+    {
+        filterCoeffsAlign[2 * i + 0] =
+        filterCoeffsAlign[2 * i + 1] = coeffs[i + 0] / fDivider;
+    }
+}
+
+
+// 3DNow!-optimized version of the filter routine for stereo sound
+uint FIRFilter3DNow::evaluateFilterStereo(float *dest, const float *src, const uint numSamples) const
+{
+    float *filterCoeffsLocal = filterCoeffsAlign;
+    uint count = (numSamples - length) & -2;
+    uint lengthLocal = length / 4;
+
+    assert(length != 0);
+    assert(count % 2 == 0);
+
+    /* original code:
+
+    double suml1, suml2;
+    double sumr1, sumr2;
+    uint i, j;
+
+    for (j = 0; j < count; j += 2)
+    {
+        const float *ptr;
+
+        suml1 = sumr1 = 0.0;
+        suml2 = sumr2 = 0.0;
+        ptr = src;
+        filterCoeffsLocal = filterCoeffs;
+        for (i = 0; i < lengthLocal; i ++) 
+        {
+            // unroll loop for efficiency.
+
+            suml1 += ptr[0] * filterCoeffsLocal[0] + 
+                     ptr[2] * filterCoeffsLocal[2] +
+                     ptr[4] * filterCoeffsLocal[4] +
+                     ptr[6] * filterCoeffsLocal[6];
+
+            sumr1 += ptr[1] * filterCoeffsLocal[1] + 
+                     ptr[3] * filterCoeffsLocal[3] +
+                     ptr[5] * filterCoeffsLocal[5] +
+                     ptr[7] * filterCoeffsLocal[7];
+
+            suml2 += ptr[8] * filterCoeffsLocal[0] + 
+                     ptr[10] * filterCoeffsLocal[2] +
+                     ptr[12] * filterCoeffsLocal[4] +
+                     ptr[14] * filterCoeffsLocal[6];
+
+            sumr2 += ptr[9] * filterCoeffsLocal[1] + 
+                     ptr[11] * filterCoeffsLocal[3] +
+                     ptr[13] * filterCoeffsLocal[5] +
+                     ptr[15] * filterCoeffsLocal[7];
+
+            ptr += 16;
+            filterCoeffsLocal += 8;
+        }
+        dest[0] = (float)suml1;
+        dest[1] = (float)sumr1;
+        dest[2] = (float)suml2;
+        dest[3] = (float)sumr2;
+
+        src += 4;
+        dest += 4;
+    }
+
+    */
+    _asm
+    {
+        mov     eax, dword ptr dest
+        mov     ebx, dword ptr src
+        mov     edx, count
+        shr     edx, 1
+
+    loop1:
+        // "outer loop" : during each round 2*2 output samples are calculated
+        prefetch  [ebx]                 // give a prefetch hint to CPU what data are to be needed soonish
+        prefetch  [filterCoeffsLocal]   // give a prefetch hint to CPU what data are to be needed soonish
+
+        mov     esi, ebx
+        mov     edi, filterCoeffsLocal
+        pxor    mm0, mm0
+        pxor    mm1, mm1
+        mov     ecx, lengthLocal
+
+    loop2:
+        // "inner loop" : during each round four FIR filter taps are evaluated for 2*2 output samples
+        movq    mm2, [edi]
+        movq    mm3, mm2
+        prefetch  [edi + 32]     // give a prefetch hint to CPU what data are to be needed soonish
+        pfmul   mm2, [esi]
+        prefetch  [esi + 32]     // give a prefetch hint to CPU what data are to be needed soonish
+        pfmul   mm3, [esi + 8]
+
+        movq    mm4, [edi + 8]
+        movq    mm5, mm4
+        pfadd   mm0, mm2
+        pfmul   mm4, [esi + 8]
+        pfadd   mm1, mm3
+        pfmul   mm5, [esi + 16]
+
+        movq    mm2, [edi + 16]
+        movq    mm6, mm2
+        pfadd   mm0, mm4
+        pfmul   mm2, [esi + 16]
+        pfadd   mm1, mm5
+        pfmul   mm6, [esi + 24]
+
+        movq    mm3, [edi + 24]
+        movq    mm7, mm3
+        pfadd   mm0, mm2
+        pfmul   mm3, [esi + 24]
+        pfadd   mm1, mm6
+        pfmul   mm7, [esi + 32]
+        add     esi, 32
+        pfadd   mm0, mm3
+        add     edi, 32
+        pfadd   mm1, mm7
+
+        dec     ecx
+        jnz     loop2
+
+        movq    [eax], mm0
+        add     ebx, 16
+        movq    [eax + 8], mm1
+        add     eax, 16
+
+        dec     edx
+        jnz     loop1
+
+        femms
+    }
+
+    return count;
+}
+
+
+#endif  // ALLOW_3DNOW
--- a/desmume/src/metaspu/SoundTouch/AAFilter.cpp
+++ b/desmume/src/metaspu/SoundTouch/AAFilter.cpp
@ -0,0 +1,184 @@
+////////////////////////////////////////////////////////////////////////////////
+///
+/// FIR low-pass (anti-alias) filter with filter coefficient design routine and
+/// MMX optimization. 
+/// 
+/// Anti-alias filter is used to prevent folding of high frequencies when 
+/// transposing the sample rate with interpolation.
+///
+/// Author        : Copyright (c) Olli Parviainen
+/// Author e-mail : oparviai 'at' iki.fi
+/// SoundTouch WWW: http://www.surina.net/soundtouch
+///
+////////////////////////////////////////////////////////////////////////////////
+//
+// Last changed  : $Date: 2006/02/05 16:44:06 $
+// File revision : $Revision: 1.9 $
+//
+// $Id: AAFilter.cpp,v 1.9 2006/02/05 16:44:06 Olli Exp $
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// License :
+//
+//  SoundTouch audio processing library
+//  Copyright (c) Olli Parviainen
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2.1 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include <memory.h>
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h>
+#include "AAFilter.h"
+#include "FIRFilter.h"
+
+using namespace soundtouch;
+
+#define PI        3.141592655357989
+#define TWOPI    (2 * PI)
+
+/*****************************************************************************
+ *
+ * Implementation of the class 'AAFilter'
+ *
+ *****************************************************************************/
+
+AAFilter::AAFilter(const uint length)
+{
+    pFIR = FIRFilter::newInstance();
+    cutoffFreq = 0.5;
+    setLength(length);
+}
+
+
+
+AAFilter::~AAFilter()
+{
+    delete pFIR;
+}
+
+
+
+// Sets new anti-alias filter cut-off edge frequency, scaled to
+// sampling frequency (nyquist frequency = 0.5).
+// The filter will cut frequencies higher than the given frequency.
+void AAFilter::setCutoffFreq(const double newCutoffFreq)
+{
+    cutoffFreq = newCutoffFreq;
+    calculateCoeffs();
+}
+
+
+
+// Sets number of FIR filter taps
+void AAFilter::setLength(const uint newLength)
+{
+    length = newLength;
+    calculateCoeffs();
+}
+
+
+
+// Calculates coefficients for a low-pass FIR filter using Hamming window
+void AAFilter::calculateCoeffs()
+{
+    uint i;
+    double cntTemp, temp, tempCoeff,h, w;
+    double fc2, wc;
+    double scaleCoeff, sum;
+    double *work;
+    SAMPLETYPE *coeffs;
+
+    assert(length > 0);
+    assert(length % 4 == 0);
+    assert(cutoffFreq >= 0);
+    assert(cutoffFreq <= 0.5);
+
+    work = new double[length];
+    coeffs = new SAMPLETYPE[length];
+
+    fc2 = 2.0 * cutoffFreq; 
+    wc = PI * fc2;
+    tempCoeff = TWOPI / (double)length;
+
+    sum = 0;
+    for (i = 0; i < length; i ++) 
+    {
+        cntTemp = (double)i - (double)(length / 2);
+
+        temp = cntTemp * wc;
+        if (temp != 0) 
+        {
+            h = fc2 * sin(temp) / temp;                     // sinc function
+        } 
+        else 
+        {
+            h = 1.0;
+        }
+        w = 0.54 + 0.46 * cos(tempCoeff * cntTemp);       // hamming window
+
+        temp = w * h;
+        work[i] = temp;
+
+        // calc net sum of coefficients 
+        sum += temp;
+    }
+
+    // ensure the sum of coefficients is larger than zero
+    assert(sum > 0);
+
+    // ensure we've really designed a lowpass filter...
+    assert(work[length/2] > 0);
+    assert(work[length/2 + 1] > -1e-6);
+    assert(work[length/2 - 1] > -1e-6);
+
+    // Calculate a scaling coefficient in such a way that the result can be
+    // divided by 16384
+    scaleCoeff = 16384.0f / sum;
+
+    for (i = 0; i < length; i ++) 
+    {
+        // scale & round to nearest integer
+        temp = work[i] * scaleCoeff;
+        temp += (temp >= 0) ? 0.5 : -0.5;
+        // ensure no overfloods
+        assert(temp >= -32768 && temp <= 32767);
+        coeffs[i] = (SAMPLETYPE)temp;
+    }
+
+    // Set coefficients. Use divide factor 14 => divide result by 2^14 = 16384
+    pFIR->setCoefficients(coeffs, length, 14);
+
+    delete[] work;
+    delete[] coeffs;
+}
+
+
+// Applies the filter to the given sequence of samples. 
+// Note : The amount of outputted samples is by value of 'filter length' 
+// smaller than the amount of input samples.
+uint AAFilter::evaluate(SAMPLETYPE *dest, const SAMPLETYPE *src, uint numSamples, uint numChannels) const
+{
+    return pFIR->evaluate(dest, src, numSamples, numChannels);
+}
+
+
+uint AAFilter::getLength() const
+{
+    return pFIR->getLength();
+}
--- a/desmume/src/metaspu/SoundTouch/AAFilter.h
+++ b/desmume/src/metaspu/SoundTouch/AAFilter.h
@ -0,0 +1,91 @@
+////////////////////////////////////////////////////////////////////////////////
+///
+/// Sampled sound tempo changer/time stretch algorithm. Changes the sound tempo 
+/// while maintaining the original pitch by using a time domain WSOLA-like method 
+/// with several performance-increasing tweaks.
+///
+/// Anti-alias filter is used to prevent folding of high frequencies when 
+/// transposing the sample rate with interpolation.
+///
+/// Author        : Copyright (c) Olli Parviainen
+/// Author e-mail : oparviai 'at' iki.fi
+/// SoundTouch WWW: http://www.surina.net/soundtouch
+///
+////////////////////////////////////////////////////////////////////////////////
+//
+// Last changed  : $Date: 2006/02/05 16:44:06 $
+// File revision : $Revision: 1.10 $
+//
+// $Id: AAFilter.h,v 1.10 2006/02/05 16:44:06 Olli Exp $
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// License :
+//
+//  SoundTouch audio processing library
+//  Copyright (c) Olli Parviainen
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2.1 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef AAFilter_H
+#define AAFilter_H
+
+#include "STTypes.h"
+
+namespace soundtouch
+{
+
+class AAFilter
+{
+protected:
+    class FIRFilter *pFIR;
+
+    /// Low-pass filter cut-off frequency, negative = invalid
+    double cutoffFreq;
+
+    /// num of filter taps
+    uint length;
+
+    /// Calculate the FIR coefficients realizing the given cutoff-frequency
+    void calculateCoeffs();
+public:
+    AAFilter(uint length);
+
+    ~AAFilter();
+
+    /// Sets new anti-alias filter cut-off edge frequency, scaled to sampling 
+    /// frequency (nyquist frequency = 0.5). The filter will cut off the 
+    /// frequencies than that.
+    void setCutoffFreq(double newCutoffFreq);
+
+    /// Sets number of FIR filter taps, i.e. ~filter complexity
+    void setLength(uint newLength);
+
+    uint getLength() const;
+
+    /// Applies the filter to the given sequence of samples. 
+    /// Note : The amount of outputted samples is by value of 'filter length' 
+    /// smaller than the amount of input samples.
+    uint evaluate(SAMPLETYPE *dest, 
+                  const SAMPLETYPE *src, 
+                  uint numSamples, 
+                  uint numChannels) const;
+};
+
+}
+
+#endif
--- a/desmume/src/metaspu/SoundTouch/BPMDetect.h
+++ b/desmume/src/metaspu/SoundTouch/BPMDetect.h
@ -0,0 +1,159 @@
+////////////////////////////////////////////////////////////////////////////////
+///
+/// Beats-per-minute (BPM) detection routine.
+///
+/// The beat detection algorithm works as follows:
+/// - Use function 'inputSamples' to input a chunks of samples to the class for
+///   analysis. It's a good idea to enter a large sound file or stream in smallish
+///   chunks of around few kilosamples in order not to extinguish too much RAM memory.
+/// - Input sound data is decimated to approx 500 Hz to reduce calculation burden,
+///   which is basically ok as low (bass) frequencies mostly determine the beat rate.
+///   Simple averaging is used for anti-alias filtering because the resulting signal
+///   quality isn't of that high importance.
+/// - Decimated sound data is enveloped, i.e. the amplitude shape is detected by
+///   taking absolute value that's smoothed by sliding average. Signal levels that
+///   are below a couple of times the general RMS amplitude level are cut away to
+///   leave only notable peaks there.
+/// - Repeating sound patterns (e.g. beats) are detected by calculating short-term 
+///   autocorrelation function of the enveloped signal.
+/// - After whole sound data file has been analyzed as above, the bpm level is 
+///   detected by function 'getBpm' that finds the highest peak of the autocorrelation 
+///   function, calculates it's precise location and converts this reading to bpm's.
+///
+/// Author        : Copyright (c) Olli Parviainen
+/// Author e-mail : oparviai 'at' iki.fi
+/// SoundTouch WWW: http://www.surina.net/soundtouch
+///
+////////////////////////////////////////////////////////////////////////////////
+//
+// Last changed  : $Date: 2006/02/05 16:44:06 $
+// File revision : $Revision: 1.5 $
+//
+// $Id: BPMDetect.h,v 1.5 2006/02/05 16:44:06 Olli Exp $
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// License :
+//
+//  SoundTouch audio processing library
+//  Copyright (c) Olli Parviainen
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2.1 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef _BPMDetect_H_
+#define _BPMDetect_H_
+
+#include "STTypes.h"
+#include "FIFOSampleBuffer.h"
+
+/// Minimum allowed BPM rate. Used to restrict accepted result above a reasonable limit.
+#define MIN_BPM 45
+
+/// Maximum allowed BPM rate. Used to restrict accepted result below a reasonable limit.
+#define MAX_BPM 230
+
+
+/// Class for calculating BPM rate for audio data.
+class BPMDetect
+{
+protected:
+    /// Auto-correlation accumulator bins.
+    float *xcorr;
+    
+    /// Amplitude envelope sliding average approximation level accumulator
+    float envelopeAccu;
+
+    /// RMS volume sliding average approximation level accumulator
+    float RMSVolumeAccu;
+
+    /// Sample average counter.
+    int decimateCount;
+
+    /// Sample average accumulator for FIFO-like decimation.
+    soundtouch::LONG_SAMPLETYPE decimateSum;
+
+    /// Decimate sound by this coefficient to reach approx. 500 Hz.
+    int decimateBy;
+
+    /// Auto-correlation window length
+    int windowLen;
+
+    /// Number of channels (1 = mono, 2 = stereo)
+    int channels;
+
+    /// sample rate
+    int sampleRate;
+
+    /// Beginning of auto-correlation window: Autocorrelation isn't being updated for
+    /// the first these many correlation bins.
+    int windowStart;
+ 
+    /// FIFO-buffer for decimated processing samples.
+    soundtouch::FIFOSampleBuffer *buffer;
+
+    /// Initialize the class for processing.
+    void init(int numChannels, int sampleRate);
+
+    /// Updates auto-correlation function for given number of decimated samples that 
+    /// are read from the internal 'buffer' pipe (samples aren't removed from the pipe 
+    /// though).
+    void updateXCorr(int process_samples      /// How many samples are processed.
+                     );
+
+    /// Decimates samples to approx. 500 Hz.
+    ///
+    /// \return Number of output samples.
+    int decimate(soundtouch::SAMPLETYPE *dest,      ///< Destination buffer
+                 const soundtouch::SAMPLETYPE *src, ///< Source sample buffer
+                 int numsamples                     ///< Number of source samples.
+                 );
+
+    /// Calculates amplitude envelope for the buffer of samples.
+    /// Result is output to 'samples'.
+    void calcEnvelope(soundtouch::SAMPLETYPE *samples,  ///< Pointer to input/output data buffer
+                      int numsamples                    ///< Number of samples in buffer
+                      );
+
+public:
+    /// Constructor.
+    BPMDetect(int numChannels,  ///< Number of channels in sample data.
+              int sampleRate    ///< Sample rate in Hz.
+              );
+
+    /// Destructor.
+    virtual ~BPMDetect();
+
+    /// Inputs a block of samples for analyzing: Envelopes the samples and then
+    /// updates the autocorrelation estimation. When whole song data has been input
+    /// in smaller blocks using this function, read the resulting bpm with 'getBpm' 
+    /// function. 
+    /// 
+    /// Notice that data in 'samples' array can be disrupted in processing.
+    void inputSamples(soundtouch::SAMPLETYPE *samples,  ///< Pointer to input/working data buffer
+                      int numSamples                    ///< Number of samples in buffer
+                      );
+
+
+    /// Analyzes the results and returns the BPM rate. Use this function to read result
+    /// after whole song data has been input to the class by consecutive calls of
+    /// 'inputSamples' function.
+    ///
+    /// \return Beats-per-minute rate, or zero if detection failed.
+    float getBpm();
+};
+
+#endif // _BPMDetect_H_
--- a/desmume/src/metaspu/SoundTouch/FIFOSampleBuffer.cpp
+++ b/desmume/src/metaspu/SoundTouch/FIFOSampleBuffer.cpp
@ -0,0 +1,252 @@
+////////////////////////////////////////////////////////////////////////////////
+///
+/// A buffer class for temporarily storaging sound samples, operates as a 
+/// first-in-first-out pipe.
+///
+/// Samples are added to the end of the sample buffer with the 'putSamples' 
+/// function, and are received from the beginning of the buffer by calling
+/// the 'receiveSamples' function. The class automatically removes the 
+/// outputted samples from the buffer, as well as grows the buffer size 
+/// whenever necessary.
+///
+/// Author        : Copyright (c) Olli Parviainen
+/// Author e-mail : oparviai 'at' iki.fi
+/// SoundTouch WWW: http://www.surina.net/soundtouch
+///
+////////////////////////////////////////////////////////////////////////////////
+//
+// Last changed  : $Date: 2006/02/05 16:44:06 $
+// File revision : $Revision: 1.11 $
+//
+// $Id: FIFOSampleBuffer.cpp,v 1.11 2006/02/05 16:44:06 Olli Exp $
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// License :
+//
+//  SoundTouch audio processing library
+//  Copyright (c) Olli Parviainen
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2.1 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include <stdlib.h>
+#include <memory.h>
+#include <string.h>
+#include <assert.h>
+#include <stdexcept>
+
+#include "FIFOSampleBuffer.h"
+
+using namespace soundtouch;
+
+// Constructor
+FIFOSampleBuffer::FIFOSampleBuffer(uint numChannels)
+{
+    sizeInBytes = 0; // reasonable initial value
+    buffer = NULL;  //new SAMPLETYPE[sizeInBytes / sizeof(SAMPLETYPE)];
+    bufferUnaligned = NULL;
+    samplesInBuffer = 0;
+    bufferPos = 0;
+    channels = numChannels;
+}
+
+
+// destructor
+FIFOSampleBuffer::~FIFOSampleBuffer()
+{
+    delete[] bufferUnaligned;
+}
+
+
+// Sets number of channels, 1 = mono, 2 = stereo
+void FIFOSampleBuffer::setChannels(const uint numChannels)
+{
+    uint usedBytes;
+
+    usedBytes = channels * samplesInBuffer;
+    channels = numChannels;
+    samplesInBuffer = usedBytes / channels;
+}
+
+
+// if output location pointer 'bufferPos' isn't zero, 'rewinds' the buffer and
+// zeroes this pointer by copying samples from the 'bufferPos' pointer 
+// location on to the beginning of the buffer.
+void FIFOSampleBuffer::rewind()
+{
+    if (bufferPos) 
+    {
+        memmove(buffer, ptrBegin(), sizeof(SAMPLETYPE) * channels * samplesInBuffer);
+        bufferPos = 0;
+    }
+}
+
+
+// Adds 'numSamples' pcs of samples from the 'samples' memory position to 
+// the sample buffer.
+void FIFOSampleBuffer::putSamples(const SAMPLETYPE *samples, uint numSamples)
+{
+    memcpy(ptrEnd(numSamples), samples, sizeof(SAMPLETYPE) * numSamples * channels);
+    samplesInBuffer += numSamples;
+}
+
+
+// Increases the number of samples in the buffer without copying any actual
+// samples.
+//
+// This function is used to update the number of samples in the sample buffer
+// when accessing the buffer directly with 'ptrEnd' function. Please be 
+// careful though!
+void FIFOSampleBuffer::putSamples(uint numSamples)
+{
+    uint req;
+
+    req = samplesInBuffer + numSamples;
+    ensureCapacity(req);
+    samplesInBuffer += numSamples;
+}
+
+
+// Returns a pointer to the end of the used part of the sample buffer (i.e. 
+// where the new samples are to be inserted). This function may be used for 
+// inserting new samples into the sample buffer directly. Please be careful! 
+//
+// Parameter 'slackCapacity' tells the function how much free capacity (in
+// terms of samples) there _at least_ should be, in order to the caller to
+// succesfully insert all the required samples to the buffer. When necessary, 
+// the function grows the buffer size to comply with this requirement.
+//
+// When using this function as means for inserting new samples, also remember 
+// to increase the sample count afterwards, by calling  the 
+// 'putSamples(numSamples)' function.
+SAMPLETYPE *FIFOSampleBuffer::ptrEnd(uint slackCapacity) 
+{
+    ensureCapacity(samplesInBuffer + slackCapacity);
+    return buffer + samplesInBuffer * channels;
+}
+
+
+// Returns a pointer to the beginning of the currently non-outputted samples. 
+// This function is provided for accessing the output samples directly. 
+// Please be careful!
+//
+// When using this function to output samples, also remember to 'remove' the
+// outputted samples from the buffer by calling the 
+// 'receiveSamples(numSamples)' function
+SAMPLETYPE *FIFOSampleBuffer::ptrBegin() const
+{
+    return buffer + bufferPos * channels;
+}
+
+
+// Ensures that the buffer has enought capacity, i.e. space for _at least_
+// 'capacityRequirement' number of samples. The buffer is grown in steps of
+// 4 kilobytes to eliminate the need for frequently growing up the buffer,
+// as well as to round the buffer size up to the virtual memory page size.
+void FIFOSampleBuffer::ensureCapacity(uint capacityRequirement)
+{
+    SAMPLETYPE *tempUnaligned, *temp;
+
+    if (capacityRequirement > getCapacity()) 
+    {
+        // enlarge the buffer in 4kbyte steps (round up to next 4k boundary)
+        sizeInBytes = (capacityRequirement * channels * sizeof(SAMPLETYPE) + 4095) & -4096;
+        assert(sizeInBytes % 2 == 0);
+        tempUnaligned = new SAMPLETYPE[sizeInBytes / sizeof(SAMPLETYPE) + 16 / sizeof(SAMPLETYPE)];
+        if (tempUnaligned == NULL)
+        {
+            throw std::runtime_error("Couldn't allocate memory!\n");
+        }
+        temp = (SAMPLETYPE *)(((ulongptr)tempUnaligned + 15) & -16);
+        memcpy(temp, ptrBegin(), samplesInBuffer * channels * sizeof(SAMPLETYPE));
+        delete[] bufferUnaligned;
+        buffer = temp;
+        bufferUnaligned = tempUnaligned;
+        bufferPos = 0;
+    } 
+    else 
+    {
+        // simply rewind the buffer (if necessary)
+        rewind();
+    }
+}
+
+
+// Returns the current buffer capacity in terms of samples
+uint FIFOSampleBuffer::getCapacity() const
+{
+    return sizeInBytes / (channels * sizeof(SAMPLETYPE));
+}
+
+
+// Returns the number of samples currently in the buffer
+uint FIFOSampleBuffer::numSamples() const
+{
+    return samplesInBuffer;
+}
+
+
+// Output samples from beginning of the sample buffer. Copies demanded number
+// of samples to output and removes them from the sample buffer. If there
+// are less than 'numsample' samples in the buffer, returns all available.
+//
+// Returns number of samples copied.
+uint FIFOSampleBuffer::receiveSamples(SAMPLETYPE *output, uint maxSamples)
+{
+    uint num;
+
+    num = (maxSamples > samplesInBuffer) ? samplesInBuffer : maxSamples;
+
+    memcpy(output, ptrBegin(), channels * sizeof(SAMPLETYPE) * num);
+    return receiveSamples(num);
+}
+
+
+// Removes samples from the beginning of the sample buffer without copying them
+// anywhere. Used to reduce the number of samples in the buffer, when accessing
+// the sample buffer with the 'ptrBegin' function.
+uint FIFOSampleBuffer::receiveSamples(uint maxSamples)
+{
+    if (maxSamples >= samplesInBuffer)
+    {
+        uint temp;
+
+        temp = samplesInBuffer;
+        samplesInBuffer = 0;
+        return temp;
+    }
+
+    samplesInBuffer -= maxSamples;
+    bufferPos += maxSamples;
+
+    return maxSamples;
+}
+
+
+// Returns nonzero if the sample buffer is empty
+int FIFOSampleBuffer::isEmpty() const
+{
+    return (samplesInBuffer == 0) ? 1 : 0;
+}
+
+
+// Clears the sample buffer
+void FIFOSampleBuffer::clear()
+{
+    samplesInBuffer = 0;
+    bufferPos = 0;
+}
--- a/desmume/src/metaspu/SoundTouch/FIFOSampleBuffer.h
+++ b/desmume/src/metaspu/SoundTouch/FIFOSampleBuffer.h
@ -0,0 +1,174 @@
+////////////////////////////////////////////////////////////////////////////////
+///
+/// A buffer class for temporarily storaging sound samples, operates as a 
+/// first-in-first-out pipe.
+///
+/// Samples are added to the end of the sample buffer with the 'putSamples' 
+/// function, and are received from the beginning of the buffer by calling
+/// the 'receiveSamples' function. The class automatically removes the 
+/// output samples from the buffer as well as grows the storage size 
+/// whenever necessary.
+///
+/// Author        : Copyright (c) Olli Parviainen
+/// Author e-mail : oparviai 'at' iki.fi
+/// SoundTouch WWW: http://www.surina.net/soundtouch
+///
+////////////////////////////////////////////////////////////////////////////////
+//
+// Last changed  : $Date: 2006/02/05 16:44:06 $
+// File revision : $Revision: 1.9 $
+//
+// $Id: FIFOSampleBuffer.h,v 1.9 2006/02/05 16:44:06 Olli Exp $
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// License :
+//
+//  SoundTouch audio processing library
+//  Copyright (c) Olli Parviainen
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2.1 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef FIFOSampleBuffer_H
+#define FIFOSampleBuffer_H
+
+#include "FIFOSamplePipe.h"
+
+namespace soundtouch
+{
+
+/// Sample buffer working in FIFO (first-in-first-out) principle. The class takes
+/// care of storage size adjustment and data moving during input/output operations.
+///
+/// Notice that in case of stereo audio, one sample is considered to consist of 
+/// both channel data.
+class FIFOSampleBuffer : public FIFOSamplePipe
+{
+private:
+    /// Sample buffer.
+    SAMPLETYPE *buffer;
+
+    // Raw unaligned buffer memory. 'buffer' is made aligned by pointing it to first
+    // 16-byte aligned location of this buffer
+    SAMPLETYPE *bufferUnaligned;
+
+    /// Sample buffer size in bytes
+    uint sizeInBytes;
+
+    /// How many samples are currently in buffer.
+    uint samplesInBuffer;
+
+    /// Channels, 1=mono, 2=stereo.
+    uint channels;
+
+    /// Current position pointer to the buffer. This pointer is increased when samples are 
+    /// removed from the pipe so that it's necessary to actually rewind buffer (move data)
+    /// only new data when is put to the pipe.
+    uint bufferPos;
+
+    /// Rewind the buffer by moving data from position pointed by 'bufferPos' to real 
+    /// beginning of the buffer.
+    void rewind();
+
+    /// Ensures that the buffer has capacity for at least this many samples.
+    void ensureCapacity(const uint capacityRequirement);
+
+    /// Returns current capacity.
+    uint getCapacity() const;
+ 
+public:
+
+    /// Constructor
+    FIFOSampleBuffer(uint numChannels = 2     ///< Number of channels, 1=mono, 2=stereo.
+                                              ///< Default is stereo.
+                     );
+
+    /// destructor
+    ~FIFOSampleBuffer();
+
+    /// Returns a pointer to the beginning of the output samples. 
+    /// This function is provided for accessing the output samples directly. 
+    /// Please be careful for not to corrupt the book-keeping!
+    ///
+    /// When using this function to output samples, also remember to 'remove' the
+    /// output samples from the buffer by calling the 
+    /// 'receiveSamples(numSamples)' function
+    virtual SAMPLETYPE *ptrBegin() const;
+
+    /// Returns a pointer to the end of the used part of the sample buffer (i.e. 
+    /// where the new samples are to be inserted). This function may be used for 
+    /// inserting new samples into the sample buffer directly. Please be careful
+    /// not corrupt the book-keeping!
+    ///
+    /// When using this function as means for inserting new samples, also remember 
+    /// to increase the sample count afterwards, by calling  the 
+    /// 'putSamples(numSamples)' function.
+    SAMPLETYPE *ptrEnd(
+                uint slackCapacity   ///< How much free capacity (in samples) there _at least_ 
+                                     ///< should be so that the caller can succesfully insert the 
+                                     ///< desired samples to the buffer. If necessary, the function 
+                                     ///< grows the buffer size to comply with this requirement.
+                );
+
+    /// Adds 'numSamples' pcs of samples from the 'samples' memory position to
+    /// the sample buffer.
+    virtual void putSamples(const SAMPLETYPE *samples,  ///< Pointer to samples.
+                            uint numSamples                         ///< Number of samples to insert.
+                            );
+
+    /// Adjusts the book-keeping to increase number of samples in the buffer without 
+    /// copying any actual samples.
+    ///
+    /// This function is used to update the number of samples in the sample buffer
+    /// when accessing the buffer directly with 'ptrEnd' function. Please be 
+    /// careful though!
+    virtual void putSamples(uint numSamples   ///< Number of samples been inserted.
+                            );
+
+    /// Output samples from beginning of the sample buffer. Copies requested samples to 
+    /// output buffer and removes them from the sample buffer. If there are less than 
+    /// 'numsample' samples in the buffer, returns all that available.
+    ///
+    /// \return Number of samples returned.
+    virtual uint receiveSamples(SAMPLETYPE *output, ///< Buffer where to copy output samples.
+                                uint maxSamples                 ///< How many samples to receive at max.
+                                );
+
+    /// Adjusts book-keeping so that given number of samples are removed from beginning of the 
+    /// sample buffer without copying them anywhere. 
+    ///
+    /// Used to reduce the number of samples in the buffer when accessing the sample buffer directly
+    /// with 'ptrBegin' function.
+    virtual uint receiveSamples(uint maxSamples   ///< Remove this many samples from the beginning of pipe.
+                                );
+
+    /// Returns number of samples currently available.
+    virtual uint numSamples() const;
+
+    /// Sets number of channels, 1 = mono, 2 = stereo.
+    void setChannels(uint numChannels);
+
+    /// Returns nonzero if there aren't any samples available for outputting.
+    virtual int isEmpty() const;
+
+    /// Clears all the samples.
+    virtual void clear();
+};
+
+}
+
+#endif
--- a/desmume/src/metaspu/SoundTouch/FIFOSamplePipe.h
+++ b/desmume/src/metaspu/SoundTouch/FIFOSamplePipe.h
@ -0,0 +1,217 @@
+////////////////////////////////////////////////////////////////////////////////
+///
+/// 'FIFOSamplePipe' : An abstract base class for classes that manipulate sound
+/// samples by operating like a first-in-first-out pipe: New samples are fed
+/// into one end of the pipe with the 'putSamples' function, and the processed
+/// samples are received from the other end with the 'receiveSamples' function.
+///
+/// 'FIFOProcessor' : A base class for classes the do signal processing with 
+/// the samples while operating like a first-in-first-out pipe. When samples
+/// are input with the 'putSamples' function, the class processes them
+/// and moves the processed samples to the given 'output' pipe object, which
+/// may be either another processing stage, or a fifo sample buffer object.
+///
+/// Author        : Copyright (c) Olli Parviainen
+/// Author e-mail : oparviai 'at' iki.fi
+/// SoundTouch WWW: http://www.surina.net/soundtouch
+///
+////////////////////////////////////////////////////////////////////////////////
+//
+// Last changed  : $Date: 2006/02/05 16:44:06 $
+// File revision : $Revision: 1.8 $
+//
+// $Id: FIFOSamplePipe.h,v 1.8 2006/02/05 16:44:06 Olli Exp $
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// License :
+//
+//  SoundTouch audio processing library
+//  Copyright (c) Olli Parviainen
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2.1 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef FIFOSamplePipe_H
+#define FIFOSamplePipe_H
+
+#include <assert.h>
+#include <stdlib.h>
+#include "STTypes.h"
+
+namespace soundtouch
+{
+
+/// Abstract base class for FIFO (first-in-first-out) sample processing classes.
+class FIFOSamplePipe
+{
+public:
+    /// Returns a pointer to the beginning of the output samples. 
+    /// This function is provided for accessing the output samples directly. 
+    /// Please be careful for not to corrupt the book-keeping!
+    ///
+    /// When using this function to output samples, also remember to 'remove' the
+    /// output samples from the buffer by calling the 
+    /// 'receiveSamples(numSamples)' function
+    virtual SAMPLETYPE *ptrBegin() const = 0;
+
+    /// Adds 'numSamples' pcs of samples from the 'samples' memory position to
+    /// the sample buffer.
+    virtual void putSamples(const SAMPLETYPE *samples,  ///< Pointer to samples.
+                            uint numSamples                         ///< Number of samples to insert.
+                            ) = 0;
+
+
+    // Moves samples from the 'other' pipe instance to this instance.
+    void moveSamples(FIFOSamplePipe &other  ///< Other pipe instance where from the receive the data.
+         )
+    {
+        int oNumSamples = other.numSamples();
+
+        putSamples(other.ptrBegin(), oNumSamples);
+        other.receiveSamples(oNumSamples);
+    };
+
+    /// Output samples from beginning of the sample buffer. Copies requested samples to 
+    /// output buffer and removes them from the sample buffer. If there are less than 
+    /// 'numsample' samples in the buffer, returns all that available.
+    ///
+    /// \return Number of samples returned.
+    virtual uint receiveSamples(SAMPLETYPE *output, ///< Buffer where to copy output samples.
+                                uint maxSamples                 ///< How many samples to receive at max.
+                                ) = 0;
+
+    /// Adjusts book-keeping so that given number of samples are removed from beginning of the 
+    /// sample buffer without copying them anywhere. 
+    ///
+    /// Used to reduce the number of samples in the buffer when accessing the sample buffer directly
+    /// with 'ptrBegin' function.
+    virtual uint receiveSamples(uint maxSamples   ///< Remove this many samples from the beginning of pipe.
+                                ) = 0;
+
+    /// Returns number of samples currently available.
+    virtual uint numSamples() const = 0;
+
+    // Returns nonzero if there aren't any samples available for outputting.
+    virtual int isEmpty() const = 0;
+
+    /// Clears all the samples.
+    virtual void clear() = 0;
+};
+
+
+
+/// Base-class for sound processing routines working in FIFO principle. With this base 
+/// class it's easy to implement sound processing stages that can be chained together,
+/// so that samples that are fed into beginning of the pipe automatically go through 
+/// all the processing stages.
+///
+/// When samples are input to this class, they're first processed and then put to 
+/// the FIFO pipe that's defined as output of this class. This output pipe can be
+/// either other processing stage or a FIFO sample buffer.
+class FIFOProcessor :public FIFOSamplePipe
+{
+protected:
+    /// Internal pipe where processed samples are put.
+    FIFOSamplePipe *output;
+
+    /// Sets output pipe.
+    void setOutPipe(FIFOSamplePipe *pOutput)
+    {
+        assert(output == NULL);
+        assert(pOutput != NULL);
+        output = pOutput;
+    }
+
+
+    /// Constructor. Doesn't define output pipe; it has to be set be 
+    /// 'setOutPipe' function.
+    FIFOProcessor()
+    {
+        output = NULL;
+    }
+
+
+    /// Constructor. Configures output pipe.
+    FIFOProcessor(FIFOSamplePipe *pOutput   ///< Output pipe.
+                 )
+    {
+        output = pOutput;
+    }
+
+
+    /// Destructor.
+    virtual ~FIFOProcessor()
+    {
+    }
+
+
+    /// Returns a pointer to the beginning of the output samples. 
+    /// This function is provided for accessing the output samples directly. 
+    /// Please be careful for not to corrupt the book-keeping!
+    ///
+    /// When using this function to output samples, also remember to 'remove' the
+    /// output samples from the buffer by calling the 
+    /// 'receiveSamples(numSamples)' function
+    virtual SAMPLETYPE *ptrBegin() const
+    {
+        return output->ptrBegin();
+    }
+
+public:
+
+    /// Output samples from beginning of the sample buffer. Copies requested samples to 
+    /// output buffer and removes them from the sample buffer. If there are less than 
+    /// 'numsample' samples in the buffer, returns all that available.
+    ///
+    /// \return Number of samples returned.
+    virtual uint receiveSamples(SAMPLETYPE *outBuffer, ///< Buffer where to copy output samples.
+                                uint maxSamples                    ///< How many samples to receive at max.
+                                )
+    {
+        return output->receiveSamples(outBuffer, maxSamples);
+    }
+
+
+    /// Adjusts book-keeping so that given number of samples are removed from beginning of the 
+    /// sample buffer without copying them anywhere. 
+    ///
+    /// Used to reduce the number of samples in the buffer when accessing the sample buffer directly
+    /// with 'ptrBegin' function.
+    virtual uint receiveSamples(uint maxSamples   ///< Remove this many samples from the beginning of pipe.
+                                )
+    {
+        return output->receiveSamples(maxSamples);
+    }
+
+
+    /// Returns number of samples currently available.
+    virtual uint numSamples() const
+    {
+        return output->numSamples();
+    }
+
+
+    /// Returns nonzero if there aren't any samples available for outputting.
+    virtual int isEmpty() const
+    {
+        return output->isEmpty();
+    }
+};
+
+}
+
+#endif
--- a/desmume/src/metaspu/SoundTouch/FIRFilter.cpp
+++ b/desmume/src/metaspu/SoundTouch/FIRFilter.cpp
@ -0,0 +1,272 @@
+////////////////////////////////////////////////////////////////////////////////
+///
+/// General FIR digital filter routines with MMX optimization. 
+///
+/// Note : MMX optimized functions reside in a separate, platform-specific file, 
+/// e.g. 'mmx_win.cpp' or 'mmx_gcc.cpp'
+///
+/// Author        : Copyright (c) Olli Parviainen
+/// Author e-mail : oparviai 'at' iki.fi
+/// SoundTouch WWW: http://www.surina.net/soundtouch
+///
+////////////////////////////////////////////////////////////////////////////////
+//
+// Last changed  : $Date: 2006/02/05 16:44:06 $
+// File revision : $Revision: 1.16 $
+//
+// $Id: FIRFilter.cpp,v 1.16 2006/02/05 16:44:06 Olli Exp $
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// License :
+//
+//  SoundTouch audio processing library
+//  Copyright (c) Olli Parviainen
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2.1 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include <memory.h>
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h>
+#include <stdexcept>
+#include "FIRFilter.h"
+#include "cpu_detect.h"
+
+using namespace soundtouch;
+
+/*****************************************************************************
+ *
+ * Implementation of the class 'FIRFilter'
+ *
+ *****************************************************************************/
+
+FIRFilter::FIRFilter()
+{
+    resultDivFactor = 0;
+    length = 0;
+    lengthDiv8 = 0;
+    filterCoeffs = NULL;
+}
+
+
+FIRFilter::~FIRFilter()
+{
+    delete[] filterCoeffs;
+}
+
+// Usual C-version of the filter routine for stereo sound
+uint FIRFilter::evaluateFilterStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, uint numSamples) const
+{
+    uint i, j, end;
+    LONG_SAMPLETYPE suml, sumr;
+#ifdef FLOAT_SAMPLES
+    // when using floating point samples, use a scaler instead of a divider
+    // because division is much slower operation than multiplying.
+    double dScaler = 1.0 / (double)resultDivider;
+#endif
+
+    assert(length != 0);
+
+    end = 2 * (numSamples - length);
+
+    for (j = 0; j < end; j += 2) 
+    {
+        const SAMPLETYPE *ptr;
+
+        suml = sumr = 0;
+        ptr = src + j;
+
+        for (i = 0; i < length; i += 4) 
+        {
+            // loop is unrolled by factor of 4 here for efficiency
+            suml += ptr[2 * i + 0] * filterCoeffs[i + 0] +
+                    ptr[2 * i + 2] * filterCoeffs[i + 1] +
+                    ptr[2 * i + 4] * filterCoeffs[i + 2] +
+                    ptr[2 * i + 6] * filterCoeffs[i + 3];
+            sumr += ptr[2 * i + 1] * filterCoeffs[i + 0] +
+                    ptr[2 * i + 3] * filterCoeffs[i + 1] +
+                    ptr[2 * i + 5] * filterCoeffs[i + 2] +
+                    ptr[2 * i + 7] * filterCoeffs[i + 3];
+        }
+
+#ifdef INTEGER_SAMPLES
+        suml >>= resultDivFactor;
+        sumr >>= resultDivFactor;
+        // saturate to 16 bit integer limits
+        suml = (suml < -32768) ? -32768 : (suml > 32767) ? 32767 : suml;
+        // saturate to 16 bit integer limits
+        sumr = (sumr < -32768) ? -32768 : (sumr > 32767) ? 32767 : sumr;
+#else
+        suml *= dScaler;
+        sumr *= dScaler;
+#endif // INTEGER_SAMPLES
+        dest[j] = (SAMPLETYPE)suml;
+        dest[j + 1] = (SAMPLETYPE)sumr;
+    }
+    return numSamples - length;
+}
+
+
+
+
+// Usual C-version of the filter routine for mono sound
+uint FIRFilter::evaluateFilterMono(SAMPLETYPE *dest, const SAMPLETYPE *src, uint numSamples) const
+{
+    uint i, j, end;
+    LONG_SAMPLETYPE sum;
+#ifdef FLOAT_SAMPLES
+    // when using floating point samples, use a scaler instead of a divider
+    // because division is much slower operation than multiplying.
+    double dScaler = 1.0 / (double)resultDivider;
+#endif
+
+
+    assert(length != 0);
+
+    end = numSamples - length;
+    for (j = 0; j < end; j ++) 
+    {
+        sum = 0;
+        for (i = 0; i < length; i += 4) 
+        {
+            // loop is unrolled by factor of 4 here for efficiency
+            sum += src[i + 0] * filterCoeffs[i + 0] + 
+                   src[i + 1] * filterCoeffs[i + 1] + 
+                   src[i + 2] * filterCoeffs[i + 2] + 
+                   src[i + 3] * filterCoeffs[i + 3];
+        }
+#ifdef INTEGER_SAMPLES
+        sum >>= resultDivFactor;
+        // saturate to 16 bit integer limits
+        sum = (sum < -32768) ? -32768 : (sum > 32767) ? 32767 : sum;
+#else
+        sum *= dScaler;
+#endif // INTEGER_SAMPLES
+        dest[j] = (SAMPLETYPE)sum;
+        src ++;
+    }
+    return end;
+}
+
+
+// Set filter coeffiecients and length.
+//
+// Throws an exception if filter length isn't divisible by 8
+void FIRFilter::setCoefficients(const SAMPLETYPE *coeffs, uint newLength, uint uResultDivFactor)
+{
+    assert(newLength > 0);
+    if (newLength % 8) throw std::runtime_error("FIR filter length not divisible by 8");
+
+    lengthDiv8 = newLength / 8;
+    length = lengthDiv8 * 8;
+    assert(length == newLength);
+
+    resultDivFactor = uResultDivFactor;
+#ifdef INTEGER_SAMPLES
+    resultDivider = (SAMPLETYPE)(1<<resultDivFactor);
+#else
+    resultDivider = (SAMPLETYPE)powf(2, (SAMPLETYPE)resultDivFactor);
+#endif
+
+    delete[] filterCoeffs;
+    filterCoeffs = new SAMPLETYPE[length];
+    memcpy(filterCoeffs, coeffs, length * sizeof(SAMPLETYPE));
+}
+
+
+uint FIRFilter::getLength() const
+{
+    return length;
+}
+
+
+
+// Applies the filter to the given sequence of samples. 
+//
+// Note : The amount of outputted samples is by value of 'filter_length' 
+// smaller than the amount of input samples.
+uint FIRFilter::evaluate(SAMPLETYPE *dest, const SAMPLETYPE *src, uint numSamples, uint numChannels) const
+{
+    assert(numChannels == 1 || numChannels == 2);
+
+    assert(length > 0);
+    assert(lengthDiv8 * 8 == length);
+    if (numSamples < length) return 0;
+    assert(resultDivFactor >= 0);
+    if (numChannels == 2) 
+    {
+        return evaluateFilterStereo(dest, src, numSamples);
+    } else {
+        return evaluateFilterMono(dest, src, numSamples);
+    }
+}
+
+
+
+// Operator 'new' is overloaded so that it automatically creates a suitable instance 
+// depending on if we've a MMX-capable CPU available or not.
+void * FIRFilter::operator new(size_t s)
+{
+    // Notice! don't use "new FIRFilter" directly, use "newInstance" to create a new instance instead!
+    throw std::runtime_error("Don't use 'new FIRFilter', use 'newInstance' member instead!");
+    return NULL;
+}
+
+
+FIRFilter * FIRFilter::newInstance()
+{
+    uint uExtensions = 0;
+
+#if !defined(_MSC_VER) || !defined(__x86_64__)
+    uExtensions = detectCPUextensions();
+#endif
+
+    // Check if MMX/SSE/3DNow! instruction set extensions supported by CPU
+
+#ifdef ALLOW_MMX
+    // MMX routines available only with integer sample types
+    if (uExtensions & SUPPORT_MMX)
+    {
+        return ::new FIRFilterMMX;
+    }
+    else
+#endif // ALLOW_MMX
+
+#ifdef ALLOW_SSE
+    if (uExtensions & SUPPORT_SSE)
+    {
+        // SSE support
+        return ::new FIRFilterSSE;
+    }
+    else
+#endif // ALLOW_SSE
+
+#ifdef ALLOW_3DNOW
+    if (uExtensions & SUPPORT_3DNOW)
+    {
+        // 3DNow! support
+        return ::new FIRFilter3DNow;
+    }
+    else
+#endif // ALLOW_3DNOW
+
+    {
+        // ISA optimizations not supported, use plain C version
+        return ::new FIRFilter;
+    }
+}
--- a/desmume/src/metaspu/SoundTouch/FIRFilter.h
+++ b/desmume/src/metaspu/SoundTouch/FIRFilter.h
@ -0,0 +1,163 @@
+////////////////////////////////////////////////////////////////////////////////
+///
+/// General FIR digital filter routines with MMX optimization. 
+///
+/// Note : MMX optimized functions reside in a separate, platform-specific file, 
+/// e.g. 'mmx_win.cpp' or 'mmx_gcc.cpp'
+///
+/// Author        : Copyright (c) Olli Parviainen
+/// Author e-mail : oparviai 'at' iki.fi
+/// SoundTouch WWW: http://www.surina.net/soundtouch
+///
+////////////////////////////////////////////////////////////////////////////////
+//
+// Last changed  : $Date: 2006/02/05 16:44:06 $
+// File revision : $Revision: 1.17 $
+//
+// $Id: FIRFilter.h,v 1.17 2006/02/05 16:44:06 Olli Exp $
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// License :
+//
+//  SoundTouch audio processing library
+//  Copyright (c) Olli Parviainen
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2.1 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef FIRFilter_H
+#define FIRFilter_H
+
+#include "STTypes.h"
+
+namespace soundtouch
+{
+
+class FIRFilter 
+{
+protected:
+    // Number of FIR filter taps
+    uint length;    
+    // Number of FIR filter taps divided by 8
+    uint lengthDiv8;
+
+    // Result divider factor in 2^k format
+    uint resultDivFactor;
+
+    // Result divider value.
+    SAMPLETYPE resultDivider;
+
+    // Memory for filter coefficients
+    SAMPLETYPE *filterCoeffs;
+
+    virtual uint evaluateFilterStereo(SAMPLETYPE *dest, 
+                                      const SAMPLETYPE *src, 
+                                      uint numSamples) const;
+    virtual uint evaluateFilterMono(SAMPLETYPE *dest, 
+                                    const SAMPLETYPE *src, 
+                                    uint numSamples) const;
+
+public:
+    FIRFilter();
+    virtual ~FIRFilter();
+
+    /// Operator 'new' is overloaded so that it automatically creates a suitable instance 
+    /// depending on if we've a MMX-capable CPU available or not.
+    void * operator new(size_t s);
+
+    static FIRFilter *newInstance();
+
+    /// Applies the filter to the given sequence of samples. 
+    /// Note : The amount of outputted samples is by value of 'filter_length' 
+    /// smaller than the amount of input samples.
+    ///
+    /// \return Number of samples copied to 'dest'.
+    uint evaluate(SAMPLETYPE *dest, 
+                  const SAMPLETYPE *src, 
+                  uint numSamples, 
+                  uint numChannels) const;
+
+    uint getLength() const;
+
+    virtual void setCoefficients(const SAMPLETYPE *coeffs, 
+                                 uint newLength, 
+                                 uint uResultDivFactor);
+};
+
+
+// Optional subclasses that implement CPU-specific optimizations:
+
+#ifdef ALLOW_MMX
+
+    /// Class that implements MMX optimized functions exclusive for 16bit integer samples type.
+    class FIRFilterMMX : public FIRFilter
+    {
+    protected:
+        short *filterCoeffsUnalign;
+        short *filterCoeffsAlign;
+
+        virtual uint evaluateFilterStereo(short *dest, const short *src, uint numSamples) const;
+    public:
+        FIRFilterMMX();
+        ~FIRFilterMMX();
+
+        virtual void setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor);
+    };
+
+#endif // ALLOW_MMX
+
+
+#ifdef ALLOW_3DNOW
+
+    /// Class that implements 3DNow! optimized functions exclusive for floating point samples type.
+    class FIRFilter3DNow : public FIRFilter
+    {
+    protected:
+        float *filterCoeffsUnalign;
+        float *filterCoeffsAlign;
+
+        virtual uint evaluateFilterStereo(float *dest, const float *src, uint numSamples) const;
+    public:
+        FIRFilter3DNow();
+        ~FIRFilter3DNow();
+        virtual void setCoefficients(const float *coeffs, uint newLength, uint uResultDivFactor);
+    };
+
+#endif  // ALLOW_3DNOW
+
+
+#ifdef ALLOW_SSE
+    /// Class that implements SSE optimized functions exclusive for floating point samples type.
+    class FIRFilterSSE : public FIRFilter
+    {
+    protected:
+        float *filterCoeffsUnalign;
+        float *filterCoeffsAlign;
+
+        virtual uint evaluateFilterStereo(float *dest, const float *src, uint numSamples) const;
+    public:
+        FIRFilterSSE();
+        ~FIRFilterSSE();
+
+        virtual void setCoefficients(const float *coeffs, uint newLength, uint uResultDivFactor);
+    };
+
+#endif // ALLOW_SSE
+
+}
+
+#endif  // FIRFilter_H
--- a/desmume/src/metaspu/SoundTouch/Makefile.am
+++ b/desmume/src/metaspu/SoundTouch/Makefile.am
@ -0,0 +1,42 @@
+## Process this file with automake to create Makefile.in
+##
+## $Id: Makefile.am,v 1.3 2006/02/05 18:33:34 Olli Exp $
+##
+## Copyright (C) 2003 - David W. Durham
+##
+## This file is part of SoundTouch, an audio processing library for pitch/time adjustments
+##
+## SoundTouch is free software; you can redistribute it and/or modify it under the
+## terms of the GNU General Public License as published by the Free Software
+## Foundation; either version 2 of the License, or (at your option) any later
+## version.
+##
+## SoundTouch is distributed in the hope that it will be useful, but WITHOUT ANY
+## WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+## A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License along with
+## this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+## Place - Suite 330, Boston, MA  02111-1307, USA
+
+AUTOMAKE_OPTIONS = foreign
+
+noinst_HEADERS=AAFilter.h cpu_detect.h FIRFilter.h RateTransposer.h TDStretch.h cpu_detect_x86_gcc.cpp
+noinst_LIBRARIES = libSoundTouch.a
+
+libSoundTouch_a_CXXFLAGS = -msse -mmmx
+libSoundTouch_a_CFLAGS = -msse -mmmx
+
+#lib_LTLIBRARIES=libSoundTouch.la
+# the mmx_gcc.cpp and cpu_detect_x86_gcc.cpp may need to be conditionally included here from things discovered in configure.ac
+libSoundTouch_a_SOURCES=AAFilter.cpp FIRFilter.cpp FIFOSampleBuffer.cpp mmx_optimized.cpp sse_optimized.cpp \
+RateTransposer.cpp SoundTouch.cpp TDStretch.cpp WavFile.cpp cpu_detect_x86_gcc.cpp
+
+# ??? test for -fcheck-new in configure.ac
+# other compiler flags to add
+AM_CXXFLAGS=-O3 -msse -fcheck-new
+#-I../../include
+
+# other linking flags to add
+#libSoundTouch_la_LIBADD=
+
--- a/desmume/src/metaspu/SoundTouch/RateTransposer.cpp
+++ b/desmume/src/metaspu/SoundTouch/RateTransposer.cpp
@ -0,0 +1,626 @@
+////////////////////////////////////////////////////////////////////////////////
+/// 
+/// Sample rate transposer. Changes sample rate by using linear interpolation 
+/// together with anti-alias filtering (first order interpolation with anti-
+/// alias filtering should be quite adequate for this application)
+///
+/// Author        : Copyright (c) Olli Parviainen
+/// Author e-mail : oparviai 'at' iki.fi
+/// SoundTouch WWW: http://www.surina.net/soundtouch
+///
+////////////////////////////////////////////////////////////////////////////////
+//
+// Last changed  : $Date: 2006/03/19 10:05:49 $
+// File revision : $Revision: 1.13 $
+//
+// $Id: RateTransposer.cpp,v 1.13 2006/03/19 10:05:49 Olli Exp $
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// License :
+//
+//  SoundTouch audio processing library
+//  Copyright (c) Olli Parviainen
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2.1 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include <memory.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <limits.h>
+#include "RateTransposer.h"
+#include "AAFilter.h"
+
+using namespace soundtouch;
+
+
+/// A linear samplerate transposer class that uses integer arithmetics.
+/// for the transposing.
+class RateTransposerInteger : public RateTransposer
+{
+protected:
+    int iSlopeCount;
+    uint uRate;
+    SAMPLETYPE sPrevSampleL, sPrevSampleR;
+
+    virtual void resetRegisters();
+
+    virtual uint transposeStereo(SAMPLETYPE *dest, 
+                         const SAMPLETYPE *src, 
+                         uint numSamples);
+    virtual uint transposeMono(SAMPLETYPE *dest, 
+                       const SAMPLETYPE *src, 
+                       uint numSamples);
+
+public:
+    RateTransposerInteger();
+    virtual ~RateTransposerInteger();
+
+    /// Sets new target rate. Normal rate = 1.0, smaller values represent slower 
+    /// rate, larger faster rates.
+    virtual void setRate(float newRate);
+
+};
+
+
+/// A linear samplerate transposer class that uses floating point arithmetics
+/// for the transposing.
+class RateTransposerFloat : public RateTransposer
+{
+protected:
+    float fSlopeCount;
+    float fRateStep;
+    SAMPLETYPE sPrevSampleL, sPrevSampleR;
+
+    virtual void resetRegisters();
+
+    virtual uint transposeStereo(SAMPLETYPE *dest, 
+                         const SAMPLETYPE *src, 
+                         uint numSamples);
+    virtual uint transposeMono(SAMPLETYPE *dest, 
+                       const SAMPLETYPE *src, 
+                       uint numSamples);
+
+public:
+    RateTransposerFloat();
+    virtual ~RateTransposerFloat();
+};
+
+
+
+#ifndef min
+#define min(a,b) ((a > b) ? b : a)
+#define max(a,b) ((a < b) ? b : a)
+#endif
+
+
+// Operator 'new' is overloaded so that it automatically creates a suitable instance 
+// depending on if we've a MMX/SSE/etc-capable CPU available or not.
+void * RateTransposer::operator new(size_t s)
+{
+    // Notice! don't use "new TDStretch" directly, use "newInstance" to create a new instance instead!
+    assert(FALSE);  
+    return NULL;
+}
+
+
+RateTransposer *RateTransposer::newInstance()
+{
+#ifdef INTEGER_SAMPLES
+    return ::new RateTransposerInteger;
+#else
+    return ::new RateTransposerFloat;
+#endif
+}
+
+
+// Constructor
+RateTransposer::RateTransposer() : FIFOProcessor(&outputBuffer)
+{
+    uChannels = 2;
+    bUseAAFilter = TRUE;
+
+    // Instantiates the anti-alias filter with default tap length
+    // of 32
+    pAAFilter = new AAFilter(32);
+}
+
+
+
+RateTransposer::~RateTransposer()
+{
+    delete pAAFilter;
+}
+
+
+
+/// Enables/disables the anti-alias filter. Zero to disable, nonzero to enable
+void RateTransposer::enableAAFilter(const BOOL newMode)
+{
+    bUseAAFilter = newMode;
+}
+
+
+/// Returns nonzero if anti-alias filter is enabled.
+BOOL RateTransposer::isAAFilterEnabled() const
+{
+    return bUseAAFilter;
+}
+
+
+AAFilter *RateTransposer::getAAFilter() const
+{
+    return pAAFilter;
+}
+
+
+
+// Sets new target uRate. Normal uRate = 1.0, smaller values represent slower 
+// uRate, larger faster uRates.
+void RateTransposer::setRate(float newRate)
+{
+    float fCutoff;
+
+    fRate = newRate;
+
+    // design a new anti-alias filter
+    if (newRate > 1.0f) 
+    {
+        fCutoff = 0.5f / newRate;
+    } 
+    else 
+    {
+        fCutoff = 0.5f * newRate;
+    }
+    pAAFilter->setCutoffFreq(fCutoff);
+}
+
+
+// Outputs as many samples of the 'outputBuffer' as possible, and if there's
+// any room left, outputs also as many of the incoming samples as possible.
+// The goal is to drive the outputBuffer empty.
+//
+// It's allowed for 'output' and 'input' parameters to point to the same
+// memory position.
+void RateTransposer::flushStoreBuffer()
+{
+    if (storeBuffer.isEmpty()) return;
+
+    outputBuffer.moveSamples(storeBuffer);
+}
+
+
+// Adds 'numSamples' pcs of samples from the 'samples' memory position into
+// the input of the object.
+void RateTransposer::putSamples(const SAMPLETYPE *samples, uint numSamples)
+{
+    processSamples(samples, numSamples);
+}
+
+
+
+// Transposes up the sample rate, causing the observed playback 'rate' of the
+// sound to decrease
+void RateTransposer::upsample(const SAMPLETYPE *src, uint numSamples)
+{
+    int count, sizeTemp, num;
+
+    // If the parameter 'uRate' value is smaller than 'SCALE', first transpose
+    // the samples and then apply the anti-alias filter to remove aliasing.
+
+    // First check that there's enough room in 'storeBuffer' 
+    // (+16 is to reserve some slack in the destination buffer)
+    sizeTemp = (int)((float)numSamples / fRate + 16.0f);
+
+    // Transpose the samples, store the result into the end of "storeBuffer"
+    count = transpose(storeBuffer.ptrEnd(sizeTemp), src, numSamples);
+    storeBuffer.putSamples(count);
+
+    // Apply the anti-alias filter to samples in "store output", output the
+    // result to "dest"
+    num = storeBuffer.numSamples();
+    count = pAAFilter->evaluate(outputBuffer.ptrEnd(num), 
+        storeBuffer.ptrBegin(), num, uChannels);
+    outputBuffer.putSamples(count);
+
+    // Remove the processed samples from "storeBuffer"
+    storeBuffer.receiveSamples(count);
+}
+
+
+// Transposes down the sample rate, causing the observed playback 'rate' of the
+// sound to increase
+void RateTransposer::downsample(const SAMPLETYPE *src, uint numSamples)
+{
+    int count, sizeTemp;
+
+    // If the parameter 'uRate' value is larger than 'SCALE', first apply the
+    // anti-alias filter to remove high frequencies (prevent them from folding
+    // over the lover frequencies), then transpose. */
+
+    // Add the new samples to the end of the storeBuffer */
+    storeBuffer.putSamples(src, numSamples);
+
+    // Anti-alias filter the samples to prevent folding and output the filtered 
+    // data to tempBuffer. Note : because of the FIR filter length, the
+    // filtering routine takes in 'filter_length' more samples than it outputs.
+    assert(tempBuffer.isEmpty());
+    sizeTemp = storeBuffer.numSamples();
+
+    count = pAAFilter->evaluate(tempBuffer.ptrEnd(sizeTemp), 
+        storeBuffer.ptrBegin(), sizeTemp, uChannels);
+
+    // Remove the filtered samples from 'storeBuffer'
+    storeBuffer.receiveSamples(count);
+
+    // Transpose the samples (+16 is to reserve some slack in the destination buffer)
+    sizeTemp = (int)((float)numSamples / fRate + 16.0f);
+    count = transpose(outputBuffer.ptrEnd(sizeTemp), tempBuffer.ptrBegin(), count);
+    outputBuffer.putSamples(count);
+}
+
+
+// Transposes sample rate by applying anti-alias filter to prevent folding. 
+// Returns amount of samples returned in the "dest" buffer.
+// The maximum amount of samples that can be returned at a time is set by
+// the 'set_returnBuffer_size' function.
+void RateTransposer::processSamples(const SAMPLETYPE *src, uint numSamples)
+{
+    uint count;
+    uint sizeReq;
+
+    if (numSamples == 0) return;
+    assert(pAAFilter);
+
+    // If anti-alias filter is turned off, simply transpose without applying
+    // the filter
+    if (bUseAAFilter == FALSE) 
+    {
+        sizeReq = (int)((float)numSamples / fRate + 1.0f);
+        count = transpose(outputBuffer.ptrEnd(sizeReq), src, numSamples);
+        outputBuffer.putSamples(count);
+        return;
+    }
+
+    // Transpose with anti-alias filter
+    if (fRate < 1.0f) 
+    {
+        upsample(src, numSamples);
+    } 
+    else  
+    {
+        downsample(src, numSamples);
+    }
+}
+
+
+// Transposes the sample rate of the given samples using linear interpolation. 
+// Returns the number of samples returned in the "dest" buffer
+inline uint RateTransposer::transpose(SAMPLETYPE *dest, const SAMPLETYPE *src, uint numSamples)
+{
+    if (uChannels == 2) 
+    {
+        return transposeStereo(dest, src, numSamples);
+    } 
+    else 
+    {
+        return transposeMono(dest, src, numSamples);
+    }
+}
+
+
+// Sets the number of channels, 1 = mono, 2 = stereo
+void RateTransposer::setChannels(const uint numchannels)
+{
+    if (uChannels == numchannels) return;
+
+    assert(numchannels == 1 || numchannels == 2);
+    uChannels = numchannels;
+
+    storeBuffer.setChannels(uChannels);
+    tempBuffer.setChannels(uChannels);
+    outputBuffer.setChannels(uChannels);
+
+    // Inits the linear interpolation registers
+    resetRegisters();
+}
+
+
+// Clears all the samples in the object
+void RateTransposer::clear()
+{
+    outputBuffer.clear();
+    storeBuffer.clear();
+}
+
+
+// Returns nonzero if there aren't any samples available for outputting.
+uint RateTransposer::isEmpty()
+{
+    int res;
+
+    res = FIFOProcessor::isEmpty();
+    if (res == 0) return 0;
+    return storeBuffer.isEmpty();
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// RateTransposerInteger - integer arithmetic implementation
+// 
+
+/// fixed-point interpolation routine precision
+#define SCALE    65536
+
+// Constructor
+RateTransposerInteger::RateTransposerInteger() : RateTransposer()
+{
+    // call these here as these are virtual functions; calling these
+    // from the base class constructor wouldn't execute the overloaded
+    // versions (<master yoda>peculiar C++ can be</my>).
+    resetRegisters();
+    setRate(1.0f);
+}
+
+
+RateTransposerInteger::~RateTransposerInteger()
+{
+}
+
+
+void RateTransposerInteger::resetRegisters()
+{
+    iSlopeCount = 0;
+    sPrevSampleL = 
+    sPrevSampleR = 0;
+}
+
+
+
+// Transposes the sample rate of the given samples using linear interpolation. 
+// 'Mono' version of the routine. Returns the number of samples returned in 
+// the "dest" buffer
+uint RateTransposerInteger::transposeMono(SAMPLETYPE *dest, const SAMPLETYPE *src, uint numSamples)
+{
+    unsigned int i, used;
+    LONG_SAMPLETYPE temp, vol1;
+
+    used = 0;    
+    i = 0;
+
+    // Process the last sample saved from the previous call first...
+    while (iSlopeCount <= SCALE) 
+    {
+        vol1 = (LONG_SAMPLETYPE)(SCALE - iSlopeCount);
+        temp = vol1 * sPrevSampleL + iSlopeCount * src[0];
+        dest[i] = (SAMPLETYPE)(temp / SCALE);
+        i++;
+        iSlopeCount += uRate;
+    }
+    // now always (iSlopeCount > SCALE)
+    iSlopeCount -= SCALE;
+
+    while (1)
+    {
+        while (iSlopeCount > SCALE) 
+        {
+            iSlopeCount -= SCALE;
+            used ++;
+            if (used >= numSamples - 1) goto end;
+        }
+        vol1 = (LONG_SAMPLETYPE)(SCALE - iSlopeCount);
+        temp = src[used] * vol1 + iSlopeCount * src[used + 1];
+        dest[i] = (SAMPLETYPE)(temp / SCALE);
+
+        i++;
+        iSlopeCount += uRate;
+    }
+end:
+    // Store the last sample for the next round
+    sPrevSampleL = src[numSamples - 1];
+
+    return i;
+}
+
+
+// Transposes the sample rate of the given samples using linear interpolation. 
+// 'Stereo' version of the routine. Returns the number of samples returned in 
+// the "dest" buffer
+uint RateTransposerInteger::transposeStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, uint numSamples)
+{
+    unsigned int srcPos, i, used;
+    LONG_SAMPLETYPE temp, vol1;
+
+    if (numSamples == 0) return 0;  // no samples, no work
+
+    used = 0;    
+    i = 0;
+
+    // Process the last sample saved from the sPrevSampleLious call first...
+    while (iSlopeCount <= SCALE) 
+    {
+        vol1 = (LONG_SAMPLETYPE)(SCALE - iSlopeCount);
+        temp = vol1 * sPrevSampleL + iSlopeCount * src[0];
+        dest[2 * i] = (SAMPLETYPE)(temp / SCALE);
+        temp = vol1 * sPrevSampleR + iSlopeCount * src[1];
+        dest[2 * i + 1] = (SAMPLETYPE)(temp / SCALE);
+        i++;
+        iSlopeCount += uRate;
+    }
+    // now always (iSlopeCount > SCALE)
+    iSlopeCount -= SCALE;
+
+    while (1)
+    {
+        while (iSlopeCount > SCALE) 
+        {
+            iSlopeCount -= SCALE;
+            used ++;
+            if (used >= numSamples - 1) goto end;
+        }
+        srcPos = 2 * used;
+        vol1 = (LONG_SAMPLETYPE)(SCALE - iSlopeCount);
+        temp = src[srcPos] * vol1 + iSlopeCount * src[srcPos + 2];
+        dest[2 * i] = (SAMPLETYPE)(temp / SCALE);
+        temp = src[srcPos + 1] * vol1 + iSlopeCount * src[srcPos + 3];
+        dest[2 * i + 1] = (SAMPLETYPE)(temp / SCALE);
+
+        i++;
+        iSlopeCount += uRate;
+    }
+end:
+    // Store the last sample for the next round
+    sPrevSampleL = src[2 * numSamples - 2];
+    sPrevSampleR = src[2 * numSamples - 1];
+
+    return i;
+}
+
+
+// Sets new target uRate. Normal uRate = 1.0, smaller values represent slower 
+// uRate, larger faster uRates.
+void RateTransposerInteger::setRate(float newRate)
+{
+    uRate = (int)(newRate * SCALE + 0.5f);
+    RateTransposer::setRate(newRate);
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// RateTransposerFloat - floating point arithmetic implementation
+// 
+//////////////////////////////////////////////////////////////////////////////
+
+// Constructor
+RateTransposerFloat::RateTransposerFloat() : RateTransposer()
+{
+    // call these here as these are virtual functions; calling these
+    // from the base class constructor wouldn't execute the overloaded
+    // versions (<master yoda>peculiar C++ can be</my>).
+    resetRegisters();
+    setRate(1.0f);
+}
+
+
+RateTransposerFloat::~RateTransposerFloat()
+{
+}
+
+
+void RateTransposerFloat::resetRegisters()
+{
+    fSlopeCount = 0;
+    sPrevSampleL = 
+    sPrevSampleR = 0;
+}
+
+
+
+// Transposes the sample rate of the given samples using linear interpolation. 
+// 'Mono' version of the routine. Returns the number of samples returned in 
+// the "dest" buffer
+uint RateTransposerFloat::transposeMono(SAMPLETYPE *dest, const SAMPLETYPE *src, uint numSamples)
+{
+    unsigned int i, used;
+
+    used = 0;    
+    i = 0;
+
+    // Process the last sample saved from the previous call first...
+    while (fSlopeCount <= 1.0f) 
+    {
+        dest[i] = (SAMPLETYPE)((1.0f - fSlopeCount) * sPrevSampleL + fSlopeCount * src[0]);
+        i++;
+        fSlopeCount += fRate;
+    }
+    fSlopeCount -= 1.0f;
+
+    if (numSamples == 1) goto end;
+
+    while (1)
+    {
+        while (fSlopeCount > 1.0f) 
+        {
+            fSlopeCount -= 1.0f;
+            used ++;
+            if (used >= numSamples - 1) goto end;
+        }
+        dest[i] = (SAMPLETYPE)((1.0f - fSlopeCount) * src[used] + fSlopeCount * src[used + 1]);
+        i++;
+        fSlopeCount += fRate;
+    }
+end:
+    // Store the last sample for the next round
+    sPrevSampleL = src[numSamples - 1];
+
+    return i;
+}
+
+
+// Transposes the sample rate of the given samples using linear interpolation. 
+// 'Mono' version of the routine. Returns the number of samples returned in 
+// the "dest" buffer
+uint RateTransposerFloat::transposeStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, uint numSamples)
+{
+    unsigned int srcPos, i, used;
+
+    if (numSamples == 0) return 0;  // no samples, no work
+
+    used = 0;    
+    i = 0;
+
+    // Process the last sample saved from the sPrevSampleLious call first...
+    while (fSlopeCount <= 1.0f) 
+    {
+        dest[2 * i] = (SAMPLETYPE)((1.0f - fSlopeCount) * sPrevSampleL + fSlopeCount * src[0]);
+        dest[2 * i + 1] = (SAMPLETYPE)((1.0f - fSlopeCount) * sPrevSampleR + fSlopeCount * src[1]);
+        i++;
+        fSlopeCount += fRate;
+    }
+    // now always (iSlopeCount > 1.0f)
+    fSlopeCount -= 1.0f;
+
+    if (numSamples == 1) goto end;
+
+    while (1)
+    {
+        while (fSlopeCount > 1.0f) 
+        {
+            fSlopeCount -= 1.0f;
+            used ++;
+            if (used >= numSamples - 1) goto end;
+        }
+        srcPos = 2 * used;
+
+        dest[2 * i] = (SAMPLETYPE)((1.0f - fSlopeCount) * src[srcPos] 
+            + fSlopeCount * src[srcPos + 2]);
+        dest[2 * i + 1] = (SAMPLETYPE)((1.0f - fSlopeCount) * src[srcPos + 1] 
+            + fSlopeCount * src[srcPos + 3]);
+
+        i++;
+        fSlopeCount += fRate;
+    }
+end:
+    // Store the last sample for the next round
+    sPrevSampleL = src[2 * numSamples - 2];
+    sPrevSampleR = src[2 * numSamples - 1];
+
+    return i;
+}
--- a/desmume/src/metaspu/SoundTouch/RateTransposer.h
+++ b/desmume/src/metaspu/SoundTouch/RateTransposer.h
@ -0,0 +1,162 @@
+////////////////////////////////////////////////////////////////////////////////
+/// 
+/// Sample rate transposer. Changes sample rate by using linear interpolation 
+/// together with anti-alias filtering (first order interpolation with anti-
+/// alias filtering should be quite adequate for this application).
+///
+/// Use either of the derived classes of 'RateTransposerInteger' or 
+/// 'RateTransposerFloat' for corresponding integer/floating point tranposing
+/// algorithm implementation.
+///
+/// Author        : Copyright (c) Olli Parviainen
+/// Author e-mail : oparviai 'at' iki.fi
+/// SoundTouch WWW: http://www.surina.net/soundtouch
+///
+////////////////////////////////////////////////////////////////////////////////
+//
+// Last changed  : $Date: 2006/02/05 16:44:06 $
+// File revision : $Revision: 1.10 $
+//
+// $Id: RateTransposer.h,v 1.10 2006/02/05 16:44:06 Olli Exp $
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// License :
+//
+//  SoundTouch audio processing library
+//  Copyright (c) Olli Parviainen
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2.1 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef RateTransposer_H
+#define RateTransposer_H
+
+#include "AAFilter.h"
+#include "FIFOSamplePipe.h"
+#include "FIFOSampleBuffer.h"
+
+#include "STTypes.h"
+
+namespace soundtouch
+{
+
+/// A common linear samplerate transposer class.
+///
+/// Note: Use function "RateTransposer::newInstance()" to create a new class 
+/// instance instead of the "new" operator; that function automatically 
+/// chooses a correct implementation depending on if integer or floating 
+/// arithmetics are to be used.
+class RateTransposer : public FIFOProcessor
+{
+protected:
+    /// Anti-alias filter object
+    AAFilter *pAAFilter;
+
+    float fRate;
+
+    uint uChannels;
+
+    /// Buffer for collecting samples to feed the anti-alias filter between
+    /// two batches
+    FIFOSampleBuffer storeBuffer;
+
+    /// Buffer for keeping samples between transposing & anti-alias filter
+    FIFOSampleBuffer tempBuffer;
+
+    /// Output sample buffer
+    FIFOSampleBuffer outputBuffer;
+
+    BOOL bUseAAFilter;
+
+    void init();
+
+    virtual void resetRegisters() = 0;
+
+    virtual uint transposeStereo(SAMPLETYPE *dest, 
+                         const SAMPLETYPE *src, 
+                         uint numSamples) = 0;
+    virtual uint transposeMono(SAMPLETYPE *dest, 
+                       const SAMPLETYPE *src, 
+                       uint numSamples) = 0;
+    uint transpose(SAMPLETYPE *dest, 
+                   const SAMPLETYPE *src, 
+                   uint numSamples);
+
+    void flushStoreBuffer();
+
+    void downsample(const SAMPLETYPE *src, 
+                    uint numSamples);
+    void upsample(const SAMPLETYPE *src, 
+                 uint numSamples);
+
+    /// Transposes sample rate by applying anti-alias filter to prevent folding. 
+    /// Returns amount of samples returned in the "dest" buffer.
+    /// The maximum amount of samples that can be returned at a time is set by
+    /// the 'set_returnBuffer_size' function.
+    void processSamples(const SAMPLETYPE *src, 
+                        uint numSamples);
+
+
+public:
+    RateTransposer();
+    virtual ~RateTransposer();
+
+    /// Operator 'new' is overloaded so that it automatically creates a suitable instance 
+    /// depending on if we're to use integer or floating point arithmetics.
+    void *operator new(size_t s);
+
+    /// Use this function instead of "new" operator to create a new instance of this class. 
+    /// This function automatically chooses a correct implementation, depending on if 
+    /// integer ot floating point arithmetics are to be used.
+    static RateTransposer *newInstance();
+
+    /// Returns the output buffer object
+    FIFOSamplePipe *getOutput() { return &outputBuffer; };
+
+    /// Returns the store buffer object
+    FIFOSamplePipe *getStore() { return &storeBuffer; };
+
+    /// Return anti-alias filter object
+    AAFilter *getAAFilter() const;
+
+    /// Enables/disables the anti-alias filter. Zero to disable, nonzero to enable
+    void enableAAFilter(BOOL newMode);
+
+    /// Returns nonzero if anti-alias filter is enabled.
+    BOOL isAAFilterEnabled() const;
+
+    /// Sets new target rate. Normal rate = 1.0, smaller values represent slower 
+    /// rate, larger faster rates.
+    virtual void setRate(float newRate);
+
+    /// Sets the number of channels, 1 = mono, 2 = stereo
+    void setChannels(uint channels);
+
+    /// Adds 'numSamples' pcs of samples from the 'samples' memory position into
+    /// the input of the object.
+    void putSamples(const SAMPLETYPE *samples, uint numSamples);
+
+    /// Clears all the samples in the object
+    void clear();
+
+    /// Returns nonzero if there aren't any samples available for outputting.
+    uint isEmpty();
+};
+
+}
+
+#endif
--- a/desmume/src/metaspu/SoundTouch/STTypes.h
+++ b/desmume/src/metaspu/SoundTouch/STTypes.h
@ -0,0 +1,202 @@
+////////////////////////////////////////////////////////////////////////////////
+///
+/// Common type definitions for SoundTouch audio processing library.
+///
+/// Author        : Copyright (c) Olli Parviainen
+/// Author e-mail : oparviai 'at' iki.fi
+/// SoundTouch WWW: http://www.surina.net/soundtouch
+///
+////////////////////////////////////////////////////////////////////////////////
+//
+// Last changed  : $Date: 2006/02/05 16:44:06 $
+// File revision : $Revision: 1.16 $
+//
+// $Id: STTypes.h,v 1.16 2006/02/05 16:44:06 Olli Exp $
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// License :
+//
+//  SoundTouch audio processing library
+//  Copyright (c) Olli Parviainen
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2.1 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef STTypes_H
+#define STTypes_H
+
+//#define INTEGER_SAMPLES 1
+
+typedef unsigned int    uint;
+typedef unsigned long   ulong;
+
+#ifdef __x86_64__
+typedef unsigned long long   ulongptr;
+#else
+typedef unsigned long   ulongptr;
+#endif
+
+
+#ifdef __GNUC__
+    // In GCC, include soundtouch_config.h made by config scritps
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* Define to 1 if you have the `m' library (-lm). */
+#define HAVE_LIBM 1
+
+/* Define to 1 if your system has a GNU libc compatible `malloc' function, and
+   to 0 otherwise. */
+#define HAVE_MALLOC 1
+
+/* Define to 1 if you have the <memory.h> header file. */
+#define HAVE_MEMORY_H 1
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#define HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#define HAVE_UNISTD_H 1
+
+/* Use Integer as Sample type */
+//#define INTEGER_SAMPLES 1
+
+/* Define as the return type of signal handlers (`int' or `void'). */
+#define RETSIGTYPE void
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+#endif
+
+#ifndef _WINDEF_
+    // if these aren't defined already by Windows headers, define now
+
+    typedef int BOOL;
+
+#ifndef FALSE
+    #define FALSE   0
+#endif
+
+#ifndef TRUE
+    #define TRUE    1
+#endif
+
+#endif  // _WINDEF_
+
+
+namespace soundtouch
+{
+/// Activate these undef's to overrule the possible sampletype 
+/// setting inherited from some other header file:
+//#undef INTEGER_SAMPLES
+//#undef FLOAT_SAMPLES
+
+#if !(INTEGER_SAMPLES || FLOAT_SAMPLES)
+   
+    /// Choose either 32bit floating point or 16bit integer sampletype
+    /// by choosing one of the following defines, unless this selection 
+    /// has already been done in some other file.
+    ////
+    /// Notes:
+    /// - In Windows environment, choose the sample format with the
+    ///   following defines.
+    /// - In GNU environment, the floating point samples are used by 
+    ///   default, but integer samples can be chosen by giving the 
+    ///   following switch to the configure script:
+    ///       ./configure --enable-integer-samples
+    ///   However, if you still prefer to select the sample format here 
+    ///   also in GNU environment, then please #undef the INTEGER_SAMPLE
+    ///   and FLOAT_SAMPLE defines first as in comments above.
+    //#define INTEGER_SAMPLES     1    //< 16bit integer samples
+    #define FLOAT_SAMPLES       1    //< 32bit float samples
+ 
+ #endif
+
+    /// Define this to allow CPU-specific assembler optimizations. Notice that 
+    /// having this enabled on non-x86 platforms doesn't matter; the compiler can 
+    /// drop unsupported extensions on different platforms automatically. 
+    /// However, if you're having difficulties getting the optimized routines 
+    /// compiled with your compler (e.g. some gcc compiler versions may be picky), 
+    /// you may wish to disable the optimizations to make the library compile.
+	#if !defined(_MSC_VER) || !defined(__x86_64__)
+	#define ALLOW_OPTIMIZATIONS 1
+	#define ALLOW_NONEXACT_SIMD_OPTIMIZATION    1
+	#endif
+
+
+    // If defined, allows the SIMD-optimized routines to take minor shortcuts 
+    // for improved performance. Undefine to require faithfully similar SIMD 
+    // calculations as in normal C implementation.
+    
+
+
+    #ifdef INTEGER_SAMPLES
+        // 16bit integer sample type
+        typedef short SAMPLETYPE;
+        // data type for sample accumulation: Use 32bit integer to prevent overflows
+        typedef long  LONG_SAMPLETYPE;
+
+        #ifdef FLOAT_SAMPLES
+            // check that only one sample type is defined
+            #error "conflicting sample types defined"
+        #endif // FLOAT_SAMPLES
+
+        #ifdef ALLOW_OPTIMIZATIONS
+            #if (_WIN32 || __i386__ || __x86_64__)
+                // Allow MMX optimizations
+                #define ALLOW_MMX   1
+            #endif
+        #endif
+
+    #else
+
+        // floating point samples
+        typedef float  SAMPLETYPE;
+        // data type for sample accumulation: Use double to utilize full precision.
+        typedef double LONG_SAMPLETYPE;
+
+        #ifdef ALLOW_OPTIMIZATIONS
+                // Allow 3DNow! and SSE optimizations
+            #if _WIN32
+               // #define ALLOW_3DNOW     1
+            #endif
+
+            #if (_WIN32 || __i386__ || __x86_64__)
+                #define ALLOW_SSE       1
+            #endif
+        #endif
+
+    #endif  // INTEGER_SAMPLES
+};
+
+#endif
--- a/desmume/src/metaspu/SoundTouch/SoundTouch.cbp
+++ b/desmume/src/metaspu/SoundTouch/SoundTouch.cbp
@ -0,0 +1,83 @@
+<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
+<CodeBlocks_project_file>
+	<FileVersion major="1" minor="6" />
+	<Project>
+		<Option title="SoundTouch" />
+		<Option pch_mode="2" />
+		<Option compiler="gcc" />
+		<Build>
+			<Target title="Debug">
+				<Option output="../../deps/debug/libsoundtouch-dbg" prefix_auto="1" extension_auto="1" />
+				<Option working_dir="" />
+				<Option object_output="./.objs/debug" />
+				<Option type="2" />
+				<Option compiler="gcc" />
+				<Option createDefFile="1" />
+				<Compiler>
+					<Add option="-g" />
+					<Add option="-O0" />
+				</Compiler>
+			</Target>
+			<Target title="Release">
+				<Option output="../../deps/libsoundtouch" prefix_auto="1" extension_auto="1" />
+				<Option working_dir="" />
+				<Option object_output="./.objs/release" />
+				<Option type="2" />
+				<Option compiler="gcc" />
+				<Option createDefFile="1" />
+				<Compiler>
+					<Add option="-fexpensive-optimizations" />
+					<Add option="-O3" />
+					<Add option="-W" />
+				</Compiler>
+				<Linker>
+					<Add option="-s" />
+				</Linker>
+			</Target>
+			<Target title="Devel">
+				<Option output="../../deps/devel/libsoundtouch-dev" prefix_auto="1" extension_auto="1" />
+				<Option working_dir="" />
+				<Option object_output="./.objs/devel" />
+				<Option type="2" />
+				<Option compiler="gcc" />
+				<Option createDefFile="1" />
+				<Compiler>
+					<Add option="-O1" />
+					<Add option="-W" />
+					<Add option="-g" />
+				</Compiler>
+			</Target>
+		</Build>
+		<Compiler>
+			<Add option="-march=athlon-xp" />
+			<Add option="-march=prescott" />
+		</Compiler>
+		<Unit filename="AAFilter.cpp" />
+		<Unit filename="AAFilter.h" />
+		<Unit filename="BPMDetect.h" />
+		<Unit filename="FIFOSampleBuffer.cpp" />
+		<Unit filename="FIFOSampleBuffer.h" />
+		<Unit filename="FIFOSamplePipe.h" />
+		<Unit filename="FIRFilter.cpp" />
+		<Unit filename="FIRFilter.h" />
+		<Unit filename="RateTransposer.cpp" />
+		<Unit filename="RateTransposer.h" />
+		<Unit filename="STTypes.h" />
+		<Unit filename="SoundTouch.cpp" />
+		<Unit filename="SoundTouch.h" />
+		<Unit filename="TDStretch.cpp" />
+		<Unit filename="TDStretch.h" />
+		<Unit filename="WavFile.cpp" />
+		<Unit filename="WavFile.h" />
+		<Unit filename="cpu_detect.h" />
+		<Unit filename="cpu_detect_x86_gcc.cpp" />
+		<Unit filename="mmx_optimized.cpp" />
+		<Unit filename="sse_optimized.cpp" />
+		<Extensions>
+			<envvars />
+			<code_completion />
+			<lib_finder disable_auto="1" />
+			<debugger />
+		</Extensions>
+	</Project>
+</CodeBlocks_project_file>
--- a/desmume/src/metaspu/SoundTouch/SoundTouch.cpp
+++ b/desmume/src/metaspu/SoundTouch/SoundTouch.cpp
@ -0,0 +1,474 @@
+//////////////////////////////////////////////////////////////////////////////
+///
+/// SoundTouch - main class for tempo/pitch/rate adjusting routines. 
+///
+/// Notes:
+/// - Initialize the SoundTouch object instance by setting up the sound stream 
+///   parameters with functions 'setSampleRate' and 'setChannels', then set 
+///   desired tempo/pitch/rate settings with the corresponding functions.
+///
+/// - The SoundTouch class behaves like a first-in-first-out pipeline: The 
+///   samples that are to be processed are fed into one of the pipe by calling
+///   function 'putSamples', while the ready processed samples can be read 
+///   from the other end of the pipeline with function 'receiveSamples'.
+/// 
+/// - The SoundTouch processing classes require certain sized 'batches' of 
+///   samples in order to process the sound. For this reason the classes buffer 
+///   incoming samples until there are enough of samples available for 
+///   processing, then they carry out the processing step and consequently
+///   make the processed samples available for outputting.
+/// 
+/// - For the above reason, the processing routines introduce a certain 
+///   'latency' between the input and output, so that the samples input to
+///   SoundTouch may not be immediately available in the output, and neither 
+///   the amount of outputtable samples may not immediately be in direct 
+///   relationship with the amount of previously input samples.
+///
+/// - The tempo/pitch/rate control parameters can be altered during processing.
+///   Please notice though that they aren't currently protected by semaphores,
+///   so in multi-thread application external semaphore protection may be
+///   required.
+///
+/// - This class utilizes classes 'TDStretch' for tempo change (without modifying
+///   pitch) and 'RateTransposer' for changing the playback rate (that is, both 
+///   tempo and pitch in the same ratio) of the sound. The third available control 
+///   'pitch' (change pitch but maintain tempo) is produced by a combination of
+///   combining the two other controls.
+///
+/// Author        : Copyright (c) Olli Parviainen
+/// Author e-mail : oparviai 'at' iki.fi
+/// SoundTouch WWW: http://www.surina.net/soundtouch
+///
+////////////////////////////////////////////////////////////////////////////////
+//
+// Last changed  : $Date: 2006/02/05 16:44:06 $
+// File revision : $Revision: 1.13 $
+//
+// $Id: SoundTouch.cpp,v 1.13 2006/02/05 16:44:06 Olli Exp $
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// License :
+//
+//  SoundTouch audio processing library
+//  Copyright (c) Olli Parviainen
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2.1 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include <assert.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+#include <stdexcept>
+#include <stdio.h>
+
+#include "SoundTouch.h"
+#include "TDStretch.h"
+#include "RateTransposer.h"
+#include "cpu_detect.h"
+
+using namespace soundtouch;
+
+/// Print library version string
+extern "C" void soundtouch_ac_test()
+{
+    printf("SoundTouch Version: %s\n",SOUNDTOUCH_VERSION);
+} 
+
+
+SoundTouch::SoundTouch()
+{
+    // Initialize rate transposer and tempo changer instances
+
+    pRateTransposer = RateTransposer::newInstance();
+    pTDStretch = TDStretch::newInstance();
+
+    setOutPipe(pTDStretch);
+
+    rate = tempo = 0;
+
+    virtualPitch = 
+    virtualRate = 
+    virtualTempo = 1.0;
+
+    calcEffectiveRateAndTempo();
+
+    channels = 0;
+    bSrateSet = FALSE;
+}
+
+
+
+SoundTouch::~SoundTouch()
+{
+    delete pRateTransposer;
+    delete pTDStretch;
+}
+
+
+
+/// Get SoundTouch library version string
+const char *SoundTouch::getVersionString()
+{
+    static const char *_version = SOUNDTOUCH_VERSION;
+
+    return _version;
+}
+
+
+/// Get SoundTouch library version Id
+uint SoundTouch::getVersionId()
+{
+    return SOUNDTOUCH_VERSION_ID;
+}
+
+
+// Sets the number of channels, 1 = mono, 2 = stereo
+void SoundTouch::setChannels(uint numChannels)
+{
+    if (numChannels != 1 && numChannels != 2) 
+    {
+        throw std::runtime_error("Illegal number of channels");
+    }
+    channels = numChannels;
+    pRateTransposer->setChannels(numChannels);
+    pTDStretch->setChannels(numChannels);
+}
+
+
+
+// Sets new rate control value. Normal rate = 1.0, smaller values
+// represent slower rate, larger faster rates.
+void SoundTouch::setRate(float newRate)
+{
+    virtualRate = newRate;
+    calcEffectiveRateAndTempo();
+}
+
+
+
+// Sets new rate control value as a difference in percents compared
+// to the original rate (-50 .. +100 %)
+void SoundTouch::setRateChange(float newRate)
+{
+    virtualRate = 1.0f + 0.01f * newRate;
+    calcEffectiveRateAndTempo();
+}
+
+
+
+// Sets new tempo control value. Normal tempo = 1.0, smaller values
+// represent slower tempo, larger faster tempo.
+void SoundTouch::setTempo(float newTempo)
+{
+    virtualTempo = newTempo;
+    calcEffectiveRateAndTempo();
+}
+
+
+
+// Sets new tempo control value as a difference in percents compared
+// to the original tempo (-50 .. +100 %)
+void SoundTouch::setTempoChange(float newTempo)
+{
+    virtualTempo = 1.0f + 0.01f * newTempo;
+    calcEffectiveRateAndTempo();
+}
+
+
+
+// Sets new pitch control value. Original pitch = 1.0, smaller values
+// represent lower pitches, larger values higher pitch.
+void SoundTouch::setPitch(float newPitch)
+{
+    virtualPitch = newPitch;
+    calcEffectiveRateAndTempo();
+}
+
+
+
+// Sets pitch change in octaves compared to the original pitch
+// (-1.00 .. +1.00)
+void SoundTouch::setPitchOctaves(float newPitch)
+{
+    virtualPitch = (float)exp(0.69314718056f * newPitch);
+    calcEffectiveRateAndTempo();
+}
+
+
+
+// Sets pitch change in semi-tones compared to the original pitch
+// (-12 .. +12)
+void SoundTouch::setPitchSemiTones(int newPitch)
+{
+    setPitchOctaves((float)newPitch / 12.0f);
+}
+
+
+
+void SoundTouch::setPitchSemiTones(float newPitch)
+{
+    setPitchOctaves(newPitch / 12.0f);
+}
+
+
+// Calculates 'effective' rate and tempo values from the
+// nominal control values.
+void SoundTouch::calcEffectiveRateAndTempo()
+{
+    float oldTempo = tempo;
+    float oldRate = rate;
+
+    tempo = virtualTempo / virtualPitch;
+    rate = virtualPitch * virtualRate;
+
+    if (rate != oldRate) pRateTransposer->setRate(rate);
+    if (tempo != oldTempo) pTDStretch->setTempo(tempo);
+
+    if (rate > 1.0f) 
+    {
+        if (output != pRateTransposer) 
+        {
+            FIFOSamplePipe *transOut;
+
+            assert(output == pTDStretch);
+            // move samples in the current output buffer to the output of pRateTransposer
+            transOut = pRateTransposer->getOutput();
+            transOut->moveSamples(*output);
+            // move samples in tempo changer's input to pitch transposer's input
+            pRateTransposer->moveSamples(*pTDStretch->getInput());
+
+            output = pRateTransposer;
+        }
+    } 
+    else 
+    {
+        if (output != pTDStretch) 
+        {
+            FIFOSamplePipe *tempoOut;
+
+            assert(output == pRateTransposer);
+            // move samples in the current output buffer to the output of pTDStretch
+            tempoOut = pTDStretch->getOutput();
+            tempoOut->moveSamples(*output);
+            // move samples in pitch transposer's store buffer to tempo changer's input
+            pTDStretch->moveSamples(*pRateTransposer->getStore());
+
+            output = pTDStretch;
+
+        }
+    }
+}
+
+
+// Sets sample rate.
+void SoundTouch::setSampleRate(uint srate)
+{
+    bSrateSet = TRUE;
+    // set sample rate, leave other tempo changer parameters as they are.
+    pTDStretch->setParameters(srate);
+}
+
+
+// Adds 'numSamples' pcs of samples from the 'samples' memory position into
+// the input of the object.
+void SoundTouch::putSamples(const SAMPLETYPE *samples, uint numSamples)
+{
+    if (bSrateSet == FALSE) 
+    {
+        throw std::runtime_error("SoundTouch : Sample rate not defined");
+    } 
+    else if (channels == 0) 
+    {
+        throw std::runtime_error("SoundTouch : Number of channels not defined");
+    }
+
+    // Transpose the rate of the new samples if necessary
+    /* Bypass the nominal setting - can introduce a click in sound when tempo/pitch control crosses the nominal value...
+    if (rate == 1.0f) 
+    {
+        // The rate value is same as the original, simply evaluate the tempo changer. 
+        assert(output == pTDStretch);
+        if (pRateTransposer->isEmpty() == 0) 
+        {
+            // yet flush the last samples in the pitch transposer buffer
+            // (may happen if 'rate' changes from a non-zero value to zero)
+            pTDStretch->moveSamples(*pRateTransposer);
+        }
+        pTDStretch->putSamples(samples, numSamples);
+    } 
+    */
+    else if (rate <= 1.0f) 
+    {
+        // transpose the rate down, output the transposed sound to tempo changer buffer
+        assert(output == pTDStretch);
+        pRateTransposer->putSamples(samples, numSamples);
+        pTDStretch->moveSamples(*pRateTransposer);
+    } 
+    else 
+    {
+        assert(rate > 1.0f);
+        // evaluate the tempo changer, then transpose the rate up, 
+        assert(output == pRateTransposer);
+        pTDStretch->putSamples(samples, numSamples);
+        pRateTransposer->moveSamples(*pTDStretch);
+    }
+}
+
+
+// Flushes the last samples from the processing pipeline to the output.
+// Clears also the internal processing buffers.
+//
+// Note: This function is meant for extracting the last samples of a sound
+// stream. This function may introduce additional blank samples in the end
+// of the sound stream, and thus it's not recommended to call this function
+// in the middle of a sound stream.
+void SoundTouch::flush()
+{
+    int i;
+    uint nOut;
+    SAMPLETYPE buff[128];
+
+    nOut = numSamples();
+
+    memset(buff, 0, 128 * sizeof(SAMPLETYPE));
+    // "Push" the last active samples out from the processing pipeline by
+    // feeding blank samples into the processing pipeline until new, 
+    // processed samples appear in the output (not however, more than 
+    // 8ksamples in any case)
+    for (i = 0; i < 128; i ++) 
+    {
+        putSamples(buff, 64);
+        if (numSamples() != nOut) break;  // new samples have appeared in the output!
+    }
+
+    // Clear working buffers
+    pRateTransposer->clear();
+    pTDStretch->clearInput();
+    // yet leave the 'tempoChanger' output intouched as that's where the
+    // flushed samples are!
+}
+
+
+// Changes a setting controlling the processing system behaviour. See the
+// 'SETTING_...' defines for available setting ID's.
+BOOL SoundTouch::setSetting(uint settingId, uint value)
+{
+    uint sampleRate, sequenceMs, seekWindowMs, overlapMs;
+
+    // read current tdstretch routine parameters
+    pTDStretch->getParameters(&sampleRate, &sequenceMs, &seekWindowMs, &overlapMs);
+
+    switch (settingId) 
+    {
+        case SETTING_USE_AA_FILTER :
+            // enables / disabless anti-alias filter
+            pRateTransposer->enableAAFilter((value != 0) ? TRUE : FALSE);
+            return TRUE;
+
+        case SETTING_AA_FILTER_LENGTH :
+            // sets anti-alias filter length
+            pRateTransposer->getAAFilter()->setLength(value);
+            return TRUE;
+
+        case SETTING_USE_QUICKSEEK :
+            // enables / disables tempo routine quick seeking algorithm
+            pTDStretch->enableQuickSeek((value != 0) ? TRUE : FALSE);
+            return TRUE;
+
+        case SETTING_SEQUENCE_MS:
+            // change time-stretch sequence duration parameter
+            pTDStretch->setParameters(sampleRate, value, seekWindowMs, overlapMs);
+            return TRUE;
+
+        case SETTING_SEEKWINDOW_MS:
+            // change time-stretch seek window length parameter
+            pTDStretch->setParameters(sampleRate, sequenceMs, value, overlapMs);
+            return TRUE;
+
+        case SETTING_OVERLAP_MS:
+            // change time-stretch overlap length parameter
+            pTDStretch->setParameters(sampleRate, sequenceMs, seekWindowMs, value);
+            return TRUE;
+
+        default :
+            return FALSE;
+    }
+}
+
+
+// Reads a setting controlling the processing system behaviour. See the
+// 'SETTING_...' defines for available setting ID's.
+//
+// Returns the setting value.
+uint SoundTouch::getSetting(uint settingId) const
+{
+    uint temp;
+
+    switch (settingId) 
+    {
+        case SETTING_USE_AA_FILTER :
+            return pRateTransposer->isAAFilterEnabled();
+
+        case SETTING_AA_FILTER_LENGTH :
+            return pRateTransposer->getAAFilter()->getLength();
+
+        case SETTING_USE_QUICKSEEK :
+            return pTDStretch->isQuickSeekEnabled();
+
+        case SETTING_SEQUENCE_MS:
+            pTDStretch->getParameters(NULL, &temp, NULL, NULL);
+            return temp;
+
+        case SETTING_SEEKWINDOW_MS:
+            pTDStretch->getParameters(NULL, NULL, &temp, NULL);
+            return temp;
+
+        case SETTING_OVERLAP_MS:
+            pTDStretch->getParameters(NULL, NULL, NULL, &temp);
+            return temp;
+
+        default :
+            return 0;
+    }
+}
+
+
+// Clears all the samples in the object's output and internal processing
+// buffers.
+void SoundTouch::clear()
+{
+    pRateTransposer->clear();
+    pTDStretch->clear();
+}
+
+
+
+/// Returns number of samples currently unprocessed.
+uint SoundTouch::numUnprocessedSamples() const
+{
+    FIFOSamplePipe * psp;
+    if (pTDStretch)
+    {
+        psp = pTDStretch->getInput();
+        if (psp)
+        {
+            return psp->numSamples();
+        }
+    }
+    return 0;
+}
--- a/desmume/src/metaspu/SoundTouch/SoundTouch.h
+++ b/desmume/src/metaspu/SoundTouch/SoundTouch.h
@ -0,0 +1,252 @@
+//////////////////////////////////////////////////////////////////////////////
+///
+/// SoundTouch - main class for tempo/pitch/rate adjusting routines. 
+///
+/// Notes:
+/// - Initialize the SoundTouch object instance by setting up the sound stream 
+///   parameters with functions 'setSampleRate' and 'setChannels', then set 
+///   desired tempo/pitch/rate settings with the corresponding functions.
+///
+/// - The SoundTouch class behaves like a first-in-first-out pipeline: The 
+///   samples that are to be processed are fed into one of the pipe by calling
+///   function 'putSamples', while the ready processed samples can be read 
+///   from the other end of the pipeline with function 'receiveSamples'.
+/// 
+/// - The SoundTouch processing classes require certain sized 'batches' of 
+///   samples in order to process the sound. For this reason the classes buffer 
+///   incoming samples until there are enough of samples available for 
+///   processing, then they carry out the processing step and consequently
+///   make the processed samples available for outputting.
+/// 
+/// - For the above reason, the processing routines introduce a certain 
+///   'latency' between the input and output, so that the samples input to
+///   SoundTouch may not be immediately available in the output, and neither 
+///   the amount of outputtable samples may not immediately be in direct 
+///   relationship with the amount of previously input samples.
+///
+/// - The tempo/pitch/rate control parameters can be altered during processing.
+///   Please notice though that they aren't currently protected by semaphores,
+///   so in multi-thread application external semaphore protection may be
+///   required.
+///
+/// - This class utilizes classes 'TDStretch' for tempo change (without modifying
+///   pitch) and 'RateTransposer' for changing the playback rate (that is, both 
+///   tempo and pitch in the same ratio) of the sound. The third available control 
+///   'pitch' (change pitch but maintain tempo) is produced by a combination of
+///   combining the two other controls.
+///
+/// Author        : Copyright (c) Olli Parviainen
+/// Author e-mail : oparviai 'at' iki.fi
+/// SoundTouch WWW: http://www.surina.net/soundtouch
+///
+////////////////////////////////////////////////////////////////////////////////
+//
+// Last changed  : $Date: 2006/02/05 16:44:06 $
+// File revision : $Revision: 1.14 $
+//
+// $Id: SoundTouch.h,v 1.14 2006/02/05 16:44:06 Olli Exp $
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// License :
+//
+//  SoundTouch audio processing library
+//  Copyright (c) Olli Parviainen
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2.1 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef SoundTouch_H
+#define SoundTouch_H
+
+#include "FIFOSamplePipe.h"
+#include "STTypes.h"
+
+namespace soundtouch
+{
+
+/// Soundtouch library version string
+#define SOUNDTOUCH_VERSION          "1.3.1"
+
+/// SoundTouch library version id
+#define SOUNDTOUCH_VERSION_ID       010301
+
+//
+// Available setting IDs for the 'setSetting' & 'get_setting' functions:
+
+/// Enable/disable anti-alias filter in pitch transposer (0 = disable)
+#define SETTING_USE_AA_FILTER       0
+
+/// Pitch transposer anti-alias filter length (8 .. 128 taps, default = 32)
+#define SETTING_AA_FILTER_LENGTH    1
+
+/// Enable/disable quick seeking algorithm in tempo changer routine
+/// (enabling quick seeking lowers CPU utilization but causes a minor sound
+///  quality compromising)
+#define SETTING_USE_QUICKSEEK       2
+
+/// Time-stretch algorithm single processing sequence length in milliseconds. This determines 
+/// to how long sequences the original sound is chopped in the time-stretch algorithm. 
+/// See "STTypes.h" or README for more information.
+#define SETTING_SEQUENCE_MS         3
+
+/// Time-stretch algorithm seeking window length in milliseconds for algorithm that finds the 
+/// best possible overlapping location. This determines from how wide window the algorithm 
+/// may look for an optimal joining location when mixing the sound sequences back together. 
+/// See "STTypes.h" or README for more information.
+#define SETTING_SEEKWINDOW_MS       4
+
+/// Time-stretch algorithm overlap length in milliseconds. When the chopped sound sequences 
+/// are mixed back together, to form a continuous sound stream, this parameter defines over 
+/// how long period the two consecutive sequences are let to overlap each other. 
+/// See "STTypes.h" or README for more information.
+#define SETTING_OVERLAP_MS          5
+
+
+class SoundTouch : public FIFOProcessor
+{
+private:
+    /// Rate transposer class instance
+    class RateTransposer *pRateTransposer;
+
+    /// Time-stretch class instance
+    class TDStretch *pTDStretch;
+
+    /// Virtual pitch parameter. Effective rate & tempo are calculated from these parameters.
+    float virtualRate;
+
+    /// Virtual pitch parameter. Effective rate & tempo are calculated from these parameters.
+    float virtualTempo;
+
+    /// Virtual pitch parameter. Effective rate & tempo are calculated from these parameters.
+    float virtualPitch;
+
+    /// Flag: Has sample rate been set?
+    BOOL  bSrateSet;
+
+    /// Calculates effective rate & tempo valuescfrom 'virtualRate', 'virtualTempo' and 
+    /// 'virtualPitch' parameters.
+    void calcEffectiveRateAndTempo();
+
+protected :
+    /// Number of channels
+    uint  channels;
+
+    /// Effective 'rate' value calculated from 'virtualRate', 'virtualTempo' and 'virtualPitch'
+    float rate;
+
+    /// Effective 'tempo' value calculated from 'virtualRate', 'virtualTempo' and 'virtualPitch'
+    float tempo;
+
+public:
+    SoundTouch();
+    virtual ~SoundTouch();
+
+    /// Get SoundTouch library version string
+    static const char *getVersionString();
+
+    /// Get SoundTouch library version Id
+    static uint getVersionId();
+
+    /// Sets new rate control value. Normal rate = 1.0, smaller values
+    /// represent slower rate, larger faster rates.
+    void setRate(float newRate);
+
+    /// Sets new tempo control value. Normal tempo = 1.0, smaller values
+    /// represent slower tempo, larger faster tempo.
+    void setTempo(float newTempo);
+
+    /// Sets new rate control value as a difference in percents compared
+    /// to the original rate (-50 .. +100 %)
+    void setRateChange(float newRate);
+
+    /// Sets new tempo control value as a difference in percents compared
+    /// to the original tempo (-50 .. +100 %)
+    void setTempoChange(float newTempo);
+
+    /// Sets new pitch control value. Original pitch = 1.0, smaller values
+    /// represent lower pitches, larger values higher pitch.
+    void setPitch(float newPitch);
+
+    /// Sets pitch change in octaves compared to the original pitch  
+    /// (-1.00 .. +1.00)
+    void setPitchOctaves(float newPitch);
+
+    /// Sets pitch change in semi-tones compared to the original pitch
+    /// (-12 .. +12)
+    void setPitchSemiTones(int newPitch);
+    void setPitchSemiTones(float newPitch);
+
+    /// Sets the number of channels, 1 = mono, 2 = stereo
+    void setChannels(uint numChannels);
+
+    /// Sets sample rate.
+    void setSampleRate(uint srate);
+
+    /// Flushes the last samples from the processing pipeline to the output.
+    /// Clears also the internal processing buffers.
+    //
+    /// Note: This function is meant for extracting the last samples of a sound
+    /// stream. This function may introduce additional blank samples in the end
+    /// of the sound stream, and thus it's not recommended to call this function
+    /// in the middle of a sound stream.
+    void flush();
+
+    /// Adds 'numSamples' pcs of samples from the 'samples' memory position into
+    /// the input of the object. Notice that sample rate _has_to_ be set before
+    /// calling this function, otherwise throws a runtime_error exception.
+    virtual void putSamples(
+            const SAMPLETYPE *samples,  ///< Pointer to sample buffer.
+            uint numSamples                         ///< Number of samples in buffer. Notice
+                                                    ///< that in case of stereo-sound a single sample
+                                                    ///< contains data for both channels.
+            );
+
+    /// Clears all the samples in the object's output and internal processing
+    /// buffers.
+    virtual void clear();
+
+    /// Changes a setting controlling the processing system behaviour. See the
+    /// 'SETTING_...' defines for available setting ID's.
+    /// 
+    /// \return 'TRUE' if the setting was succesfully changed
+    BOOL setSetting(uint settingId,   ///< Setting ID number. see SETTING_... defines.
+                    uint value        ///< New setting value.
+                    );
+
+    /// Reads a setting controlling the processing system behaviour. See the
+    /// 'SETTING_...' defines for available setting ID's.
+    ///
+    /// \return the setting value.
+    uint getSetting(uint settingId    ///< Setting ID number, see SETTING_... defines.
+                    ) const;
+
+    /// Returns number of samples currently unprocessed.
+    virtual uint numUnprocessedSamples() const;
+
+
+    /// Other handy functions that are implemented in the ancestor classes (see
+    /// classes 'FIFOProcessor' and 'FIFOSamplePipe')
+    ///
+    /// - receiveSamples() : Use this function to receive 'ready' processed samples from SoundTouch.
+    /// - numSamples()     : Get number of 'ready' samples that can be received with 
+    ///                      function 'receiveSamples()'
+    /// - isEmpty()        : Returns nonzero if there aren't any 'ready' samples.
+    /// - clear()          : Clears all samples from ready/processing buffers.
+};
+
+}
+#endif
--- a/desmume/src/metaspu/SoundTouch/TDStretch.cpp
+++ b/desmume/src/metaspu/SoundTouch/TDStretch.cpp
@ -0,0 +1,940 @@
+////////////////////////////////////////////////////////////////////////////////
+/// 
+/// Sampled sound tempo changer/time stretch algorithm. Changes the sound tempo 
+/// while maintaining the original pitch by using a time domain WSOLA-like 
+/// method with several performance-increasing tweaks.
+///
+/// Note : MMX optimized functions reside in a separate, platform-specific 
+/// file, e.g. 'mmx_win.cpp' or 'mmx_gcc.cpp'
+///
+/// Author        : Copyright (c) Olli Parviainen
+/// Author e-mail : oparviai 'at' iki.fi
+/// SoundTouch WWW: http://www.surina.net/soundtouch
+///
+////////////////////////////////////////////////////////////////////////////////
+//
+// Last changed  : $Date: 2006/02/05 16:44:06 $
+// File revision : $Revision: 1.24 $
+//
+// $Id: TDStretch.cpp,v 1.24 2006/02/05 16:44:06 Olli Exp $
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// License :
+//
+//  SoundTouch audio processing library
+//  Copyright (c) Olli Parviainen
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2.1 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include <string.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <limits.h>
+#include <math.h>
+#include <assert.h>
+
+#include "STTypes.h"
+#include "cpu_detect.h"
+#include "TDStretch.h"
+
+using namespace soundtouch;
+
+#ifndef min
+#define min(a,b) ((a > b) ? b : a)
+#define max(a,b) ((a < b) ? b : a)
+#endif
+
+
+
+/*****************************************************************************
+ *
+ * Constant definitions
+ *
+ *****************************************************************************/
+
+
+// Table for the hierarchical mixing position seeking algorithm
+int scanOffsets[4][24]={
+    { 124,  186,  248,  310,  372,  434,  496,  558,  620,  682,  744, 806, 
+      868,  930,  992, 1054, 1116, 1178, 1240, 1302, 1364, 1426, 1488,   0}, 
+    {-100,  -75,  -50,  -25,   25,   50,   75,  100,    0,    0,    0,   0,
+        0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0},
+    { -20,  -15,  -10,   -5,    5,   10,   15,   20,    0,    0,    0,   0,
+        0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0},
+    {  -4,   -3,   -2,   -1,    1,    2,    3,    4,    0,    0,    0,   0,
+        0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0}};
+
+/*****************************************************************************
+ *
+ * Implementation of the class 'TDStretch'
+ *
+ *****************************************************************************/
+
+
+TDStretch::TDStretch() : FIFOProcessor(&outputBuffer)
+{
+    bQuickseek = FALSE;
+    channels = 2;
+    bMidBufferDirty = FALSE;
+
+    pMidBuffer = NULL;
+    pRefMidBufferUnaligned = NULL;
+    overlapLength = 0;
+
+    setParameters(48000, DEFAULT_SEQUENCE_MS, DEFAULT_SEEKWINDOW_MS, DEFAULT_OVERLAP_MS);
+
+    setTempo(1.0f);
+}
+
+
+
+
+TDStretch::~TDStretch()
+{
+    delete[] pMidBuffer;
+    delete[] pRefMidBufferUnaligned;
+}
+
+
+    
+// Calculates the x having the closest 2^x value for the given value
+static int _getClosest2Power(double value)
+{
+    return (int)(log(value) / log(2.0) + 0.5);
+}
+
+
+
+// Sets routine control parameters. These control are certain time constants
+// defining how the sound is stretched to the desired duration.
+//
+// 'sampleRate' = sample rate of the sound
+// 'sequenceMS' = one processing sequence length in milliseconds (default = 82 ms)
+// 'seekwindowMS' = seeking window length for scanning the best overlapping 
+//      position (default = 28 ms)
+// 'overlapMS' = overlapping length (default = 12 ms)
+
+void TDStretch::setParameters(uint aSampleRate, uint aSequenceMS, 
+                              uint aSeekWindowMS, uint aOverlapMS)
+{
+    this->sampleRate = aSampleRate;
+    this->sequenceMs = aSequenceMS;
+    this->seekWindowMs = aSeekWindowMS;
+    this->overlapMs = aOverlapMS;
+
+    seekLength = (sampleRate * seekWindowMs) / 1000;
+    seekWindowLength = (sampleRate * sequenceMs) / 1000;
+
+    maxOffset = seekLength;
+
+    calculateOverlapLength(overlapMs);
+
+    // set tempo to recalculate 'sampleReq'
+    setTempo(tempo);
+
+}
+
+
+
+/// Get routine control parameters, see setParameters() function.
+/// Any of the parameters to this function can be NULL, in such case corresponding parameter
+/// value isn't returned.
+void TDStretch::getParameters(uint *pSampleRate, uint *pSequenceMs, uint *pSeekWindowMs, uint *pOverlapMs)
+{
+    if (pSampleRate)
+    {
+        *pSampleRate = sampleRate;
+    }
+
+    if (pSequenceMs)
+    {
+        *pSequenceMs = sequenceMs;
+    }
+
+    if (pSeekWindowMs)
+    {
+        *pSeekWindowMs = seekWindowMs;
+    }
+
+    if (pOverlapMs)
+    {
+        *pOverlapMs = overlapMs;
+    }
+}
+
+
+// Overlaps samples in 'midBuffer' with the samples in 'input'
+void TDStretch::overlapMono(SAMPLETYPE *output, const SAMPLETYPE *input) const
+{
+    int i, itemp;
+
+    for (i = 0; i < (int)overlapLength ; i ++) 
+    {
+        itemp = overlapLength - i;
+        output[i] = (input[i] * i + pMidBuffer[i] * itemp ) / overlapLength;    // >> overlapDividerBits;
+    }
+}
+
+
+
+void TDStretch::clearMidBuffer()
+{
+    if (bMidBufferDirty) 
+    {
+        memset(pMidBuffer, 0, 2 * sizeof(SAMPLETYPE) * overlapLength);
+        bMidBufferDirty = FALSE;
+    }
+}
+
+
+void TDStretch::clearInput()
+{
+    inputBuffer.clear();
+    clearMidBuffer();
+}
+
+
+// Clears the sample buffers
+void TDStretch::clear()
+{
+    outputBuffer.clear();
+    inputBuffer.clear();
+    clearMidBuffer();
+}
+
+
+
+// Enables/disables the quick position seeking algorithm. Zero to disable, nonzero
+// to enable
+void TDStretch::enableQuickSeek(BOOL enable)
+{
+    bQuickseek = enable;
+}
+
+
+// Returns nonzero if the quick seeking algorithm is enabled.
+BOOL TDStretch::isQuickSeekEnabled() const
+{
+    return bQuickseek;
+}
+
+
+// Seeks for the optimal overlap-mixing position.
+uint TDStretch::seekBestOverlapPosition(const SAMPLETYPE *refPos)
+{
+    if (channels == 2) 
+    {
+        // stereo sound
+        if (bQuickseek) 
+        {
+            return seekBestOverlapPositionStereoQuick(refPos);
+        } 
+        else 
+        {
+            return seekBestOverlapPositionStereo(refPos);
+        }
+    } 
+    else 
+    {
+        // mono sound
+        if (bQuickseek) 
+        {
+            return seekBestOverlapPositionMonoQuick(refPos);
+        } 
+        else 
+        {
+            return seekBestOverlapPositionMono(refPos);
+        }
+    }
+}
+
+
+
+
+// Overlaps samples in 'midBuffer' with the samples in 'inputBuffer' at position
+// of 'ovlPos'.
+inline void TDStretch::overlap(SAMPLETYPE *output, const SAMPLETYPE *input, uint ovlPos) const
+{
+    if (channels == 2) 
+    {
+        // stereo sound
+        overlapStereo(output, input + 2 * ovlPos);
+    } else {
+        // mono sound.
+        overlapMono(output, input + ovlPos);
+    }
+}
+
+
+
+
+// Seeks for the optimal overlap-mixing position. The 'stereo' version of the
+// routine
+//
+// The best position is determined as the position where the two overlapped
+// sample sequences are 'most alike', in terms of the highest cross-correlation
+// value over the overlapping period
+uint TDStretch::seekBestOverlapPositionStereo(const SAMPLETYPE *refPos) 
+{
+    uint bestOffs;
+    LONG_SAMPLETYPE bestCorr, corr;
+    uint i;
+
+    // Slopes the amplitudes of the 'midBuffer' samples
+    precalcCorrReferenceStereo();
+
+    bestCorr = INT_MIN;
+    bestOffs = 0;
+
+    // Scans for the best correlation value by testing each possible position
+    // over the permitted range.
+    for (i = 0; i < seekLength; i ++) 
+    {
+        // Calculates correlation value for the mixing position corresponding
+        // to 'i'
+        corr = calcCrossCorrStereo(refPos + 2 * i, pRefMidBuffer);
+
+        // Checks for the highest correlation value
+        if (corr > bestCorr) 
+        {
+            bestCorr = corr;
+            bestOffs = i;
+        }
+    }
+    // clear cross correlation routine state if necessary (is so e.g. in MMX routines).
+    clearCrossCorrState();
+
+    return bestOffs;
+}
+
+
+// Seeks for the optimal overlap-mixing position. The 'stereo' version of the
+// routine
+//
+// The best position is determined as the position where the two overlapped
+// sample sequences are 'most alike', in terms of the highest cross-correlation
+// value over the overlapping period
+uint TDStretch::seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos) 
+{
+    uint j;
+    uint bestOffs;
+    LONG_SAMPLETYPE bestCorr, corr;
+    uint scanCount, corrOffset, tempOffset;
+
+    // Slopes the amplitude of the 'midBuffer' samples
+    precalcCorrReferenceStereo();
+
+    bestCorr = INT_MIN;
+    bestOffs = 0;
+    corrOffset = 0;
+    tempOffset = 0;
+
+    // Scans for the best correlation value using four-pass hierarchical search.
+    //
+    // The look-up table 'scans' has hierarchical position adjusting steps.
+    // In first pass the routine searhes for the highest correlation with 
+    // relatively coarse steps, then rescans the neighbourhood of the highest
+    // correlation with better resolution and so on.
+    for (scanCount = 0;scanCount < 4; scanCount ++) 
+    {
+        j = 0;
+        while (scanOffsets[scanCount][j]) 
+        {
+            tempOffset = corrOffset + scanOffsets[scanCount][j];
+            if (tempOffset >= seekLength) break;
+
+            // Calculates correlation value for the mixing position corresponding
+            // to 'tempOffset'
+            corr = calcCrossCorrStereo(refPos + 2 * tempOffset, pRefMidBuffer);
+
+            // Checks for the highest correlation value
+            if (corr > bestCorr) 
+            {
+                bestCorr = corr;
+                bestOffs = tempOffset;
+            }
+            j ++;
+        }
+        corrOffset = bestOffs;
+    }
+    // clear cross correlation routine state if necessary (is so e.g. in MMX routines).
+    clearCrossCorrState();
+
+    return bestOffs;
+}
+
+
+
+// Seeks for the optimal overlap-mixing position. The 'mono' version of the
+// routine
+//
+// The best position is determined as the position where the two overlapped
+// sample sequences are 'most alike', in terms of the highest cross-correlation
+// value over the overlapping period
+uint TDStretch::seekBestOverlapPositionMono(const SAMPLETYPE *refPos) 
+{
+    uint bestOffs;
+    LONG_SAMPLETYPE bestCorr, corr;
+    uint tempOffset;
+    const SAMPLETYPE *compare;
+
+    // Slopes the amplitude of the 'midBuffer' samples
+    precalcCorrReferenceMono();
+
+    bestCorr = INT_MIN;
+    bestOffs = 0;
+
+    // Scans for the best correlation value by testing each possible position
+    // over the permitted range.
+    for (tempOffset = 0; tempOffset < seekLength; tempOffset ++) 
+    {
+        compare = refPos + tempOffset;
+
+        // Calculates correlation value for the mixing position corresponding
+        // to 'tempOffset'
+        corr = calcCrossCorrMono(pRefMidBuffer, compare);
+
+        // Checks for the highest correlation value
+        if (corr > bestCorr) 
+        {
+            bestCorr = corr;
+            bestOffs = tempOffset;
+        }
+    }
+    // clear cross correlation routine state if necessary (is so e.g. in MMX routines).
+    clearCrossCorrState();
+
+    return bestOffs;
+}
+
+
+// Seeks for the optimal overlap-mixing position. The 'mono' version of the
+// routine
+//
+// The best position is determined as the position where the two overlapped
+// sample sequences are 'most alike', in terms of the highest cross-correlation
+// value over the overlapping period
+uint TDStretch::seekBestOverlapPositionMonoQuick(const SAMPLETYPE *refPos) 
+{
+    uint j;
+    uint bestOffs;
+    LONG_SAMPLETYPE bestCorr, corr;
+    uint scanCount, corrOffset, tempOffset;
+
+    // Slopes the amplitude of the 'midBuffer' samples
+    precalcCorrReferenceMono();
+
+    bestCorr = INT_MIN;
+    bestOffs = 0;
+    corrOffset = 0;
+    tempOffset = 0;
+
+    // Scans for the best correlation value using four-pass hierarchical search.
+    //
+    // The look-up table 'scans' has hierarchical position adjusting steps.
+    // In first pass the routine searhes for the highest correlation with 
+    // relatively coarse steps, then rescans the neighbourhood of the highest
+    // correlation with better resolution and so on.
+    for (scanCount = 0;scanCount < 4; scanCount ++) 
+    {
+        j = 0;
+        while (scanOffsets[scanCount][j]) 
+        {
+            tempOffset = corrOffset + scanOffsets[scanCount][j];
+            if (tempOffset >= seekLength) break;
+
+            // Calculates correlation value for the mixing position corresponding
+            // to 'tempOffset'
+            corr = calcCrossCorrMono(refPos + tempOffset, pRefMidBuffer);
+
+            // Checks for the highest correlation value
+            if (corr > bestCorr) 
+            {
+                bestCorr = corr;
+                bestOffs = tempOffset;
+            }
+            j ++;
+        }
+        corrOffset = bestOffs;
+    }
+    // clear cross correlation routine state if necessary (is so e.g. in MMX routines).
+    clearCrossCorrState();
+
+    return bestOffs;
+}
+
+
+/// clear cross correlation routine state if necessary 
+void TDStretch::clearCrossCorrState()
+{
+    // default implementation is empty.
+}
+
+
+// Sets new target tempo. Normal tempo = 'SCALE', smaller values represent slower 
+// tempo, larger faster tempo.
+void TDStretch::setTempo(float newTempo)
+{
+    uint intskip;
+
+    tempo = newTempo;
+
+    // Calculate ideal skip length (according to tempo value) 
+    nominalSkip = tempo * (seekWindowLength - overlapLength);
+    skipFract = 0;
+    intskip = (int)(nominalSkip + 0.5f);
+
+    // Calculate how many samples are needed in the 'inputBuffer' to 
+    // process another batch of samples
+    sampleReq = max(intskip + overlapLength, seekWindowLength) + maxOffset;
+}
+
+
+
+// Sets the number of channels, 1 = mono, 2 = stereo
+void TDStretch::setChannels(uint numChannels)
+{
+    if (channels == numChannels) return;
+    assert(numChannels == 1 || numChannels == 2);
+
+    channels = numChannels;
+    inputBuffer.setChannels(channels);
+    outputBuffer.setChannels(channels);
+}
+
+
+// nominal tempo, no need for processing, just pass the samples through
+// to outputBuffer
+void TDStretch::processNominalTempo()
+{
+    assert(tempo == 1.0f);
+
+    if (bMidBufferDirty) 
+    {
+        // If there are samples in pMidBuffer waiting for overlapping,
+        // do a single sliding overlapping with them in order to prevent a 
+        // clicking distortion in the output sound
+        if (inputBuffer.numSamples() < overlapLength) 
+        {
+            // wait until we've got overlapLength input samples
+            return;
+        }
+        // Mix the samples in the beginning of 'inputBuffer' with the 
+        // samples in 'midBuffer' using sliding overlapping 
+        overlap(outputBuffer.ptrEnd(overlapLength), inputBuffer.ptrBegin(), 0);
+        outputBuffer.putSamples(overlapLength);
+        inputBuffer.receiveSamples(overlapLength);
+        clearMidBuffer();
+        // now we've caught the nominal sample flow and may switch to
+        // bypass mode
+    }
+
+    // Simply bypass samples from input to output
+    outputBuffer.moveSamples(inputBuffer);
+}
+
+
+// Processes as many processing frames of the samples 'inputBuffer', store
+// the result into 'outputBuffer'
+void TDStretch::processSamples()
+{
+    uint ovlSkip, offset;
+    int temp;
+
+    /* Removed this small optimization - can introduce a click to sound when tempo setting
+       crosses the nominal value
+    if (tempo == 1.0f) 
+    {
+        // tempo not changed from the original, so bypass the processing
+        processNominalTempo();
+        return;
+    }
+    */
+
+    if (bMidBufferDirty == FALSE) 
+    {
+        // if midBuffer is empty, move the first samples of the input stream 
+        // into it
+        if (inputBuffer.numSamples() < overlapLength) 
+        {
+            // wait until we've got overlapLength samples
+            return;
+        }
+        memcpy(pMidBuffer, inputBuffer.ptrBegin(), channels * overlapLength * sizeof(SAMPLETYPE));
+        inputBuffer.receiveSamples(overlapLength);
+        bMidBufferDirty = TRUE;
+    }
+
+    // Process samples as long as there are enough samples in 'inputBuffer'
+    // to form a processing frame.
+    while (inputBuffer.numSamples() >= sampleReq) 
+    {
+        // If tempo differs from the normal ('SCALE'), scan for the best overlapping
+        // position
+        offset = seekBestOverlapPosition(inputBuffer.ptrBegin());
+
+        // Mix the samples in the 'inputBuffer' at position of 'offset' with the 
+        // samples in 'midBuffer' using sliding overlapping
+        // ... first partially overlap with the end of the previous sequence
+        // (that's in 'midBuffer')
+        overlap(outputBuffer.ptrEnd(overlapLength), inputBuffer.ptrBegin(), offset);
+        outputBuffer.putSamples(overlapLength);
+
+        // ... then copy sequence samples from 'inputBuffer' to output
+        temp = (seekWindowLength - 2 * overlapLength);// & 0xfffffffe;
+        if (temp > 0)
+        {
+            outputBuffer.putSamples(inputBuffer.ptrBegin() + channels * (offset + overlapLength), temp);
+        }
+
+        // Copies the end of the current sequence from 'inputBuffer' to 
+        // 'midBuffer' for being mixed with the beginning of the next 
+        // processing sequence and so on
+        assert(offset + seekWindowLength <= inputBuffer.numSamples());
+        memcpy(pMidBuffer, inputBuffer.ptrBegin() + channels * (offset + seekWindowLength - overlapLength), 
+            channels * sizeof(SAMPLETYPE) * overlapLength);
+        bMidBufferDirty = TRUE;
+
+        // Remove the processed samples from the input buffer. Update
+        // the difference between integer & nominal skip step to 'skipFract'
+        // in order to prevent the error from accumulating over time.
+        skipFract += nominalSkip;   // real skip size
+        ovlSkip = (int)skipFract;   // rounded to integer skip
+        skipFract -= ovlSkip;       // maintain the fraction part, i.e. real vs. integer skip
+        inputBuffer.receiveSamples(ovlSkip);
+    }
+}
+
+
+// Adds 'numsamples' pcs of samples from the 'samples' memory position into
+// the input of the object.
+void TDStretch::putSamples(const SAMPLETYPE *samples, uint numSamples)
+{
+    // Add the samples into the input buffer
+    inputBuffer.putSamples(samples, numSamples);
+    // Process the samples in input buffer
+    processSamples();
+}
+
+
+
+/// Set new overlap length parameter & reallocate RefMidBuffer if necessary.
+void TDStretch::acceptNewOverlapLength(uint newOverlapLength)
+{
+    uint prevOvl;
+
+    prevOvl = overlapLength;
+    overlapLength = newOverlapLength;
+
+    if (overlapLength > prevOvl)
+    {
+        delete[] pMidBuffer;
+        delete[] pRefMidBufferUnaligned;
+
+        pMidBuffer = new SAMPLETYPE[overlapLength * 2];
+        bMidBufferDirty = TRUE;
+        clearMidBuffer();
+
+        pRefMidBufferUnaligned = new SAMPLETYPE[2 * overlapLength + 16 / sizeof(SAMPLETYPE)];
+        // ensure that 'pRefMidBuffer' is aligned to 16 byte boundary for efficiency
+        pRefMidBuffer = (SAMPLETYPE *)((((ulongptr)pRefMidBufferUnaligned) + 15) & -16);
+    }
+}
+
+
+// Operator 'new' is overloaded so that it automatically creates a suitable instance 
+// depending on if we've a MMX/SSE/etc-capable CPU available or not.
+void * TDStretch::operator new(size_t s)
+{
+    // Notice! don't use "new TDStretch" directly, use "newInstance" to create a new instance instead!
+    assert(FALSE);  
+    return NULL;
+}
+
+
+TDStretch * TDStretch::newInstance()
+{
+    uint uExtensions = 0;
+
+#if !defined(_MSC_VER) || !defined(__x86_64__)
+    uExtensions = detectCPUextensions();
+#endif
+
+    // Check if MMX/SSE/3DNow! instruction set extensions supported by CPU
+
+#ifdef ALLOW_MMX
+    // MMX routines available only with integer sample types
+    if (uExtensions & SUPPORT_MMX)
+    {
+        return ::new TDStretchMMX;
+    }
+    else
+#endif // ALLOW_MMX
+
+
+#ifdef ALLOW_SSE
+    if (uExtensions & SUPPORT_SSE)
+    {
+        // SSE support
+        return ::new TDStretchSSE;
+    }
+    else
+#endif // ALLOW_SSE
+
+
+#ifdef ALLOW_3DNOW
+    if (uExtensions & SUPPORT_3DNOW)
+    {
+        // 3DNow! support
+        return ::new TDStretch3DNow;
+    }
+    else
+#endif // ALLOW_3DNOW
+
+    {
+        // ISA optimizations not supported, use plain C version
+        return ::new TDStretch;
+    }
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Integer arithmetics specific algorithm implementations.
+//
+//////////////////////////////////////////////////////////////////////////////
+
+#ifdef INTEGER_SAMPLES
+
+// Slopes the amplitude of the 'midBuffer' samples so that cross correlation
+// is faster to calculate
+void TDStretch::precalcCorrReferenceStereo()
+{
+    int i, cnt2;
+    int temp, temp2;
+
+    for (i=0 ; i < (int)overlapLength ;i ++) 
+    {
+        temp = i * (overlapLength - i);
+        cnt2 = i * 2;
+
+        temp2 = (pMidBuffer[cnt2] * temp) / slopingDivider;
+        pRefMidBuffer[cnt2] = (short)(temp2);
+        temp2 = (pMidBuffer[cnt2 + 1] * temp) / slopingDivider;
+        pRefMidBuffer[cnt2 + 1] = (short)(temp2);
+    }
+}
+
+
+// Slopes the amplitude of the 'midBuffer' samples so that cross correlation
+// is faster to calculate
+void TDStretch::precalcCorrReferenceMono()
+{
+    int i;
+    long temp;
+    long temp2;
+
+    for (i=0 ; i < (int)overlapLength ;i ++) 
+    {
+        temp = i * (overlapLength - i);
+        temp2 = (pMidBuffer[i] * temp) / slopingDivider;
+        pRefMidBuffer[i] = (short)temp2;
+    }
+}
+
+
+// Overlaps samples in 'midBuffer' with the samples in 'input'. The 'Stereo' 
+// version of the routine.
+void TDStretch::overlapStereo(short *output, const short *input) const
+{
+    int i;
+    short temp;
+    uint cnt2;
+
+    for (i = 0; i < (int)overlapLength ; i ++) 
+    {
+        temp = (short)(overlapLength - i);
+        cnt2 = 2 * i;
+        output[cnt2] = (input[cnt2] * i + pMidBuffer[cnt2] * temp )  / overlapLength;
+        output[cnt2 + 1] = (input[cnt2 + 1] * i + pMidBuffer[cnt2 + 1] * temp ) / overlapLength;
+    }
+}
+
+
+/// Calculates overlap period length in samples.
+/// Integer version rounds overlap length to closest power of 2
+/// for a divide scaling operation.
+void TDStretch::calculateOverlapLength(uint overlapMs)
+{
+    uint newOvl;
+
+    overlapDividerBits = _getClosest2Power((sampleRate * overlapMs) / 1000.0);
+    if (overlapDividerBits > 9) overlapDividerBits = 9;
+    if (overlapDividerBits < 4) overlapDividerBits = 4;
+    newOvl = 1<<overlapDividerBits;
+
+    acceptNewOverlapLength(newOvl);
+
+    // calculate sloping divider so that crosscorrelation operation won't 
+    // overflow 32-bit register. Max. sum of the crosscorrelation sum without 
+    // divider would be 2^30*(N^3-N)/3, where N = overlap length
+    slopingDivider = (newOvl * newOvl - 1) / 3;
+}
+
+
+long TDStretch::calcCrossCorrMono(const short *mixingPos, const short *compare) const
+{
+    long corr;
+    uint i;
+
+    corr = 0;
+    for (i = 1; i < overlapLength; i ++) 
+    {
+        corr += (mixingPos[i] * compare[i]) >> overlapDividerBits;
+    }
+
+    return corr;
+}
+
+
+long TDStretch::calcCrossCorrStereo(const short *mixingPos, const short *compare) const
+{
+    long corr;
+    uint i;
+
+    corr = 0;
+    for (i = 2; i < 2 * overlapLength; i += 2) 
+    {
+        corr += (mixingPos[i] * compare[i] +
+                 mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBits;
+    }
+
+    return corr;
+}
+
+#endif // INTEGER_SAMPLES
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Floating point arithmetics specific algorithm implementations.
+//
+
+#ifdef FLOAT_SAMPLES
+
+
+// Slopes the amplitude of the 'midBuffer' samples so that cross correlation
+// is faster to calculate
+void TDStretch::precalcCorrReferenceStereo()
+{
+    int i, cnt2;
+    float temp;
+
+    for (i=0 ; i < (int)overlapLength ;i ++) 
+    {
+        temp = (float)i * (float)(overlapLength - i);
+        cnt2 = i * 2;
+        pRefMidBuffer[cnt2] = (float)(pMidBuffer[cnt2] * temp);
+        pRefMidBuffer[cnt2 + 1] = (float)(pMidBuffer[cnt2 + 1] * temp);
+    }
+}
+
+
+// Slopes the amplitude of the 'midBuffer' samples so that cross correlation
+// is faster to calculate
+void TDStretch::precalcCorrReferenceMono()
+{
+    int i;
+    float temp;
+
+    for (i=0 ; i < (int)overlapLength ;i ++) 
+    {
+        temp = (float)i * (float)(overlapLength - i);
+        pRefMidBuffer[i] = (float)(pMidBuffer[i] * temp);
+    }
+}
+
+
+// SSE-optimized version of the function overlapStereo
+void TDStretch::overlapStereo(float *output, const float *input) const
+{
+    int i;
+    uint cnt2;
+    float fTemp;
+    float fScale;
+    float fi;
+
+    fScale = 1.0f / (float)overlapLength;
+
+    for (i = 0; i < (int)overlapLength ; i ++) 
+    {
+        fTemp = (float)(overlapLength - i) * fScale;
+        fi = (float)i * fScale;
+        cnt2 = 2 * i;
+        output[cnt2 + 0] = input[cnt2 + 0] * fi + pMidBuffer[cnt2 + 0] * fTemp;
+        output[cnt2 + 1] = input[cnt2 + 1] * fi + pMidBuffer[cnt2 + 1] * fTemp;
+    }
+}
+
+
+/// Calculates overlap period length in samples.
+void TDStretch::calculateOverlapLength(uint overlapMs)
+{
+    uint newOvl;
+
+    newOvl = (sampleRate * overlapMs) / 1000;
+    if (newOvl < 16) newOvl = 16;
+
+    // must be divisible by 8
+    newOvl -= newOvl % 8;
+
+    acceptNewOverlapLength(newOvl);
+}
+
+
+
+double TDStretch::calcCrossCorrMono(const float *mixingPos, const float *compare) const
+{
+    double corr;
+    uint i;
+
+    corr = 0;
+    for (i = 1; i < overlapLength; i ++) 
+    {
+        corr += mixingPos[i] * compare[i];
+    }
+
+    return corr;
+}
+
+
+double TDStretch::calcCrossCorrStereo(const float *mixingPos, const float *compare) const
+{
+    double corr;
+    uint i;
+
+    corr = 0;
+    for (i = 2; i < 2 * overlapLength; i += 2) 
+    {
+        corr += mixingPos[i] * compare[i] +
+                mixingPos[i + 1] * compare[i + 1];
+    }
+
+    return corr;
+}
+
+#endif // FLOAT_SAMPLES
--- a/desmume/src/metaspu/SoundTouch/TDStretch.h
+++ b/desmume/src/metaspu/SoundTouch/TDStretch.h
@ -0,0 +1,236 @@
+////////////////////////////////////////////////////////////////////////////////
+/// 
+/// Sampled sound tempo changer/time stretch algorithm. Changes the sound tempo 
+/// while maintaining the original pitch by using a time domain WSOLA-like method 
+/// with several performance-increasing tweaks.
+///
+/// Note : MMX optimized functions reside in a separate, platform-specific file, 
+/// e.g. 'mmx_win.cpp' or 'mmx_gcc.cpp'
+///
+/// Author        : Copyright (c) Olli Parviainen
+/// Author e-mail : oparviai 'at' iki.fi
+/// SoundTouch WWW: http://www.surina.net/soundtouch
+///
+////////////////////////////////////////////////////////////////////////////////
+//
+// Last changed  : $Date: 2006/02/05 16:44:06 $
+// File revision : $Revision: 1.16 $
+//
+// $Id: TDStretch.h,v 1.16 2006/02/05 16:44:06 Olli Exp $
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// License :
+//
+//  SoundTouch audio processing library
+//  Copyright (c) Olli Parviainen
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2.1 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef TDStretch_H
+#define TDStretch_H
+
+#include "STTypes.h"
+#include "RateTransposer.h"
+#include "FIFOSamplePipe.h"
+
+namespace soundtouch
+{
+
+// Default values for sound processing parameters:
+
+/// Default length of a single processing sequence, in milliseconds. This determines to how 
+/// long sequences the original sound is chopped in the time-stretch algorithm.
+///
+/// The larger this value is, the lesser sequences are used in processing. In principle
+/// a bigger value sounds better when slowing down tempo, but worse when increasing tempo
+/// and vice versa.
+///
+/// Increasing this value reduces computational burden & vice versa.
+#define DEFAULT_SEQUENCE_MS     63
+
+#define DEFAULT_SEEKWINDOW_MS   17
+
+#define DEFAULT_OVERLAP_MS      7
+
+
+/// Class that does the time-stretch (tempo change) effect for the processed
+/// sound.
+class TDStretch : public FIFOProcessor
+{
+protected:
+    uint channels;
+    uint sampleReq;
+    float tempo;
+
+    SAMPLETYPE *pMidBuffer;
+    SAMPLETYPE *pRefMidBuffer;
+    SAMPLETYPE *pRefMidBufferUnaligned;
+    uint overlapLength;
+    uint overlapDividerBits;
+    uint slopingDivider;
+    uint seekLength;
+    uint seekWindowLength;
+    uint maxOffset;
+    float nominalSkip;
+    float skipFract;
+    FIFOSampleBuffer outputBuffer;
+    FIFOSampleBuffer inputBuffer;
+    BOOL bQuickseek;
+    BOOL bMidBufferDirty;
+
+    uint sampleRate;
+    uint sequenceMs;
+    uint seekWindowMs;
+    uint overlapMs;
+
+    void acceptNewOverlapLength(uint newOverlapLength);
+
+    virtual void clearCrossCorrState();
+    void calculateOverlapLength(uint overlapMs);
+
+    virtual LONG_SAMPLETYPE calcCrossCorrStereo(const SAMPLETYPE *mixingPos, const SAMPLETYPE *compare) const;
+    virtual LONG_SAMPLETYPE calcCrossCorrMono(const SAMPLETYPE *mixingPos, const SAMPLETYPE *compare) const;
+
+    virtual uint seekBestOverlapPositionStereo(const SAMPLETYPE *refPos);
+    virtual uint seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos);
+    virtual uint seekBestOverlapPositionMono(const SAMPLETYPE *refPos);
+    virtual uint seekBestOverlapPositionMonoQuick(const SAMPLETYPE *refPos);
+    uint seekBestOverlapPosition(const SAMPLETYPE *refPos);
+
+    virtual void overlapStereo(SAMPLETYPE *output, const SAMPLETYPE *input) const;
+    virtual void overlapMono(SAMPLETYPE *output, const SAMPLETYPE *input) const;
+
+    void clearMidBuffer();
+    void overlap(SAMPLETYPE *output, const SAMPLETYPE *input, uint ovlPos) const;
+
+    void precalcCorrReferenceMono();
+    void precalcCorrReferenceStereo();
+
+    void processNominalTempo();
+
+    /// Changes the tempo of the given sound samples.
+    /// Returns amount of samples returned in the "output" buffer.
+    /// The maximum amount of samples that can be returned at a time is set by
+    /// the 'set_returnBuffer_size' function.
+    void processSamples();
+    
+public:
+    TDStretch();
+    virtual ~TDStretch();
+
+    /// Operator 'new' is overloaded so that it automatically creates a suitable instance 
+    /// depending on if we've a MMX/SSE/etc-capable CPU available or not.
+    void *operator new(size_t s);
+
+    /// Use this function instead of "new" operator to create a new instance of this class. 
+    /// This function automatically chooses a correct feature set depending on if the CPU
+    /// supports MMX/SSE/etc extensions.
+    static TDStretch *newInstance();
+    
+    /// Returns the output buffer object
+    FIFOSamplePipe *getOutput() { return &outputBuffer; };
+
+    /// Returns the input buffer object
+    FIFOSamplePipe *getInput() { return &inputBuffer; };
+
+    /// Sets new target tempo. Normal tempo = 'SCALE', smaller values represent slower 
+    /// tempo, larger faster tempo.
+    void setTempo(float newTempo);
+
+    /// Returns nonzero if there aren't any samples available for outputting.
+    virtual void clear();
+
+    /// Clears the input buffer
+    void clearInput();
+
+    /// Sets the number of channels, 1 = mono, 2 = stereo
+    void setChannels(uint numChannels);
+
+    /// Enables/disables the quick position seeking algorithm. Zero to disable, 
+    /// nonzero to enable
+    void enableQuickSeek(BOOL enable);
+
+    /// Returns nonzero if the quick seeking algorithm is enabled.
+    BOOL isQuickSeekEnabled() const;
+
+    /// Sets routine control parameters. These control are certain time constants
+    /// defining how the sound is stretched to the desired duration.
+    //
+    /// 'sampleRate' = sample rate of the sound
+    /// 'sequenceMS' = one processing sequence length in milliseconds
+    /// 'seekwindowMS' = seeking window length for scanning the best overlapping 
+    ///      position
+    /// 'overlapMS' = overlapping length
+    void setParameters(uint sampleRate,                             ///< Samplerate of sound being processed (Hz)
+                       uint sequenceMS = DEFAULT_SEQUENCE_MS,       ///< Single processing sequence length (ms)
+                       uint seekwindowMS = DEFAULT_SEEKWINDOW_MS,   ///< Offset seeking window length (ms)
+                       uint overlapMS = DEFAULT_OVERLAP_MS          ///< Sequence overlapping length (ms)
+                       );
+
+    /// Get routine control parameters, see setParameters() function.
+    /// Any of the parameters to this function can be NULL, in such case corresponding parameter
+    /// value isn't returned.
+    void getParameters(uint *pSampleRate, uint *pSequenceMs, uint *pSeekWindowMs, uint *pOverlapMs);
+
+    /// Adds 'numsamples' pcs of samples from the 'samples' memory position into
+    /// the input of the object.
+    virtual void putSamples(
+            const SAMPLETYPE *samples,  ///< Input sample data
+            uint numSamples                         ///< Number of samples in 'samples' so that one sample
+                                                    ///< contains both channels if stereo
+            );
+};
+
+
+
+// Implementation-specific class declarations:
+
+//#ifdef ALLOW_MMX
+//    /// Class that implements MMX optimized routines for 16bit integer samples type.
+//    class TDStretchMMX : public TDStretch
+//    {
+//    protected:
+//        long calcCrossCorrStereo(const short *mixingPos, const short *compare) const;
+//        virtual void overlapStereo(short *output, const short *input) const;
+//        virtual void clearCrossCorrState();
+//    };
+//#endif /// ALLOW_MMX
+//
+//
+//#ifdef ALLOW_3DNOW
+//    /// Class that implements 3DNow! optimized routines for floating point samples type.
+//    class TDStretch3DNow : public TDStretch
+//    {
+//    protected:
+//        double calcCrossCorrStereo(const float *mixingPos, const float *compare) const;
+//    };
+//#endif /// ALLOW_3DNOW
+
+
+#ifdef ALLOW_SSE
+    /// Class that implements SSE optimized routines for floating point samples type.
+    class TDStretchSSE : public TDStretch
+    {
+    protected:
+        double calcCrossCorrStereo(const float *mixingPos, const float *compare) const;
+    };
+
+#endif /// ALLOW_SSE
+
+}
+#endif  /// TDStretch_H
--- a/desmume/src/metaspu/SoundTouch/WavFile.cpp
+++ b/desmume/src/metaspu/SoundTouch/WavFile.cpp
@ -0,0 +1,728 @@
+////////////////////////////////////////////////////////////////////////////////
+///
+/// Classes for easy reading & writing of WAV sound files. 
+///
+/// For big-endian CPU, define _BIG_ENDIAN_ during compile-time to correctly
+/// parse the WAV files with such processors.
+/// 
+/// Admittingly, more complete WAV reader routines may exist in public domain,
+/// but the reason for 'yet another' one is that those generic WAV reader 
+/// libraries are exhaustingly large and cumbersome! Wanted to have something
+/// simpler here, i.e. something that's not already larger than rest of the
+/// SoundTouch/SoundStretch program...
+///
+/// Author        : Copyright (c) Olli Parviainen
+/// Author e-mail : oparviai 'at' iki.fi
+/// SoundTouch WWW: http://www.surina.net/soundtouch
+///
+////////////////////////////////////////////////////////////////////////////////
+//
+// Last changed  : $Date: 2006/02/05 16:44:06 $
+// File revision : $Revision: 1.15 $
+//
+// $Id: WavFile.cpp,v 1.15 2006/02/05 16:44:06 Olli Exp $
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// License :
+//
+//  SoundTouch audio processing library
+//  Copyright (c) Olli Parviainen
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2.1 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include <stdio.h>
+#include <stdexcept>
+#include <string>
+#include <assert.h>
+#include <limits.h>
+
+#include <cstdlib>
+#include <cstring>
+
+#include "WavFile.h"
+
+using namespace std;
+
+const static char riffStr[] = "RIFF";
+const static char waveStr[] = "WAVE";
+const static char fmtStr[]  = "fmt ";
+const static char dataStr[] = "data";
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Helper functions for swapping byte order to correctly read/write WAV files 
+// with big-endian CPU's: Define compile-time definition _BIG_ENDIAN_ to
+// turn-on the conversion if it appears necessary. 
+//
+// For example, Intel x86 is little-endian and doesn't require conversion,
+// while PowerPC of Mac's and many other RISC cpu's are big-endian.
+
+#ifdef BYTE_ORDER
+    // In gcc compiler detect the byte order automatically
+    #if BYTE_ORDER == BIG_ENDIAN
+        // big-endian platform.
+        #define _BIG_ENDIAN_
+    #endif
+#endif
+    
+#ifdef _BIG_ENDIAN_
+    // big-endian CPU, swap bytes in 16 & 32 bit words
+
+    // helper-function to swap byte-order of 32bit integer
+    static inline void _swap32(unsigned int &dwData)
+    {
+        dwData = ((dwData >> 24) & 0x000000FF) | 
+                 ((dwData >> 8)  & 0x0000FF00) | 
+                 ((dwData << 8)  & 0x00FF0000) | 
+                 ((dwData << 24) & 0xFF000000);
+    }   
+
+    // helper-function to swap byte-order of 16bit integer
+    static inline void _swap16(unsigned short &wData)
+    {
+        wData = ((wData >> 8) & 0x00FF) | 
+                ((wData << 8) & 0xFF00);
+    }
+
+    // helper-function to swap byte-order of buffer of 16bit integers
+    static inline void _swap16Buffer(unsigned short *pData, unsigned int dwNumWords)
+    {
+        unsigned long i;
+
+        for (i = 0; i < dwNumWords; i ++)
+        {
+            _swap16(pData[i]);
+        }
+    }
+
+#else   // BIG_ENDIAN
+    // little-endian CPU, WAV file is ok as such
+
+    // dummy helper-function
+    static inline void _swap32(unsigned int &dwData)
+    {
+        // do nothing
+    }   
+
+    // dummy helper-function
+    static inline void _swap16(unsigned short &wData)
+    {
+        // do nothing
+    }
+
+    // dummy helper-function
+    static inline void _swap16Buffer(unsigned short *pData, unsigned int dwNumBytes)
+    {
+        // do nothing
+    }
+
+#endif  // BIG_ENDIAN
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Class WavInFile
+//
+
+WavInFile::WavInFile(const char *fileName)
+{
+    int hdrsOk;
+
+    // Try to open the file for reading
+    fptr = fopen(fileName, "rb");
+    if (fptr == NULL) 
+    {
+        // didn't succeed
+        string msg = "Error : Unable to open file \"";
+        msg += fileName;
+        msg += "\" for reading.";
+        throw runtime_error(msg);
+    }
+
+    // Read the file headers
+    hdrsOk = readWavHeaders();
+    if (hdrsOk != 0) 
+    {
+        // Something didn't match in the wav file headers 
+        string msg = "File \"";
+        msg += fileName;
+        msg += "\" is corrupt or not a WAV file";
+        throw runtime_error(msg);
+    }
+
+    if (header.format.fixed != 1)
+    {
+        string msg = "File \"";
+        msg += fileName;
+        msg += "\" uses unsupported encoding.";
+        throw runtime_error(msg);
+    }
+
+    dataRead = 0;
+}
+
+
+
+WavInFile::~WavInFile()
+{
+    close();
+}
+
+
+
+void WavInFile::rewind()
+{
+    int hdrsOk;
+
+    fseek(fptr, 0, SEEK_SET);
+    hdrsOk = readWavHeaders();
+    assert(hdrsOk == 0);
+    dataRead = 0;
+}
+
+
+int WavInFile::checkCharTags()
+{
+    // header.format.fmt should equal to 'fmt '
+    if (memcmp(fmtStr, header.format.fmt, 4) != 0) return -1;
+    // header.data.data_field should equal to 'data'
+    if (memcmp(dataStr, header.data.data_field, 4) != 0) return -1;
+
+    return 0;
+}
+
+
+int WavInFile::read(char *buffer, int maxElems)
+{
+    int numBytes;
+    uint afterDataRead;
+
+    // ensure it's 8 bit format
+    if (header.format.bits_per_sample != 8)
+    {
+        throw runtime_error("Error: WavInFile::read(char*, int) works only with 8bit samples.");
+    }
+    assert(sizeof(char) == 1);
+
+    numBytes = maxElems;
+    afterDataRead = dataRead + numBytes;
+    if (afterDataRead > header.data.data_len) 
+    {
+        // Don't read more samples than are marked available in header
+        numBytes = header.data.data_len - dataRead;
+        assert(numBytes >= 0);
+    }
+
+    numBytes = fread(buffer, 1, numBytes, fptr);
+    dataRead += numBytes;
+
+    return numBytes;
+}
+
+
+int WavInFile::read(short *buffer, int maxElems)
+{
+    unsigned int afterDataRead;
+    int numBytes;
+    int numElems;
+
+    if (header.format.bits_per_sample == 8)
+    {
+        // 8 bit format
+        char *temp = new char[maxElems];
+        int i;
+
+        numElems = read(temp, maxElems);
+        // convert from 8 to 16 bit
+        for (i = 0; i < numElems; i ++)
+        {
+            buffer[i] = temp[i] << 8;
+        }
+        delete[] temp;
+    }
+    else
+    {
+        // 16 bit format
+        assert(header.format.bits_per_sample == 16);
+        assert(sizeof(short) == 2);
+
+        numBytes = maxElems * 2;
+        afterDataRead = dataRead + numBytes;
+        if (afterDataRead > header.data.data_len) 
+        {
+            // Don't read more samples than are marked available in header
+            numBytes = header.data.data_len - dataRead;
+            assert(numBytes >= 0);
+        }
+
+        numBytes = fread(buffer, 1, numBytes, fptr);
+        dataRead += numBytes;
+        numElems = numBytes / 2;
+
+        // 16bit samples, swap byte order if necessary
+        _swap16Buffer((unsigned short *)buffer, numElems);
+    }
+
+    return numElems;
+}
+
+
+
+int WavInFile::read(float *buffer, int maxElems)
+{
+    short *temp = new short[maxElems];
+    int num;
+    int i;
+    double fscale;
+
+    num = read(temp, maxElems);
+
+    fscale = 1.0 / 32768.0;
+    // convert to floats, scale to range [-1..+1[
+    for (i = 0; i < num; i ++)
+    {
+        buffer[i] = (float)(fscale * (double)temp[i]);
+    }
+
+    delete[] temp;
+
+    return num;
+}
+
+
+int WavInFile::eof() const
+{
+    // return true if all data has been read or file eof has reached
+    return (dataRead == header.data.data_len || feof(fptr));
+}
+
+
+void WavInFile::close()
+{
+    fclose(fptr);
+    fptr = NULL;
+}
+
+
+
+// test if character code is between a white space ' ' and little 'z'
+static int isAlpha(char c)
+{
+    return (c >= ' ' && c <= 'z') ? 1 : 0;
+}
+
+
+// test if all characters are between a white space ' ' and little 'z'
+static int isAlphaStr(char *str)
+{
+    int c;
+
+    c = str[0];
+    while (c) 
+    {
+        if (isAlpha(c) == 0) return 0;
+        str ++;
+        c = str[0];
+    }
+
+    return 1;
+}
+
+
+int WavInFile::readRIFFBlock()
+{
+    fread(&(header.riff), sizeof(WavRiff), 1, fptr);
+
+    // swap 32bit data byte order if necessary
+    _swap32((unsigned int &)header.riff.package_len);
+
+    // header.riff.riff_char should equal to 'RIFF');
+    if (memcmp(riffStr, header.riff.riff_char, 4) != 0) return -1;
+    // header.riff.wave should equal to 'WAVE'
+    if (memcmp(waveStr, header.riff.wave, 4) != 0) return -1;
+
+    return 0;
+}
+
+
+
+
+int WavInFile::readHeaderBlock()
+{
+    char label[5];
+    string sLabel;
+
+    // lead label string
+    fread(label, 1, 4, fptr);
+    label[4] = 0;
+
+    if (isAlphaStr(label) == 0) return -1;    // not a valid label
+
+    // Decode blocks according to their label
+    if (strcmp(label, fmtStr) == 0)
+    {
+        int nLen, nDump;
+
+        // 'fmt ' block 
+        memcpy(header.format.fmt, fmtStr, 4);
+
+        // read length of the format field
+        fread(&nLen, sizeof(int), 1, fptr);
+        // swap byte order if necessary
+        _swap32((unsigned int &)nLen); // int format_len;
+        header.format.format_len = nLen;
+
+        // calculate how much length differs from expected
+        nDump = nLen - (sizeof(header.format) - 8);
+
+        // if format_len is larger than expected, read only as much data as we've space for
+        if (nDump > 0)
+        {
+            nLen = sizeof(header.format) - 8;
+        }
+
+        // read data
+        fread(&(header.format.fixed), nLen, 1, fptr);
+
+        // swap byte order if necessary
+        _swap16((unsigned short &)header.format.fixed);            // short int fixed;
+        _swap16((unsigned short &)header.format.channel_number);   // short int channel_number;
+        _swap32((unsigned int   &)header.format.sample_rate);      // int sample_rate;
+        _swap32((unsigned int   &)header.format.byte_rate);        // int byte_rate;
+        _swap16((unsigned short &)header.format.byte_per_sample);  // short int byte_per_sample;
+        _swap16((unsigned short &)header.format.bits_per_sample);  // short int bits_per_sample;
+
+        // if format_len is larger than expected, skip the extra data
+        if (nDump > 0)
+        {
+            fseek(fptr, nDump, SEEK_CUR);
+        }
+
+        return 0;
+    }
+    else if (strcmp(label, dataStr) == 0)
+    {
+        // 'data' block
+        memcpy(header.data.data_field, dataStr, 4);
+        fread(&(header.data.data_len), sizeof(uint), 1, fptr);
+
+        // swap byte order if necessary
+        _swap32((unsigned int &)header.data.data_len);
+
+        return 1;
+    }
+    else
+    {
+        uint len, i;
+        uint temp;
+        // unknown block
+
+        // read length
+        fread(&len, sizeof(len), 1, fptr);
+        // scan through the block
+        for (i = 0; i < len; i ++)
+        {
+            fread(&temp, 1, 1, fptr);
+            if (feof(fptr)) return -1;   // unexpected eof
+        }
+    }
+    return 0;
+}
+
+
+int WavInFile::readWavHeaders()
+{
+    int res;
+
+    memset(&header, 0, sizeof(header));
+
+    res = readRIFFBlock();
+    if (res) return 1;
+    // read header blocks until data block is found
+    do
+    {
+        // read header blocks
+        res = readHeaderBlock();
+        if (res < 0) return 1;  // error in file structure
+    } while (res == 0);
+    // check that all required tags are legal
+    return checkCharTags();
+}
+
+
+uint WavInFile::getNumChannels() const
+{
+    return header.format.channel_number;
+}
+
+
+uint WavInFile::getNumBits() const
+{
+    return header.format.bits_per_sample;
+}
+
+
+uint WavInFile::getBytesPerSample() const
+{
+    return getNumChannels() * getNumBits() / 8;
+}
+
+
+uint WavInFile::getSampleRate() const
+{
+    return header.format.sample_rate;
+}
+
+
+
+uint WavInFile::getDataSizeInBytes() const
+{
+    return header.data.data_len;
+}
+
+
+uint WavInFile::getNumSamples() const
+{
+    return header.data.data_len / header.format.byte_per_sample;
+}
+
+
+uint WavInFile::getLengthMS() const
+{
+   uint numSamples;
+   uint sampleRate;
+
+   numSamples = getNumSamples();
+   sampleRate = getSampleRate();
+
+   assert(numSamples < UINT_MAX / 1000);
+   return (1000 * numSamples / sampleRate);
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// Class WavOutFile
+//
+
+WavOutFile::WavOutFile(const char *fileName, int sampleRate, int bits, int channels)
+{
+    bytesWritten = 0;
+    fptr = fopen(fileName, "wb");
+    if (fptr == NULL) 
+    {
+        string msg = "Error : Unable to open file \"";
+        msg += fileName;
+        msg += "\" for writing.";
+        //pmsg = msg.c_str;
+        throw runtime_error(msg);
+    }
+
+    fillInHeader(sampleRate, bits, channels);
+    writeHeader();
+    
+    flushTime = flushRate;
+}
+
+
+
+WavOutFile::~WavOutFile()
+{
+    close();
+}
+
+
+
+void WavOutFile::fillInHeader(uint sampleRate, uint bits, uint channels)
+{
+    // fill in the 'riff' part..
+
+    // copy string 'RIFF' to riff_char
+    memcpy(&(header.riff.riff_char), riffStr, 4);
+    // package_len unknown so far
+    header.riff.package_len = 0;
+    // copy string 'WAVE' to wave
+    memcpy(&(header.riff.wave), waveStr, 4);
+
+
+    // fill in the 'format' part..
+
+    // copy string 'fmt ' to fmt
+    memcpy(&(header.format.fmt), fmtStr, 4);
+
+    header.format.format_len = 0x10;
+    header.format.fixed = 1;
+    header.format.channel_number = (short)channels;
+    header.format.sample_rate = sampleRate;
+    header.format.bits_per_sample = (short)bits;
+    header.format.byte_per_sample = (short)(bits * channels / 8);
+    header.format.byte_rate = header.format.byte_per_sample * sampleRate;
+    header.format.sample_rate = sampleRate;
+
+    // fill in the 'data' part..
+
+    // copy string 'data' to data_field
+    memcpy(&(header.data.data_field), dataStr, 4);
+    // data_len unknown so far
+    header.data.data_len = 0;
+}
+
+
+void WavOutFile::finishHeader()
+{
+    // supplement the file length into the header structure
+    header.riff.package_len = bytesWritten + 36;
+    header.data.data_len = bytesWritten;
+
+    writeHeader();
+}
+
+
+
+void WavOutFile::writeHeader()
+{
+    WavHeader hdrTemp;
+
+    // swap byte order if necessary
+    hdrTemp = header;
+    _swap32((unsigned int   &)hdrTemp.riff.package_len);
+    _swap32((unsigned int   &)hdrTemp.format.format_len);
+    _swap16((unsigned short &)hdrTemp.format.fixed);
+    _swap16((unsigned short &)hdrTemp.format.channel_number);
+    _swap32((unsigned int   &)hdrTemp.format.sample_rate);
+    _swap32((unsigned int   &)hdrTemp.format.byte_rate);
+    _swap16((unsigned short &)hdrTemp.format.byte_per_sample);
+    _swap16((unsigned short &)hdrTemp.format.bits_per_sample);
+    _swap32((unsigned int   &)hdrTemp.data.data_len);
+
+    // write the supplemented header in the beginning of the file
+    fseek(fptr, 0, SEEK_SET);
+    fwrite(&hdrTemp, sizeof(hdrTemp), 1, fptr);
+    // jump back to the end of the file
+    fseek(fptr, 0, SEEK_END);
+}
+
+
+
+void WavOutFile::close()
+{
+    finishHeader();
+    fclose(fptr);
+    fptr = NULL;
+}
+
+void WavOutFile::flush( int numElems )
+{
+	flushTime -= numElems;
+	if( flushTime < 0 )
+	{
+		flushTime += flushRate;
+		finishHeader();
+	}
+}
+
+void WavOutFile::write(const char *buffer, int numElems)
+{
+    int res;
+
+    if (header.format.bits_per_sample != 8)
+    {
+        throw runtime_error("Error: WavOutFile::write(const char*, int) accepts only 8bit samples.");
+    }
+    assert(sizeof(char) == 1);
+
+    res = fwrite(buffer, 1, numElems, fptr);
+    if (res != numElems) 
+    {
+        throw runtime_error("Error while writing to a wav file.");
+    }
+
+    bytesWritten += numElems;
+	flush( numElems );
+}
+
+
+void WavOutFile::write(const short *buffer, int numElems)
+{
+    int res;
+
+    // 16 bit samples
+    if (numElems < 1) return;   // nothing to do
+
+    if (header.format.bits_per_sample == 8)
+    {
+        int i;
+        char *temp = new char[numElems];
+        // convert from 16bit format to 8bit format
+        for (i = 0; i < numElems; i ++)
+        {
+            temp[i] = buffer[i] >> 8;
+        }
+        // write in 8bit format
+        write(temp, numElems);
+        delete[] temp;
+    }
+    else
+    {
+        // 16bit format
+        unsigned short *pTemp = new unsigned short[numElems];
+
+        assert(header.format.bits_per_sample == 16);
+
+        // allocate temp buffer to swap byte order if necessary
+        memcpy(pTemp, buffer, numElems * 2);
+        _swap16Buffer(pTemp, numElems);
+
+        res = fwrite(pTemp, 2, numElems, fptr);
+
+        delete[] pTemp;
+
+        if (res != numElems) 
+        {
+            throw runtime_error("Error while writing to a wav file.");
+        }
+        bytesWritten += 2 * numElems;
+		flush( numElems*2 );
+    }
+}
+
+
+void WavOutFile::write(const float *buffer, int numElems)
+{
+    int i;
+    short *temp = new short[numElems];
+    int iTemp;
+
+    // convert to 16 bit integer
+    for (i = 0; i < numElems; i ++)
+    {
+        // convert to integer
+        iTemp = (int)(32768.0f * buffer[i]);
+
+        // saturate
+        if (iTemp < -32768) iTemp = -32768;
+        if (iTemp > 32767)  iTemp = 32767;
+        temp[i] = (short)iTemp;
+    }
+
+    write(temp, numElems);
+	flush( numElems );
+
+    delete[] temp;
+}
--- a/desmume/src/metaspu/SoundTouch/WavFile.h
+++ b/desmume/src/metaspu/SoundTouch/WavFile.h
@ -0,0 +1,264 @@
+////////////////////////////////////////////////////////////////////////////////
+///
+/// Classes for easy reading & writing of WAV sound files.
+///
+/// For big-endian CPU, define BIG_ENDIAN during compile-time to correctly
+/// parse the WAV files with such processors.
+/// 
+/// Admittingly, more complete WAV reader routines may exist in public domain, but 
+/// the reason for 'yet another' one is that those generic WAV reader libraries are
+/// exhaustingly large and cumbersome! Wanted to have something simpler here, i.e. 
+/// something that's not already larger than rest of the SoundTouch/SoundStretch program...
+///
+/// Author        : Copyright (c) Olli Parviainen
+/// Author e-mail : oparviai 'at' iki.fi
+/// SoundTouch WWW: http://www.surina.net/soundtouch
+///
+////////////////////////////////////////////////////////////////////////////////
+//
+// Last changed  : $Date: 2006/02/05 16:44:06 $
+// File revision : $Revision: 1.7 $
+//
+// $Id: WavFile.h,v 1.7 2006/02/05 16:44:06 Olli Exp $
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// License :
+//
+//  SoundTouch audio processing library
+//  Copyright (c) Olli Parviainen
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2.1 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef WAVFILE_H
+#define WAVFILE_H
+
+#include <stdio.h>
+
+#ifndef uint
+typedef unsigned int uint;
+#endif           
+
+
+/// WAV audio file 'riff' section header
+typedef struct 
+{
+    char riff_char[4];
+    int  package_len;
+    char wave[4];
+} WavRiff;
+
+/// WAV audio file 'format' section header
+typedef struct 
+{
+    char  fmt[4];
+    int   format_len;
+    short fixed;
+    short channel_number;
+    int   sample_rate;
+    int   byte_rate;
+    short byte_per_sample;
+    short bits_per_sample;
+} WavFormat;
+
+/// WAV audio file 'data' section header
+typedef struct 
+{
+    char  data_field[4];
+    uint  data_len;
+} WavData;
+
+
+/// WAV audio file header
+typedef struct 
+{
+    WavRiff   riff;
+    WavFormat format;
+    WavData   data;
+} WavHeader;
+
+
+/// Class for reading WAV audio files.
+class WavInFile
+{
+private:
+    /// File pointer.
+    FILE *fptr;
+
+    /// Counter of how many bytes of sample data have been read from the file.
+    uint dataRead;
+
+    /// WAV header information
+    WavHeader header;
+
+    /// Read WAV file headers.
+    /// \return zero if all ok, nonzero if file format is invalid.
+    int readWavHeaders();
+
+    /// Checks WAV file header tags.
+    /// \return zero if all ok, nonzero if file format is invalid.
+    int checkCharTags();
+
+    /// Reads a single WAV file header block.
+    /// \return zero if all ok, nonzero if file format is invalid.
+    int readHeaderBlock();
+
+    /// Reads WAV file 'riff' block
+    int readRIFFBlock();
+
+public:
+    /// Constructor: Opens the given WAV file. If the file can't be opened,
+    /// throws 'runtime_error' exception.
+    WavInFile(const char *filename);
+
+    /// Destructor: Closes the file.
+    ~WavInFile();
+
+    /// Close the file. Notice that file is automatically closed also when the 
+    /// class instance is deleted.
+    void close();
+
+    /// Rewind to beginning of the file
+    void rewind();
+
+    /// Get sample rate.
+    uint getSampleRate() const;
+
+    /// Get number of bits per sample, i.e. 8 or 16.
+    uint getNumBits() const;
+
+    /// Get sample data size in bytes. Ahem, this should return same information as 
+    /// 'getBytesPerSample'...
+    uint getDataSizeInBytes() const;
+
+    /// Get total number of samples in file.
+    uint getNumSamples() const;
+
+    /// Get number of bytes per audio sample (e.g. 16bit stereo = 4 bytes/sample)
+    uint getBytesPerSample() const;
+    
+    /// Get number of audio channels in the file (1=mono, 2=stereo)
+    uint getNumChannels() const;
+
+    /// Get the audio file length in milliseconds
+    uint getLengthMS() const;
+
+    /// Reads audio samples from the WAV file. This routine works only for 8 bit samples.
+    /// Reads given number of elements from the file or if end-of-file reached, as many 
+    /// elements as are left in the file.
+    ///
+    /// \return Number of 8-bit integers read from the file.
+    int read(char *buffer, int maxElems);
+
+    /// Reads audio samples from the WAV file to 16 bit integer format. Reads given number 
+    /// of elements from the file or if end-of-file reached, as many elements as are 
+    /// left in the file.
+    ///
+    /// \return Number of 16-bit integers read from the file.
+    int read(short *buffer,     ///< Pointer to buffer where to read data.
+             int maxElems       ///< Size of 'buffer' array (number of array elements).
+             );
+
+    /// Reads audio samples from the WAV file to floating point format, converting 
+    /// sample values to range [-1,1[. Reads given number of elements from the file
+    /// or if end-of-file reached, as many elements as are left in the file.
+    ///
+    /// \return Number of elements read from the file.
+    int read(float *buffer,     ///< Pointer to buffer where to read data.
+             int maxElems       ///< Size of 'buffer' array (number of array elements).
+             );
+
+    /// Check end-of-file.
+    ///
+    /// \return Nonzero if end-of-file reached.
+    int eof() const;
+};
+
+
+
+/// Class for writing WAV audio files.
+class WavOutFile
+{
+private:
+    /// Pointer to the WAV file
+    FILE *fptr;
+
+    /// WAV file header data.
+    WavHeader header;
+
+    /// Counter of how many bytes have been written to the file so far.
+    int bytesWritten;
+
+	/// number of bytes to be written before next flush.
+	int flushTime;
+
+    /// Fills in WAV file header information.
+    void fillInHeader(const uint sampleRate, const uint bits, const uint channels);
+
+    /// Finishes the WAV file header by supplementing information of amount of
+    /// data written to file etc
+    void finishHeader();
+
+    /// Writes the WAV file header.
+    void writeHeader();
+
+	/// Flushes the WAV file every so often -- writes header info for the current
+	/// data length and then returns the seek position to the end of the WAV for
+	/// continued writing.  This method is called from each write() method.
+	void flush( int numElems );
+
+	/// Flush the WAVheader every 32kb written
+	static const int flushRate = 0x8000;
+
+public:
+    /// Constructor: Creates a new WAV file. Throws a 'runtime_error' exception 
+    /// if file creation fails.
+    WavOutFile(const char *fileName,    ///< Filename
+               int sampleRate,          ///< Sample rate (e.g. 44100 etc)
+               int bits,                ///< Bits per sample (8 or 16 bits)
+               int channels             ///< Number of channels (1=mono, 2=stereo)
+               );
+
+    /// Destructor: Finalizes & closes the WAV file.
+    ~WavOutFile();
+
+    /// Write data to WAV file. This function works only with 8bit samples. 
+    /// Throws a 'runtime_error' exception if writing to file fails.
+    void write(const char *buffer,     ///< Pointer to sample data buffer.
+               int numElems             ///< How many array items are to be written to file.
+               );
+
+    /// Write data to WAV file. Throws a 'runtime_error' exception if writing to
+    /// file fails.
+    void write(const short *buffer,     ///< Pointer to sample data buffer.
+               int numElems             ///< How many array items are to be written to file.
+               );
+
+    /// Write data to WAV file in floating point format, saturating sample values to range
+    /// [-1..+1[. Throws a 'runtime_error' exception if writing to file fails.
+    void write(const float *buffer,     ///< Pointer to sample data buffer.
+               int numElems             ///< How many array items are to be written to file.
+               );
+
+    /// Finalize & close the WAV file. Automatically supplements the WAV file header
+    /// information according to written data etc.
+    ///
+    /// Notice that file is automatically closed also when the class instance is deleted.
+    void close();
+};
+
+#endif
--- a/desmume/src/metaspu/SoundTouch/build.sh
+++ b/desmume/src/metaspu/SoundTouch/build.sh
@ -0,0 +1,28 @@
+#!/bin/sh
+
+curdir=`pwd`
+
+echo -----------------
+echo Building SoundTouch
+echo -----------------
+
+if [ $# -gt 0 ] && [ $1 = "all" ]
+then
+
+aclocal
+automake -a
+autoconf
+./configure
+make clean
+make install
+
+else
+make $@
+fi
+
+if [ $? -ne 0 ]
+then
+exit 1
+fi
+
+#cp libZeroSPU2*.so* ${PCSX2PLUGINS}
--- a/desmume/src/metaspu/SoundTouch/configure.ac
+++ b/desmume/src/metaspu/SoundTouch/configure.ac
@ -0,0 +1,37 @@
+#                                               -*- Autoconf -*-
+# Process this file with autoconf to produce a configure script.
+
+#AC_PREREQ([2.63])
+AC_INIT([FULL-PACKAGE-NAME], [VERSION], [BUG-REPORT-ADDRESS])
+AM_INIT_AUTOMAKE
+AC_CONFIG_SRCDIR([BPMDetect.h])
+
+# Checks for programs.
+AC_PROG_CXX
+AC_PROG_CC
+AC_PROG_RANLIB
+
+CFLAGS=
+CPPFLAGS=
+CXXFLAGS=
+CCASFLAGS=
+
+CFLAGS+=" -m32 "
+CPPFLAGS+=" -m32 "
+CXXFLAGS+=" -m32 "
+CCASFLAGS+=" -m32 "
+
+# Checks for header files.
+AC_CHECK_HEADERS([limits.h memory.h stdlib.h string.h])
+
+# Checks for typedefs, structures, and compiler characteristics.
+AC_C_INLINE
+AC_C_RESTRICT
+AC_TYPE_SIZE_T
+AC_HEADER_STDBOOL
+
+# Checks for library functions.
+AC_CHECK_FUNCS([memmove memset])
+
+AC_CONFIG_FILES([Makefile])
+AC_OUTPUT
--- a/desmume/src/metaspu/SoundTouch/cpu_detect.h
+++ b/desmume/src/metaspu/SoundTouch/cpu_detect.h
@ -0,0 +1,62 @@
+////////////////////////////////////////////////////////////////////////////////
+///
+/// A header file for detecting the Intel MMX instructions set extension.
+///
+/// Please see 'mmx_win.cpp', 'mmx_cpp.cpp' and 'mmx_non_x86.cpp' for the 
+/// routine implementations for x86 Windows, x86 gnu version and non-x86 
+/// platforms, respectively.
+///
+/// Author        : Copyright (c) Olli Parviainen
+/// Author e-mail : oparviai 'at' iki.fi
+/// SoundTouch WWW: http://www.surina.net/soundtouch
+///
+////////////////////////////////////////////////////////////////////////////////
+//
+// Last changed  : $Date: 2006/02/05 16:44:06 $
+// File revision : $Revision: 1.4 $
+//
+// $Id: cpu_detect.h,v 1.4 2006/02/05 16:44:06 Olli Exp $
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// License :
+//
+//  SoundTouch audio processing library
+//  Copyright (c) Olli Parviainen
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2.1 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#ifndef _CPU_DETECT_H_
+#define _CPU_DETECT_H_
+
+#include "STTypes.h"
+
+#define SUPPORT_MMX         0x0001
+#define SUPPORT_3DNOW       0x0002
+#define SUPPORT_ALTIVEC     0x0004
+#define SUPPORT_SSE         0x0008
+#define SUPPORT_SSE2        0x0010
+
+/// Checks which instruction set extensions are supported by the CPU.
+///
+/// \return A bitmask of supported extensions, see SUPPORT_... defines.
+uint detectCPUextensions(void);
+
+/// Disables given set of instruction extensions. See SUPPORT_... defines.
+void disableExtensions(uint wDisableMask);
+
+#endif  // _CPU_DETECT_H_
--- a/desmume/src/metaspu/SoundTouch/cpu_detect_x86_gcc.cpp
+++ b/desmume/src/metaspu/SoundTouch/cpu_detect_x86_gcc.cpp
@ -0,0 +1,138 @@
+////////////////////////////////////////////////////////////////////////////////
+///
+/// gcc version of the x86 CPU detect routine.
+///
+/// This file is to be compiled on any platform with the GNU C compiler.
+/// Compiler. Please see 'cpu_detect_x86_win.cpp' for the x86 Windows version 
+/// of this file.
+///
+/// Author        : Copyright (c) Olli Parviainen
+/// Author e-mail : oparviai 'at' iki.fi
+/// SoundTouch WWW: http://www.surina.net/soundtouch
+///
+////////////////////////////////////////////////////////////////////////////////
+//
+// Last changed  : $Date: 2006/02/05 16:44:06 $
+// File revision : $Revision: 1.6 $
+//
+// $Id: cpu_detect_x86_gcc.cpp,v 1.6 2006/02/05 16:44:06 Olli Exp $
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// License :
+//
+//  SoundTouch audio processing library
+//  Copyright (c) Olli Parviainen
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2.1 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include <stdexcept>
+#include <string>
+#include "cpu_detect.h"
+
+#ifndef __GNUC__
+#error wrong platform - this source code file is for the GNU C compiler.
+#endif
+
+using namespace std;
+
+#include <stdio.h>
+//////////////////////////////////////////////////////////////////////////////
+//
+// processor instructions extension detection routines
+//
+//////////////////////////////////////////////////////////////////////////////
+
+
+// Flag variable indicating whick ISA extensions are disabled (for debugging)
+static uint _dwDisabledISA = 0x00;      // 0xffffffff; //<- use this to disable all extensions
+
+// Disables given set of instruction extensions. See SUPPORT_... defines.
+void disableExtensions(uint dwDisableMask)
+{
+    _dwDisabledISA = dwDisableMask;
+}
+
+
+
+/// Checks which instruction set extensions are supported by the CPU.
+uint detectCPUextensions(void)
+{
+#ifndef __i386__
+    return 0; // always disable extensions on non-x86 platforms.
+#else
+    uint res = 0;
+
+    if (_dwDisabledISA == 0xffffffff) return 0;
+
+    asm volatile(
+        "\n\txor     %%esi, %%esi"       // clear %%esi = result register
+        // check if 'cpuid' instructions is available by toggling eflags bit 21
+
+        "\n\tpushf"                      // save eflags to stack
+        "\n\tpop     %%eax"              // load eax from stack (with eflags)
+        "\n\tmovl    %%eax, %%ecx"       // save the original eflags values to ecx
+        "\n\txor     $0x00200000, %%eax" // toggle bit 21
+        "\n\tpush    %%eax"              // store toggled eflags to stack
+        "\n\tpopf"                       // load eflags from stack
+        "\n\tpushf"                      // save updated eflags to stack
+        "\n\tpop     %%eax"              // load from stack
+        "\n\txor     %%edx, %%edx"       // clear edx for defaulting no mmx
+        "\n\tcmp     %%ecx, %%eax"       // compare to original eflags values
+        "\n\tjz      end"                // jumps to 'end' if cpuid not present
+
+        // cpuid instruction available, test for presence of mmx instructions
+
+        "\n\tmovl    $1, %%eax"
+        "\n\tcpuid"
+//        movl       $0x00800000, %edx   // force enable MMX
+        "\n\ttest    $0x00800000, %%edx"
+        "\n\tjz      end"                // branch if MMX not available
+
+        "\n\tor      $0x01, %%esi"       // otherwise add MMX support bit
+
+        "\n\ttest    $0x02000000, %%edx"
+        "\n\tjz      test3DNow"          // branch if SSE not available
+
+        "\n\tor      $0x08, %%esi"       // otherwise add SSE support bit
+
+    "\n\ttest3DNow:"
+        // test for precense of AMD extensions
+        "\n\tmov     $0x80000000, %%eax"
+        "\n\tcpuid"
+        "\n\tcmp     $0x80000000, %%eax"
+        "\n\tjbe     end"                 // branch if no AMD extensions detected
+
+        // test for precense of 3DNow! extension
+        "\n\tmov     $0x80000001, %%eax"
+        "\n\tcpuid"
+        "\n\ttest    $0x80000000, %%edx"
+        "\n\tjz      end"                  // branch if 3DNow! not detected
+
+        "\n\tor      $0x02, %%esi"         // otherwise add 3DNow support bit
+
+    "\n\tend:"
+
+        "\n\tmov     %%esi, %0"
+
+      : "=r" (res)
+      : /* no inputs */
+      : "%edx", "%eax", "%ecx", "%esi" );
+      
+    return res & ~_dwDisabledISA;
+#endif
+}
--- a/desmume/src/metaspu/SoundTouch/cpu_detect_x86_win.cpp
+++ b/desmume/src/metaspu/SoundTouch/cpu_detect_x86_win.cpp
@ -0,0 +1,126 @@
+////////////////////////////////////////////////////////////////////////////////
+///
+/// Win32 version of the x86 CPU detect routine.
+///
+/// This file is to be compiled in Windows platform with Microsoft Visual C++ 
+/// Compiler. Please see 'cpu_detect_x86_gcc.cpp' for the gcc compiler version 
+/// for all GNU platforms.
+///
+/// Author        : Copyright (c) Olli Parviainen
+/// Author e-mail : oparviai 'at' iki.fi
+/// SoundTouch WWW: http://www.surina.net/soundtouch
+///
+////////////////////////////////////////////////////////////////////////////////
+//
+// Last changed  : $Date: 2006/02/05 16:44:06 $
+// File revision : $Revision: 1.10 $
+//
+// $Id: cpu_detect_x86_win.cpp,v 1.10 2006/02/05 16:44:06 Olli Exp $
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// License :
+//
+//  SoundTouch audio processing library
+//  Copyright (c) Olli Parviainen
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2.1 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "cpu_detect.h"
+
+#ifndef _WIN32
+#error wrong platform - this source code file is exclusively for Win32 platform
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// processor instructions extension detection routines
+//
+//////////////////////////////////////////////////////////////////////////////
+
+// Flag variable indicating whick ISA extensions are disabled (for debugging)
+static uint _dwDisabledISA = 0x00;      // 0xffffffff; //<- use this to disable all extensions
+
+
+// Disables given set of instruction extensions. See SUPPORT_... defines.
+void disableExtensions(uint dwDisableMask)
+{
+    _dwDisabledISA = dwDisableMask;
+}
+
+
+
+/// Checks which instruction set extensions are supported by the CPU.
+uint detectCPUextensions(void)
+{
+    uint res = 0;
+
+    if (_dwDisabledISA == 0xffffffff) return 0;
+
+    _asm 
+    {
+        ; check if 'cpuid' instructions is available by toggling eflags bit 21
+        ;
+        xor     esi, esi            ; clear esi = result register
+
+        pushfd                      ; save eflags to stack
+        pop     eax                 ; load eax from stack (with eflags)
+        mov     ecx, eax            ; save the original eflags values to ecx
+        xor     eax, 0x00200000     ; toggle bit 21
+        push    eax                 ; store toggled eflags to stack
+        popfd                       ; load eflags from stack
+        pushfd                      ; save updated eflags to stack
+        pop     eax                 ; load from stack
+        xor     edx, edx            ; clear edx for defaulting no mmx
+        cmp     eax, ecx            ; compare to original eflags values
+        jz      end                 ; jumps to 'end' if cpuid not present
+
+        ; cpuid instruction available, test for presence of mmx instructions 
+        mov     eax, 1
+        cpuid
+        test    edx, 0x00800000
+        jz      end                 ; branch if MMX not available
+
+        or      esi, SUPPORT_MMX    ; otherwise add MMX support bit
+
+        test    edx, 0x02000000
+        jz      test3DNow           ; branch if SSE not available
+
+        or      esi, SUPPORT_SSE    ; otherwise add SSE support bit
+
+    test3DNow:
+        ; test for precense of AMD extensions
+        mov     eax, 0x80000000
+        cpuid
+        cmp     eax, 0x80000000
+        jbe     end                ; branch if no AMD extensions detected
+
+        ; test for precense of 3DNow! extension
+        mov     eax, 0x80000001
+        cpuid
+        test    edx, 0x80000000
+        jz      end                 ; branch if 3DNow! not detected
+
+        or      esi, SUPPORT_3DNOW  ; otherwise add 3DNow support bit
+
+    end:
+
+        mov     res, esi
+    }
+
+    return res & ~_dwDisabledISA;
+}
--- a/desmume/src/metaspu/SoundTouch/depcomp
+++ b/desmume/src/metaspu/SoundTouch/depcomp
@ -0,0 +1 @@
+link /usr/share/automake-1.10/depcomp
--- a/desmume/src/metaspu/SoundTouch/install-sh
+++ b/desmume/src/metaspu/SoundTouch/install-sh
@ -0,0 +1 @@
+link /usr/share/automake-1.10/install-sh
--- a/desmume/src/metaspu/SoundTouch/missing
+++ b/desmume/src/metaspu/SoundTouch/missing
@ -0,0 +1 @@
+link /usr/share/automake-1.10/missing
--- a/desmume/src/metaspu/SoundTouch/mmx_optimized.cpp
+++ b/desmume/src/metaspu/SoundTouch/mmx_optimized.cpp
@ -0,0 +1,305 @@
+////////////////////////////////////////////////////////////////////////////////
+///
+/// MMX optimized routines. All MMX optimized functions have been gathered into 
+/// this single source code file, regardless to their class or original source 
+/// code file, in order to ease porting the library to other compiler and 
+/// processor platforms.
+///
+/// The MMX-optimizations are programmed using MMX compiler intrinsics that
+/// are supported both by Microsoft Visual C++ and GCC compilers, so this file
+/// should compile with both toolsets.
+///
+/// NOTICE: If using Visual Studio 6.0, you'll need to install the "Visual C++ 
+/// 6.0 processor pack" update to support compiler intrinsic syntax. The update
+/// is available for download at Microsoft Developers Network, see here:
+/// http://msdn.microsoft.com/vstudio/downloads/tools/ppack/default.aspx
+///
+/// Author        : Copyright (c) Olli Parviainen
+/// Author e-mail : oparviai 'at' iki.fi
+/// SoundTouch WWW: http://www.surina.net/soundtouch
+///
+////////////////////////////////////////////////////////////////////////////////
+//
+// Last changed  : $Date: 2006/02/06 18:52:43 $
+// File revision : $Revision: 1.1 $
+//
+// $Id: mmx_optimized.cpp,v 1.1 2006/02/06 18:52:43 Olli Exp $
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// License :
+//
+//  SoundTouch audio processing library
+//  Copyright (c) Olli Parviainen
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2.1 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "STTypes.h"
+
+#ifdef ALLOW_MMX
+// MMX routines available only with integer sample type
+
+#if !(_WIN32 || __i386__ || __x86_64__)
+#error "wrong platform - this source code file is exclusively for x86 platforms"
+#endif
+
+using namespace soundtouch;
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// implementation of MMX optimized functions of class 'TDStretchMMX'
+//
+//////////////////////////////////////////////////////////////////////////////
+
+#include "TDStretch.h"
+#include <mmintrin.h>
+#include <limits.h>
+
+
+// Calculates cross correlation of two buffers
+long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const
+{
+    const __m64 *pVec1, *pVec2;
+    __m64 shifter;
+    __m64 accu;
+    long corr;
+    uint i;
+   
+    pVec1 = (__m64*)pV1;
+    pVec2 = (__m64*)pV2;
+
+    shifter = _m_from_int(overlapDividerBits);
+    accu = _mm_setzero_si64();
+
+    // Process 4 parallel sets of 2 * stereo samples each during each 
+    // round to improve CPU-level parallellization.
+    for (i = 0; i < overlapLength / 8; i ++)
+    {
+        __m64 temp;
+
+        // dictionary of instructions:
+        // _m_pmaddwd   : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3]
+        // _mm_add_pi32 : 2*32bit add
+        // _m_psrad     : 32bit right-shift
+
+        temp = _mm_add_pi32(_mm_madd_pi16(pVec1[0], pVec2[0]),
+                            _mm_madd_pi16(pVec1[1], pVec2[1]));
+        accu = _mm_add_pi32(accu, _mm_sra_pi32(temp, shifter));
+
+        temp = _mm_add_pi32(_mm_madd_pi16(pVec1[2], pVec2[2]),
+                            _mm_madd_pi16(pVec1[3], pVec2[3]));
+        accu = _mm_add_pi32(accu, _mm_sra_pi32(temp, shifter));
+
+        pVec1 += 4;
+        pVec2 += 4;
+    }
+
+    // copy hi-dword of mm0 to lo-dword of mm1, then sum mmo+mm1
+    // and finally store the result into the variable "corr"
+
+    accu = _mm_add_pi32(accu, _mm_srli_si64(accu, 32));
+    corr = _m_to_int(accu);
+
+    // Clear MMS state
+    _m_empty();
+
+    return corr;
+    // Note: Warning about the missing EMMS instruction is harmless
+    // as it'll be called elsewhere.
+}
+
+
+
+void TDStretchMMX::clearCrossCorrState()
+{
+    // Clear MMS state
+    _m_empty();
+    //_asm EMMS;
+}
+
+
+
+// MMX-optimized version of the function overlapStereo
+void TDStretchMMX::overlapStereo(short *output, const short *input) const
+{
+    const __m64 *pVinput, *pVMidBuf;
+    __m64 *pVdest;
+    __m64 mix1, mix2, adder, shifter;
+    uint i;
+
+    pVinput  = (const __m64*)input;
+    pVMidBuf = (const __m64*)pMidBuffer;
+    pVdest   = (__m64*)output;
+
+    // mix1  = mixer values for 1st stereo sample
+    // mix1  = mixer values for 2nd stereo sample
+    // adder = adder for updating mixer values after each round
+    
+    mix1  = _mm_set_pi16(0, overlapLength,   0, overlapLength);
+    adder = _mm_set_pi16(1, -1, 1, -1);
+    mix2  = _mm_add_pi16(mix1, adder);
+    adder = _mm_add_pi16(adder, adder);
+
+    shifter = _m_from_int(overlapDividerBits);
+
+    for (i = 0; i < overlapLength / 4; i ++)
+    {
+        __m64 temp1, temp2;
+                
+        // load & shuffle data so that input & mixbuffer data samples are paired
+        temp1 = _mm_unpacklo_pi16(pVMidBuf[0], pVinput[0]);     // = i0l m0l i0r m0r
+        temp2 = _mm_unpackhi_pi16(pVMidBuf[0], pVinput[0]);     // = i1l m1l i1r m1r
+
+        // temp = (temp .* mix) >> shifter
+        temp1 = _mm_sra_pi32(_mm_madd_pi16(temp1, mix1), shifter);
+        temp2 = _mm_sra_pi32(_mm_madd_pi16(temp2, mix2), shifter);
+        pVdest[0] = _mm_packs_pi32(temp1, temp2); // pack 2*2*32bit => 4*16bit
+
+        // update mix += adder
+        mix1 = _mm_add_pi16(mix1, adder);
+        mix2 = _mm_add_pi16(mix2, adder);
+
+        // --- second round begins here ---
+
+        // load & shuffle data so that input & mixbuffer data samples are paired
+        temp1 = _mm_unpacklo_pi16(pVMidBuf[1], pVinput[1]);       // = i2l m2l i2r m2r
+        temp2 = _mm_unpackhi_pi16(pVMidBuf[1], pVinput[1]);       // = i3l m3l i3r m3r
+
+        // temp = (temp .* mix) >> shifter
+        temp1 = _mm_sra_pi32(_mm_madd_pi16(temp1, mix1), shifter);
+        temp2 = _mm_sra_pi32(_mm_madd_pi16(temp2, mix2), shifter);
+        pVdest[1] = _mm_packs_pi32(temp1, temp2); // pack 2*2*32bit => 4*16bit
+
+        // update mix += adder
+        mix1 = _mm_add_pi16(mix1, adder);
+        mix2 = _mm_add_pi16(mix2, adder);
+
+        pVinput  += 2;
+        pVMidBuf += 2;
+        pVdest   += 2;
+    }
+
+    _m_empty(); // clear MMS state
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// implementation of MMX optimized functions of class 'FIRFilter'
+//
+//////////////////////////////////////////////////////////////////////////////
+
+#include "FIRFilter.h"
+
+
+FIRFilterMMX::FIRFilterMMX() : FIRFilter()
+{
+    filterCoeffsUnalign = NULL;
+}
+
+
+FIRFilterMMX::~FIRFilterMMX()
+{
+    delete[] filterCoeffsUnalign;
+}
+
+
+// (overloaded) Calculates filter coefficients for MMX routine
+void FIRFilterMMX::setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor)
+{
+    uint i;
+    FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
+
+    // Ensure that filter coeffs array is aligned to 16-byte boundary
+    delete[] filterCoeffsUnalign;
+    filterCoeffsUnalign = new short[2 * newLength + 8];
+    filterCoeffsAlign = (short *)(((ulongptr)filterCoeffsUnalign + 15) & -16);
+
+    // rearrange the filter coefficients for mmx routines 
+    for (i = 0;i < length; i += 4) 
+    {
+        filterCoeffsAlign[2 * i + 0] = coeffs[i + 0];
+        filterCoeffsAlign[2 * i + 1] = coeffs[i + 2];
+        filterCoeffsAlign[2 * i + 2] = coeffs[i + 0];
+        filterCoeffsAlign[2 * i + 3] = coeffs[i + 2];
+
+        filterCoeffsAlign[2 * i + 4] = coeffs[i + 1];
+        filterCoeffsAlign[2 * i + 5] = coeffs[i + 3];
+        filterCoeffsAlign[2 * i + 6] = coeffs[i + 1];
+        filterCoeffsAlign[2 * i + 7] = coeffs[i + 3];
+    }
+}
+
+
+
+// mmx-optimized version of the filter routine for stereo sound
+uint FIRFilterMMX::evaluateFilterStereo(short *dest, const short *src, const uint numSamples) const
+{
+    // Create stack copies of the needed member variables for asm routines :
+    uint i, j;
+    __m64 *pVdest = (__m64*)dest;
+
+    if (length < 2) return 0;
+
+    for (i = 0; i < numSamples / 2; i ++)
+    {
+        __m64 accu1;
+        __m64 accu2;
+        const __m64 *pVsrc = (const __m64*)src;
+        const __m64 *pVfilter = (const __m64*)filterCoeffsAlign;
+
+        accu1 = accu2 = _mm_setzero_si64();
+        for (j = 0; j < lengthDiv8 * 2; j ++)
+        {
+            __m64 temp1, temp2;
+
+            temp1 = _mm_unpacklo_pi16(pVsrc[0], pVsrc[1]);  // = l2 l0 r2 r0
+            temp2 = _mm_unpackhi_pi16(pVsrc[0], pVsrc[1]);  // = l3 l1 r3 r1
+
+            accu1 = _mm_add_pi32(accu1, _mm_madd_pi16(temp1, pVfilter[0]));  // += l2*f2+l0*f0 r2*f2+r0*f0
+            accu1 = _mm_add_pi32(accu1, _mm_madd_pi16(temp2, pVfilter[1]));  // += l3*f3+l1*f1 r3*f3+r1*f1
+
+            temp1 = _mm_unpacklo_pi16(pVsrc[1], pVsrc[2]);  // = l4 l2 r4 r2
+
+            accu2 = _mm_add_pi32(accu2, _mm_madd_pi16(temp2, pVfilter[0]));  // += l3*f2+l1*f0 r3*f2+r1*f0
+            accu2 = _mm_add_pi32(accu2, _mm_madd_pi16(temp1, pVfilter[1]));  // += l4*f3+l2*f1 r4*f3+r2*f1
+
+            // accu1 += l2*f2+l0*f0 r2*f2+r0*f0
+            //       += l3*f3+l1*f1 r3*f3+r1*f1
+
+            // accu2 += l3*f2+l1*f0 r3*f2+r1*f0
+            //          l4*f3+l2*f1 r4*f3+r2*f1
+
+            pVfilter += 2;
+            pVsrc += 2;
+        }
+        // accu >>= resultDivFactor
+        accu1 = _mm_srai_pi32(accu1, resultDivFactor);
+        accu2 = _mm_srai_pi32(accu2, resultDivFactor);
+
+        // pack 2*2*32bits => 4*16 bits
+        pVdest[0] = _mm_packs_pi32(accu1, accu2);
+        src += 4;
+        pVdest ++;
+    }
+
+   _m_empty();  // clear emms state
+
+    return (numSamples & 0xfffffffe) - length;
+}
+
+#endif  // ALLOW_MMX
--- a/desmume/src/metaspu/SoundTouch/sse_optimized.cpp
+++ b/desmume/src/metaspu/SoundTouch/sse_optimized.cpp
@ -0,0 +1,484 @@
+////////////////////////////////////////////////////////////////////////////////
+///
+/// SSE optimized routines for Pentium-III, Athlon-XP and later CPUs. All SSE 
+/// optimized functions have been gathered into this single source 
+/// code file, regardless to their class or original source code file, in order 
+/// to ease porting the library to other compiler and processor platforms.
+///
+/// The SSE-optimizations are programmed using SSE compiler intrinsics that
+/// are supported both by Microsoft Visual C++ and GCC compilers, so this file
+/// should compile with both toolsets.
+///
+/// NOTICE: If using Visual Studio 6.0, you'll need to install the "Visual C++ 
+/// 6.0 processor pack" update to support SSE instruction set. The update is 
+/// available for download at Microsoft Developers Network, see here:
+/// http://msdn.microsoft.com/vstudio/downloads/tools/ppack/default.aspx
+///
+/// If the above URL is expired or removed, go to "http://msdn.microsoft.com" and 
+/// perform a search with keywords "processor pack".
+///
+/// Author        : Copyright (c) Olli Parviainen
+/// Author e-mail : oparviai 'at' iki.fi
+/// SoundTouch WWW: http://www.surina.net/soundtouch
+///
+////////////////////////////////////////////////////////////////////////////////
+//
+// Last changed  : $Date: 2006/02/05 16:44:06 $
+// File revision : $Revision: 1.2 $
+//
+// $Id: sse_optimized.cpp,v 1.2 2006/02/05 16:44:06 Olli Exp $
+//
+////////////////////////////////////////////////////////////////////////////////
+//
+// License :
+//
+//  SoundTouch audio processing library
+//  Copyright (c) Olli Parviainen
+//
+//  This library is free software; you can redistribute it and/or
+//  modify it under the terms of the GNU Lesser General Public
+//  License as published by the Free Software Foundation; either
+//  version 2.1 of the License, or (at your option) any later version.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+//  Lesser General Public License for more details.
+//
+//  You should have received a copy of the GNU Lesser General Public
+//  License along with this library; if not, write to the Free Software
+//  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+//
+////////////////////////////////////////////////////////////////////////////////
+
+#include "cpu_detect.h"
+#include "STTypes.h"
+
+using namespace soundtouch;
+
+#ifdef ALLOW_SSE
+
+// SSE routines available only with float sample type    
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// implementation of SSE optimized functions of class 'TDStretchSSE'
+//
+//////////////////////////////////////////////////////////////////////////////
+
+#include "TDStretch.h"
+#include <xmmintrin.h>
+
+// Calculates cross correlation of two buffers
+double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) const
+{
+    uint i;
+    __m128 vSum, *pVec2;
+
+    // Note. It means a major slow-down if the routine needs to tolerate 
+    // unaligned __m128 memory accesses. It's way faster if we can skip 
+    // unaligned slots and use _mm_load_ps instruction instead of _mm_loadu_ps.
+    // This can mean up to ~ 10-fold difference (incl. part of which is
+    // due to skipping every second round for stereo sound though).
+    //
+    // Compile-time define ALLOW_NONEXACT_SIMD_OPTIMIZATION is provided
+    // for choosing if this little cheating is allowed.
+
+#ifdef ALLOW_NONEXACT_SIMD_OPTIMIZATION
+    // Little cheating allowed, return valid correlation only for 
+    // aligned locations, meaning every second round for stereo sound.
+
+    #define _MM_LOAD    _mm_load_ps
+
+    if (((ulong)pV1) & 15) return -1e50;    // skip unaligned locations
+
+#else
+    // No cheating allowed, use unaligned load & take the resulting
+    // performance hit.
+    #define _MM_LOAD    _mm_loadu_ps
+#endif 
+
+    // ensure overlapLength is divisible by 8
+    assert((overlapLength % 8) == 0);
+
+    // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors
+    // Note: pV2 _must_ be aligned to 16-bit boundary, pV1 need not.
+    pVec2 = (__m128*)pV2;
+    vSum = _mm_setzero_ps();
+
+    // Unroll the loop by factor of 4 * 4 operations
+    for (i = 0; i < overlapLength / 8; i ++) 
+    {
+        // vSum += pV1[0..3] * pV2[0..3]
+        vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pV1),pVec2[0]));
+
+        // vSum += pV1[4..7] * pV2[4..7]
+        vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pV1 + 4), pVec2[1]));
+
+        // vSum += pV1[8..11] * pV2[8..11]
+        vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pV1 + 8), pVec2[2]));
+
+        // vSum += pV1[12..15] * pV2[12..15]
+        vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pV1 + 12), pVec2[3]));
+
+        pV1 += 16;
+        pVec2 += 4;
+    }
+
+    // return value = vSum[0] + vSum[1] + vSum[2] + vSum[3]
+    float *pvSum = (float*)&vSum;
+    return (double)(pvSum[0] + pvSum[1] + pvSum[2] + pvSum[3]);
+
+    /* This is approximately corresponding routine in C-language:
+    double corr;
+    uint i;
+
+    // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors
+    corr = 0.0;
+    for (i = 0; i < overlapLength / 8; i ++) 
+    {
+        corr += pV1[0] * pV2[0] +
+                pV1[1] * pV2[1] +
+                pV1[2] * pV2[2] +
+                pV1[3] * pV2[3] +
+                pV1[4] * pV2[4] +
+                pV1[5] * pV2[5] +
+                pV1[6] * pV2[6] +
+                pV1[7] * pV2[7] +
+                pV1[8] * pV2[8] +
+                pV1[9] * pV2[9] +
+                pV1[10] * pV2[10] +
+                pV1[11] * pV2[11] +
+                pV1[12] * pV2[12] +
+                pV1[13] * pV2[13] +
+                pV1[14] * pV2[14] +
+                pV1[15] * pV2[15];
+
+        pV1 += 16;
+        pV2 += 16;
+    }
+    */
+
+    /* This is corresponding routine in assembler. This may be teeny-weeny bit faster
+       than intrinsic version, but more difficult to maintain & get compiled on multiple
+       platforms.
+
+    uint overlapLengthLocal = overlapLength;
+    float corr;
+
+    _asm 
+    {
+        // Very important note: data in 'pV2' _must_ be aligned to 
+        // 16-byte boundary!
+
+        // give prefetch hints to CPU of what data are to be needed soonish
+        // give more aggressive hints on pV1 as that changes while pV2 stays
+        // same between runs
+        prefetcht0 [pV1]
+        prefetcht0 [pV2]
+        prefetcht0 [pV1 + 32]
+
+        mov     eax, dword ptr pV1
+        mov     ebx, dword ptr pV2
+
+        xorps   xmm0, xmm0
+
+        mov     ecx, overlapLengthLocal
+        shr     ecx, 3  // div by eight
+
+    loop1:
+        prefetcht0 [eax + 64]     // give a prefetch hint to CPU what data are to be needed soonish
+        prefetcht0 [ebx + 32]     // give a prefetch hint to CPU what data are to be needed soonish
+        movups  xmm1, [eax]
+        mulps   xmm1, [ebx]
+        addps   xmm0, xmm1
+
+        movups  xmm2, [eax + 16]
+        mulps   xmm2, [ebx + 16]
+        addps   xmm0, xmm2
+
+        prefetcht0 [eax + 96]     // give a prefetch hint to CPU what data are to be needed soonish
+        prefetcht0 [ebx + 64]     // give a prefetch hint to CPU what data are to be needed soonish
+
+        movups  xmm3, [eax + 32]
+        mulps   xmm3, [ebx + 32]
+        addps   xmm0, xmm3
+
+        movups  xmm4, [eax + 48]
+        mulps   xmm4, [ebx + 48]
+        addps   xmm0, xmm4
+
+        add     eax, 64
+        add     ebx, 64
+
+        dec     ecx
+        jnz     loop1
+
+        // add the four floats of xmm0 together and return the result. 
+
+        movhlps xmm1, xmm0          // move 3 & 4 of xmm0 to 1 & 2 of xmm1
+        addps   xmm1, xmm0
+        movaps  xmm2, xmm1
+        shufps  xmm2, xmm2, 0x01    // move 2 of xmm2 as 1 of xmm2
+        addss   xmm2, xmm1
+        movss   corr, xmm2
+    }
+
+    return (double)corr;
+    */
+}
+
+
+//////////////////////////////////////////////////////////////////////////////
+//
+// implementation of SSE optimized functions of class 'FIRFilter'
+//
+//////////////////////////////////////////////////////////////////////////////
+
+#include "FIRFilter.h"
+
+FIRFilterSSE::FIRFilterSSE() : FIRFilter()
+{
+    filterCoeffsUnalign = NULL;
+}
+
+
+FIRFilterSSE::~FIRFilterSSE()
+{
+    delete[] filterCoeffsUnalign;
+}
+
+
+// (overloaded) Calculates filter coefficients for SSE routine
+void FIRFilterSSE::setCoefficients(const float *coeffs, uint newLength, uint uResultDivFactor)
+{
+    uint i;
+    float fDivider;
+
+    FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
+
+    // Scale the filter coefficients so that it won't be necessary to scale the filtering result
+    // also rearrange coefficients suitably for 3DNow!
+    // Ensure that filter coeffs array is aligned to 16-byte boundary
+    delete[] filterCoeffsUnalign;
+    filterCoeffsUnalign = new float[2 * newLength + 4];
+    filterCoeffsAlign = (float *)(((unsigned long)filterCoeffsUnalign + 15) & -16);
+
+    fDivider = (float)resultDivider;
+
+    // rearrange the filter coefficients for mmx routines 
+    for (i = 0; i < newLength; i ++)
+    {
+        filterCoeffsAlign[2 * i + 0] =
+        filterCoeffsAlign[2 * i + 1] = coeffs[i + 0] / fDivider;
+    }
+}
+
+
+
+// SSE-optimized version of the filter routine for stereo sound
+uint FIRFilterSSE::evaluateFilterStereo(float *dest, const float *source, uint numSamples) const
+{
+    int count = (numSamples - length) & -2;
+    int j;
+
+    assert(count % 2 == 0);
+
+    if (count < 2) return 0;
+
+    assert((length % 8) == 0);
+    assert(((unsigned long)filterCoeffsAlign) % 16 == 0);
+
+    // filter is evaluated for two stereo samples with each iteration, thus use of 'j += 2'
+    for (j = 0; j < count; j += 2)
+    {
+        const float *pSrc;
+        const __m128 *pFil;
+        __m128 sum1, sum2;
+        uint i;
+
+        pSrc = source;                      // source audio data
+        pFil = (__m128*)filterCoeffsAlign;  // filter coefficients. NOTE: Assumes coefficients 
+                                            // are aligned to 16-byte boundary
+        sum1 = sum2 = _mm_setzero_ps();
+
+        for (i = 0; i < length / 8; i ++) 
+        {
+            // Unroll loop for efficiency & calculate filter for 2*2 stereo samples 
+            // at each pass
+
+            // sum1 is accu for 2*2 filtered stereo sound data at the primary sound data offset
+            // sum2 is accu for 2*2 filtered stereo sound data for the next sound sample offset.
+
+            sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc)    , pFil[0]));
+            sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 2), pFil[0]));
+
+            sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc + 4), pFil[1]));
+            sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 6), pFil[1]));
+
+            sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc + 8) ,  pFil[2]));
+            sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 10), pFil[2]));
+
+            sum1 = _mm_add_ps(sum1, _mm_mul_ps(_mm_loadu_ps(pSrc + 12), pFil[3]));
+            sum2 = _mm_add_ps(sum2, _mm_mul_ps(_mm_loadu_ps(pSrc + 14), pFil[3]));
+
+            pSrc += 16;
+            pFil += 4;
+        }
+
+        // Now sum1 and sum2 both have a filtered 2-channel sample each, but we still need
+        // to sum the two hi- and lo-floats of these registers together.
+
+        // post-shuffle & add the filtered values and store to dest.
+        _mm_storeu_ps(dest, _mm_add_ps(
+                    _mm_shuffle_ps(sum1, sum2, _MM_SHUFFLE(1,0,3,2)),   // s2_1 s2_0 s1_3 s1_2
+                    _mm_shuffle_ps(sum1, sum2, _MM_SHUFFLE(3,2,1,0))    // s2_3 s2_2 s1_1 s1_0
+                    ));
+        source += 4;
+        dest += 4;
+    }
+
+    // Ideas for further improvement:
+    // 1. If it could be guaranteed that 'source' were always aligned to 16-byte 
+    //    boundary, a faster aligned '_mm_load_ps' instruction could be used.
+    // 2. If it could be guaranteed that 'dest' were always aligned to 16-byte 
+    //    boundary, a faster '_mm_store_ps' instruction could be used.
+
+    return (uint)count;
+
+    /* original routine in C-language. please notice the C-version has differently 
+       organized coefficients though.
+    double suml1, suml2;
+    double sumr1, sumr2;
+    uint i, j;
+
+    for (j = 0; j < count; j += 2)
+    {
+        const float *ptr;
+        const float *pFil;
+
+        suml1 = sumr1 = 0.0;
+        suml2 = sumr2 = 0.0;
+        ptr = src;
+        pFil = filterCoeffs;
+        for (i = 0; i < lengthLocal; i ++) 
+        {
+            // unroll loop for efficiency.
+
+            suml1 += ptr[0] * pFil[0] + 
+                     ptr[2] * pFil[2] +
+                     ptr[4] * pFil[4] +
+                     ptr[6] * pFil[6];
+
+            sumr1 += ptr[1] * pFil[1] + 
+                     ptr[3] * pFil[3] +
+                     ptr[5] * pFil[5] +
+                     ptr[7] * pFil[7];
+
+            suml2 += ptr[8] * pFil[0] + 
+                     ptr[10] * pFil[2] +
+                     ptr[12] * pFil[4] +
+                     ptr[14] * pFil[6];
+
+            sumr2 += ptr[9] * pFil[1] + 
+                     ptr[11] * pFil[3] +
+                     ptr[13] * pFil[5] +
+                     ptr[15] * pFil[7];
+
+            ptr += 16;
+            pFil += 8;
+        }
+        dest[0] = (float)suml1;
+        dest[1] = (float)sumr1;
+        dest[2] = (float)suml2;
+        dest[3] = (float)sumr2;
+
+        src += 4;
+        dest += 4;
+    }
+    */
+
+
+    /* Similar routine in assembly, again obsoleted due to maintainability
+    _asm
+    {
+        // Very important note: data in 'src' _must_ be aligned to 
+        // 16-byte boundary!
+        mov     edx, count
+        mov     ebx, dword ptr src
+        mov     eax, dword ptr dest
+        shr     edx, 1
+
+    loop1:
+        // "outer loop" : during each round 2*2 output samples are calculated
+
+        // give prefetch hints to CPU of what data are to be needed soonish
+        prefetcht0 [ebx]
+        prefetcht0 [filterCoeffsLocal]
+
+        mov     esi, ebx
+        mov     edi, filterCoeffsLocal
+        xorps   xmm0, xmm0
+        xorps   xmm1, xmm1
+        mov     ecx, lengthLocal
+
+    loop2:
+        // "inner loop" : during each round eight FIR filter taps are evaluated for 2*2 samples
+        prefetcht0 [esi + 32]     // give a prefetch hint to CPU what data are to be needed soonish
+        prefetcht0 [edi + 32]     // give a prefetch hint to CPU what data are to be needed soonish
+
+        movups  xmm2, [esi]         // possibly unaligned load
+        movups  xmm3, [esi + 8]     // possibly unaligned load
+        mulps   xmm2, [edi]
+        mulps   xmm3, [edi]
+        addps   xmm0, xmm2
+        addps   xmm1, xmm3
+
+        movups  xmm4, [esi + 16]    // possibly unaligned load
+        movups  xmm5, [esi + 24]    // possibly unaligned load
+        mulps   xmm4, [edi + 16]
+        mulps   xmm5, [edi + 16]
+        addps   xmm0, xmm4
+        addps   xmm1, xmm5
+
+        prefetcht0 [esi + 64]     // give a prefetch hint to CPU what data are to be needed soonish
+        prefetcht0 [edi + 64]     // give a prefetch hint to CPU what data are to be needed soonish
+
+        movups  xmm6, [esi + 32]    // possibly unaligned load
+        movups  xmm7, [esi + 40]    // possibly unaligned load
+        mulps   xmm6, [edi + 32]
+        mulps   xmm7, [edi + 32]
+        addps   xmm0, xmm6
+        addps   xmm1, xmm7
+
+        movups  xmm4, [esi + 48]    // possibly unaligned load
+        movups  xmm5, [esi + 56]    // possibly unaligned load
+        mulps   xmm4, [edi + 48]
+        mulps   xmm5, [edi + 48]
+        addps   xmm0, xmm4
+        addps   xmm1, xmm5
+
+        add     esi, 64
+        add     edi, 64
+        dec     ecx
+        jnz     loop2
+
+        // Now xmm0 and xmm1 both have a filtered 2-channel sample each, but we still need
+        // to sum the two hi- and lo-floats of these registers together.
+
+        movhlps xmm2, xmm0          // xmm2 = xmm2_3 xmm2_2 xmm0_3 xmm0_2
+        movlhps xmm2, xmm1          // xmm2 = xmm1_1 xmm1_0 xmm0_3 xmm0_2
+        shufps  xmm0, xmm1, 0xe4    // xmm0 = xmm1_3 xmm1_2 xmm0_1 xmm0_0
+        addps   xmm0, xmm2
+
+        movaps  [eax], xmm0
+        add     ebx, 16
+        add     eax, 16
+
+        dec     edx
+        jnz     loop1
+    }
+    */
+}
+
+#endif  // ALLOW_SSE
--- a/desmume/src/metaspu/Timestretcher.cpp
+++ b/desmume/src/metaspu/Timestretcher.cpp
@ -0,0 +1,354 @@
+/* SPU2-X, A plugin for Emulating the Sound Processing Unit of the Playstation 2
+* Developed and maintained by the Pcsx2 Development Team.
+* 
+* Original portions from SPU2ghz are (c) 2008 by David Quintana [gigaherz]
+*
+* SPU2-X is free software: you can redistribute it and/or modify it under the terms
+* of the GNU Lesser General Public License as published by the Free Software Found-
+* ation, either version 3 of the License, or (at your option) any later version.
+*
+* SPU2-X is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+* without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+* PURPOSE.  See the GNU Lesser General Public License for more details.
+*
+* You should have received a copy of the GNU Lesser General Public License
+* along with SPU2-X.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+//#include "Global.h"
+#include <math.h>
+#include "types.h"
+#include "SoundTouch/SoundTouch.h"
+#include "SndOut.h"
+//#include "SoundTouch/WavFile.h"
+
+#include "win32/Dialogs.h"
+
+
+
+static soundtouch::SoundTouch* pSoundTouch = NULL;
+static int ts_stats_stretchblocks = 0;
+static int ts_stats_normalblocks = 0;
+static int ts_stats_logcounter = 0;
+
+
+// data prediction amount, used to "commit" data that hasn't
+// finished timestretch processing.
+s32 SndBuffer::m_predictData;
+
+// records last buffer status (fill %, range -100 to 100, with 0 being 50% full)
+float SndBuffer::lastPct;
+float SndBuffer::lastEmergencyAdj;
+
+float SndBuffer::cTempo = 1;
+float SndBuffer::eTempo = 1;
+int SndBuffer::freezeTempo = 0;
+
+void SndBuffer::PredictDataWrite( int samples )
+{
+	m_predictData += samples;
+}
+
+// Calculate the buffer status percentage.
+// Returns range from -1.0 to 1.0
+//    1.0 = buffer overflow!
+//    0.0 = buffer nominal (50% full)
+//   -1.0 = buffer underflow!
+float SndBuffer::GetStatusPct()
+{
+	// Get the buffer status of the output driver too, so that we can
+	// obtain a more accurate overall buffer status.
+
+	int drvempty = 0;
+	//int drvempty = mods[OutputModule]->GetEmptySampleCount(); // / 2;
+	//TODO 
+
+	//ConLog( "Data %d >>> driver: %d   predict: %d\n", data, drvempty, predictData );
+
+	float result = (float)(m_data + m_predictData - drvempty) - (m_size/2);
+	result /= (m_size/2);
+	return result;
+}
+
+void SndBuffer::UpdateTempoChange()
+{
+	if( --freezeTempo > 0 )
+	{
+		return;
+	}
+
+	float statusPct = GetStatusPct();
+	float pctChange = statusPct - lastPct;
+
+	float tempoChange;
+	float emergencyAdj = 0;
+	float newcee = cTempo;		// workspace var. for cTempo
+
+	// IMPORTANT!
+	// If you plan to tweak these values, make sure you're using a release build
+	// OUTSIDE THE DEBUGGER to test it!  The Visual Studio debugger can really cause
+	// erratic behavior in the audio buffers, and makes the timestretcher seem a
+	// lot more inconsistent than it really is.
+
+	// We have two factors.
+	//   * Distance from nominal buffer status (50% full)
+	//   * The change from previous update to this update.
+
+	// Prediction based on the buffer change:
+	// (linear seems to work better here)
+
+	tempoChange = pctChange * 0.75f;
+
+	if( statusPct * tempoChange < 0.0f )
+	{
+		// only apply tempo change if it is in synch with the buffer status.
+		// In other words, if the buffer is high (over 0%), and is decreasing,
+		// ignore it.  It'll just muck things up.
+
+		tempoChange = 0;
+	}
+
+	// Sudden spikes in framerate can cause the nominal buffer status
+	// to go critical, in which case we have to enact an emergency
+	// stretch. The following cubic formulas do that.  Values near
+	// the extremeites give much larger results than those near 0.
+	// And the value is added only this time, and does not accumulate.
+	// (otherwise a large value like this would cause problems down the road)
+
+	// Constants:
+	// Weight - weights the statusPct's "emergency" consideration.
+	//   higher values here will make the buffer perform more drastic
+	//   compensations at the outer edges of the buffer (at -75 or +75%
+	//   or beyond, for example).
+
+	// Range - scales the adjustment to the given range (more or less).
+	//   The actual range is dependent on the weight used, so if you increase
+	//   Weight you'll usually want to decrease Range somewhat to compensate.
+
+	// Prediction based on the buffer fill status:
+
+	const float statusWeight = 2.99f;
+	const float statusRange = 0.068f;
+
+	// "non-emergency" deadzone:  In this area stretching will be strongly discouraged.
+	// Note: due tot he nature of timestretch latency, it's always a wee bit harder to
+	// cope with low fps (underruns) than it is high fps (overruns).  So to help out a
+	// little, the low-end portions of this check are less forgiving than the high-sides.
+
+	if( cTempo < 0.965f || cTempo > 1.060f ||
+		pctChange < -0.38f || pctChange > 0.54f ||
+		statusPct < -0.32f || statusPct > 0.39f ||
+		eTempo < 0.89f || eTempo > 1.19f )
+	{
+		emergencyAdj = ( pow( statusPct*statusWeight, 3.0f ) * statusRange);
+	}
+
+	// Smooth things out by factoring our previous adjustment into this one.
+	// It helps make the system 'feel' a little smarter by  giving it at least
+	// one packet worth of history to help work off of:
+
+	emergencyAdj = (emergencyAdj * 0.75f) + (lastEmergencyAdj * 0.25f );
+
+	lastEmergencyAdj = emergencyAdj;
+	lastPct = statusPct;
+
+	// Accumulate a fraction of the tempo change into the tempo itself.
+	// This helps the system run "smarter" to games that run consistently
+	// fast or slow by altering the base tempo to something closer to the
+	// game's active speed.  In tests most games normalize within 2 seconds
+	// at 100ms latency, which is pretty good (larger buffers normalize even
+	// quicker).
+
+	newcee += newcee * (tempoChange+emergencyAdj) * 0.03f;
+
+	// Apply tempoChange as a scale of cTempo.  That way the effect is proportional
+	// to the current tempo.  (otherwise tempos rate of change at the extremes would
+	// be too drastic)
+
+	float newTempo = newcee + ( emergencyAdj * cTempo );
+
+	// ... and as a final optimization, only stretch if the new tempo is outside
+	// a nominal threshold.  Keep this threshold check small, because it could
+	// cause some serious side effects otherwise. (enlarging the cTempo check above
+	// is usually better/safer)
+	if( newTempo < 0.970f || newTempo > 1.045f )
+	{
+		cTempo = (float)newcee;
+
+		if( newTempo < 0.10f ) newTempo = 0.10f;
+		else if( newTempo > 10.0f ) newTempo = 10.0f;
+
+		if( cTempo < 0.15f ) cTempo = 0.15f;
+		else if( cTempo > 7.5f ) cTempo = 7.5f;
+
+		pSoundTouch->setTempo( eTempo = (float)newTempo );
+		ts_stats_stretchblocks++;
+
+		/*ConLog(" * SPU2: [Nominal %d%%] [Emergency: %d%%] (baseTempo: %d%% ) (newTempo: %d%%) (buffer: %d%%)\n",
+			//(relation < 0.0) ? "Normalize" : "",
+			(int)(tempoChange * 100.0 * 0.03),
+			(int)(emergencyAdj * 100.0),
+			(int)(cTempo * 100.0),
+			(int)(newTempo * 100.0),
+			(int)(statusPct * 100.0)
+		);*/
+	}
+	else
+	{
+		// Nominal operation -- turn off stretching.
+		// note: eTempo 'slides' toward 1.0 for smoother audio and better
+		// protection against spikes.
+		if( cTempo != 1.0f )
+		{
+			cTempo = 1.0f;
+			eTempo = ( 1.0f + eTempo ) * 0.5f;
+			pSoundTouch->setTempo( eTempo );
+		}
+		else
+		{
+			if( eTempo != cTempo )
+				pSoundTouch->setTempo( eTempo=cTempo );
+			ts_stats_normalblocks++;
+		}
+	}
+}
+
+void SndBuffer::timeStretchUnderrun()
+{
+	// timeStretcher failed it's job.  We need to slow down the audio some.
+
+	cTempo -= (cTempo * 0.12f);
+	eTempo -= (eTempo * 0.30f);
+	if( eTempo < 0.1f ) eTempo = 0.1f;
+	pSoundTouch->setTempo( eTempo );
+}
+
+s32 SndBuffer::timeStretchOverrun()
+{
+	// If we overran it means the timestretcher failed.  We need to speed
+	// up audio playback.
+	cTempo += cTempo * 0.12f;
+	eTempo += eTempo * 0.40f;
+	if( eTempo > 7.5f ) eTempo = 7.5f;
+	pSoundTouch->setTempo( eTempo );
+
+	// Throw out just a little bit (two packets worth) to help
+	// give the TS some room to work:
+
+	return SndOutPacketSize*2;
+}
+
+static void CvtPacketToFloat( StereoOut32* srcdest )
+{
+	StereoOutFloat* dest = (StereoOutFloat*)srcdest;
+	const StereoOut32* src = (StereoOut32*)srcdest;
+	for( uint i=0; i<SndOutPacketSize; ++i, ++dest, ++src )
+		*dest = (StereoOutFloat)*src;
+}
+
+// Parameter note: Size should always be a multiple of 128, thanks!
+static void CvtPacketToInt( StereoOut32* srcdest, uint size )
+{
+	//jASSUME( (size & 127) == 0 );
+	
+	const StereoOutFloat* src = (StereoOutFloat*)srcdest;
+	StereoOut32* dest = srcdest;
+
+	for( uint i=0; i<size; ++i, ++dest, ++src )
+		*dest = (StereoOut32)*src;
+}
+
+void SndBuffer::timeStretchWrite()
+{
+	bool progress = false;
+
+	// data prediction helps keep the tempo adjustments more accurate.
+	// The timestretcher returns packets in belated "clump" form.
+	// Meaning that most of the time we'll get nothing back, and then
+	// suddenly we'll get several chunks back at once.  Thus we use
+	// data prediction to make the timestretcher more responsive.
+
+	PredictDataWrite( (int)( SndOutPacketSize / eTempo ) );
+	CvtPacketToFloat( sndTempBuffer );
+
+	pSoundTouch->putSamples( (float*)sndTempBuffer, SndOutPacketSize );
+
+	int tempProgress;
+	while( tempProgress = pSoundTouch->receiveSamples( (float*)sndTempBuffer, SndOutPacketSize),
+		tempProgress != 0 )
+	{
+		// Hint: It's assumed that pSoundTouch will return chunks of 128 bytes (it always does as
+		// long as the SSE optimizations are enabled), which means we can do our own SSE opts here.
+		
+		CvtPacketToInt( sndTempBuffer, tempProgress );
+		_WriteSamples( sndTempBuffer, tempProgress );
+		progress = true;
+	}
+
+	UpdateTempoChange();
+
+	//TODO
+	//if( MsgOverruns() )
+	{
+		if( progress )
+		{
+			if( ++ts_stats_logcounter > 300 )
+			{
+				ts_stats_logcounter = 0;
+				printf( " * SPU2 > Timestretch Stats > %d%% of packets stretched.\n",
+					( ts_stats_stretchblocks * 100 ) / ( ts_stats_normalblocks + ts_stats_stretchblocks ) );
+				ts_stats_normalblocks = 0;
+				ts_stats_stretchblocks = 0;
+			}
+		}
+	}
+}
+
+void SndBuffer::soundtouchInit()
+{
+	pSoundTouch = new soundtouch::SoundTouch();
+	pSoundTouch->setSampleRate(SampleRate);
+	pSoundTouch->setChannels(2);
+
+	pSoundTouch->setSetting( SETTING_USE_QUICKSEEK, 0 );
+	pSoundTouch->setSetting( SETTING_USE_AA_FILTER, 0 );
+
+	SoundtouchCfg::ApplySettings( *pSoundTouch );
+
+	pSoundTouch->setTempo(1);
+
+	// some timestretch management vars:
+
+	cTempo = 1.0;
+	eTempo = 1.0;
+	lastPct = 0;
+	lastEmergencyAdj = 0;
+
+	// just freeze tempo changes for a while at startup.
+	// the driver buffers are bogus anyway.
+	freezeTempo = 16;
+	m_predictData = 0;
+}
+
+// reset timestretch management vars, and delay updates a bit:
+void SndBuffer::soundtouchClearContents()
+{
+	if( pSoundTouch == NULL ) return;
+
+	pSoundTouch->clear();
+	pSoundTouch->setTempo(1);
+
+	cTempo = 1.0;
+	eTempo = 1.0;
+	lastPct = 0;
+	lastEmergencyAdj = 0;
+
+	freezeTempo = 16;
+	m_predictData = 0;
+}
+
+void SndBuffer::soundtouchCleanup()
+{
+	//safe_delete( pSoundTouch );
+	delete pSoundTouch;
+}
--- a/desmume/src/metaspu/metaspu.cpp
+++ b/desmume/src/metaspu/metaspu.cpp
@ -0,0 +1,471 @@
+/*  Copyright 2009 DeSmuME team
+
+    This file is part of DeSmuME
+
+    DeSmuME is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    DeSmuME is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with DeSmuME; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA
+*/
+
+#include "types.h"
+#include "metaspu.h"
+#include <queue>
+#include <vector>
+#include <assert.h>
+
+//for pcsx2 method
+//(havent bothered to get it compiling in gcc yet)
+#ifdef _MSC_VER
+#include "SndOut.h"
+#endif
+
+
+template<typename T> inline T _abs(T val)
+{
+	if(val<0) return -val;
+	else return val;
+}
+
+template<typename T> inline T moveValueTowards(T val, T target, T incr)
+{
+	incr = _abs(incr);
+	T delta = _abs(target-val);
+	if(val<target) val += incr;
+	else if(val>target) val -= incr;
+	T newDelta = _abs(target-val);
+	if(newDelta >= delta)
+		val = target;
+	return val;
+}
+
+
+class ZeromusSynchronizer : public ISynchronizingAudioBuffer
+{
+public:
+	ZeromusSynchronizer()
+		: mixqueue_go(false)
+		,
+		#ifdef NDEBUG
+		adjustobuf(200,1000)
+		#else
+		adjustobuf(22000,44000)
+		#endif
+	{
+
+	}
+
+	bool mixqueue_go;
+
+	virtual void enqueue_samples(s16* buf, int samples_provided)
+	{
+		for(int i=0;i<samples_provided;i++) {
+			s16 left = *buf++;
+			s16 right = *buf++;
+			adjustobuf.enqueue(left,right);
+		}
+	}
+
+	//returns the number of samples actually supplied, which may not match the number requested
+	virtual int output_samples(s16* buf, int samples_requested)
+	{
+		int done = 0;
+		if(!mixqueue_go) {
+			if(adjustobuf.size > 200)
+				mixqueue_go = true;
+		}
+		else
+		{
+			for(int i=0;i<samples_requested;i++) {
+				if(adjustobuf.size==0) {
+					mixqueue_go = false;
+					break;
+				}
+				done++;
+				s16 left, right;
+				adjustobuf.dequeue(left,right);
+				*buf++ = left;
+				*buf++ = right;
+			}
+		}
+		
+		return done;
+	}
+
+private:
+	class Adjustobuf
+	{
+	public:
+		Adjustobuf(int _minLatency, int _maxLatency)
+			: size(0)
+			, minLatency(_minLatency)
+			, maxLatency(_maxLatency)
+		{
+			rollingTotalSize = 0;
+			targetLatency = (maxLatency + minLatency)/2;
+			rate = 1.0f;
+			cursor = 0.0f;
+			curr[0] = curr[1] = 0;
+			kAverageSize = 80000;
+		}
+
+		float rate, cursor;
+		int minLatency, targetLatency, maxLatency;
+		std::queue<s16> buffer;
+		int size;
+		s16 curr[2];
+
+		std::queue<int> statsHistory;
+
+		void enqueue(s16 left, s16 right) 
+		{
+			buffer.push(left);
+			buffer.push(right);
+			size++;
+		}
+
+		s64 rollingTotalSize;
+
+		u32 kAverageSize;
+
+		void addStatistic()
+		{
+			statsHistory.push(size);
+			rollingTotalSize += size;
+			if(statsHistory.size()>kAverageSize)
+			{
+				rollingTotalSize -= statsHistory.front();
+				statsHistory.pop();
+
+				float averageSize = (float)(rollingTotalSize / kAverageSize);
+				//static int ctr=0;  ctr++; if((ctr&127)==0) printf("avg size: %f curr size: %d rate: %f\n",averageSize,size,rate);
+				{
+					float targetRate;
+					if(averageSize < targetLatency)
+					{
+						targetRate = 1.0f - (targetLatency-averageSize)/kAverageSize;
+					}
+					else if(averageSize > targetLatency) {
+						targetRate = 1.0f + (averageSize-targetLatency)/kAverageSize;
+					} else targetRate = 1.0f;
+				
+					//rate = moveValueTowards(rate,targetRate,0.001f);
+					rate = targetRate;
+				}
+
+			}
+
+
+		}
+
+		void dequeue(s16& left, s16& right)
+		{
+			left = right = 0; 
+			addStatistic();
+			if(size==0) { return; }
+			cursor += rate;
+			while(cursor>1.0f) {
+				cursor -= 1.0f;
+				if(size>0) {
+					curr[0] = buffer.front(); buffer.pop();
+					curr[1] = buffer.front(); buffer.pop();
+					size--;
+				}
+			}
+			left = curr[0]; 
+			right = curr[1];
+		}
+	} adjustobuf;
+};
+
+class NitsujaSynchronizer : public ISynchronizingAudioBuffer
+{
+private:
+	struct ssamp
+	{
+		s16 l, r;
+		ssamp() {}
+		ssamp(s16 ll, s16 rr) : l(ll), r(rr) {}
+	};
+
+	std::vector<ssamp> sampleQueue;
+
+	// returns values going between 0 and y-1 in a saw wave pattern, based on x
+	static FORCEINLINE int pingpong(int x, int y)
+	{
+		x %= 2*y;
+		if(x >= y)
+			x = 2*y - x - 1;
+		return x;
+
+		// in case we want to switch to odd buffer sizes for more sharpness
+		//x %= 2*(y-1);
+		//if(x >= y)
+		//	x = 2*(y-1) - x;
+		//return x;
+	}
+
+	static FORCEINLINE ssamp crossfade (ssamp lhs, ssamp rhs,  int cur, int start, int end)
+	{
+		if(cur <= start)
+			return lhs;
+		if(cur >= end)
+			return rhs;
+
+		// in case we want sine wave interpolation instead of linear here
+		//float ang = 3.14159f * (float)(cur - start) / (float)(end - start);
+		//cur = start + (int)((1-cosf(ang))*0.5f * (end - start));
+
+		int inNum = cur - start;
+		int outNum = end - cur;
+		int denom = end - start;
+
+		int lrv = ((int)lhs.l * outNum + (int)rhs.l * inNum) / denom;
+		int rrv = ((int)lhs.r * outNum + (int)rhs.r * inNum) / denom;
+
+		return ssamp(lrv,rrv);
+	}
+
+	static FORCEINLINE void emit_sample(s16*& outbuf, ssamp sample)
+	{
+		*outbuf++ = sample.l;
+		*outbuf++ = sample.r;
+	}
+
+	static FORCEINLINE void emit_samples(s16*& outbuf, const ssamp* samplebuf, int samples)
+	{
+		for(int i=0;i<samples;i++)
+			emit_sample(outbuf,samplebuf[i]);
+	}
+
+public:
+	NitsujaSynchronizer()
+	{}
+
+	virtual void enqueue_samples(s16* buf, int samples_provided)
+	{
+		for(int i=0;i<samples_provided;i++)
+		{
+			sampleQueue.push_back(ssamp(buf[0],buf[1]));
+			buf += 2;
+		}
+	}
+
+	virtual int output_samples(s16* buf, int samples_requested)
+	{
+		int audiosize = samples_requested;
+		int queued = sampleQueue.size();
+
+		// truncate input and output sizes to multiples of 8 because I am too lazy to deal with odd numbers
+		audiosize &= ~7;
+		queued &= ~7;
+
+		if(queued > 0x200 && audiosize > 0) // is there any work to do?
+		{
+			// are we going at normal speed?
+			// or more precisely, are the input and output queues/buffers of similar size?
+			if(queued > 900 || audiosize > queued * 2)
+			{
+				// not normal speed. we have to resample it somehow in this case.
+				if(audiosize <= queued)
+				{
+					// fast forward speed
+					// this is the easy case, just crossfade it and it sounds ok
+					for(int i = 0; i < audiosize; i++)
+					{
+						int j = i + queued - audiosize;
+						ssamp outsamp = crossfade(sampleQueue[i],sampleQueue[j], i,0,audiosize);
+						emit_sample(buf,outsamp);
+					}
+				}
+				else
+				{
+					// slow motion speed
+					// here we take a very different approach,
+					// instead of crossfading it, we select a single sample from the queue
+					// and make sure that the index we use to select a sample is constantly moving
+					// and that it starts at the first sample in the queue and ends on the last one.
+					//
+					// hopefully the index doesn't move discontinuously or we'll get slight crackling
+					// (there might still be a minor bug here that causes this occasionally)
+					//
+					// here's a diagram of how the index we sample from moves:
+					//
+					// queued (this axis represents the index we sample from. the top means the end of the queue)
+					// ^
+					// |   --> audiosize (this axis represents the output index we write to, right meaning forward in output time/position)
+					// |   A           C       C  end
+					//    A A     B   C C     C
+					//   A   A   A B C   C   C
+					//  A     A A   B     C C
+					// A       A           C
+					// start
+					//
+					// yes, this means we are spending some stretches of time playing the sound backwards,
+					// but the stretches are short enough that this doesn't sound weird.
+					// this lets us avoid most crackling problems due to the endpoints matching up.
+					// TODO: it might help to calculate the approximate fundamental frequency
+					// and reduce either buffer size such that the reflections line up with it.
+
+					int midpointX = audiosize >> 1;
+					int midpointY = queued >> 1;
+
+					// all we need to do here is calculate the X position of the leftmost "B" in the above diagram.
+					// TODO: we should calculate it with a simple equation like
+					//   midpointXOffset = min(something,somethingElse);
+					// but it's a little difficult to work it out exactly
+					// so here's a stupid search for the value for now:
+
+					int prevA = 999999;
+					int midpointXOffset = queued/2;
+					while(true)
+					{
+						int a = abs(pingpong(midpointX - midpointXOffset, queued) - midpointY) - midpointXOffset;
+						if(((a > 0) != (prevA > 0) || (a < 0) != (prevA < 0)) && prevA != 999999)
+						{
+							if((a + prevA)&1) // there's some sort of off-by-one problem with this search since we're moving diagonally...
+								midpointXOffset++; // but this fixes it most of the time...
+							break; // found it
+						}
+						prevA = a;
+						midpointXOffset--;
+						if(midpointXOffset < 0)
+						{
+							midpointXOffset = 0;
+							break; // failed to find it. the two sides probably meet exactly in the center.
+						}
+					}
+
+					int leftMidpointX = midpointX - midpointXOffset;
+					int rightMidpointX = midpointX + midpointXOffset;
+					int leftMidpointY = pingpong(leftMidpointX, queued);
+					int rightMidpointY = (queued-1) - pingpong((int)audiosize-1 - rightMidpointX + queued*2, queued);
+
+					// output the left almost-half of the sound (section "A")
+					for(int x = 0; x < leftMidpointX; x++)
+					{
+						int i = pingpong(x, queued);
+						emit_sample(buf,sampleQueue[i]);
+					}
+
+					// output the middle stretch (section "B")
+					int y = leftMidpointY;
+					int dyMidLeft  = (leftMidpointY  < midpointY) ? 1 : -1;
+					int dyMidRight = (rightMidpointY > midpointY) ? 1 : -1;
+					for(int x = leftMidpointX; x < midpointX; x++, y+=dyMidLeft)
+						emit_sample(buf,sampleQueue[y]);
+					for(int x = midpointX; x < rightMidpointX; x++, y+=dyMidRight)
+						emit_sample(buf,sampleQueue[y]);
+
+					// output the end of the queued sound (section "C")
+					for(int x = rightMidpointX; x < audiosize; x++)
+					{
+						int i = (queued-1) - pingpong((int)audiosize-1 - x + queued*2, queued);
+						emit_sample(buf,sampleQueue[i]);
+					}
+				} //end else
+
+				sampleQueue.erase(sampleQueue.begin(), sampleQueue.begin() + queued);
+				return audiosize;
+			}
+			else
+			{
+				// normal speed
+				// just output the samples straightforwardly.
+				//
+				// at almost-full speeds (like 50/60 FPS)
+				// what will happen is that we rapidly fluctuate between entering this branch
+				// and entering the "slow motion speed" branch above.
+				// but that's ok! because all of these branches sound similar enough that we can get away with it.
+				// so the two cases actually complement each other.
+
+				if(audiosize >= queued)
+				{
+					emit_samples(buf,&sampleQueue[0],queued);
+					sampleQueue.erase(sampleQueue.begin(), sampleQueue.begin() + queued);
+					return queued;
+				}
+				else
+				{
+					emit_samples(buf,&sampleQueue[0],audiosize);
+					sampleQueue.erase(sampleQueue.begin(), sampleQueue.begin()+audiosize);
+					return audiosize;
+				}
+
+			} //end normal speed
+
+		} //end if there is any work to do
+		else
+		{
+			return 0;
+		}
+
+	} //output_samples
+
+private:
+
+}; //NitsujaSynchronizer
+
+
+#ifdef _MSC_VER
+class PCSX2Synchronizer : public ISynchronizingAudioBuffer
+{
+public:
+	std::queue<s16> readySamples;
+	PCSX2Synchronizer()
+	{
+		SndBuffer::Init();
+	}
+	virtual void enqueue_samples(s16* buf, int samples_provided)
+	{
+		for(int i=0;i<samples_provided;i++)
+		{
+			StereoOut32 so32(buf[0],buf[1]);
+			SndBuffer::Write(so32);
+			buf++;
+			buf++;
+		}
+	}
+
+	virtual int output_samples(s16* buf, int samples_requested)
+	{
+		for(int i=0;i<samples_requested;i++) {
+			if(readySamples.size()==0) {
+				//SndOutPacketSize
+				StereoOut16 temp[SndOutPacketSize*2];
+				SndBuffer::ReadSamples( temp );
+				for(int i=0;i<SndOutPacketSize;i++) {
+					readySamples.push(temp[i].Left);
+					readySamples.push(temp[i].Right);
+				}
+			}
+			*buf++ = readySamples.front(); readySamples.pop();
+			*buf++ = readySamples.front(); readySamples.pop();
+		}
+		return samples_requested;
+	}
+};
+#endif
+
+
+ISynchronizingAudioBuffer* metaspu_construct(ESynchMethod method)
+{
+	switch(method)
+	{
+	case ESynchMethod_N: return new NitsujaSynchronizer();
+	case ESynchMethod_Z: return new ZeromusSynchronizer();
+	#ifdef _MSC_VER
+	case ESynchMethod_P: return new PCSX2Synchronizer();
+	#endif
+	default: return NULL;
+	}
+}
--- a/desmume/src/metaspu/metaspu.h
+++ b/desmume/src/metaspu/metaspu.h
@ -0,0 +1,68 @@
+/*  Copyright 2009 DeSmuME team
+
+    This file is part of DeSmuME
+
+    DeSmuME is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    DeSmuME is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with DeSmuME; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA
+*/
+
+//-------------------------
+//this file contains the METASPU system
+//which is designed to handle the task of audio synchronization
+//and is designed to be as portable between multiple emulators
+//-------------------------
+
+
+#ifndef _METASPU_H_
+#define _METASPU_H_
+
+#include <algorithm>
+
+template< typename T >
+static FORCEINLINE void Clampify( T& src, T min, T max )
+{
+	src = std::min( std::max( src, min ), max );
+}
+
+template< typename T >
+static FORCEINLINE T GetClamped( T src, T min, T max )
+{
+	return std::min( std::max( src, min ), max );
+}
+
+class ISynchronizingAudioBuffer
+{
+public:
+	virtual void enqueue_samples(s16* buf, int samples_provided) = 0;
+
+	//returns the number of samples actually supplied, which may not match the number requested
+	virtual int output_samples(s16* buf, int samples_requested) = 0;
+};
+
+enum ESynchMode
+{
+	ESynchMode_DualSynchAsynch,
+	ESynchMode_Synchronous
+};
+
+enum ESynchMethod
+{
+	ESynchMethod_N, //nitsuja's
+	ESynchMethod_Z, //zero's
+	ESynchMethod_P, //PCSX2 spu2-x
+};
+
+ISynchronizingAudioBuffer* metaspu_construct(ESynchMethod method);
+
+#endif
--- a/desmume/src/metaspu/win32/ConfigSoundtouch.cpp
+++ b/desmume/src/metaspu/win32/ConfigSoundtouch.cpp
@ -0,0 +1,134 @@
+/* SPU2-X, A plugin for Emulating the Sound Processing Unit of the Playstation 2
+ * Developed and maintained by the Pcsx2 Development Team.
+ * 
+ * Original portions from SPU2ghz are (c) 2008 by David Quintana [gigaherz]
+ *
+ * SPU2-X is free software: you can redistribute it and/or modify it under the terms
+ * of the GNU Lesser General Public License as published by the Free Software Found-
+ * ation, either version 3 of the License, or (at your option) any later version.
+ *
+ * SPU2-X is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+ * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ * PURPOSE.  See the GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with SPU2-X.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+//TODO - need to make a generic way for saving these configs and add in the windows
+//rc configurator
+
+#include "types.h"
+//#include "Global.h"
+#include "Dialogs.h"
+
+#include "../metaspu.h"
+
+#include "../SoundTouch/SoundTouch.h"
+
+static int SequenceLenMS = 63;
+static int SeekWindowMS = 16;
+static int OverlapMS = 7;
+
+// Timestretch Slider Bounds, Min/Max
+static const int SequenceLen_Min = 50;
+static const int SequenceLen_Max = 90;
+
+static const int SeekWindow_Min = 10;
+static const int SeekWindow_Max = 32;
+
+static const int Overlap_Min = 3;
+static const int Overlap_Max = 25;
+
+void SoundtouchCfg::ApplySettings( soundtouch::SoundTouch& sndtouch )
+{
+	sndtouch.setSetting( SETTING_SEQUENCE_MS,	SequenceLenMS );
+	sndtouch.setSetting( SETTING_SEEKWINDOW_MS,	SeekWindowMS );
+	sndtouch.setSetting( SETTING_OVERLAP_MS,	OverlapMS );
+}
+
+static void ClampValues()
+{
+	Clampify( SequenceLenMS, SequenceLen_Min, SequenceLen_Max );
+	Clampify( SeekWindowMS, SeekWindow_Min, SeekWindow_Max );
+	Clampify( OverlapMS, Overlap_Min, Overlap_Max );
+}
+
+void SoundtouchCfg::ReadSettings()
+{
+/*	SequenceLenMS	= CfgReadInt( L"SOUNDTOUCH", L"SequenceLengthMS", 50 );
+	SeekWindowMS	= CfgReadInt( L"SOUNDTOUCH", L"SeekWindowMS", 15 );
+	OverlapMS		= CfgReadInt( L"SOUNDTOUCH", L"OverlapMS", 25 );
+
+	ClampValues();	*/	
+}
+
+void SoundtouchCfg::WriteSettings()
+{
+	//CfgWriteInt( L"SOUNDTOUCH", L"SequenceLengthMS", SequenceLenMS );
+	//CfgWriteInt( L"SOUNDTOUCH", L"SeekWindowMS", SeekWindowMS );
+	//CfgWriteInt( L"SOUNDTOUCH", L"OverlapMS", OverlapMS );
+}
+
+//BOOL CALLBACK SoundtouchCfg::DialogProc(HWND hWnd,UINT uMsg,WPARAM wParam,LPARAM lParam)
+//{
+//	int wmId,wmEvent;
+//	wchar_t temp[384]={0};
+//
+//	switch(uMsg)
+//	{
+//		case WM_PAINT:
+//			return FALSE;
+//
+//		case WM_INITDIALOG:
+//		{
+//			INIT_SLIDER( IDC_SEQLEN_SLIDER, SequenceLen_Min, SequenceLen_Max, 20, 5, 1 );
+//			INIT_SLIDER( IDC_SEEKWIN_SLIDER, SeekWindow_Min, SeekWindow_Max, 5, 2, 1 );
+//			INIT_SLIDER( IDC_OVERLAP_SLIDER, Overlap_Min, Overlap_Max, 3, 2, 1 );
+//
+//			SendDialogMsg( hWnd, IDC_SEQLEN_SLIDER, TBM_SETPOS, TRUE, SequenceLenMS );
+//			SendDialogMsg( hWnd, IDC_SEEKWIN_SLIDER, TBM_SETPOS, TRUE, SeekWindowMS );
+//			SendDialogMsg( hWnd, IDC_OVERLAP_SLIDER, TBM_SETPOS, TRUE, OverlapMS );
+//		}
+//		
+//		case WM_COMMAND:
+//			wmId    = LOWORD(wParam); 
+//			wmEvent = HIWORD(wParam); 
+//			// Parse the menu selections:
+//			if( wmId == IDOK )
+//			{
+//				SequenceLenMS	= (int)SendDialogMsg( hWnd, IDC_SEQLEN_SLIDER, TBM_GETPOS, 0, 0 );
+//				SeekWindowMS	= (int)SendDialogMsg( hWnd, IDC_SEEKWIN_SLIDER, TBM_GETPOS, 0, 0 );
+//				OverlapMS		= (int)SendDialogMsg( hWnd, IDC_OVERLAP_SLIDER, TBM_GETPOS, 0, 0 );
+//
+//				ClampValues();
+//				WriteSettings();
+//				EndDialog(hWnd,0);
+//			}
+//			else if( wmId == IDCANCEL )
+//			{
+//				EndDialog(hWnd,0);
+//			}
+//		break;
+//		
+//		case WM_HSCROLL:
+//			DoHandleScrollMessage( hWnd, wParam, lParam );
+//		break;
+//		
+//		default:
+//			return FALSE;
+//	}
+//	return TRUE;
+//}
+
+void SoundtouchCfg::OpenDialog( HWND hWnd )
+{
+	//INT_PTR ret;
+	//ret = DialogBox( hInstance, MAKEINTRESOURCE(IDD_CONFIG_SOUNDTOUCH), hWnd, (DLGPROC)DialogProc );
+	//if(ret==-1)
+	//{
+	//	MessageBoxEx(GetActiveWindow(), L"Error Opening the Soundtouch advanced dialog.", L"OMG ERROR!", MB_OK, 0);
+	//	return;
+	//}
+	//ReadSettings();
+}
--- a/desmume/src/metaspu/win32/Dialogs.h
+++ b/desmume/src/metaspu/win32/Dialogs.h
@ -0,0 +1,81 @@
+/* SPU2-X, A plugin for Emulating the Sound Processing Unit of the Playstation 2
+ * Developed and maintained by the Pcsx2 Development Team.
+ * 
+ * Original portions from SPU2ghz are (c) 2008 by David Quintana [gigaherz]
+ *
+ * SPU2-X is free software: you can redistribute it and/or modify it under the terms
+ * of the GNU Lesser General Public License as published by the Free Software Found-
+ * ation, either version 3 of the License, or (at your option) any later version.
+ *
+ * SPU2-X is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
+ * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ * PURPOSE.  See the GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with SPU2-X.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#pragma once
+
+#ifdef _WIN32
+#include <windows.h>
+#endif
+
+#include "../SoundTouch/SoundTouch.h"
+
+//#ifdef _WIN32
+//#	include "WinConfig.h"
+//#else
+//#	include "LnxConfig.h"
+//#endif
+//
+//namespace DebugConfig
+//{
+//	extern void ReadSettings();
+//	extern void WriteSettings();
+//	extern void OpenDialog();
+//	extern void EnableControls( HWND hWnd );
+//}
+//
+namespace SoundtouchCfg
+{
+	extern void ReadSettings();
+	extern void WriteSettings();
+	extern void OpenDialog( HWND hWnd );
+	extern BOOL CALLBACK DialogProc(HWND hWnd,UINT uMsg,WPARAM wParam,LPARAM lParam);
+	extern void ApplySettings( soundtouch::SoundTouch& sndtouch );
+}
+//
+//extern int		SendDialogMsg( HWND hwnd, int dlgId, UINT code, WPARAM wParam, LPARAM lParam);
+//extern HRESULT	GUIDFromString( const char *str, LPGUID guid );
+//
+//extern void		AssignSliderValue( HWND idcwnd, HWND hwndDisplay, int value );
+//extern void		AssignSliderValue( HWND hWnd, int idc, int editbox, int value );
+//extern int		GetSliderValue( HWND hWnd, int idc );
+//extern BOOL		DoHandleScrollMessage( HWND hwndDisplay, WPARAM wParam, LPARAM lParam );
+//
+//extern bool		CfgFindName( const TCHAR *Section, const TCHAR* Name);
+//
+//extern void		CfgWriteBool(const TCHAR* Section, const TCHAR* Name, bool Value);
+//extern void		CfgWriteInt(const TCHAR* Section, const TCHAR* Name, int Value);
+//extern void		CfgWriteStr(const TCHAR* Section, const TCHAR* Name, const wstring& Data);
+//
+//extern bool		CfgReadBool(const TCHAR *Section,const TCHAR* Name, bool Default);
+//extern void		CfgReadStr(const TCHAR* Section, const TCHAR* Name, wstring& Data, int DataSize, const TCHAR* Default);
+//extern void		CfgReadStr(const TCHAR* Section, const TCHAR* Name, TCHAR* Data, int DataSize, const TCHAR* Default);
+//extern int		CfgReadInt(const TCHAR* Section, const TCHAR* Name,int Default);
+//
+//
+//// Items Specific to DirectSound
+//#define STRFY(x) #x
+//#define verifyc(x) Verifyc(x,STRFY(x))
+//
+//extern void Verifyc(HRESULT hr, const char* fn);
+//
+//struct ds_device_data
+//{
+//	std::wstring name;
+//	GUID guid;
+//	bool hasGuid;
+//};
+//
--- a/desmume/src/windows/DeSmuME_2005.vcproj
+++ b/desmume/src/windows/DeSmuME_2005.vcproj
@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="shift_jis"?>
+<?xml version="1.0" encoding="Windows-1252"?>
 <VisualStudioProject
 	ProjectType="Visual C++"
 	Version="8.00"
@ -274,7 +274,7 @@
 			/>
 			<Tool
 				Name="VCLinkerTool"
-				AdditionalDependencies="agg-2.5.lib lua-5.1.4-x86.lib glib-2.20.1-x86.lib vfw32.lib winmm.lib opengl32.lib glu32.lib ws2_32.lib user32.lib gdi32.lib directx\dxguid.lib shell32.lib comdlg32.lib directx\dxerr8.lib directx\dsound.lib directx\dinput8.lib directx\ddraw.lib zlib-2005-x32.lib zziplib-2005-x32.lib shlwapi.lib  winpcap\wpcap.lib 7zip.lib"
+				AdditionalDependencies="agg-2.5.lib lua-5.1.4-x86.lib glib-2.20.1-x86.lib vfw32.lib winmm.lib opengl32.lib glu32.lib ws2_32.lib user32.lib gdi32.lib directx\dxguid.lib shell32.lib comdlg32.lib directx\dxerr8.lib directx\dsound.lib directx\dinput8.lib directx\ddraw.lib zlib-2005-x32.lib zziplib-2005-x32.lib shlwapi.lib winpcap\wpcap.lib 7zip.lib"
 				OutputFile="$(OutDir)\$(ProjectName)_releaseFastBuild.exe"
 				AdditionalLibraryDirectories=".\zlib123;.\zziplib;glib-2.20.1\lib;.\lua\lib;.\7z;agg"
 				DelayLoadDLLs="wpcap.dll"
@ -1094,6 +1094,130 @@
 				>
 			</File>
 		</Filter>
+		<Filter
+			Name="metaspu"
+			>
+			<File
+				RelativePath="..\metaspu\metaspu.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\metaspu\metaspu.h"
+				>
+			</File>
+			<File
+				RelativePath="..\metaspu\SndOut.cpp"
+				>
+			</File>
+			<File
+				RelativePath="..\metaspu\SndOut.h"
+				>
+			</File>
+			<File
+				RelativePath="..\metaspu\Timestretcher.cpp"
+				>
+			</File>
+			<Filter
+				Name="SoundTouch"
+				>
+				<File
+					RelativePath="..\metaspu\SoundTouch\3dnow_win.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\metaspu\SoundTouch\AAFilter.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\metaspu\SoundTouch\AAFilter.h"
+					>
+				</File>
+				<File
+					RelativePath="..\metaspu\SoundTouch\BPMDetect.h"
+					>
+				</File>
+				<File
+					RelativePath="..\metaspu\SoundTouch\cpu_detect.h"
+					>
+				</File>
+				<File
+					RelativePath="..\metaspu\SoundTouch\cpu_detect_x86_win.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\metaspu\SoundTouch\FIFOSampleBuffer.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\metaspu\SoundTouch\FIFOSampleBuffer.h"
+					>
+				</File>
+				<File
+					RelativePath="..\metaspu\SoundTouch\FIFOSamplePipe.h"
+					>
+				</File>
+				<File
+					RelativePath="..\metaspu\SoundTouch\FIRFilter.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\metaspu\SoundTouch\FIRFilter.h"
+					>
+				</File>
+				<File
+					RelativePath="..\metaspu\SoundTouch\mmx_optimized.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\metaspu\SoundTouch\RateTransposer.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\metaspu\SoundTouch\RateTransposer.h"
+					>
+				</File>
+				<File
+					RelativePath="..\metaspu\SoundTouch\SoundTouch.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\metaspu\SoundTouch\SoundTouch.h"
+					>
+				</File>
+				<File
+					RelativePath="..\metaspu\SoundTouch\sse_optimized.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\metaspu\SoundTouch\STTypes.h"
+					>
+				</File>
+				<File
+					RelativePath="..\metaspu\SoundTouch\TDStretch.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\metaspu\SoundTouch\TDStretch.h"
+					>
+				</File>
+				<File
+					RelativePath="..\metaspu\SoundTouch\WavFile.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\metaspu\SoundTouch\WavFile.h"
+					>
+				</File>
+			</Filter>
+			<Filter
+				Name="win32"
+				>
+				<File
+					RelativePath="..\metaspu\win32\ConfigSoundtouch.cpp"
+					>
+				</File>
+			</Filter>
+		</Filter>
 		<File
 			RelativePath="..\addons.cpp"
 			>
--- a/desmume/src/windows/DeSmuME_2008.vcproj
+++ b/desmume/src/windows/DeSmuME_2008.vcproj
@ -109,193 +109,6 @@
 				Name="VCPostBuildEventTool"
 			/>
 		</Configuration>
-		<Configuration
-			Name="Release|Win32"
-			OutputDirectory="$(SolutionDir)\__bins"
-			IntermediateDirectory="$(SolutionDir)\.VS2008\$(ConfigurationName)\$(PlatformName)"
-			ConfigurationType="1"
-			InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
-			WholeProgramOptimization="1"
-			>
-			<Tool
-				Name="VCPreBuildEventTool"
-				CommandLine="defaultconfig\SubWCRev.bat"
-			/>
-			<Tool
-				Name="VCCustomBuildTool"
-			/>
-			<Tool
-				Name="MASM"
-			/>
-			<Tool
-				Name="VCXMLDataGeneratorTool"
-			/>
-			<Tool
-				Name="VCWebServiceProxyGeneratorTool"
-			/>
-			<Tool
-				Name="VCMIDLTool"
-			/>
-			<Tool
-				Name="VCCLCompilerTool"
-				Optimization="2"
-				InlineFunctionExpansion="2"
-				EnableIntrinsicFunctions="true"
-				FavorSizeOrSpeed="1"
-				OmitFramePointers="true"
-				EnableFiberSafeOptimizations="true"
-				WholeProgramOptimization="true"
-				AdditionalIncludeDirectories=".;..;&quot;lua\lua-5.1.4\src&quot;;&quot;glib-2.20.1\build&quot;;&quot;glib-2.20.1\build\glib&quot;;.\zlib123;.\zziplib;.\winpcap;userconfig;defaultconfig;.\7z;.\agg\include;.\agg\examples"
-				PreprocessorDefinitions="_CRT_SECURE_NO_DEPRECATE;GLIB_STATIC_COMPILATION;WIN32;HAVE_LIBZ;HAVE_LIBZZIP;SSE2;SPU_INTERPOLATE;NOMINMAX;RELEASE;EXPERIMENTAL_WIFI;NDEBUG"
-				StringPooling="true"
-				ExceptionHandling="1"
-				StructMemberAlignment="0"
-				BufferSecurityCheck="false"
-				EnableEnhancedInstructionSet="2"
-				FloatingPointModel="2"
-				WarningLevel="1"
-				DebugInformationFormat="3"
-				CallingConvention="0"
-				CompileAs="0"
-			/>
-			<Tool
-				Name="VCManagedResourceCompilerTool"
-			/>
-			<Tool
-				Name="VCResourceCompilerTool"
-			/>
-			<Tool
-				Name="VCPreLinkEventTool"
-			/>
-			<Tool
-				Name="VCLinkerTool"
-				AdditionalDependencies="agg-2.5.lib lua-5.1.4-x86.lib glib-2.20.1-x86.lib vfw32.lib winmm.lib opengl32.lib glu32.lib ws2_32.lib user32.lib gdi32.lib directx\dxguid.lib shell32.lib comdlg32.lib directx\dxerr8.lib directx\dsound.lib directx\dinput8.lib directx\ddraw.lib zlib-2005-x32.lib zziplib-2005-x32.lib shlwapi.lib  winpcap\wpcap.lib 7zip.lib comctl32.lib"
-				OutputFile="$(OutDir)\$(ProjectName)_release.exe"
-				AdditionalLibraryDirectories=".\zlib123;.\zziplib;glib-2.20.1\lib;lua\lib;.\7z;agg"
-				DelayLoadDLLs="wpcap.dll"
-				GenerateDebugInformation="true"
-				OptimizeReferences="2"
-				RandomizedBaseAddress="1"
-				DataExecutionPrevention="0"
-				Profile="false"
-			/>
-			<Tool
-				Name="VCALinkTool"
-			/>
-			<Tool
-				Name="VCManifestTool"
-				AdditionalManifestFiles="DeSmuME_x86.manifest"
-			/>
-			<Tool
-				Name="VCXDCMakeTool"
-			/>
-			<Tool
-				Name="VCBscMakeTool"
-			/>
-			<Tool
-				Name="VCFxCopTool"
-			/>
-			<Tool
-				Name="VCAppVerifierTool"
-			/>
-			<Tool
-				Name="VCPostBuildEventTool"
-			/>
-		</Configuration>
-		<Configuration
-			Name="Release FastBuild|Win32"
-			OutputDirectory="$(SolutionDir)\__bins"
-			IntermediateDirectory="$(SolutionDir)\.VS2008\$(ConfigurationName)\$(PlatformName)"
-			ConfigurationType="1"
-			InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
-			WholeProgramOptimization="0"
-			>
-			<Tool
-				Name="VCPreBuildEventTool"
-				CommandLine="defaultconfig\SubWCRev.bat"
-			/>
-			<Tool
-				Name="VCCustomBuildTool"
-			/>
-			<Tool
-				Name="MASM"
-			/>
-			<Tool
-				Name="VCXMLDataGeneratorTool"
-			/>
-			<Tool
-				Name="VCWebServiceProxyGeneratorTool"
-			/>
-			<Tool
-				Name="VCMIDLTool"
-			/>
-			<Tool
-				Name="VCCLCompilerTool"
-				Optimization="3"
-				InlineFunctionExpansion="2"
-				EnableIntrinsicFunctions="true"
-				FavorSizeOrSpeed="1"
-				OmitFramePointers="true"
-				EnableFiberSafeOptimizations="true"
-				WholeProgramOptimization="false"
-				AdditionalIncludeDirectories=".;..;&quot;lua\lua-5.1.4\src&quot;;&quot;glib-2.20.1\build&quot;;&quot;glib-2.20.1\build\glib&quot;;.\zlib123;.\zziplib;.\winpcap;userconfig;defaultconfig;.\7z;.\agg\include;.\agg\examples"
-				PreprocessorDefinitions="_CRT_SECURE_NO_DEPRECATE;GLIB_STATIC_COMPILATION;WIN32;HAVE_LIBZ;HAVE_LIBZZIP;SSE2;SPU_INTERPOLATE;NOMINMAX;RELEASE;EXPERIMENTAL_WIFI;NDEBUG"
-				StringPooling="true"
-				ExceptionHandling="1"
-				StructMemberAlignment="0"
-				BufferSecurityCheck="false"
-				EnableEnhancedInstructionSet="2"
-				FloatingPointModel="2"
-				WarningLevel="1"
-				DebugInformationFormat="3"
-				CallingConvention="0"
-				CompileAs="0"
-			/>
-			<Tool
-				Name="VCManagedResourceCompilerTool"
-			/>
-			<Tool
-				Name="VCResourceCompilerTool"
-			/>
-			<Tool
-				Name="VCPreLinkEventTool"
-			/>
-			<Tool
-				Name="VCLinkerTool"
-				AdditionalDependencies="agg-2.5.lib lua-5.1.4-x86.lib glib-2.20.1-x86.lib vfw32.lib winmm.lib opengl32.lib glu32.lib ws2_32.lib user32.lib gdi32.lib directx\dxguid.lib shell32.lib comdlg32.lib directx\dxerr8.lib directx\dsound.lib directx\dinput8.lib directx\ddraw.lib zlib-2005-x32.lib zziplib-2005-x32.lib shlwapi.lib  winpcap\wpcap.lib 7zip.lib comctl32.lib"
-				OutputFile="$(OutDir)\$(ProjectName)_releaseFastBuild.exe"
-				AdditionalLibraryDirectories=".\zlib123;.\zziplib;&quot;glib-2.20.1\lib&quot;;lua\lib;.\7z;.\agg"
-				DelayLoadDLLs="wpcap.dll"
-				GenerateDebugInformation="true"
-				OptimizeReferences="2"
-				LinkTimeCodeGeneration="0"
-				RandomizedBaseAddress="1"
-				DataExecutionPrevention="0"
-				Profile="false"
-			/>
-			<Tool
-				Name="VCALinkTool"
-			/>
-			<Tool
-				Name="VCManifestTool"
-				AdditionalManifestFiles="DeSmuME_x86.manifest"
-			/>
-			<Tool
-				Name="VCXDCMakeTool"
-			/>
-			<Tool
-				Name="VCBscMakeTool"
-			/>
-			<Tool
-				Name="VCFxCopTool"
-			/>
-			<Tool
-				Name="VCAppVerifierTool"
-			/>
-			<Tool
-				Name="VCPostBuildEventTool"
-			/>
-		</Configuration>
 		<Configuration
 			Name="Debug|x64"
 			OutputDirectory="$(SolutionDir)\__bins"
@ -386,6 +199,99 @@
 				Name="VCPostBuildEventTool"
 			/>
 		</Configuration>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory="$(SolutionDir)\__bins"
+			IntermediateDirectory="$(SolutionDir)\.VS2008\$(ConfigurationName)\$(PlatformName)"
+			ConfigurationType="1"
+			InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
+			WholeProgramOptimization="1"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+				CommandLine="defaultconfig\SubWCRev.bat"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="MASM"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="2"
+				InlineFunctionExpansion="2"
+				EnableIntrinsicFunctions="true"
+				FavorSizeOrSpeed="1"
+				OmitFramePointers="true"
+				EnableFiberSafeOptimizations="true"
+				WholeProgramOptimization="true"
+				AdditionalIncludeDirectories=".;..;&quot;lua\lua-5.1.4\src&quot;;&quot;glib-2.20.1\build&quot;;&quot;glib-2.20.1\build\glib&quot;;.\zlib123;.\zziplib;.\winpcap;userconfig;defaultconfig;.\7z;.\agg\include;.\agg\examples"
+				PreprocessorDefinitions="_CRT_SECURE_NO_DEPRECATE;GLIB_STATIC_COMPILATION;WIN32;HAVE_LIBZ;HAVE_LIBZZIP;SSE2;SPU_INTERPOLATE;NOMINMAX;RELEASE;EXPERIMENTAL_WIFI;NDEBUG"
+				StringPooling="true"
+				ExceptionHandling="1"
+				StructMemberAlignment="0"
+				BufferSecurityCheck="false"
+				EnableEnhancedInstructionSet="2"
+				FloatingPointModel="2"
+				WarningLevel="1"
+				DebugInformationFormat="3"
+				CallingConvention="0"
+				CompileAs="0"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				AdditionalDependencies="agg-2.5.lib lua-5.1.4-x86.lib glib-2.20.1-x86.lib vfw32.lib winmm.lib opengl32.lib glu32.lib ws2_32.lib user32.lib gdi32.lib directx\dxguid.lib shell32.lib comdlg32.lib directx\dxerr8.lib directx\dsound.lib directx\dinput8.lib directx\ddraw.lib zlib-2005-x32.lib zziplib-2005-x32.lib shlwapi.lib  winpcap\wpcap.lib 7zip.lib comctl32.lib"
+				OutputFile="$(OutDir)\$(ProjectName)_release.exe"
+				AdditionalLibraryDirectories=".\zlib123;.\zziplib;glib-2.20.1\lib;lua\lib;.\7z;agg"
+				DelayLoadDLLs="wpcap.dll"
+				GenerateDebugInformation="true"
+				OptimizeReferences="2"
+				RandomizedBaseAddress="1"
+				DataExecutionPrevention="0"
+				Profile="false"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+				AdditionalManifestFiles="DeSmuME_x86.manifest"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
 		<Configuration
 			Name="Release|x64"
 			OutputDirectory="$(SolutionDir)\__bins"
@ -481,6 +387,100 @@
 				Name="VCPostBuildEventTool"
 			/>
 		</Configuration>
+		<Configuration
+			Name="Release FastBuild|Win32"
+			OutputDirectory="$(SolutionDir)\__bins"
+			IntermediateDirectory="$(SolutionDir)\.VS2008\$(ConfigurationName)\$(PlatformName)"
+			ConfigurationType="1"
+			InheritedPropertySheets="$(VCInstallDir)VCProjectDefaults\UpgradeFromVC71.vsprops"
+			WholeProgramOptimization="0"
+			>
+			<Tool
+				Name="VCPreBuildEventTool"
+				CommandLine="defaultconfig\SubWCRev.bat"
+			/>
+			<Tool
+				Name="VCCustomBuildTool"
+			/>
+			<Tool
+				Name="MASM"
+			/>
+			<Tool
+				Name="VCXMLDataGeneratorTool"
+			/>
+			<Tool
+				Name="VCWebServiceProxyGeneratorTool"
+			/>
+			<Tool
+				Name="VCMIDLTool"
+			/>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="3"
+				InlineFunctionExpansion="2"
+				EnableIntrinsicFunctions="true"
+				FavorSizeOrSpeed="1"
+				OmitFramePointers="true"
+				EnableFiberSafeOptimizations="true"
+				WholeProgramOptimization="false"
+				AdditionalIncludeDirectories=".;..;&quot;lua\lua-5.1.4\src&quot;;&quot;glib-2.20.1\build&quot;;&quot;glib-2.20.1\build\glib&quot;;.\zlib123;.\zziplib;.\winpcap;userconfig;defaultconfig;.\7z;.\agg\include;.\agg\examples"
+				PreprocessorDefinitions="_CRT_SECURE_NO_DEPRECATE;GLIB_STATIC_COMPILATION;WIN32;HAVE_LIBZ;HAVE_LIBZZIP;SSE2;SPU_INTERPOLATE;NOMINMAX;RELEASE;EXPERIMENTAL_WIFI;NDEBUG"
+				StringPooling="true"
+				ExceptionHandling="1"
+				StructMemberAlignment="0"
+				BufferSecurityCheck="false"
+				EnableEnhancedInstructionSet="2"
+				FloatingPointModel="2"
+				WarningLevel="1"
+				DebugInformationFormat="3"
+				CallingConvention="0"
+				CompileAs="0"
+			/>
+			<Tool
+				Name="VCManagedResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCResourceCompilerTool"
+			/>
+			<Tool
+				Name="VCPreLinkEventTool"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				AdditionalDependencies="agg-2.5.lib lua-5.1.4-x86.lib glib-2.20.1-x86.lib vfw32.lib winmm.lib opengl32.lib glu32.lib ws2_32.lib user32.lib gdi32.lib directx\dxguid.lib shell32.lib comdlg32.lib directx\dxerr8.lib directx\dsound.lib directx\dinput8.lib directx\ddraw.lib zlib-2005-x32.lib zziplib-2005-x32.lib shlwapi.lib  winpcap\wpcap.lib 7zip.lib comctl32.lib"
+				OutputFile="$(OutDir)\$(ProjectName)_releaseFastBuild.exe"
+				AdditionalLibraryDirectories=".\zlib123;.\zziplib;&quot;glib-2.20.1\lib&quot;;lua\lib;.\7z;.\agg"
+				DelayLoadDLLs="wpcap.dll"
+				GenerateDebugInformation="true"
+				OptimizeReferences="2"
+				LinkTimeCodeGeneration="0"
+				RandomizedBaseAddress="1"
+				DataExecutionPrevention="0"
+				Profile="false"
+			/>
+			<Tool
+				Name="VCALinkTool"
+			/>
+			<Tool
+				Name="VCManifestTool"
+				AdditionalManifestFiles="DeSmuME_x86.manifest"
+			/>
+			<Tool
+				Name="VCXDCMakeTool"
+			/>
+			<Tool
+				Name="VCBscMakeTool"
+			/>
+			<Tool
+				Name="VCFxCopTool"
+			/>
+			<Tool
+				Name="VCAppVerifierTool"
+			/>
+			<Tool
+				Name="VCPostBuildEventTool"
+			/>
+		</Configuration>
 		<Configuration
 			Name="Release FastBuild|x64"
 			OutputDirectory="$(SolutionDir)\__bins"
@ -644,7 +644,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release FastBuild|Win32"
+					Name="Release|x64"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@ -652,7 +652,7 @@
 					/>
 				</FileConfiguration>
 				<FileConfiguration
-					Name="Release|x64"
+					Name="Release FastBuild|Win32"
 					>
 					<Tool
 						Name="VCCLCompilerTool"
@ -1060,6 +1060,134 @@
 					</File>
 				</Filter>
 			</Filter>
+			<Filter
+				Name="metaspu"
+				>
+				<File
+					RelativePath="..\metaspu\metaspu.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\metaspu\metaspu.h"
+					>
+				</File>
+				<File
+					RelativePath="..\metaspu\SndOut.cpp"
+					>
+				</File>
+				<File
+					RelativePath="..\metaspu\SndOut.h"
+					>
+				</File>
+				<File
+					RelativePath="..\metaspu\Timestretcher.cpp"
+					>
+				</File>
+				<Filter
+					Name="win32"
+					>
+					<File
+						RelativePath="..\metaspu\win32\ConfigSoundtouch.cpp"
+						>
+					</File>
+					<File
+						RelativePath="..\metaspu\win32\Dialogs.h"
+						>
+					</File>
+				</Filter>
+				<Filter
+					Name="SoundTouch"
+					>
+					<File
+						RelativePath="..\metaspu\SoundTouch\3dnow_win.cpp"
+						>
+					</File>
+					<File
+						RelativePath="..\metaspu\SoundTouch\AAFilter.cpp"
+						>
+					</File>
+					<File
+						RelativePath="..\metaspu\SoundTouch\AAFilter.h"
+						>
+					</File>
+					<File
+						RelativePath="..\metaspu\SoundTouch\BPMDetect.h"
+						>
+					</File>
+					<File
+						RelativePath="..\metaspu\SoundTouch\cpu_detect.h"
+						>
+					</File>
+					<File
+						RelativePath="..\metaspu\SoundTouch\cpu_detect_x86_win.cpp"
+						>
+					</File>
+					<File
+						RelativePath="..\metaspu\SoundTouch\FIFOSampleBuffer.cpp"
+						>
+					</File>
+					<File
+						RelativePath="..\metaspu\SoundTouch\FIFOSampleBuffer.h"
+						>
+					</File>
+					<File
+						RelativePath="..\metaspu\SoundTouch\FIFOSamplePipe.h"
+						>
+					</File>
+					<File
+						RelativePath="..\metaspu\SoundTouch\FIRFilter.cpp"
+						>
+					</File>
+					<File
+						RelativePath="..\metaspu\SoundTouch\FIRFilter.h"
+						>
+					</File>
+					<File
+						RelativePath="..\metaspu\SoundTouch\mmx_optimized.cpp"
+						>
+					</File>
+					<File
+						RelativePath="..\metaspu\SoundTouch\RateTransposer.cpp"
+						>
+					</File>
+					<File
+						RelativePath="..\metaspu\SoundTouch\RateTransposer.h"
+						>
+					</File>
+					<File
+						RelativePath="..\metaspu\SoundTouch\SoundTouch.cpp"
+						>
+					</File>
+					<File
+						RelativePath="..\metaspu\SoundTouch\SoundTouch.h"
+						>
+					</File>
+					<File
+						RelativePath="..\metaspu\SoundTouch\sse_optimized.cpp"
+						>
+					</File>
+					<File
+						RelativePath="..\metaspu\SoundTouch\STTypes.h"
+						>
+					</File>
+					<File
+						RelativePath="..\metaspu\SoundTouch\TDStretch.cpp"
+						>
+					</File>
+					<File
+						RelativePath="..\metaspu\SoundTouch\TDStretch.h"
+						>
+					</File>
+					<File
+						RelativePath="..\metaspu\SoundTouch\WavFile.cpp"
+						>
+					</File>
+					<File
+						RelativePath="..\metaspu\SoundTouch\WavFile.h"
+						>
+					</File>
+				</Filter>
+			</Filter>
 		</Filter>
 		<Filter
 			Name="Windows"
--- a/desmume/src/windows/main.cpp
+++ b/desmume/src/windows/main.cpp
@ -4831,6 +4831,7 @@ static void SoundSettings_updateSynchMode(HWND hDlg)
 	EnableWindow(GetDlgItem(hDlg,IDC_GROUP_SYNCHMETHOD),en);
 	EnableWindow(GetDlgItem(hDlg,IDC_SYNCHMETHOD_N),en);
 	EnableWindow(GetDlgItem(hDlg,IDC_SYNCHMETHOD_Z),en);
+	EnableWindow(GetDlgItem(hDlg,IDC_SYNCHMETHOD_P),en);
 }

 static LRESULT CALLBACK SoundSettingsDlgProc(HWND hDlg, UINT uMsg, WPARAM wParam, LPARAM lParam)
@ -4865,6 +4866,7 @@ static LRESULT CALLBACK SoundSettingsDlgProc(HWND hDlg, UINT uMsg, WPARAM wParam
 			//update the synch method
 			CheckDlgItem(hDlg,IDC_SYNCHMETHOD_N,snd_synchmethod==0);
 			CheckDlgItem(hDlg,IDC_SYNCHMETHOD_Z,snd_synchmethod==1);
+			CheckDlgItem(hDlg,IDC_SYNCHMETHOD_P,snd_synchmethod==2);

 			//setup interpolation combobox
 			SendDlgItemMessage(hDlg, IDC_SPU_INTERPOLATION_CB, CB_RESETCONTENT, 0, 0);
@ -4948,6 +4950,7 @@ static LRESULT CALLBACK SoundSettingsDlgProc(HWND hDlg, UINT uMsg, WPARAM wParam
 					//save the synch method
 					if(IsDlgCheckboxChecked(hDlg,IDC_SYNCHMETHOD_N)) snd_synchmethod = 0;
 					if(IsDlgCheckboxChecked(hDlg,IDC_SYNCHMETHOD_Z)) snd_synchmethod = 1;
+					if(IsDlgCheckboxChecked(hDlg,IDC_SYNCHMETHOD_P)) snd_synchmethod = 2;
 					WritePrivateProfileInt("Sound", "SynchMethod", snd_synchmethod, IniName);

 					SPU_SetSynchMode(snd_synchmode, snd_synchmethod);
--- a/desmume/src/windows/resource.h
+++ b/desmume/src/windows/resource.h
@ -351,6 +351,7 @@
 #define IDC_GROUP_SYNCHMETHOD           1017
 #define IDC_AUTOUPDATE_ASM              1018
 #define IDC_BGMAP_PRIO                  1018
+#define IDC_SYNCHMETHOD_P               1018
 #define IDC_BGMAP_PAL                   1019
 #define IDC_VISIBLE                     1019
 #define IDC_BGMAP_SIZE                  1020
--- a/desmume/src/windows/resources.rc
+++ b/desmume/src/windows/resources.rc