diff --git a/desmume/src/metaspu/SoundTouch/BPMDetect.h b/desmume/src/metaspu/SoundTouch/BPMDetect.h
index ac616e7e2..d9b5ccc1d 100644
--- a/desmume/src/metaspu/SoundTouch/BPMDetect.h
+++ b/desmume/src/metaspu/SoundTouch/BPMDetect.h
@@ -26,10 +26,10 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2006/02/05 16:44:06 $
-// File revision : $Revision: 1.5 $
+// Last changed  : $Date: 2012-08-30 16:53:44 -0300 (qui, 30 ago 2012) $
+// File revision : $Revision: 4 $
 //
-// $Id: BPMDetect.h,v 1.5 2006/02/05 16:44:06 Olli Exp $
+// $Id: BPMDetect.h 150 2012-08-30 19:53:44Z oparviai $
 //
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -60,11 +60,14 @@
 #include "STTypes.h"
 #include "FIFOSampleBuffer.h"
 
+namespace soundtouch
+{
+
 /// Minimum allowed BPM rate. Used to restrict accepted result above a reasonable limit.
-#define MIN_BPM 45
+#define MIN_BPM 29
 
 /// Maximum allowed BPM rate. Used to restrict accepted result below a reasonable limit.
-#define MAX_BPM 230
+#define MAX_BPM 200
 
 
 /// Class for calculating BPM rate for audio data.
@@ -75,10 +78,10 @@ protected:
     float *xcorr;
     
     /// Amplitude envelope sliding average approximation level accumulator
-    float envelopeAccu;
+    double envelopeAccu;
 
     /// RMS volume sliding average approximation level accumulator
-    float RMSVolumeAccu;
+    double RMSVolumeAccu;
 
     /// Sample average counter.
     int decimateCount;
@@ -105,9 +108,6 @@ protected:
     /// FIFO-buffer for decimated processing samples.
     soundtouch::FIFOSampleBuffer *buffer;
 
-    /// Initialize the class for processing.
-    void init(int numChannels, int sampleRate);
-
     /// Updates auto-correlation function for given number of decimated samples that 
     /// are read from the internal 'buffer' pipe (samples aren't removed from the pipe 
     /// though).
@@ -128,6 +128,9 @@ protected:
                       int numsamples                    ///< Number of samples in buffer
                       );
 
+    /// remove constant bias from xcorr data
+    void removeBias();
+
 public:
     /// Constructor.
     BPMDetect(int numChannels,  ///< Number of channels in sample data.
@@ -143,8 +146,8 @@ public:
     /// function. 
     /// 
     /// Notice that data in 'samples' array can be disrupted in processing.
-    void inputSamples(soundtouch::SAMPLETYPE *samples,  ///< Pointer to input/working data buffer
-                      int numSamples                    ///< Number of samples in buffer
+    void inputSamples(const soundtouch::SAMPLETYPE *samples,    ///< Pointer to input/working data buffer
+                      int numSamples                            ///< Number of samples in buffer
                       );
 
 
@@ -156,4 +159,6 @@ public:
     float getBpm();
 };
 
+}
+
 #endif // _BPMDetect_H_
diff --git a/desmume/src/metaspu/SoundTouch/FIFOSampleBuffer.cpp b/desmume/src/metaspu/SoundTouch/FIFOSampleBuffer.cpp
index 2bf965b76..a5eb56a7a 100644
--- a/desmume/src/metaspu/SoundTouch/FIFOSampleBuffer.cpp
+++ b/desmume/src/metaspu/SoundTouch/FIFOSampleBuffer.cpp
@@ -15,10 +15,10 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2006/02/05 16:44:06 $
-// File revision : $Revision: 1.11 $
+// Last changed  : $Date: 2012-11-08 16:53:01 -0200 (qui, 08 nov 2012) $
+// File revision : $Revision: 4 $
 //
-// $Id: FIFOSampleBuffer.cpp,v 1.11 2006/02/05 16:44:06 Olli Exp $
+// $Id: FIFOSampleBuffer.cpp 160 2012-11-08 18:53:01Z oparviai $
 //
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -47,21 +47,22 @@
 #include <memory.h>
 #include <string.h>
 #include <assert.h>
-#include <stdexcept>
 
 #include "FIFOSampleBuffer.h"
 
 using namespace soundtouch;
 
 // Constructor
-FIFOSampleBuffer::FIFOSampleBuffer(uint numChannels)
+FIFOSampleBuffer::FIFOSampleBuffer(int numChannels)
 {
+    assert(numChannels > 0);
     sizeInBytes = 0; // reasonable initial value
-    buffer = NULL;  //new SAMPLETYPE[sizeInBytes / sizeof(SAMPLETYPE)];
+    buffer = NULL;
     bufferUnaligned = NULL;
     samplesInBuffer = 0;
     bufferPos = 0;
-    channels = numChannels;
+    channels = (uint)numChannels;
+    ensureCapacity(32);     // allocate initial capacity 
 }
 
 
@@ -69,16 +70,19 @@ FIFOSampleBuffer::FIFOSampleBuffer(uint numChannels)
 FIFOSampleBuffer::~FIFOSampleBuffer()
 {
     delete[] bufferUnaligned;
+    bufferUnaligned = NULL;
+    buffer = NULL;
 }
 
 
 // Sets number of channels, 1 = mono, 2 = stereo
-void FIFOSampleBuffer::setChannels(const uint numChannels)
+void FIFOSampleBuffer::setChannels(int numChannels)
 {
     uint usedBytes;
 
+    assert(numChannels > 0);
     usedBytes = channels * samplesInBuffer;
-    channels = numChannels;
+    channels = (uint)numChannels;
     samplesInBuffer = usedBytes / channels;
 }
 
@@ -88,7 +92,7 @@ void FIFOSampleBuffer::setChannels(const uint numChannels)
 // location on to the beginning of the buffer.
 void FIFOSampleBuffer::rewind()
 {
-    if (bufferPos) 
+    if (buffer && bufferPos) 
     {
         memmove(buffer, ptrBegin(), sizeof(SAMPLETYPE) * channels * samplesInBuffer);
         bufferPos = 0;
@@ -98,10 +102,10 @@ void FIFOSampleBuffer::rewind()
 
 // Adds 'numSamples' pcs of samples from the 'samples' memory position to 
 // the sample buffer.
-void FIFOSampleBuffer::putSamples(const SAMPLETYPE *samples, uint numSamples)
+void FIFOSampleBuffer::putSamples(const SAMPLETYPE *samples, uint nSamples)
 {
-    memcpy(ptrEnd(numSamples), samples, sizeof(SAMPLETYPE) * numSamples * channels);
-    samplesInBuffer += numSamples;
+    memcpy(ptrEnd(nSamples), samples, sizeof(SAMPLETYPE) * nSamples * channels);
+    samplesInBuffer += nSamples;
 }
 
 
@@ -111,13 +115,13 @@ void FIFOSampleBuffer::putSamples(const SAMPLETYPE *samples, uint numSamples)
 // This function is used to update the number of samples in the sample buffer
 // when accessing the buffer directly with 'ptrEnd' function. Please be 
 // careful though!
-void FIFOSampleBuffer::putSamples(uint numSamples)
+void FIFOSampleBuffer::putSamples(uint nSamples)
 {
     uint req;
 
-    req = samplesInBuffer + numSamples;
+    req = samplesInBuffer + nSamples;
     ensureCapacity(req);
-    samplesInBuffer += numSamples;
+    samplesInBuffer += nSamples;
 }
 
 
@@ -147,8 +151,9 @@ SAMPLETYPE *FIFOSampleBuffer::ptrEnd(uint slackCapacity)
 // When using this function to output samples, also remember to 'remove' the
 // outputted samples from the buffer by calling the 
 // 'receiveSamples(numSamples)' function
-SAMPLETYPE *FIFOSampleBuffer::ptrBegin() const
+SAMPLETYPE *FIFOSampleBuffer::ptrBegin()
 {
+    assert(buffer);
     return buffer + bufferPos * channels;
 }
 
@@ -164,15 +169,19 @@ void FIFOSampleBuffer::ensureCapacity(uint capacityRequirement)
     if (capacityRequirement > getCapacity()) 
     {
         // enlarge the buffer in 4kbyte steps (round up to next 4k boundary)
-        sizeInBytes = (capacityRequirement * channels * sizeof(SAMPLETYPE) + 4095) & -4096;
+        sizeInBytes = (capacityRequirement * channels * sizeof(SAMPLETYPE) + 4095) & (uint)-4096;
         assert(sizeInBytes % 2 == 0);
         tempUnaligned = new SAMPLETYPE[sizeInBytes / sizeof(SAMPLETYPE) + 16 / sizeof(SAMPLETYPE)];
         if (tempUnaligned == NULL)
         {
-            throw std::runtime_error("Couldn't allocate memory!\n");
+            ST_THROW_RT_ERROR("Couldn't allocate memory!\n");
+        }
+        // Align the buffer to begin at 16byte cache line boundary for optimal performance
+        temp = (SAMPLETYPE *)SOUNDTOUCH_ALIGN_POINTER_16(tempUnaligned);
+        if (samplesInBuffer)
+        {
+            memcpy(temp, ptrBegin(), samplesInBuffer * channels * sizeof(SAMPLETYPE));
         }
-        temp = (SAMPLETYPE *)(((ulongptr)tempUnaligned + 15) & -16);
-        memcpy(temp, ptrBegin(), samplesInBuffer * channels * sizeof(SAMPLETYPE));
         delete[] bufferUnaligned;
         buffer = temp;
         bufferUnaligned = tempUnaligned;
@@ -250,3 +259,16 @@ void FIFOSampleBuffer::clear()
     samplesInBuffer = 0;
     bufferPos = 0;
 }
+
+
+/// allow trimming (downwards) amount of samples in pipeline.
+/// Returns adjusted amount of samples
+uint FIFOSampleBuffer::adjustAmountOfSamples(uint numSamples)
+{
+    if (numSamples < samplesInBuffer)
+    {
+        samplesInBuffer = numSamples;
+    }
+    return samplesInBuffer;
+}
+
diff --git a/desmume/src/metaspu/SoundTouch/FIFOSampleBuffer.h b/desmume/src/metaspu/SoundTouch/FIFOSampleBuffer.h
index 09f74c9a6..665badd21 100644
--- a/desmume/src/metaspu/SoundTouch/FIFOSampleBuffer.h
+++ b/desmume/src/metaspu/SoundTouch/FIFOSampleBuffer.h
@@ -15,10 +15,10 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2006/02/05 16:44:06 $
-// File revision : $Revision: 1.9 $
+// Last changed  : $Date: 2012-06-13 16:29:53 -0300 (qua, 13 jun 2012) $
+// File revision : $Revision: 4 $
 //
-// $Id: FIFOSampleBuffer.h,v 1.9 2006/02/05 16:44:06 Olli Exp $
+// $Id: FIFOSampleBuffer.h 143 2012-06-13 19:29:53Z oparviai $
 //
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -85,15 +85,15 @@ private:
     void rewind();
 
     /// Ensures that the buffer has capacity for at least this many samples.
-    void ensureCapacity(const uint capacityRequirement);
+    void ensureCapacity(uint capacityRequirement);
 
     /// Returns current capacity.
     uint getCapacity() const;
- 
+
 public:
 
     /// Constructor
-    FIFOSampleBuffer(uint numChannels = 2     ///< Number of channels, 1=mono, 2=stereo.
+    FIFOSampleBuffer(int numChannels = 2     ///< Number of channels, 1=mono, 2=stereo.
                                               ///< Default is stereo.
                      );
 
@@ -107,7 +107,7 @@ public:
     /// When using this function to output samples, also remember to 'remove' the
     /// output samples from the buffer by calling the 
     /// 'receiveSamples(numSamples)' function
-    virtual SAMPLETYPE *ptrBegin() const;
+    virtual SAMPLETYPE *ptrBegin();
 
     /// Returns a pointer to the end of the used part of the sample buffer (i.e. 
     /// where the new samples are to be inserted). This function may be used for 
@@ -160,13 +160,17 @@ public:
     virtual uint numSamples() const;
 
     /// Sets number of channels, 1 = mono, 2 = stereo.
-    void setChannels(uint numChannels);
+    void setChannels(int numChannels);
 
     /// Returns nonzero if there aren't any samples available for outputting.
     virtual int isEmpty() const;
 
     /// Clears all the samples.
     virtual void clear();
+
+    /// allow trimming (downwards) amount of samples in pipeline.
+    /// Returns adjusted amount of samples
+    uint adjustAmountOfSamples(uint numSamples);
 };
 
 }
diff --git a/desmume/src/metaspu/SoundTouch/FIFOSamplePipe.h b/desmume/src/metaspu/SoundTouch/FIFOSamplePipe.h
index 33e33c7e3..6efb9af88 100644
--- a/desmume/src/metaspu/SoundTouch/FIFOSamplePipe.h
+++ b/desmume/src/metaspu/SoundTouch/FIFOSamplePipe.h
@@ -17,10 +17,10 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2006/02/05 16:44:06 $
-// File revision : $Revision: 1.8 $
+// Last changed  : $Date: 2012-06-13 16:29:53 -0300 (qua, 13 jun 2012) $
+// File revision : $Revision: 4 $
 //
-// $Id: FIFOSamplePipe.h,v 1.8 2006/02/05 16:44:06 Olli Exp $
+// $Id: FIFOSamplePipe.h 143 2012-06-13 19:29:53Z oparviai $
 //
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -59,6 +59,10 @@ namespace soundtouch
 class FIFOSamplePipe
 {
 public:
+    // virtual default destructor
+    virtual ~FIFOSamplePipe() {}
+
+
     /// Returns a pointer to the beginning of the output samples. 
     /// This function is provided for accessing the output samples directly. 
     /// Please be careful for not to corrupt the book-keeping!
@@ -66,12 +70,12 @@ public:
     /// When using this function to output samples, also remember to 'remove' the
     /// output samples from the buffer by calling the 
     /// 'receiveSamples(numSamples)' function
-    virtual SAMPLETYPE *ptrBegin() const = 0;
+    virtual SAMPLETYPE *ptrBegin() = 0;
 
     /// Adds 'numSamples' pcs of samples from the 'samples' memory position to
     /// the sample buffer.
     virtual void putSamples(const SAMPLETYPE *samples,  ///< Pointer to samples.
-                            uint numSamples                         ///< Number of samples to insert.
+                            uint numSamples             ///< Number of samples to insert.
                             ) = 0;
 
 
@@ -110,6 +114,11 @@ public:
 
     /// Clears all the samples.
     virtual void clear() = 0;
+
+    /// allow trimming (downwards) amount of samples in pipeline.
+    /// Returns adjusted amount of samples
+    virtual uint adjustAmountOfSamples(uint numSamples) = 0;
+
 };
 
 
@@ -166,7 +175,7 @@ protected:
     /// When using this function to output samples, also remember to 'remove' the
     /// output samples from the buffer by calling the 
     /// 'receiveSamples(numSamples)' function
-    virtual SAMPLETYPE *ptrBegin() const
+    virtual SAMPLETYPE *ptrBegin()
     {
         return output->ptrBegin();
     }
@@ -210,6 +219,14 @@ public:
     {
         return output->isEmpty();
     }
+
+    /// allow trimming (downwards) amount of samples in pipeline.
+    /// Returns adjusted amount of samples
+    virtual uint adjustAmountOfSamples(uint numSamples)
+    {
+        return output->adjustAmountOfSamples(numSamples);
+    }
+
 };
 
 }
diff --git a/desmume/src/metaspu/SoundTouch/FIRFilter.cpp b/desmume/src/metaspu/SoundTouch/FIRFilter.cpp
index d7f2485fc..7e88b7d18 100644
--- a/desmume/src/metaspu/SoundTouch/FIRFilter.cpp
+++ b/desmume/src/metaspu/SoundTouch/FIRFilter.cpp
@@ -11,10 +11,10 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2006/02/05 16:44:06 $
-// File revision : $Revision: 1.16 $
+// Last changed  : $Date: 2011-09-02 15:56:11 -0300 (sex, 02 set 2011) $
+// File revision : $Revision: 4 $
 //
-// $Id: FIRFilter.cpp,v 1.16 2006/02/05 16:44:06 Olli Exp $
+// $Id: FIRFilter.cpp 131 2011-09-02 18:56:11Z oparviai $
 //
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -43,7 +43,6 @@
 #include <assert.h>
 #include <math.h>
 #include <stdlib.h>
-#include <stdexcept>
 #include "FIRFilter.h"
 #include "cpu_detect.h"
 
@@ -58,6 +57,7 @@ using namespace soundtouch;
 FIRFilter::FIRFilter()
 {
     resultDivFactor = 0;
+    resultDivider = 0;
     length = 0;
     lengthDiv8 = 0;
     filterCoeffs = NULL;
@@ -74,13 +74,16 @@ uint FIRFilter::evaluateFilterStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, ui
 {
     uint i, j, end;
     LONG_SAMPLETYPE suml, sumr;
-#ifdef FLOAT_SAMPLES
+#ifdef SOUNDTOUCH_FLOAT_SAMPLES
     // when using floating point samples, use a scaler instead of a divider
     // because division is much slower operation than multiplying.
     double dScaler = 1.0 / (double)resultDivider;
 #endif
 
     assert(length != 0);
+    assert(src != NULL);
+    assert(dest != NULL);
+    assert(filterCoeffs != NULL);
 
     end = 2 * (numSamples - length);
 
@@ -104,7 +107,7 @@ uint FIRFilter::evaluateFilterStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, ui
                     ptr[2 * i + 7] * filterCoeffs[i + 3];
         }
 
-#ifdef INTEGER_SAMPLES
+#ifdef SOUNDTOUCH_INTEGER_SAMPLES
         suml >>= resultDivFactor;
         sumr >>= resultDivFactor;
         // saturate to 16 bit integer limits
@@ -114,7 +117,7 @@ uint FIRFilter::evaluateFilterStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, ui
 #else
         suml *= dScaler;
         sumr *= dScaler;
-#endif // INTEGER_SAMPLES
+#endif // SOUNDTOUCH_INTEGER_SAMPLES
         dest[j] = (SAMPLETYPE)suml;
         dest[j + 1] = (SAMPLETYPE)sumr;
     }
@@ -129,7 +132,7 @@ uint FIRFilter::evaluateFilterMono(SAMPLETYPE *dest, const SAMPLETYPE *src, uint
 {
     uint i, j, end;
     LONG_SAMPLETYPE sum;
-#ifdef FLOAT_SAMPLES
+#ifdef SOUNDTOUCH_FLOAT_SAMPLES
     // when using floating point samples, use a scaler instead of a divider
     // because division is much slower operation than multiplying.
     double dScaler = 1.0 / (double)resultDivider;
@@ -150,13 +153,13 @@ uint FIRFilter::evaluateFilterMono(SAMPLETYPE *dest, const SAMPLETYPE *src, uint
                    src[i + 2] * filterCoeffs[i + 2] + 
                    src[i + 3] * filterCoeffs[i + 3];
         }
-#ifdef INTEGER_SAMPLES
+#ifdef SOUNDTOUCH_INTEGER_SAMPLES
         sum >>= resultDivFactor;
         // saturate to 16 bit integer limits
         sum = (sum < -32768) ? -32768 : (sum > 32767) ? 32767 : sum;
 #else
         sum *= dScaler;
-#endif // INTEGER_SAMPLES
+#endif // SOUNDTOUCH_INTEGER_SAMPLES
         dest[j] = (SAMPLETYPE)sum;
         src ++;
     }
@@ -170,18 +173,14 @@ uint FIRFilter::evaluateFilterMono(SAMPLETYPE *dest, const SAMPLETYPE *src, uint
 void FIRFilter::setCoefficients(const SAMPLETYPE *coeffs, uint newLength, uint uResultDivFactor)
 {
     assert(newLength > 0);
-    if (newLength % 8) throw std::runtime_error("FIR filter length not divisible by 8");
+    if (newLength % 8) ST_THROW_RT_ERROR("FIR filter length not divisible by 8");
 
     lengthDiv8 = newLength / 8;
     length = lengthDiv8 * 8;
     assert(length == newLength);
 
     resultDivFactor = uResultDivFactor;
-#ifdef INTEGER_SAMPLES
-    resultDivider = (SAMPLETYPE)(1<<resultDivFactor);
-#else
-    resultDivider = (SAMPLETYPE)powf(2, (SAMPLETYPE)resultDivFactor);
-#endif
+    resultDivider = (SAMPLETYPE)::pow(2.0, (int)resultDivFactor);
 
     delete[] filterCoeffs;
     filterCoeffs = new SAMPLETYPE[length];
@@ -207,7 +206,6 @@ uint FIRFilter::evaluate(SAMPLETYPE *dest, const SAMPLETYPE *src, uint numSample
     assert(length > 0);
     assert(lengthDiv8 * 8 == length);
     if (numSamples < length) return 0;
-    assert(resultDivFactor >= 0);
     if (numChannels == 2) 
     {
         return evaluateFilterStereo(dest, src, numSamples);
@@ -223,46 +221,36 @@ uint FIRFilter::evaluate(SAMPLETYPE *dest, const SAMPLETYPE *src, uint numSample
 void * FIRFilter::operator new(size_t s)
 {
     // Notice! don't use "new FIRFilter" directly, use "newInstance" to create a new instance instead!
-    throw std::runtime_error("Don't use 'new FIRFilter', use 'newInstance' member instead!");
-    return NULL;
+    ST_THROW_RT_ERROR("Error in FIRFilter::new: Don't use 'new FIRFilter', use 'newInstance' member instead!");
+    return newInstance();
 }
 
 
 FIRFilter * FIRFilter::newInstance()
 {
-    uint uExtensions = 0;
+    uint uExtensions;
 
-#if !defined(_MSC_VER) || !defined(__x86_64__)
     uExtensions = detectCPUextensions();
-#endif
-    // Check if MMX/SSE/3DNow! instruction set extensions supported by CPU
 
-#ifdef ALLOW_MMX
+    // Check if MMX/SSE instruction set extensions supported by CPU
+
+#ifdef SOUNDTOUCH_ALLOW_MMX
     // MMX routines available only with integer sample types
     if (uExtensions & SUPPORT_MMX)
     {
         return ::new FIRFilterMMX;
     }
     else
-#endif // ALLOW_MMX
+#endif // SOUNDTOUCH_ALLOW_MMX
 
-#ifdef ALLOW_SSE
+#ifdef SOUNDTOUCH_ALLOW_SSE
     if (uExtensions & SUPPORT_SSE)
     {
         // SSE support
         return ::new FIRFilterSSE;
     }
     else
-#endif // ALLOW_SSE
-
-#ifdef ALLOW_3DNOW
-    if (uExtensions & SUPPORT_3DNOW)
-    {
-        // 3DNow! support
-        return ::new FIRFilter3DNow;
-    }
-    else
-#endif // ALLOW_3DNOW
+#endif // SOUNDTOUCH_ALLOW_SSE
 
     {
         // ISA optimizations not supported, use plain C version
diff --git a/desmume/src/metaspu/SoundTouch/FIRFilter.h b/desmume/src/metaspu/SoundTouch/FIRFilter.h
index be5cdd294..70cde4332 100644
--- a/desmume/src/metaspu/SoundTouch/FIRFilter.h
+++ b/desmume/src/metaspu/SoundTouch/FIRFilter.h
@@ -11,10 +11,10 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2006/02/05 16:44:06 $
-// File revision : $Revision: 1.17 $
+// Last changed  : $Date: 2011-02-13 17:13:57 -0200 (dom, 13 fev 2011) $
+// File revision : $Revision: 4 $
 //
-// $Id: FIRFilter.h,v 1.17 2006/02/05 16:44:06 Olli Exp $
+// $Id: FIRFilter.h 104 2011-02-13 19:13:57Z oparviai $
 //
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -42,6 +42,7 @@
 #ifndef FIRFilter_H
 #define FIRFilter_H
 
+#include <stddef.h>
 #include "STTypes.h"
 
 namespace soundtouch
@@ -77,7 +78,7 @@ public:
 
     /// Operator 'new' is overloaded so that it automatically creates a suitable instance 
     /// depending on if we've a MMX-capable CPU available or not.
-    void * operator new(size_t s);
+    static void * operator new(size_t s);
 
     static FIRFilter *newInstance();
 
@@ -101,9 +102,9 @@ public:
 
 // Optional subclasses that implement CPU-specific optimizations:
 
-#ifdef ALLOW_MMX
+#ifdef SOUNDTOUCH_ALLOW_MMX
 
-    /// Class that implements MMX optimized functions exclusive for 16bit integer samples type.
+/// Class that implements MMX optimized functions exclusive for 16bit integer samples type.
     class FIRFilterMMX : public FIRFilter
     {
     protected:
@@ -118,29 +119,10 @@ public:
         virtual void setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor);
     };
 
-#endif // ALLOW_MMX
+#endif // SOUNDTOUCH_ALLOW_MMX
 
 
-#ifdef ALLOW_3DNOW
-
-    /// Class that implements 3DNow! optimized functions exclusive for floating point samples type.
-    class FIRFilter3DNow : public FIRFilter
-    {
-    protected:
-        float *filterCoeffsUnalign;
-        float *filterCoeffsAlign;
-
-        virtual uint evaluateFilterStereo(float *dest, const float *src, uint numSamples) const;
-    public:
-        FIRFilter3DNow();
-        ~FIRFilter3DNow();
-        virtual void setCoefficients(const float *coeffs, uint newLength, uint uResultDivFactor);
-    };
-
-#endif  // ALLOW_3DNOW
-
-
-#ifdef ALLOW_SSE
+#ifdef SOUNDTOUCH_ALLOW_SSE
     /// Class that implements SSE optimized functions exclusive for floating point samples type.
     class FIRFilterSSE : public FIRFilter
     {
@@ -156,7 +138,7 @@ public:
         virtual void setCoefficients(const float *coeffs, uint newLength, uint uResultDivFactor);
     };
 
-#endif // ALLOW_SSE
+#endif // SOUNDTOUCH_ALLOW_SSE
 
 }
 
diff --git a/desmume/src/metaspu/SoundTouch/RateTransposer.cpp b/desmume/src/metaspu/SoundTouch/RateTransposer.cpp
index b7414b90a..a3625ed49 100644
--- a/desmume/src/metaspu/SoundTouch/RateTransposer.cpp
+++ b/desmume/src/metaspu/SoundTouch/RateTransposer.cpp
@@ -10,10 +10,10 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2006/03/19 10:05:49 $
-// File revision : $Revision: 1.13 $
+// Last changed  : $Date: 2011-09-02 15:56:11 -0300 (sex, 02 set 2011) $
+// File revision : $Revision: 4 $
 //
-// $Id: RateTransposer.cpp,v 1.13 2006/03/19 10:05:49 Olli Exp $
+// $Id: RateTransposer.cpp 131 2011-09-02 18:56:11Z oparviai $
 //
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -42,7 +42,6 @@
 #include <assert.h>
 #include <stdlib.h>
 #include <stdio.h>
-#include <limits.h>
 #include "RateTransposer.h"
 #include "AAFilter.h"
 
@@ -55,7 +54,7 @@ class RateTransposerInteger : public RateTransposer
 {
 protected:
     int iSlopeCount;
-    uint uRate;
+    int iRate;
     SAMPLETYPE sPrevSampleL, sPrevSampleR;
 
     virtual void resetRegisters();
@@ -84,7 +83,6 @@ class RateTransposerFloat : public RateTransposer
 {
 protected:
     float fSlopeCount;
-    float fRateStep;
     SAMPLETYPE sPrevSampleL, sPrevSampleR;
 
     virtual void resetRegisters();
@@ -103,25 +101,19 @@ public:
 
 
 
-#ifndef min
-#define min(a,b) ((a > b) ? b : a)
-#define max(a,b) ((a < b) ? b : a)
-#endif
-
 
 // Operator 'new' is overloaded so that it automatically creates a suitable instance 
 // depending on if we've a MMX/SSE/etc-capable CPU available or not.
 void * RateTransposer::operator new(size_t s)
 {
-    // Notice! don't use "new TDStretch" directly, use "newInstance" to create a new instance instead!
-    assert(FALSE);  
-    return NULL;
+    ST_THROW_RT_ERROR("Error in RateTransoser::new: don't use \"new TDStretch\" directly, use \"newInstance\" to create a new instance instead!");
+    return newInstance();
 }
 
 
 RateTransposer *RateTransposer::newInstance()
 {
-#ifdef INTEGER_SAMPLES
+#ifdef SOUNDTOUCH_INTEGER_SAMPLES
     return ::new RateTransposerInteger;
 #else
     return ::new RateTransposerFloat;
@@ -132,8 +124,9 @@ RateTransposer *RateTransposer::newInstance()
 // Constructor
 RateTransposer::RateTransposer() : FIFOProcessor(&outputBuffer)
 {
-    uChannels = 2;
+    numChannels = 2;
     bUseAAFilter = TRUE;
+    fRate = 0;
 
     // Instantiates the anti-alias filter with default tap length
     // of 32
@@ -150,7 +143,7 @@ RateTransposer::~RateTransposer()
 
 
 /// Enables/disables the anti-alias filter. Zero to disable, nonzero to enable
-void RateTransposer::enableAAFilter(const BOOL newMode)
+void RateTransposer::enableAAFilter(BOOL newMode)
 {
     bUseAAFilter = newMode;
 }
@@ -163,18 +156,18 @@ BOOL RateTransposer::isAAFilterEnabled() const
 }
 
 
-AAFilter *RateTransposer::getAAFilter() const
+AAFilter *RateTransposer::getAAFilter()
 {
     return pAAFilter;
 }
 
 
 
-// Sets new target uRate. Normal uRate = 1.0, smaller values represent slower 
-// uRate, larger faster uRates.
+// Sets new target iRate. Normal iRate = 1.0, smaller values represent slower 
+// iRate, larger faster iRates.
 void RateTransposer::setRate(float newRate)
 {
-    float fCutoff;
+    double fCutoff;
 
     fRate = newRate;
 
@@ -197,45 +190,47 @@ void RateTransposer::setRate(float newRate)
 //
 // It's allowed for 'output' and 'input' parameters to point to the same
 // memory position.
+/*
 void RateTransposer::flushStoreBuffer()
 {
     if (storeBuffer.isEmpty()) return;
 
     outputBuffer.moveSamples(storeBuffer);
 }
+*/
 
 
-// Adds 'numSamples' pcs of samples from the 'samples' memory position into
+// Adds 'nSamples' pcs of samples from the 'samples' memory position into
 // the input of the object.
-void RateTransposer::putSamples(const SAMPLETYPE *samples, uint numSamples)
+void RateTransposer::putSamples(const SAMPLETYPE *samples, uint nSamples)
 {
-    processSamples(samples, numSamples);
+    processSamples(samples, nSamples);
 }
 
 
 
 // Transposes up the sample rate, causing the observed playback 'rate' of the
 // sound to decrease
-void RateTransposer::upsample(const SAMPLETYPE *src, uint numSamples)
+void RateTransposer::upsample(const SAMPLETYPE *src, uint nSamples)
 {
-    int count, sizeTemp, num;
+    uint count, sizeTemp, num;
 
     // If the parameter 'uRate' value is smaller than 'SCALE', first transpose
     // the samples and then apply the anti-alias filter to remove aliasing.
 
     // First check that there's enough room in 'storeBuffer' 
     // (+16 is to reserve some slack in the destination buffer)
-    sizeTemp = (int)((float)numSamples / fRate + 16.0f);
+    sizeTemp = (uint)((float)nSamples / fRate + 16.0f);
 
     // Transpose the samples, store the result into the end of "storeBuffer"
-    count = transpose(storeBuffer.ptrEnd(sizeTemp), src, numSamples);
+    count = transpose(storeBuffer.ptrEnd(sizeTemp), src, nSamples);
     storeBuffer.putSamples(count);
 
     // Apply the anti-alias filter to samples in "store output", output the
     // result to "dest"
     num = storeBuffer.numSamples();
     count = pAAFilter->evaluate(outputBuffer.ptrEnd(num), 
-        storeBuffer.ptrBegin(), num, uChannels);
+        storeBuffer.ptrBegin(), num, (uint)numChannels);
     outputBuffer.putSamples(count);
 
     // Remove the processed samples from "storeBuffer"
@@ -245,16 +240,16 @@ void RateTransposer::upsample(const SAMPLETYPE *src, uint numSamples)
 
 // Transposes down the sample rate, causing the observed playback 'rate' of the
 // sound to increase
-void RateTransposer::downsample(const SAMPLETYPE *src, uint numSamples)
+void RateTransposer::downsample(const SAMPLETYPE *src, uint nSamples)
 {
-    int count, sizeTemp;
+    uint count, sizeTemp;
 
     // If the parameter 'uRate' value is larger than 'SCALE', first apply the
     // anti-alias filter to remove high frequencies (prevent them from folding
-    // over the lover frequencies), then transpose. */
+    // over the lover frequencies), then transpose.
 
-    // Add the new samples to the end of the storeBuffer */
-    storeBuffer.putSamples(src, numSamples);
+    // Add the new samples to the end of the storeBuffer
+    storeBuffer.putSamples(src, nSamples);
 
     // Anti-alias filter the samples to prevent folding and output the filtered 
     // data to tempBuffer. Note : because of the FIR filter length, the
@@ -263,13 +258,15 @@ void RateTransposer::downsample(const SAMPLETYPE *src, uint numSamples)
     sizeTemp = storeBuffer.numSamples();
 
     count = pAAFilter->evaluate(tempBuffer.ptrEnd(sizeTemp), 
-        storeBuffer.ptrBegin(), sizeTemp, uChannels);
+        storeBuffer.ptrBegin(), sizeTemp, (uint)numChannels);
+
+	if (count == 0) return;
 
     // Remove the filtered samples from 'storeBuffer'
     storeBuffer.receiveSamples(count);
 
     // Transpose the samples (+16 is to reserve some slack in the destination buffer)
-    sizeTemp = (int)((float)numSamples / fRate + 16.0f);
+    sizeTemp = (uint)((float)nSamples / fRate + 16.0f);
     count = transpose(outputBuffer.ptrEnd(sizeTemp), tempBuffer.ptrBegin(), count);
     outputBuffer.putSamples(count);
 }
@@ -279,20 +276,20 @@ void RateTransposer::downsample(const SAMPLETYPE *src, uint numSamples)
 // Returns amount of samples returned in the "dest" buffer.
 // The maximum amount of samples that can be returned at a time is set by
 // the 'set_returnBuffer_size' function.
-void RateTransposer::processSamples(const SAMPLETYPE *src, uint numSamples)
+void RateTransposer::processSamples(const SAMPLETYPE *src, uint nSamples)
 {
     uint count;
     uint sizeReq;
 
-    if (numSamples == 0) return;
+    if (nSamples == 0) return;
     assert(pAAFilter);
 
     // If anti-alias filter is turned off, simply transpose without applying
     // the filter
     if (bUseAAFilter == FALSE) 
     {
-        sizeReq = (int)((float)numSamples / fRate + 1.0f);
-        count = transpose(outputBuffer.ptrEnd(sizeReq), src, numSamples);
+        sizeReq = (uint)((float)nSamples / fRate + 1.0f);
+        count = transpose(outputBuffer.ptrEnd(sizeReq), src, nSamples);
         outputBuffer.putSamples(count);
         return;
     }
@@ -300,41 +297,42 @@ void RateTransposer::processSamples(const SAMPLETYPE *src, uint numSamples)
     // Transpose with anti-alias filter
     if (fRate < 1.0f) 
     {
-        upsample(src, numSamples);
+        upsample(src, nSamples);
     } 
     else  
     {
-        downsample(src, numSamples);
+        downsample(src, nSamples);
     }
 }
 
 
 // Transposes the sample rate of the given samples using linear interpolation. 
 // Returns the number of samples returned in the "dest" buffer
-inline uint RateTransposer::transpose(SAMPLETYPE *dest, const SAMPLETYPE *src, uint numSamples)
+inline uint RateTransposer::transpose(SAMPLETYPE *dest, const SAMPLETYPE *src, uint nSamples)
 {
-    if (uChannels == 2) 
+    if (numChannels == 2) 
     {
-        return transposeStereo(dest, src, numSamples);
+        return transposeStereo(dest, src, nSamples);
     } 
     else 
     {
-        return transposeMono(dest, src, numSamples);
+        return transposeMono(dest, src, nSamples);
     }
 }
 
 
 // Sets the number of channels, 1 = mono, 2 = stereo
-void RateTransposer::setChannels(const uint numchannels)
+void RateTransposer::setChannels(int nChannels)
 {
-    if (uChannels == numchannels) return;
+    assert(nChannels > 0);
+    if (numChannels == nChannels) return;
 
-    assert(numchannels == 1 || numchannels == 2);
-    uChannels = numchannels;
+    assert(nChannels == 1 || nChannels == 2);
+    numChannels = nChannels;
 
-    storeBuffer.setChannels(uChannels);
-    tempBuffer.setChannels(uChannels);
-    outputBuffer.setChannels(uChannels);
+    storeBuffer.setChannels(numChannels);
+    tempBuffer.setChannels(numChannels);
+    outputBuffer.setChannels(numChannels);
 
     // Inits the linear interpolation registers
     resetRegisters();
@@ -350,7 +348,7 @@ void RateTransposer::clear()
 
 
 // Returns nonzero if there aren't any samples available for outputting.
-uint RateTransposer::isEmpty()
+int RateTransposer::isEmpty() const
 {
     int res;
 
@@ -371,11 +369,10 @@ uint RateTransposer::isEmpty()
 // Constructor
 RateTransposerInteger::RateTransposerInteger() : RateTransposer()
 {
-    // call these here as these are virtual functions; calling these
-    // from the base class constructor wouldn't execute the overloaded
-    // versions (<master yoda>peculiar C++ can be</my>).
-    resetRegisters();
-    setRate(1.0f);
+    // Notice: use local function calling syntax for sake of clarity, 
+    // to indicate the fact that C++ constructor can't call virtual functions.
+    RateTransposerInteger::resetRegisters();
+    RateTransposerInteger::setRate(1.0f);
 }
 
 
@@ -396,12 +393,14 @@ void RateTransposerInteger::resetRegisters()
 // Transposes the sample rate of the given samples using linear interpolation. 
 // 'Mono' version of the routine. Returns the number of samples returned in 
 // the "dest" buffer
-uint RateTransposerInteger::transposeMono(SAMPLETYPE *dest, const SAMPLETYPE *src, uint numSamples)
+uint RateTransposerInteger::transposeMono(SAMPLETYPE *dest, const SAMPLETYPE *src, uint nSamples)
 {
     unsigned int i, used;
     LONG_SAMPLETYPE temp, vol1;
 
-    used = 0;    
+    if (nSamples == 0) return 0;  // no samples, no work
+
+	used = 0;    
     i = 0;
 
     // Process the last sample saved from the previous call first...
@@ -411,7 +410,7 @@ uint RateTransposerInteger::transposeMono(SAMPLETYPE *dest, const SAMPLETYPE *sr
         temp = vol1 * sPrevSampleL + iSlopeCount * src[0];
         dest[i] = (SAMPLETYPE)(temp / SCALE);
         i++;
-        iSlopeCount += uRate;
+        iSlopeCount += iRate;
     }
     // now always (iSlopeCount > SCALE)
     iSlopeCount -= SCALE;
@@ -422,18 +421,18 @@ uint RateTransposerInteger::transposeMono(SAMPLETYPE *dest, const SAMPLETYPE *sr
         {
             iSlopeCount -= SCALE;
             used ++;
-            if (used >= numSamples - 1) goto end;
+            if (used >= nSamples - 1) goto end;
         }
         vol1 = (LONG_SAMPLETYPE)(SCALE - iSlopeCount);
         temp = src[used] * vol1 + iSlopeCount * src[used + 1];
         dest[i] = (SAMPLETYPE)(temp / SCALE);
 
         i++;
-        iSlopeCount += uRate;
+        iSlopeCount += iRate;
     }
 end:
     // Store the last sample for the next round
-    sPrevSampleL = src[numSamples - 1];
+    sPrevSampleL = src[nSamples - 1];
 
     return i;
 }
@@ -442,12 +441,12 @@ end:
 // Transposes the sample rate of the given samples using linear interpolation. 
 // 'Stereo' version of the routine. Returns the number of samples returned in 
 // the "dest" buffer
-uint RateTransposerInteger::transposeStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, uint numSamples)
+uint RateTransposerInteger::transposeStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, uint nSamples)
 {
     unsigned int srcPos, i, used;
     LONG_SAMPLETYPE temp, vol1;
 
-    if (numSamples == 0) return 0;  // no samples, no work
+    if (nSamples == 0) return 0;  // no samples, no work
 
     used = 0;    
     i = 0;
@@ -461,7 +460,7 @@ uint RateTransposerInteger::transposeStereo(SAMPLETYPE *dest, const SAMPLETYPE *
         temp = vol1 * sPrevSampleR + iSlopeCount * src[1];
         dest[2 * i + 1] = (SAMPLETYPE)(temp / SCALE);
         i++;
-        iSlopeCount += uRate;
+        iSlopeCount += iRate;
     }
     // now always (iSlopeCount > SCALE)
     iSlopeCount -= SCALE;
@@ -472,7 +471,7 @@ uint RateTransposerInteger::transposeStereo(SAMPLETYPE *dest, const SAMPLETYPE *
         {
             iSlopeCount -= SCALE;
             used ++;
-            if (used >= numSamples - 1) goto end;
+            if (used >= nSamples - 1) goto end;
         }
         srcPos = 2 * used;
         vol1 = (LONG_SAMPLETYPE)(SCALE - iSlopeCount);
@@ -482,22 +481,22 @@ uint RateTransposerInteger::transposeStereo(SAMPLETYPE *dest, const SAMPLETYPE *
         dest[2 * i + 1] = (SAMPLETYPE)(temp / SCALE);
 
         i++;
-        iSlopeCount += uRate;
+        iSlopeCount += iRate;
     }
 end:
     // Store the last sample for the next round
-    sPrevSampleL = src[2 * numSamples - 2];
-    sPrevSampleR = src[2 * numSamples - 1];
+    sPrevSampleL = src[2 * nSamples - 2];
+    sPrevSampleR = src[2 * nSamples - 1];
 
     return i;
 }
 
 
-// Sets new target uRate. Normal uRate = 1.0, smaller values represent slower 
-// uRate, larger faster uRates.
+// Sets new target iRate. Normal iRate = 1.0, smaller values represent slower 
+// iRate, larger faster iRates.
 void RateTransposerInteger::setRate(float newRate)
 {
-    uRate = (int)(newRate * SCALE + 0.5f);
+    iRate = (int)(newRate * SCALE + 0.5f);
     RateTransposer::setRate(newRate);
 }
 
@@ -511,11 +510,10 @@ void RateTransposerInteger::setRate(float newRate)
 // Constructor
 RateTransposerFloat::RateTransposerFloat() : RateTransposer()
 {
-    // call these here as these are virtual functions; calling these
-    // from the base class constructor wouldn't execute the overloaded
-    // versions (<master yoda>peculiar C++ can be</my>).
-    resetRegisters();
-    setRate(1.0f);
+    // Notice: use local function calling syntax for sake of clarity, 
+    // to indicate the fact that C++ constructor can't call virtual functions.
+    RateTransposerFloat::resetRegisters();
+    RateTransposerFloat::setRate(1.0f);
 }
 
 
@@ -536,7 +534,7 @@ void RateTransposerFloat::resetRegisters()
 // Transposes the sample rate of the given samples using linear interpolation. 
 // 'Mono' version of the routine. Returns the number of samples returned in 
 // the "dest" buffer
-uint RateTransposerFloat::transposeMono(SAMPLETYPE *dest, const SAMPLETYPE *src, uint numSamples)
+uint RateTransposerFloat::transposeMono(SAMPLETYPE *dest, const SAMPLETYPE *src, uint nSamples)
 {
     unsigned int i, used;
 
@@ -552,23 +550,24 @@ uint RateTransposerFloat::transposeMono(SAMPLETYPE *dest, const SAMPLETYPE *src,
     }
     fSlopeCount -= 1.0f;
 
-    if (numSamples == 1) goto end;
-
-    while (1)
+    if (nSamples > 1)
     {
-        while (fSlopeCount > 1.0f) 
+        while (1)
         {
-            fSlopeCount -= 1.0f;
-            used ++;
-            if (used >= numSamples - 1) goto end;
+            while (fSlopeCount > 1.0f) 
+            {
+                fSlopeCount -= 1.0f;
+                used ++;
+                if (used >= nSamples - 1) goto end;
+            }
+            dest[i] = (SAMPLETYPE)((1.0f - fSlopeCount) * src[used] + fSlopeCount * src[used + 1]);
+            i++;
+            fSlopeCount += fRate;
         }
-        dest[i] = (SAMPLETYPE)((1.0f - fSlopeCount) * src[used] + fSlopeCount * src[used + 1]);
-        i++;
-        fSlopeCount += fRate;
     }
 end:
     // Store the last sample for the next round
-    sPrevSampleL = src[numSamples - 1];
+    sPrevSampleL = src[nSamples - 1];
 
     return i;
 }
@@ -577,11 +576,11 @@ end:
 // Transposes the sample rate of the given samples using linear interpolation. 
 // 'Mono' version of the routine. Returns the number of samples returned in 
 // the "dest" buffer
-uint RateTransposerFloat::transposeStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, uint numSamples)
+uint RateTransposerFloat::transposeStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, uint nSamples)
 {
     unsigned int srcPos, i, used;
 
-    if (numSamples == 0) return 0;  // no samples, no work
+    if (nSamples == 0) return 0;  // no samples, no work
 
     used = 0;    
     i = 0;
@@ -597,30 +596,31 @@ uint RateTransposerFloat::transposeStereo(SAMPLETYPE *dest, const SAMPLETYPE *sr
     // now always (iSlopeCount > 1.0f)
     fSlopeCount -= 1.0f;
 
-    if (numSamples == 1) goto end;
-
-    while (1)
+    if (nSamples > 1)
     {
-        while (fSlopeCount > 1.0f) 
+        while (1)
         {
-            fSlopeCount -= 1.0f;
-            used ++;
-            if (used >= numSamples - 1) goto end;
+            while (fSlopeCount > 1.0f) 
+            {
+                fSlopeCount -= 1.0f;
+                used ++;
+                if (used >= nSamples - 1) goto end;
+            }
+            srcPos = 2 * used;
+
+            dest[2 * i] = (SAMPLETYPE)((1.0f - fSlopeCount) * src[srcPos] 
+                + fSlopeCount * src[srcPos + 2]);
+            dest[2 * i + 1] = (SAMPLETYPE)((1.0f - fSlopeCount) * src[srcPos + 1] 
+                + fSlopeCount * src[srcPos + 3]);
+
+            i++;
+            fSlopeCount += fRate;
         }
-        srcPos = 2 * used;
-
-        dest[2 * i] = (SAMPLETYPE)((1.0f - fSlopeCount) * src[srcPos] 
-            + fSlopeCount * src[srcPos + 2]);
-        dest[2 * i + 1] = (SAMPLETYPE)((1.0f - fSlopeCount) * src[srcPos + 1] 
-            + fSlopeCount * src[srcPos + 3]);
-
-        i++;
-        fSlopeCount += fRate;
     }
 end:
     // Store the last sample for the next round
-    sPrevSampleL = src[2 * numSamples - 2];
-    sPrevSampleR = src[2 * numSamples - 1];
+    sPrevSampleL = src[2 * nSamples - 2];
+    sPrevSampleR = src[2 * nSamples - 1];
 
     return i;
 }
diff --git a/desmume/src/metaspu/SoundTouch/RateTransposer.h b/desmume/src/metaspu/SoundTouch/RateTransposer.h
index f73978e63..e1e656b53 100644
--- a/desmume/src/metaspu/SoundTouch/RateTransposer.h
+++ b/desmume/src/metaspu/SoundTouch/RateTransposer.h
@@ -14,10 +14,10 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2006/02/05 16:44:06 $
-// File revision : $Revision: 1.10 $
+// Last changed  : $Date: 2009-02-21 13:00:14 -0300 (sáb, 21 fev 2009) $
+// File revision : $Revision: 4 $
 //
-// $Id: RateTransposer.h,v 1.10 2006/02/05 16:44:06 Olli Exp $
+// $Id: RateTransposer.h 63 2009-02-21 16:00:14Z oparviai $
 //
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -45,6 +45,7 @@
 #ifndef RateTransposer_H
 #define RateTransposer_H
 
+#include <stddef.h>
 #include "AAFilter.h"
 #include "FIFOSamplePipe.h"
 #include "FIFOSampleBuffer.h"
@@ -68,7 +69,7 @@ protected:
 
     float fRate;
 
-    uint uChannels;
+    int numChannels;
 
     /// Buffer for collecting samples to feed the anti-alias filter between
     /// two batches
@@ -82,8 +83,6 @@ protected:
 
     BOOL bUseAAFilter;
 
-    void init();
-
     virtual void resetRegisters() = 0;
 
     virtual uint transposeStereo(SAMPLETYPE *dest, 
@@ -92,12 +91,10 @@ protected:
     virtual uint transposeMono(SAMPLETYPE *dest, 
                        const SAMPLETYPE *src, 
                        uint numSamples) = 0;
-    uint transpose(SAMPLETYPE *dest, 
+    inline uint transpose(SAMPLETYPE *dest, 
                    const SAMPLETYPE *src, 
                    uint numSamples);
 
-    void flushStoreBuffer();
-
     void downsample(const SAMPLETYPE *src, 
                     uint numSamples);
     void upsample(const SAMPLETYPE *src, 
@@ -117,7 +114,7 @@ public:
 
     /// Operator 'new' is overloaded so that it automatically creates a suitable instance 
     /// depending on if we're to use integer or floating point arithmetics.
-    void *operator new(size_t s);
+    static void *operator new(size_t s);
 
     /// Use this function instead of "new" operator to create a new instance of this class. 
     /// This function automatically chooses a correct implementation, depending on if 
@@ -131,7 +128,7 @@ public:
     FIFOSamplePipe *getStore() { return &storeBuffer; };
 
     /// Return anti-alias filter object
-    AAFilter *getAAFilter() const;
+    AAFilter *getAAFilter();
 
     /// Enables/disables the anti-alias filter. Zero to disable, nonzero to enable
     void enableAAFilter(BOOL newMode);
@@ -144,7 +141,7 @@ public:
     virtual void setRate(float newRate);
 
     /// Sets the number of channels, 1 = mono, 2 = stereo
-    void setChannels(uint channels);
+    void setChannels(int channels);
 
     /// Adds 'numSamples' pcs of samples from the 'samples' memory position into
     /// the input of the object.
@@ -154,7 +151,7 @@ public:
     void clear();
 
     /// Returns nonzero if there aren't any samples available for outputting.
-    uint isEmpty();
+    int isEmpty() const;
 };
 
 }
diff --git a/desmume/src/metaspu/SoundTouch/STTypes.h b/desmume/src/metaspu/SoundTouch/STTypes.h
index d0a134111..0e0280a5a 100644
--- a/desmume/src/metaspu/SoundTouch/STTypes.h
+++ b/desmume/src/metaspu/SoundTouch/STTypes.h
@@ -8,10 +8,10 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2006/02/05 16:44:06 $
-// File revision : $Revision: 1.16 $
+// Last changed  : $Date: 2012-12-28 12:53:56 -0200 (sex, 28 dez 2012) $
+// File revision : $Revision: 3 $
 //
-// $Id: STTypes.h,v 1.16 2006/02/05 16:44:06 Olli Exp $
+// $Id: STTypes.h 162 2012-12-28 14:53:56Z oparviai $
 //
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -39,63 +39,25 @@
 #ifndef STTypes_H
 #define STTypes_H
 
-//#define INTEGER_SAMPLES 1
-
 typedef unsigned int    uint;
 typedef unsigned long   ulong;
 
-#ifdef __x86_64__
-typedef unsigned long long   ulongptr;
+// Patch for MinGW: on Win64 long is 32-bit
+#ifdef _WIN64
+    typedef unsigned long long ulongptr;
 #else
-typedef unsigned long   ulongptr;
+    typedef ulong ulongptr;
 #endif
 
 
-#ifdef __GNUC__
-    // In GCC, include soundtouch_config.h made by config scritps
-/* Define to 1 if you have the <inttypes.h> header file. */
-#define HAVE_INTTYPES_H 1
+// Helper macro for aligning pointer up to next 16-byte boundary
+#define SOUNDTOUCH_ALIGN_POINTER_16(x)      ( ( (ulongptr)(x) + 15 ) & ~(ulongptr)15 )
 
-/* Define to 1 if you have the `m' library (-lm). */
-#define HAVE_LIBM 1
-
-/* Define to 1 if your system has a GNU libc compatible `malloc' function, and
-   to 0 otherwise. */
-#define HAVE_MALLOC 1
-
-/* Define to 1 if you have the <memory.h> header file. */
-#define HAVE_MEMORY_H 1
-
-/* Define to 1 if you have the <stdint.h> header file. */
-#define HAVE_STDINT_H 1
-
-/* Define to 1 if you have the <stdlib.h> header file. */
-#define HAVE_STDLIB_H 1
-
-/* Define to 1 if you have the <strings.h> header file. */
-#define HAVE_STRINGS_H 1
-
-/* Define to 1 if you have the <string.h> header file. */
-#define HAVE_STRING_H 1
-
-/* Define to 1 if you have the <sys/stat.h> header file. */
-#define HAVE_SYS_STAT_H 1
-
-/* Define to 1 if you have the <sys/types.h> header file. */
-#define HAVE_SYS_TYPES_H 1
-
-/* Define to 1 if you have the <unistd.h> header file. */
-#define HAVE_UNISTD_H 1
-
-/* Use Integer as Sample type */
-//#define INTEGER_SAMPLES 1
-
-/* Define as the return type of signal handlers (`int' or `void'). */
-#define RETSIGTYPE void
-
-/* Define to 1 if you have the ANSI C header files. */
-#define STDC_HEADERS 1
 
+#if (defined(__GNUC__) && !defined(ANDROID))
+    // In GCC, include soundtouch_config.h made by config scritps.
+    // Skip this in Android compilation that uses GCC but without configure scripts.
+    #include "soundtouch_config.h"
 #endif
 
 #ifndef _WINDEF_
@@ -103,79 +65,89 @@ typedef unsigned long   ulongptr;
 
     typedef int BOOL;
 
-#ifndef FALSE
     #define FALSE   0
-#endif
-
-#ifndef TRUE
     #define TRUE    1
-#endif
 
 #endif  // _WINDEF_
 
 
 namespace soundtouch
 {
-/// Activate these undef's to overrule the possible sampletype 
-/// setting inherited from some other header file:
-//#undef INTEGER_SAMPLES
-//#undef FLOAT_SAMPLES
+    /// Activate these undef's to overrule the possible sampletype 
+    /// setting inherited from some other header file:
+    //#undef SOUNDTOUCH_INTEGER_SAMPLES
+    //#undef SOUNDTOUCH_FLOAT_SAMPLES
 
-#if !(INTEGER_SAMPLES || FLOAT_SAMPLES)
-   
-    /// Choose either 32bit floating point or 16bit integer sampletype
-    /// by choosing one of the following defines, unless this selection 
-    /// has already been done in some other file.
-    ////
-    /// Notes:
-    /// - In Windows environment, choose the sample format with the
-    ///   following defines.
-    /// - In GNU environment, the floating point samples are used by 
-    ///   default, but integer samples can be chosen by giving the 
-    ///   following switch to the configure script:
-    ///       ./configure --enable-integer-samples
-    ///   However, if you still prefer to select the sample format here 
-    ///   also in GNU environment, then please #undef the INTEGER_SAMPLE
-    ///   and FLOAT_SAMPLE defines first as in comments above.
-    //#define INTEGER_SAMPLES     1    //< 16bit integer samples
-    #define FLOAT_SAMPLES       1    //< 32bit float samples
- 
- #endif
+    #if (defined(__SOFTFP__))
+        // For Android compilation: Force use of Integer samples in case that
+        // compilation uses soft-floating point emulation - soft-fp is way too slow
+        #undef  SOUNDTOUCH_FLOAT_SAMPLES
+        #define SOUNDTOUCH_INTEGER_SAMPLES      1
+    #endif
 
-    /// Define this to allow CPU-specific assembler optimizations. Notice that 
-    /// having this enabled on non-x86 platforms doesn't matter; the compiler can 
-    /// drop unsupported extensions on different platforms automatically. 
-    /// However, if you're having difficulties getting the optimized routines 
-    /// compiled with your compler (e.g. some gcc compiler versions may be picky), 
-    /// you may wish to disable the optimizations to make the library compile.
-	#if !defined(_MSC_VER) || !defined(__x86_64__)
-	#define ALLOW_OPTIMIZATIONS 1
-	#define ALLOW_NONEXACT_SIMD_OPTIMIZATION    1
-	#endif
+    #if !(SOUNDTOUCH_INTEGER_SAMPLES || SOUNDTOUCH_FLOAT_SAMPLES)
+       
+        /// Choose either 32bit floating point or 16bit integer sampletype
+        /// by choosing one of the following defines, unless this selection 
+        /// has already been done in some other file.
+        ////
+        /// Notes:
+        /// - In Windows environment, choose the sample format with the
+        ///   following defines.
+        /// - In GNU environment, the floating point samples are used by 
+        ///   default, but integer samples can be chosen by giving the 
+        ///   following switch to the configure script:
+        ///       ./configure --enable-integer-samples
+        ///   However, if you still prefer to select the sample format here 
+        ///   also in GNU environment, then please #undef the INTEGER_SAMPLE
+        ///   and FLOAT_SAMPLE defines first as in comments above.
+        //#define SOUNDTOUCH_INTEGER_SAMPLES     1    //< 16bit integer samples
+        #define SOUNDTOUCH_FLOAT_SAMPLES       1    //< 32bit float samples
+     
+    #endif
 
+    #if (_M_IX86 || __i386__ || __x86_64__ || _M_X64)
+        /// Define this to allow X86-specific assembler/intrinsic optimizations. 
+        /// Notice that library contains also usual C++ versions of each of these
+        /// these routines, so if you're having difficulties getting the optimized 
+        /// routines compiled for whatever reason, you may disable these optimizations 
+        /// to make the library compile.
+
+        #define SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS     1
+
+        /// In GNU environment, allow the user to override this setting by
+        /// giving the following switch to the configure script:
+        /// ./configure --disable-x86-optimizations
+        /// ./configure --enable-x86-optimizations=no
+        #ifdef SOUNDTOUCH_DISABLE_X86_OPTIMIZATIONS
+            #undef SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS
+        #endif
+    #else
+        /// Always disable optimizations when not using a x86 systems.
+        #undef SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS
+
+    #endif
 
     // If defined, allows the SIMD-optimized routines to take minor shortcuts 
     // for improved performance. Undefine to require faithfully similar SIMD 
     // calculations as in normal C implementation.
-    
+    #define SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION    1
 
 
-    #ifdef INTEGER_SAMPLES
+    #ifdef SOUNDTOUCH_INTEGER_SAMPLES
         // 16bit integer sample type
         typedef short SAMPLETYPE;
         // data type for sample accumulation: Use 32bit integer to prevent overflows
         typedef long  LONG_SAMPLETYPE;
 
-        #ifdef FLOAT_SAMPLES
+        #ifdef SOUNDTOUCH_FLOAT_SAMPLES
             // check that only one sample type is defined
             #error "conflicting sample types defined"
-        #endif // FLOAT_SAMPLES
+        #endif // SOUNDTOUCH_FLOAT_SAMPLES
 
-        #ifdef ALLOW_OPTIMIZATIONS
-            #if (_WIN32 || __i386__ || __x86_64__)
-                // Allow MMX optimizations
-                #define ALLOW_MMX   1
-            #endif
+        #ifdef SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS
+            // Allow MMX optimizations
+            #define SOUNDTOUCH_ALLOW_MMX   1
         #endif
 
     #else
@@ -185,17 +157,31 @@ namespace soundtouch
         // data type for sample accumulation: Use double to utilize full precision.
         typedef double LONG_SAMPLETYPE;
 
-        #ifdef ALLOW_OPTIMIZATIONS
-                // Allow 3DNow! and SSE optimizations
-            #if _WIN32
-               // #define ALLOW_3DNOW     1
-            #endif
-            #if (_WIN32 || __i386__ || __x86_64__)
-                #define ALLOW_SSE       1
-            #endif
+        #ifdef SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS
+            // Allow SSE optimizations
+            #define SOUNDTOUCH_ALLOW_SSE       1
         #endif
 
-    #endif  // INTEGER_SAMPLES
+    #endif  // SOUNDTOUCH_INTEGER_SAMPLES
+
 };
 
-#endif
\ No newline at end of file
+// define ST_NO_EXCEPTION_HANDLING switch to disable throwing std exceptions:
+// #define ST_NO_EXCEPTION_HANDLING    1
+#ifdef ST_NO_EXCEPTION_HANDLING
+    // Exceptions disabled. Throw asserts instead if enabled.
+    #include <assert.h>
+    #define ST_THROW_RT_ERROR(x)    {assert((const char *)x);}
+#else
+    // use c++ standard exceptions
+    #include <stdexcept>
+    #define ST_THROW_RT_ERROR(x)    {throw std::runtime_error(x);}
+#endif
+
+// When this #define is active, eliminates a clicking sound when the "rate" or "pitch" 
+// parameter setting crosses from value <1 to >=1 or vice versa during processing. 
+// Default is off as such crossover is untypical case and involves a slight sound 
+// quality compromise.
+//#define SOUNDTOUCH_PREVENT_CLICK_AT_RATE_CROSSOVER   1
+
+#endif
diff --git a/desmume/src/metaspu/SoundTouch/SoundTouch.cpp b/desmume/src/metaspu/SoundTouch/SoundTouch.cpp
index d20fd326b..ca0fdd343 100644
--- a/desmume/src/metaspu/SoundTouch/SoundTouch.cpp
+++ b/desmume/src/metaspu/SoundTouch/SoundTouch.cpp
@@ -41,10 +41,10 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2006/02/05 16:44:06 $
-// File revision : $Revision: 1.13 $
+// Last changed  : $Date: 2012-06-13 16:29:53 -0300 (qua, 13 jun 2012) $
+// File revision : $Revision: 4 $
 //
-// $Id: SoundTouch.cpp,v 1.13 2006/02/05 16:44:06 Olli Exp $
+// $Id: SoundTouch.cpp 143 2012-06-13 19:29:53Z oparviai $
 //
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -73,7 +73,6 @@
 #include <stdlib.h>
 #include <memory.h>
 #include <math.h>
-#include <stdexcept>
 #include <stdio.h>
 
 #include "SoundTouch.h"
@@ -82,8 +81,12 @@
 #include "cpu_detect.h"
 
 using namespace soundtouch;
+    
+/// test if two floating point numbers are equal
+#define TEST_FLOAT_EQUAL(a, b)  (fabs(a - b) < 1e-10)
 
-/// Print library version string
+
+/// Print library version string for autoconf
 extern "C" void soundtouch_ac_test()
 {
     printf("SoundTouch Version: %s\n",SOUNDTOUCH_VERSION);
@@ -142,11 +145,11 @@ void SoundTouch::setChannels(uint numChannels)
 {
     if (numChannels != 1 && numChannels != 2) 
     {
-        throw std::runtime_error("Illegal number of channels");
+        ST_THROW_RT_ERROR("Illegal number of channels");
     }
     channels = numChannels;
-    pRateTransposer->setChannels(numChannels);
-    pTDStretch->setChannels(numChannels);
+    pRateTransposer->setChannels((int)numChannels);
+    pTDStretch->setChannels((int)numChannels);
 }
 
 
@@ -236,10 +239,28 @@ void SoundTouch::calcEffectiveRateAndTempo()
     tempo = virtualTempo / virtualPitch;
     rate = virtualPitch * virtualRate;
 
-    if (rate != oldRate) pRateTransposer->setRate(rate);
-    if (tempo != oldTempo) pTDStretch->setTempo(tempo);
+    if (!TEST_FLOAT_EQUAL(rate,oldRate)) pRateTransposer->setRate(rate);
+    if (!TEST_FLOAT_EQUAL(tempo, oldTempo)) pTDStretch->setTempo(tempo);
 
-    if (rate > 1.0f) 
+#ifndef SOUNDTOUCH_PREVENT_CLICK_AT_RATE_CROSSOVER
+    if (rate <= 1.0f) 
+    {
+        if (output != pTDStretch) 
+        {
+            FIFOSamplePipe *tempoOut;
+
+            assert(output == pRateTransposer);
+            // move samples in the current output buffer to the output of pTDStretch
+            tempoOut = pTDStretch->getOutput();
+            tempoOut->moveSamples(*output);
+            // move samples in pitch transposer's store buffer to tempo changer's input
+            pTDStretch->moveSamples(*pRateTransposer->getStore());
+
+            output = pTDStretch;
+        }
+    }
+    else
+#endif
     {
         if (output != pRateTransposer) 
         {
@@ -255,23 +276,6 @@ void SoundTouch::calcEffectiveRateAndTempo()
             output = pRateTransposer;
         }
     } 
-    else 
-    {
-        if (output != pTDStretch) 
-        {
-            FIFOSamplePipe *tempoOut;
-
-            assert(output == pRateTransposer);
-            // move samples in the current output buffer to the output of pTDStretch
-            tempoOut = pTDStretch->getOutput();
-            tempoOut->moveSamples(*output);
-            // move samples in pitch transposer's store buffer to tempo changer's input
-            pTDStretch->moveSamples(*pRateTransposer->getStore());
-
-            output = pTDStretch;
-
-        }
-    }
 }
 
 
@@ -280,21 +284,21 @@ void SoundTouch::setSampleRate(uint srate)
 {
     bSrateSet = TRUE;
     // set sample rate, leave other tempo changer parameters as they are.
-    pTDStretch->setParameters(srate);
+    pTDStretch->setParameters((int)srate);
 }
 
 
 // Adds 'numSamples' pcs of samples from the 'samples' memory position into
 // the input of the object.
-void SoundTouch::putSamples(const SAMPLETYPE *samples, uint numSamples)
+void SoundTouch::putSamples(const SAMPLETYPE *samples, uint nSamples)
 {
     if (bSrateSet == FALSE) 
     {
-        throw std::runtime_error("SoundTouch : Sample rate not defined");
+        ST_THROW_RT_ERROR("SoundTouch : Sample rate not defined");
     } 
     else if (channels == 0) 
     {
-        throw std::runtime_error("SoundTouch : Number of channels not defined");
+        ST_THROW_RT_ERROR("SoundTouch : Number of channels not defined");
     }
 
     // Transpose the rate of the new samples if necessary
@@ -309,22 +313,23 @@ void SoundTouch::putSamples(const SAMPLETYPE *samples, uint numSamples)
             // (may happen if 'rate' changes from a non-zero value to zero)
             pTDStretch->moveSamples(*pRateTransposer);
         }
-        pTDStretch->putSamples(samples, numSamples);
+        pTDStretch->putSamples(samples, nSamples);
     } 
     */
+#ifndef SOUNDTOUCH_PREVENT_CLICK_AT_RATE_CROSSOVER
     else if (rate <= 1.0f) 
     {
         // transpose the rate down, output the transposed sound to tempo changer buffer
         assert(output == pTDStretch);
-        pRateTransposer->putSamples(samples, numSamples);
+        pRateTransposer->putSamples(samples, nSamples);
         pTDStretch->moveSamples(*pRateTransposer);
     } 
     else 
+#endif
     {
-        assert(rate > 1.0f);
         // evaluate the tempo changer, then transpose the rate up, 
         assert(output == pRateTransposer);
-        pTDStretch->putSamples(samples, numSamples);
+        pTDStretch->putSamples(samples, nSamples);
         pRateTransposer->moveSamples(*pTDStretch);
     }
 }
@@ -340,12 +345,19 @@ void SoundTouch::putSamples(const SAMPLETYPE *samples, uint numSamples)
 void SoundTouch::flush()
 {
     int i;
-    uint nOut;
-    SAMPLETYPE buff[128];
+    int nUnprocessed;
+    int nOut;
+    SAMPLETYPE buff[64*2];   // note: allocate 2*64 to cater 64 sample frames of stereo sound
 
-    nOut = numSamples();
+    // check how many samples still await processing, and scale
+    // that by tempo & rate to get expected output sample count
+    nUnprocessed = numUnprocessedSamples();
+    nUnprocessed = (int)((double)nUnprocessed / (tempo * rate) + 0.5);
 
-    memset(buff, 0, 128 * sizeof(SAMPLETYPE));
+    nOut = numSamples();        // ready samples currently in buffer ...
+    nOut += nUnprocessed;       // ... and how many we expect there to be in the end
+    
+    memset(buff, 0, 64 * channels * sizeof(SAMPLETYPE));
     // "Push" the last active samples out from the processing pipeline by
     // feeding blank samples into the processing pipeline until new, 
     // processed samples appear in the output (not however, more than 
@@ -353,7 +365,16 @@ void SoundTouch::flush()
     for (i = 0; i < 128; i ++) 
     {
         putSamples(buff, 64);
-        if (numSamples() != nOut) break;  // new samples have appeared in the output!
+        if ((int)numSamples() >= nOut) 
+        {
+            // Enough new samples have appeared into the output!
+            // As samples come from processing with bigger chunks, now truncate it
+            // back to maximum "nOut" samples to improve duration accuracy 
+            adjustAmountOfSamples(nOut);
+
+            // finish
+            break;  
+        }
     }
 
     // Clear working buffers
@@ -366,9 +387,9 @@ void SoundTouch::flush()
 
 // Changes a setting controlling the processing system behaviour. See the
 // 'SETTING_...' defines for available setting ID's.
-BOOL SoundTouch::setSetting(uint settingId, uint value)
+BOOL SoundTouch::setSetting(int settingId, int value)
 {
-    uint sampleRate, sequenceMs, seekWindowMs, overlapMs;
+    int sampleRate, sequenceMs, seekWindowMs, overlapMs;
 
     // read current tdstretch routine parameters
     pTDStretch->getParameters(&sampleRate, &sequenceMs, &seekWindowMs, &overlapMs);
@@ -415,20 +436,20 @@ BOOL SoundTouch::setSetting(uint settingId, uint value)
 // 'SETTING_...' defines for available setting ID's.
 //
 // Returns the setting value.
-uint SoundTouch::getSetting(uint settingId) const
+int SoundTouch::getSetting(int settingId) const
 {
-    uint temp;
+    int temp;
 
     switch (settingId) 
     {
         case SETTING_USE_AA_FILTER :
-            return pRateTransposer->isAAFilterEnabled();
+            return (uint)pRateTransposer->isAAFilterEnabled();
 
         case SETTING_AA_FILTER_LENGTH :
             return pRateTransposer->getAAFilter()->getLength();
 
         case SETTING_USE_QUICKSEEK :
-            return pTDStretch->isQuickSeekEnabled();
+            return (uint)   pTDStretch->isQuickSeekEnabled();
 
         case SETTING_SEQUENCE_MS:
             pTDStretch->getParameters(NULL, &temp, NULL, NULL);
@@ -442,7 +463,13 @@ uint SoundTouch::getSetting(uint settingId) const
             pTDStretch->getParameters(NULL, NULL, NULL, &temp);
             return temp;
 
-        default :
+		case SETTING_NOMINAL_INPUT_SEQUENCE :
+			return pTDStretch->getInputSampleReq();
+
+		case SETTING_NOMINAL_OUTPUT_SEQUENCE :
+			return pTDStretch->getOutputBatchSize();
+
+		default :
             return 0;
     }
 }
diff --git a/desmume/src/metaspu/SoundTouch/SoundTouch.h b/desmume/src/metaspu/SoundTouch/SoundTouch.h
index fab3bb984..c4dd55702 100644
--- a/desmume/src/metaspu/SoundTouch/SoundTouch.h
+++ b/desmume/src/metaspu/SoundTouch/SoundTouch.h
@@ -41,10 +41,10 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2006/02/05 16:44:06 $
-// File revision : $Revision: 1.14 $
+// Last changed  : $Date: 2012-12-28 17:32:59 -0200 (sex, 28 dez 2012) $
+// File revision : $Revision: 4 $
 //
-// $Id: SoundTouch.h,v 1.14 2006/02/05 16:44:06 Olli Exp $
+// $Id: SoundTouch.h 163 2012-12-28 19:32:59Z oparviai $
 //
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -79,10 +79,10 @@ namespace soundtouch
 {
 
 /// Soundtouch library version string
-#define SOUNDTOUCH_VERSION          "1.3.1"
+#define SOUNDTOUCH_VERSION          "1.7.1"
 
 /// SoundTouch library version id
-#define SOUNDTOUCH_VERSION_ID       010301
+#define SOUNDTOUCH_VERSION_ID       (10701)
 
 //
 // Available setting IDs for the 'setSetting' & 'get_setting' functions:
@@ -116,6 +116,31 @@ namespace soundtouch
 #define SETTING_OVERLAP_MS          5
 
 
+/// Call "getSetting" with this ID to query nominal average processing sequence
+/// size in samples. This value tells approcimate value how many input samples 
+/// SoundTouch needs to gather before it does DSP processing run for the sample batch.
+///
+/// Notices: 
+/// - This is read-only parameter, i.e. setSetting ignores this parameter
+/// - Returned value is approximate average value, exact processing batch
+///   size may wary from time to time
+/// - This parameter value is not constant but may change depending on 
+///   tempo/pitch/rate/samplerate settings.
+#define SETTING_NOMINAL_INPUT_SEQUENCE		6
+
+
+/// Call "getSetting" with this ID to query nominal average processing output 
+/// size in samples. This value tells approcimate value how many output samples 
+/// SoundTouch outputs once it does DSP processing run for a batch of input samples.
+///	
+/// Notices: 
+/// - This is read-only parameter, i.e. setSetting ignores this parameter
+/// - Returned value is approximate average value, exact processing batch
+///   size may wary from time to time
+/// - This parameter value is not constant but may change depending on 
+///   tempo/pitch/rate/samplerate settings.
+#define SETTING_NOMINAL_OUTPUT_SEQUENCE		7
+
 class SoundTouch : public FIFOProcessor
 {
 private:
@@ -223,16 +248,16 @@ public:
     /// 'SETTING_...' defines for available setting ID's.
     /// 
     /// \return 'TRUE' if the setting was succesfully changed
-    BOOL setSetting(uint settingId,   ///< Setting ID number. see SETTING_... defines.
-                    uint value        ///< New setting value.
+    BOOL setSetting(int settingId,   ///< Setting ID number. see SETTING_... defines.
+                    int value        ///< New setting value.
                     );
 
     /// Reads a setting controlling the processing system behaviour. See the
     /// 'SETTING_...' defines for available setting ID's.
     ///
     /// \return the setting value.
-    uint getSetting(uint settingId    ///< Setting ID number, see SETTING_... defines.
-                    ) const;
+    int getSetting(int settingId    ///< Setting ID number, see SETTING_... defines.
+                   ) const;
 
     /// Returns number of samples currently unprocessed.
     virtual uint numUnprocessedSamples() const;
diff --git a/desmume/src/metaspu/SoundTouch/TDStretch.cpp b/desmume/src/metaspu/SoundTouch/TDStretch.cpp
index 9bd2d5a32..57481e58c 100644
--- a/desmume/src/metaspu/SoundTouch/TDStretch.cpp
+++ b/desmume/src/metaspu/SoundTouch/TDStretch.cpp
@@ -13,10 +13,10 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2006/02/05 16:44:06 $
-// File revision : $Revision: 1.24 $
+// Last changed  : $Date: 2012-11-08 16:53:01 -0200 (qui, 08 nov 2012) $
+// File revision : $Revision: 1.12 $
 //
-// $Id: TDStretch.cpp,v 1.24 2006/02/05 16:44:06 Olli Exp $
+// $Id: TDStretch.cpp 160 2012-11-08 18:53:01Z oparviai $
 //
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -42,23 +42,20 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 #include <string.h>
-#include <stdlib.h>
-#include <memory.h>
 #include <limits.h>
-#include <math.h>
 #include <assert.h>
+#include <math.h>
+#include <float.h>
 
 #include "STTypes.h"
 #include "cpu_detect.h"
 #include "TDStretch.h"
 
+#include <stdio.h>
+
 using namespace soundtouch;
 
-#ifndef min
-#define min(a,b) ((a > b) ? b : a)
-#define max(a,b) ((a < b) ? b : a)
-#endif
-
+#define max(x, y) (((x) > (y)) ? (x) : (y))
 
 
 /*****************************************************************************
@@ -67,17 +64,18 @@ using namespace soundtouch;
  *
  *****************************************************************************/
 
-
 // Table for the hierarchical mixing position seeking algorithm
-int scanOffsets[4][24]={
-    { 124,  186,  248,  310,  372,  434,  496,  558,  620,  682,  744, 806, 
-      868,  930,  992, 1054, 1116, 1178, 1240, 1302, 1364, 1426, 1488,   0}, 
+static const short _scanOffsets[5][24]={
+    { 124,  186,  248,  310,  372,  434,  496,  558,  620,  682,  744, 806,
+      868,  930,  992, 1054, 1116, 1178, 1240, 1302, 1364, 1426, 1488,   0},
     {-100,  -75,  -50,  -25,   25,   50,   75,  100,    0,    0,    0,   0,
         0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0},
     { -20,  -15,  -10,   -5,    5,   10,   15,   20,    0,    0,    0,   0,
         0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0},
     {  -4,   -3,   -2,   -1,    1,    2,    3,    4,    0,    0,    0,   0,
-        0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0}};
+        0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,   0},
+    { 121,  114,   97,  114,   98,  105,  108,   32,  104,   99,  117,  111,
+      116,  100,  110,  117,  111,  115,    0,    0,    0,    0,    0,   0}};
 
 /*****************************************************************************
  *
@@ -88,34 +86,31 @@ int scanOffsets[4][24]={
 
 TDStretch::TDStretch() : FIFOProcessor(&outputBuffer)
 {
-    bQuickseek = FALSE;
+    bQuickSeek = FALSE;
     channels = 2;
-    bMidBufferDirty = FALSE;
 
     pMidBuffer = NULL;
-    pRefMidBufferUnaligned = NULL;
+    pMidBufferUnaligned = NULL;
     overlapLength = 0;
 
-    setParameters(48000, DEFAULT_SEQUENCE_MS, DEFAULT_SEEKWINDOW_MS, DEFAULT_OVERLAP_MS);
+    bAutoSeqSetting = TRUE;
+    bAutoSeekSetting = TRUE;
 
+//    outDebt = 0;
+    skipFract = 0;
+
+    tempo = 1.0f;
+    setParameters(44100, DEFAULT_SEQUENCE_MS, DEFAULT_SEEKWINDOW_MS, DEFAULT_OVERLAP_MS);
     setTempo(1.0f);
-}
 
+    clear();
+}
 
 
 
 TDStretch::~TDStretch()
 {
-    delete[] pMidBuffer;
-    delete[] pRefMidBufferUnaligned;
-}
-
-
-    
-// Calculates the x having the closest 2^x value for the given value
-static int _getClosest2Power(double value)
-{
-    return (int)(log(value) / log(2.0) + 0.5);
+    delete[] pMidBufferUnaligned;
 }
 
 
@@ -129,18 +124,36 @@ static int _getClosest2Power(double value)
 //      position (default = 28 ms)
 // 'overlapMS' = overlapping length (default = 12 ms)
 
-void TDStretch::setParameters(uint aSampleRate, uint aSequenceMS, 
-                              uint aSeekWindowMS, uint aOverlapMS)
+void TDStretch::setParameters(int aSampleRate, int aSequenceMS, 
+                              int aSeekWindowMS, int aOverlapMS)
 {
-    this->sampleRate = aSampleRate;
-    this->sequenceMs = aSequenceMS;
-    this->seekWindowMs = aSeekWindowMS;
-    this->overlapMs = aOverlapMS;
+    // accept only positive parameter values - if zero or negative, use old values instead
+    if (aSampleRate > 0)   this->sampleRate = aSampleRate;
+    if (aOverlapMS > 0)    this->overlapMs = aOverlapMS;
 
-    seekLength = (sampleRate * seekWindowMs) / 1000;
-    seekWindowLength = (sampleRate * sequenceMs) / 1000;
+    if (aSequenceMS > 0)
+    {
+        this->sequenceMs = aSequenceMS;
+        bAutoSeqSetting = FALSE;
+    } 
+    else if (aSequenceMS == 0)
+    {
+        // if zero, use automatic setting
+        bAutoSeqSetting = TRUE;
+    }
 
-    maxOffset = seekLength;
+    if (aSeekWindowMS > 0) 
+    {
+        this->seekWindowMs = aSeekWindowMS;
+        bAutoSeekSetting = FALSE;
+    } 
+    else if (aSeekWindowMS == 0) 
+    {
+        // if zero, use automatic setting
+        bAutoSeekSetting = TRUE;
+    }
+
+    calcSeqParameters();
 
     calculateOverlapLength(overlapMs);
 
@@ -154,7 +167,7 @@ void TDStretch::setParameters(uint aSampleRate, uint aSequenceMS,
 /// Get routine control parameters, see setParameters() function.
 /// Any of the parameters to this function can be NULL, in such case corresponding parameter
 /// value isn't returned.
-void TDStretch::getParameters(uint *pSampleRate, uint *pSequenceMs, uint *pSeekWindowMs, uint *pOverlapMs)
+void TDStretch::getParameters(int *pSampleRate, int *pSequenceMs, int *pSeekWindowMs, int *pOverlapMs) const
 {
     if (pSampleRate)
     {
@@ -163,12 +176,12 @@ void TDStretch::getParameters(uint *pSampleRate, uint *pSequenceMs, uint *pSeekW
 
     if (pSequenceMs)
     {
-        *pSequenceMs = sequenceMs;
+        *pSequenceMs = (bAutoSeqSetting) ? (USE_AUTO_SEQUENCE_LEN) : sequenceMs;
     }
 
     if (pSeekWindowMs)
     {
-        *pSeekWindowMs = seekWindowMs;
+        *pSeekWindowMs = (bAutoSeekSetting) ? (USE_AUTO_SEEKWINDOW_LEN) : seekWindowMs;
     }
 
     if (pOverlapMs)
@@ -178,15 +191,20 @@ void TDStretch::getParameters(uint *pSampleRate, uint *pSequenceMs, uint *pSeekW
 }
 
 
-// Overlaps samples in 'midBuffer' with the samples in 'input'
-void TDStretch::overlapMono(SAMPLETYPE *output, const SAMPLETYPE *input) const
+// Overlaps samples in 'midBuffer' with the samples in 'pInput'
+void TDStretch::overlapMono(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput) const
 {
-    int i, itemp;
+    int i;
+    SAMPLETYPE m1, m2;
 
-    for (i = 0; i < (int)overlapLength ; i ++) 
+    m1 = (SAMPLETYPE)0;
+    m2 = (SAMPLETYPE)overlapLength;
+
+    for (i = 0; i < overlapLength ; i ++) 
     {
-        itemp = overlapLength - i;
-        output[i] = (input[i] * i + pMidBuffer[i] * itemp ) / overlapLength;    // >> overlapDividerBits;
+        pOutput[i] = (pInput[i] * m1 + pMidBuffer[i] * m2 ) / overlapLength;
+        m1 += 1;
+        m2 -= 1;
     }
 }
 
@@ -194,11 +212,7 @@ void TDStretch::overlapMono(SAMPLETYPE *output, const SAMPLETYPE *input) const
 
 void TDStretch::clearMidBuffer()
 {
-    if (bMidBufferDirty) 
-    {
-        memset(pMidBuffer, 0, 2 * sizeof(SAMPLETYPE) * overlapLength);
-        bMidBufferDirty = FALSE;
-    }
+    memset(pMidBuffer, 0, 2 * sizeof(SAMPLETYPE) * overlapLength);
 }
 
 
@@ -213,8 +227,7 @@ void TDStretch::clearInput()
 void TDStretch::clear()
 {
     outputBuffer.clear();
-    inputBuffer.clear();
-    clearMidBuffer();
+    clearInput();
 }
 
 
@@ -223,82 +236,60 @@ void TDStretch::clear()
 // to enable
 void TDStretch::enableQuickSeek(BOOL enable)
 {
-    bQuickseek = enable;
+    bQuickSeek = enable;
 }
 
 
 // Returns nonzero if the quick seeking algorithm is enabled.
 BOOL TDStretch::isQuickSeekEnabled() const
 {
-    return bQuickseek;
+    return bQuickSeek;
 }
 
 
 // Seeks for the optimal overlap-mixing position.
-uint TDStretch::seekBestOverlapPosition(const SAMPLETYPE *refPos)
+int TDStretch::seekBestOverlapPosition(const SAMPLETYPE *refPos)
 {
-    if (channels == 2) 
+    if (bQuickSeek) 
     {
-        // stereo sound
-        if (bQuickseek) 
-        {
-            return seekBestOverlapPositionStereoQuick(refPos);
-        } 
-        else 
-        {
-            return seekBestOverlapPositionStereo(refPos);
-        }
+        return seekBestOverlapPositionQuick(refPos);
     } 
     else 
     {
-        // mono sound
-        if (bQuickseek) 
-        {
-            return seekBestOverlapPositionMonoQuick(refPos);
-        } 
-        else 
-        {
-            return seekBestOverlapPositionMono(refPos);
-        }
+        return seekBestOverlapPositionFull(refPos);
     }
 }
 
 
-
-
-// Overlaps samples in 'midBuffer' with the samples in 'inputBuffer' at position
+// Overlaps samples in 'midBuffer' with the samples in 'pInputBuffer' at position
 // of 'ovlPos'.
-inline void TDStretch::overlap(SAMPLETYPE *output, const SAMPLETYPE *input, uint ovlPos) const
+inline void TDStretch::overlap(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput, uint ovlPos) const
 {
     if (channels == 2) 
     {
         // stereo sound
-        overlapStereo(output, input + 2 * ovlPos);
+        overlapStereo(pOutput, pInput + 2 * ovlPos);
     } else {
         // mono sound.
-        overlapMono(output, input + ovlPos);
+        overlapMono(pOutput, pInput + ovlPos);
     }
 }
 
 
 
-
 // Seeks for the optimal overlap-mixing position. The 'stereo' version of the
 // routine
 //
 // The best position is determined as the position where the two overlapped
 // sample sequences are 'most alike', in terms of the highest cross-correlation
 // value over the overlapping period
-uint TDStretch::seekBestOverlapPositionStereo(const SAMPLETYPE *refPos) 
+int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos) 
 {
-    uint bestOffs;
-    LONG_SAMPLETYPE bestCorr, corr;
-    uint i;
+    int bestOffs;
+    double bestCorr, corr;
+    int i;
 
-    // Slopes the amplitudes of the 'midBuffer' samples
-    precalcCorrReferenceStereo();
-
-    bestCorr = INT_MIN;
+    bestCorr = FLT_MIN;
     bestOffs = 0;
 
     // Scans for the best correlation value by testing each possible position
@@ -307,7 +298,10 @@ uint TDStretch::seekBestOverlapPositionStereo(const SAMPLETYPE *refPos)
     {
         // Calculates correlation value for the mixing position corresponding
         // to 'i'
-        corr = calcCrossCorrStereo(refPos + 2 * i, pRefMidBuffer);
+        corr = calcCrossCorr(refPos + channels * i, pMidBuffer);
+        // heuristic rule to slightly favour values close to mid of the range
+        double tmp = (double)(2 * i - seekLength) / (double)seekLength;
+        corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp));
 
         // Checks for the highest correlation value
         if (corr > bestCorr) 
@@ -329,18 +323,15 @@ uint TDStretch::seekBestOverlapPositionStereo(const SAMPLETYPE *refPos)
 // The best position is determined as the position where the two overlapped
 // sample sequences are 'most alike', in terms of the highest cross-correlation
 // value over the overlapping period
-uint TDStretch::seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos) 
+int TDStretch::seekBestOverlapPositionQuick(const SAMPLETYPE *refPos) 
 {
-    uint j;
-    uint bestOffs;
-    LONG_SAMPLETYPE bestCorr, corr;
-    uint scanCount, corrOffset, tempOffset;
+    int j;
+    int bestOffs;
+    double bestCorr, corr;
+    int scanCount, corrOffset, tempOffset;
 
-    // Slopes the amplitude of the 'midBuffer' samples
-    precalcCorrReferenceStereo();
-
-    bestCorr = INT_MIN;
-    bestOffs = 0;
+    bestCorr = FLT_MIN;
+    bestOffs = _scanOffsets[0][0];
     corrOffset = 0;
     tempOffset = 0;
 
@@ -353,14 +344,17 @@ uint TDStretch::seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos)
     for (scanCount = 0;scanCount < 4; scanCount ++) 
     {
         j = 0;
-        while (scanOffsets[scanCount][j]) 
+        while (_scanOffsets[scanCount][j]) 
         {
-            tempOffset = corrOffset + scanOffsets[scanCount][j];
+            tempOffset = corrOffset + _scanOffsets[scanCount][j];
             if (tempOffset >= seekLength) break;
 
             // Calculates correlation value for the mixing position corresponding
             // to 'tempOffset'
-            corr = calcCrossCorrStereo(refPos + 2 * tempOffset, pRefMidBuffer);
+            corr = (double)calcCrossCorr(refPos + channels * tempOffset, pMidBuffer);
+            // heuristic rule to slightly favour values close to mid of the range
+            double tmp = (double)(2 * tempOffset - seekLength) / seekLength;
+            corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp));
 
             // Checks for the highest correlation value
             if (corr > bestCorr) 
@@ -380,105 +374,6 @@ uint TDStretch::seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos)
 
 
 
-// Seeks for the optimal overlap-mixing position. The 'mono' version of the
-// routine
-//
-// The best position is determined as the position where the two overlapped
-// sample sequences are 'most alike', in terms of the highest cross-correlation
-// value over the overlapping period
-uint TDStretch::seekBestOverlapPositionMono(const SAMPLETYPE *refPos) 
-{
-    uint bestOffs;
-    LONG_SAMPLETYPE bestCorr, corr;
-    uint tempOffset;
-    const SAMPLETYPE *compare;
-
-    // Slopes the amplitude of the 'midBuffer' samples
-    precalcCorrReferenceMono();
-
-    bestCorr = INT_MIN;
-    bestOffs = 0;
-
-    // Scans for the best correlation value by testing each possible position
-    // over the permitted range.
-    for (tempOffset = 0; tempOffset < seekLength; tempOffset ++) 
-    {
-        compare = refPos + tempOffset;
-
-        // Calculates correlation value for the mixing position corresponding
-        // to 'tempOffset'
-        corr = calcCrossCorrMono(pRefMidBuffer, compare);
-
-        // Checks for the highest correlation value
-        if (corr > bestCorr) 
-        {
-            bestCorr = corr;
-            bestOffs = tempOffset;
-        }
-    }
-    // clear cross correlation routine state if necessary (is so e.g. in MMX routines).
-    clearCrossCorrState();
-
-    return bestOffs;
-}
-
-
-// Seeks for the optimal overlap-mixing position. The 'mono' version of the
-// routine
-//
-// The best position is determined as the position where the two overlapped
-// sample sequences are 'most alike', in terms of the highest cross-correlation
-// value over the overlapping period
-uint TDStretch::seekBestOverlapPositionMonoQuick(const SAMPLETYPE *refPos) 
-{
-    uint j;
-    uint bestOffs;
-    LONG_SAMPLETYPE bestCorr, corr;
-    uint scanCount, corrOffset, tempOffset;
-
-    // Slopes the amplitude of the 'midBuffer' samples
-    precalcCorrReferenceMono();
-
-    bestCorr = INT_MIN;
-    bestOffs = 0;
-    corrOffset = 0;
-    tempOffset = 0;
-
-    // Scans for the best correlation value using four-pass hierarchical search.
-    //
-    // The look-up table 'scans' has hierarchical position adjusting steps.
-    // In first pass the routine searhes for the highest correlation with 
-    // relatively coarse steps, then rescans the neighbourhood of the highest
-    // correlation with better resolution and so on.
-    for (scanCount = 0;scanCount < 4; scanCount ++) 
-    {
-        j = 0;
-        while (scanOffsets[scanCount][j]) 
-        {
-            tempOffset = corrOffset + scanOffsets[scanCount][j];
-            if (tempOffset >= seekLength) break;
-
-            // Calculates correlation value for the mixing position corresponding
-            // to 'tempOffset'
-            corr = calcCrossCorrMono(refPos + tempOffset, pRefMidBuffer);
-
-            // Checks for the highest correlation value
-            if (corr > bestCorr) 
-            {
-                bestCorr = corr;
-                bestOffs = tempOffset;
-            }
-            j ++;
-        }
-        corrOffset = bestOffs;
-    }
-    // clear cross correlation routine state if necessary (is so e.g. in MMX routines).
-    clearCrossCorrState();
-
-    return bestOffs;
-}
-
-
 /// clear cross correlation routine state if necessary 
 void TDStretch::clearCrossCorrState()
 {
@@ -486,29 +381,82 @@ void TDStretch::clearCrossCorrState()
 }
 
 
+/// Calculates processing sequence length according to tempo setting
+void TDStretch::calcSeqParameters()
+{
+    // Adjust tempo param according to tempo, so that variating processing sequence length is used
+    // at varius tempo settings, between the given low...top limits
+    #define AUTOSEQ_TEMPO_LOW   0.5     // auto setting low tempo range (-50%)
+    #define AUTOSEQ_TEMPO_TOP   2.0     // auto setting top tempo range (+100%)
+
+    // sequence-ms setting values at above low & top tempo
+    #define AUTOSEQ_AT_MIN      125.0
+    #define AUTOSEQ_AT_MAX      50.0
+    #define AUTOSEQ_K           ((AUTOSEQ_AT_MAX - AUTOSEQ_AT_MIN) / (AUTOSEQ_TEMPO_TOP - AUTOSEQ_TEMPO_LOW))
+    #define AUTOSEQ_C           (AUTOSEQ_AT_MIN - (AUTOSEQ_K) * (AUTOSEQ_TEMPO_LOW))
+
+    // seek-window-ms setting values at above low & top tempo
+    #define AUTOSEEK_AT_MIN     25.0
+    #define AUTOSEEK_AT_MAX     15.0
+    #define AUTOSEEK_K          ((AUTOSEEK_AT_MAX - AUTOSEEK_AT_MIN) / (AUTOSEQ_TEMPO_TOP - AUTOSEQ_TEMPO_LOW))
+    #define AUTOSEEK_C          (AUTOSEEK_AT_MIN - (AUTOSEEK_K) * (AUTOSEQ_TEMPO_LOW))
+
+    #define CHECK_LIMITS(x, mi, ma) (((x) < (mi)) ? (mi) : (((x) > (ma)) ? (ma) : (x)))
+
+    double seq, seek;
+    
+    if (bAutoSeqSetting)
+    {
+        seq = AUTOSEQ_C + AUTOSEQ_K * tempo;
+        seq = CHECK_LIMITS(seq, AUTOSEQ_AT_MAX, AUTOSEQ_AT_MIN);
+        sequenceMs = (int)(seq + 0.5);
+    }
+
+    if (bAutoSeekSetting)
+    {
+        seek = AUTOSEEK_C + AUTOSEEK_K * tempo;
+        seek = CHECK_LIMITS(seek, AUTOSEEK_AT_MAX, AUTOSEEK_AT_MIN);
+        seekWindowMs = (int)(seek + 0.5);
+    }
+
+    // Update seek window lengths
+    seekWindowLength = (sampleRate * sequenceMs) / 1000;
+    if (seekWindowLength < 2 * overlapLength) 
+    {
+        seekWindowLength = 2 * overlapLength;
+    }
+    seekLength = (sampleRate * seekWindowMs) / 1000;
+}
+
+
+
 // Sets new target tempo. Normal tempo = 'SCALE', smaller values represent slower 
 // tempo, larger faster tempo.
 void TDStretch::setTempo(float newTempo)
 {
-    uint intskip;
+    int intskip;
 
     tempo = newTempo;
 
+    // Calculate new sequence duration
+    calcSeqParameters();
+
     // Calculate ideal skip length (according to tempo value) 
     nominalSkip = tempo * (seekWindowLength - overlapLength);
-    skipFract = 0;
     intskip = (int)(nominalSkip + 0.5f);
 
     // Calculate how many samples are needed in the 'inputBuffer' to 
     // process another batch of samples
-    sampleReq = max(intskip + overlapLength, seekWindowLength) + maxOffset;
+    //sampleReq = max(intskip + overlapLength, seekWindowLength) + seekLength / 2;
+    sampleReq = max(intskip + overlapLength, seekWindowLength) + seekLength;
 }
 
 
 
 // Sets the number of channels, 1 = mono, 2 = stereo
-void TDStretch::setChannels(uint numChannels)
+void TDStretch::setChannels(int numChannels)
 {
+    assert(numChannels > 0);
     if (channels == numChannels) return;
     assert(numChannels == 1 || numChannels == 2);
 
@@ -520,6 +468,7 @@ void TDStretch::setChannels(uint numChannels)
 
 // nominal tempo, no need for processing, just pass the samples through
 // to outputBuffer
+/*
 void TDStretch::processNominalTempo()
 {
     assert(tempo == 1.0f);
@@ -547,13 +496,15 @@ void TDStretch::processNominalTempo()
     // Simply bypass samples from input to output
     outputBuffer.moveSamples(inputBuffer);
 }
+*/
 
+#include <stdio.h>
 
 // Processes as many processing frames of the samples 'inputBuffer', store
 // the result into 'outputBuffer'
 void TDStretch::processSamples()
 {
-    uint ovlSkip, offset;
+    int ovlSkip, offset;
     int temp;
 
     /* Removed this small optimization - can introduce a click to sound when tempo setting
@@ -566,23 +517,9 @@ void TDStretch::processSamples()
     }
     */
 
-    if (bMidBufferDirty == FALSE) 
-    {
-        // if midBuffer is empty, move the first samples of the input stream 
-        // into it
-        if (inputBuffer.numSamples() < overlapLength) 
-        {
-            // wait until we've got overlapLength samples
-            return;
-        }
-        memcpy(pMidBuffer, inputBuffer.ptrBegin(), channels * overlapLength * sizeof(SAMPLETYPE));
-        inputBuffer.receiveSamples(overlapLength);
-        bMidBufferDirty = TRUE;
-    }
-
     // Process samples as long as there are enough samples in 'inputBuffer'
     // to form a processing frame.
-    while (inputBuffer.numSamples() >= sampleReq) 
+    while ((int)inputBuffer.numSamples() >= sampleReq) 
     {
         // If tempo differs from the normal ('SCALE'), scan for the best overlapping
         // position
@@ -592,23 +529,28 @@ void TDStretch::processSamples()
         // samples in 'midBuffer' using sliding overlapping
         // ... first partially overlap with the end of the previous sequence
         // (that's in 'midBuffer')
-        overlap(outputBuffer.ptrEnd(overlapLength), inputBuffer.ptrBegin(), offset);
-        outputBuffer.putSamples(overlapLength);
+        overlap(outputBuffer.ptrEnd((uint)overlapLength), inputBuffer.ptrBegin(), (uint)offset);
+        outputBuffer.putSamples((uint)overlapLength);
 
-        // ... then copy sequence samples from 'inputBuffer' to output
-        temp = (seekWindowLength - 2 * overlapLength);// & 0xfffffffe;
-        if (temp > 0)
+        // ... then copy sequence samples from 'inputBuffer' to output:
+
+        // length of sequence
+        temp = (seekWindowLength - 2 * overlapLength);
+
+        // crosscheck that we don't have buffer overflow...
+        if ((int)inputBuffer.numSamples() < (offset + temp + overlapLength * 2))
         {
-            outputBuffer.putSamples(inputBuffer.ptrBegin() + channels * (offset + overlapLength), temp);
+            continue;    // just in case, shouldn't really happen
         }
 
+        outputBuffer.putSamples(inputBuffer.ptrBegin() + channels * (offset + overlapLength), (uint)temp);
+
         // Copies the end of the current sequence from 'inputBuffer' to 
         // 'midBuffer' for being mixed with the beginning of the next 
         // processing sequence and so on
-        assert(offset + seekWindowLength <= inputBuffer.numSamples());
-        memcpy(pMidBuffer, inputBuffer.ptrBegin() + channels * (offset + seekWindowLength - overlapLength), 
+        assert((offset + temp + overlapLength * 2) <= (int)inputBuffer.numSamples());
+        memcpy(pMidBuffer, inputBuffer.ptrBegin() + channels * (offset + temp + overlapLength), 
             channels * sizeof(SAMPLETYPE) * overlapLength);
-        bMidBufferDirty = TRUE;
 
         // Remove the processed samples from the input buffer. Update
         // the difference between integer & nominal skip step to 'skipFract'
@@ -616,17 +558,17 @@ void TDStretch::processSamples()
         skipFract += nominalSkip;   // real skip size
         ovlSkip = (int)skipFract;   // rounded to integer skip
         skipFract -= ovlSkip;       // maintain the fraction part, i.e. real vs. integer skip
-        inputBuffer.receiveSamples(ovlSkip);
+        inputBuffer.receiveSamples((uint)ovlSkip);
     }
 }
 
 
 // Adds 'numsamples' pcs of samples from the 'samples' memory position into
 // the input of the object.
-void TDStretch::putSamples(const SAMPLETYPE *samples, uint numSamples)
+void TDStretch::putSamples(const SAMPLETYPE *samples, uint nSamples)
 {
     // Add the samples into the input buffer
-    inputBuffer.putSamples(samples, numSamples);
+    inputBuffer.putSamples(samples, nSamples);
     // Process the samples in input buffer
     processSamples();
 }
@@ -634,25 +576,23 @@ void TDStretch::putSamples(const SAMPLETYPE *samples, uint numSamples)
 
 
 /// Set new overlap length parameter & reallocate RefMidBuffer if necessary.
-void TDStretch::acceptNewOverlapLength(uint newOverlapLength)
+void TDStretch::acceptNewOverlapLength(int newOverlapLength)
 {
-    uint prevOvl;
+    int prevOvl;
 
+    assert(newOverlapLength >= 0);
     prevOvl = overlapLength;
     overlapLength = newOverlapLength;
 
     if (overlapLength > prevOvl)
     {
-        delete[] pMidBuffer;
-        delete[] pRefMidBufferUnaligned;
+        delete[] pMidBufferUnaligned;
+
+        pMidBufferUnaligned = new SAMPLETYPE[overlapLength * 2 + 16 / sizeof(SAMPLETYPE)];
+        // ensure that 'pMidBuffer' is aligned to 16 byte boundary for efficiency
+        pMidBuffer = (SAMPLETYPE *)SOUNDTOUCH_ALIGN_POINTER_16(pMidBufferUnaligned);
 
-        pMidBuffer = new SAMPLETYPE[overlapLength * 2];
-        bMidBufferDirty = TRUE;
         clearMidBuffer();
-
-        pRefMidBufferUnaligned = new SAMPLETYPE[2 * overlapLength + 16 / sizeof(SAMPLETYPE)];
-        // ensure that 'pRefMidBuffer' is aligned to 16 byte boundary for efficiency
-        pRefMidBuffer = (SAMPLETYPE *)((((ulongptr)pRefMidBufferUnaligned) + 15) & -16);
     }
 }
 
@@ -662,47 +602,37 @@ void TDStretch::acceptNewOverlapLength(uint newOverlapLength)
 void * TDStretch::operator new(size_t s)
 {
     // Notice! don't use "new TDStretch" directly, use "newInstance" to create a new instance instead!
-    assert(FALSE);  
-    return NULL;
+    ST_THROW_RT_ERROR("Error in TDStretch::new: Don't use 'new TDStretch' directly, use 'newInstance' member instead!");
+    return newInstance();
 }
 
 
 TDStretch * TDStretch::newInstance()
 {
-    uint uExtensions = 0;
-#if !defined(_MSC_VER) || !defined(__x86_64__)
-    uExtensions = detectCPUextensions();
-#endif
-    // Check if MMX/SSE/3DNow! instruction set extensions supported by CPU
+    uint uExtensions;
 
-#ifdef ALLOW_MMX
+    uExtensions = detectCPUextensions();
+
+    // Check if MMX/SSE instruction set extensions supported by CPU
+
+#ifdef SOUNDTOUCH_ALLOW_MMX
     // MMX routines available only with integer sample types
     if (uExtensions & SUPPORT_MMX)
     {
         return ::new TDStretchMMX;
     }
     else
-#endif // ALLOW_MMX
+#endif // SOUNDTOUCH_ALLOW_MMX
 
 
-#ifdef ALLOW_SSE
+#ifdef SOUNDTOUCH_ALLOW_SSE
     if (uExtensions & SUPPORT_SSE)
     {
         // SSE support
         return ::new TDStretchSSE;
     }
     else
-#endif // ALLOW_SSE
-
-
-#ifdef ALLOW_3DNOW
-    if (uExtensions & SUPPORT_3DNOW)
-    {
-        // 3DNow! support
-        return ::new TDStretch3DNow;
-    }
-    else
-#endif // ALLOW_3DNOW
+#endif // SOUNDTOUCH_ALLOW_SSE
 
     {
         // ISA optimizations not supported, use plain C version
@@ -717,74 +647,48 @@ TDStretch * TDStretch::newInstance()
 //
 //////////////////////////////////////////////////////////////////////////////
 
-#ifdef INTEGER_SAMPLES
-
-// Slopes the amplitude of the 'midBuffer' samples so that cross correlation
-// is faster to calculate
-void TDStretch::precalcCorrReferenceStereo()
-{
-    int i, cnt2;
-    int temp, temp2;
-
-    for (i=0 ; i < (int)overlapLength ;i ++) 
-    {
-        temp = i * (overlapLength - i);
-        cnt2 = i * 2;
-
-        temp2 = (pMidBuffer[cnt2] * temp) / slopingDivider;
-        pRefMidBuffer[cnt2] = (short)(temp2);
-        temp2 = (pMidBuffer[cnt2 + 1] * temp) / slopingDivider;
-        pRefMidBuffer[cnt2 + 1] = (short)(temp2);
-    }
-}
-
-
-// Slopes the amplitude of the 'midBuffer' samples so that cross correlation
-// is faster to calculate
-void TDStretch::precalcCorrReferenceMono()
-{
-    int i;
-    long temp;
-    long temp2;
-
-    for (i=0 ; i < (int)overlapLength ;i ++) 
-    {
-        temp = i * (overlapLength - i);
-        temp2 = (pMidBuffer[i] * temp) / slopingDivider;
-        pRefMidBuffer[i] = (short)temp2;
-    }
-}
-
+#ifdef SOUNDTOUCH_INTEGER_SAMPLES
 
 // Overlaps samples in 'midBuffer' with the samples in 'input'. The 'Stereo' 
 // version of the routine.
-void TDStretch::overlapStereo(short *output, const short *input) const
+void TDStretch::overlapStereo(short *poutput, const short *input) const
 {
     int i;
     short temp;
-    uint cnt2;
+    int cnt2;
 
-    for (i = 0; i < (int)overlapLength ; i ++) 
+    for (i = 0; i < overlapLength ; i ++) 
     {
         temp = (short)(overlapLength - i);
         cnt2 = 2 * i;
-        output[cnt2] = (input[cnt2] * i + pMidBuffer[cnt2] * temp )  / overlapLength;
-        output[cnt2 + 1] = (input[cnt2 + 1] * i + pMidBuffer[cnt2 + 1] * temp ) / overlapLength;
+        poutput[cnt2] = (input[cnt2] * i + pMidBuffer[cnt2] * temp )  / overlapLength;
+        poutput[cnt2 + 1] = (input[cnt2 + 1] * i + pMidBuffer[cnt2 + 1] * temp ) / overlapLength;
     }
 }
 
+// Calculates the x having the closest 2^x value for the given value
+static int _getClosest2Power(double value)
+{
+    return (int)(log(value) / log(2.0) + 0.5);
+}
+
 
 /// Calculates overlap period length in samples.
 /// Integer version rounds overlap length to closest power of 2
 /// for a divide scaling operation.
-void TDStretch::calculateOverlapLength(uint overlapMs)
+void TDStretch::calculateOverlapLength(int aoverlapMs)
 {
-    uint newOvl;
+    int newOvl;
 
-    overlapDividerBits = _getClosest2Power((sampleRate * overlapMs) / 1000.0);
+    assert(aoverlapMs >= 0);
+
+    // calculate overlap length so that it's power of 2 - thus it's easy to do
+    // integer division by right-shifting. Term "-1" at end is to account for 
+    // the extra most significatnt bit left unused in result by signed multiplication 
+    overlapDividerBits = _getClosest2Power((sampleRate * aoverlapMs) / 1000.0) - 1;
     if (overlapDividerBits > 9) overlapDividerBits = 9;
-    if (overlapDividerBits < 4) overlapDividerBits = 4;
-    newOvl = 1<<overlapDividerBits;
+    if (overlapDividerBits < 3) overlapDividerBits = 3;
+    newOvl = (int)pow(2.0, (int)overlapDividerBits + 1);    // +1 => account for -1 above
 
     acceptNewOverlapLength(newOvl);
 
@@ -795,106 +699,74 @@ void TDStretch::calculateOverlapLength(uint overlapMs)
 }
 
 
-long TDStretch::calcCrossCorrMono(const short *mixingPos, const short *compare) const
+double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare) const
 {
     long corr;
-    uint i;
+    long norm;
+    int i;
 
-    corr = 0;
-    for (i = 1; i < overlapLength; i ++) 
+    corr = norm = 0;
+    // Same routine for stereo and mono. For stereo, unroll loop for better
+    // efficiency and gives slightly better resolution against rounding. 
+    // For mono it same routine, just  unrolls loop by factor of 4
+    for (i = 0; i < channels * overlapLength; i += 4) 
     {
-        corr += (mixingPos[i] * compare[i]) >> overlapDividerBits;
+        corr += (mixingPos[i] * compare[i] + 
+                 mixingPos[i + 1] * compare[i + 1] +
+                 mixingPos[i + 2] * compare[i + 2] + 
+                 mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBits;
+        norm += (mixingPos[i] * mixingPos[i] + 
+                 mixingPos[i + 1] * mixingPos[i + 1] +
+                 mixingPos[i + 2] * mixingPos[i + 2] + 
+                 mixingPos[i + 3] * mixingPos[i + 3]) >> overlapDividerBits;
     }
 
-    return corr;
+    // Normalize result by dividing by sqrt(norm) - this step is easiest 
+    // done using floating point operation
+    if (norm == 0) norm = 1;    // to avoid div by zero
+    return (double)corr / sqrt((double)norm);
 }
 
-
-long TDStretch::calcCrossCorrStereo(const short *mixingPos, const short *compare) const
-{
-    long corr;
-    uint i;
-
-    corr = 0;
-    for (i = 2; i < 2 * overlapLength; i += 2) 
-    {
-        corr += (mixingPos[i] * compare[i] +
-                 mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBits;
-    }
-
-    return corr;
-}
-
-#endif // INTEGER_SAMPLES
+#endif // SOUNDTOUCH_INTEGER_SAMPLES
 
 //////////////////////////////////////////////////////////////////////////////
 //
 // Floating point arithmetics specific algorithm implementations.
 //
 
-#ifdef FLOAT_SAMPLES
+#ifdef SOUNDTOUCH_FLOAT_SAMPLES
 
-
-// Slopes the amplitude of the 'midBuffer' samples so that cross correlation
-// is faster to calculate
-void TDStretch::precalcCorrReferenceStereo()
-{
-    int i, cnt2;
-    float temp;
-
-    for (i=0 ; i < (int)overlapLength ;i ++) 
-    {
-        temp = (float)i * (float)(overlapLength - i);
-        cnt2 = i * 2;
-        pRefMidBuffer[cnt2] = (float)(pMidBuffer[cnt2] * temp);
-        pRefMidBuffer[cnt2 + 1] = (float)(pMidBuffer[cnt2 + 1] * temp);
-    }
-}
-
-
-// Slopes the amplitude of the 'midBuffer' samples so that cross correlation
-// is faster to calculate
-void TDStretch::precalcCorrReferenceMono()
+// Overlaps samples in 'midBuffer' with the samples in 'pInput'
+void TDStretch::overlapStereo(float *pOutput, const float *pInput) const
 {
     int i;
-    float temp;
-
-    for (i=0 ; i < (int)overlapLength ;i ++) 
-    {
-        temp = (float)i * (float)(overlapLength - i);
-        pRefMidBuffer[i] = (float)(pMidBuffer[i] * temp);
-    }
-}
-
-
-// SSE-optimized version of the function overlapStereo
-void TDStretch::overlapStereo(float *output, const float *input) const
-{
-    int i;
-    uint cnt2;
-    float fTemp;
     float fScale;
-    float fi;
+    float f1;
+    float f2;
 
     fScale = 1.0f / (float)overlapLength;
 
-    for (i = 0; i < (int)overlapLength ; i ++) 
+    f1 = 0;
+    f2 = 1.0f;
+
+    for (i = 0; i < 2 * (int)overlapLength ; i += 2) 
     {
-        fTemp = (float)(overlapLength - i) * fScale;
-        fi = (float)i * fScale;
-        cnt2 = 2 * i;
-        output[cnt2 + 0] = input[cnt2 + 0] * fi + pMidBuffer[cnt2 + 0] * fTemp;
-        output[cnt2 + 1] = input[cnt2 + 1] * fi + pMidBuffer[cnt2 + 1] * fTemp;
+        pOutput[i + 0] = pInput[i + 0] * f1 + pMidBuffer[i + 0] * f2;
+        pOutput[i + 1] = pInput[i + 1] * f1 + pMidBuffer[i + 1] * f2;
+
+        f1 += fScale;
+        f2 -= fScale;
     }
 }
 
 
-/// Calculates overlap period length in samples.
-void TDStretch::calculateOverlapLength(uint overlapMs)
+/// Calculates overlapInMsec period length in samples.
+void TDStretch::calculateOverlapLength(int overlapInMsec)
 {
-    uint newOvl;
+    int newOvl;
 
-    newOvl = (sampleRate * overlapMs) / 1000;
+    assert(overlapInMsec >= 0);
+    newOvl = (sampleRate * overlapInMsec) / 1000;
     if (newOvl < 16) newOvl = 16;
 
     // must be divisible by 8
@@ -904,35 +776,33 @@ void TDStretch::calculateOverlapLength(uint overlapMs)
 }
 
 
-
-double TDStretch::calcCrossCorrMono(const float *mixingPos, const float *compare) const
+double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare) const
 {
     double corr;
-    uint i;
+    double norm;
+    int i;
 
-    corr = 0;
-    for (i = 1; i < overlapLength; i ++) 
-    {
-        corr += mixingPos[i] * compare[i];
-    }
-
-    return corr;
-}
-
-
-double TDStretch::calcCrossCorrStereo(const float *mixingPos, const float *compare) const
-{
-    double corr;
-    uint i;
-
-    corr = 0;
-    for (i = 2; i < 2 * overlapLength; i += 2) 
+    corr = norm = 0;
+    // Same routine for stereo and mono. For Stereo, unroll by factor of 2.
+    // For mono it's same routine yet unrollsd by factor of 4.
+    for (i = 0; i < channels * overlapLength; i += 4) 
     {
         corr += mixingPos[i] * compare[i] +
                 mixingPos[i + 1] * compare[i + 1];
+
+        norm += mixingPos[i] * mixingPos[i] + 
+                mixingPos[i + 1] * mixingPos[i + 1];
+
+        // unroll the loop for better CPU efficiency:
+        corr += mixingPos[i + 2] * compare[i + 2] +
+                mixingPos[i + 3] * compare[i + 3];
+
+        norm += mixingPos[i + 2] * mixingPos[i + 2] +
+                mixingPos[i + 3] * mixingPos[i + 3];
     }
 
-    return corr;
+    if (norm < 1e-9) norm = 1.0;    // to avoid div by zero
+    return corr / sqrt(norm);
 }
 
-#endif // FLOAT_SAMPLES
+#endif // SOUNDTOUCH_FLOAT_SAMPLES
diff --git a/desmume/src/metaspu/SoundTouch/TDStretch.h b/desmume/src/metaspu/SoundTouch/TDStretch.h
index 4ee5d6bbf..7ebb1091c 100644
--- a/desmume/src/metaspu/SoundTouch/TDStretch.h
+++ b/desmume/src/metaspu/SoundTouch/TDStretch.h
@@ -4,8 +4,8 @@
 /// while maintaining the original pitch by using a time domain WSOLA-like method 
 /// with several performance-increasing tweaks.
 ///
-/// Note : MMX optimized functions reside in a separate, platform-specific file, 
-/// e.g. 'mmx_win.cpp' or 'mmx_gcc.cpp'
+/// Note : MMX/SSE optimized functions reside in separate, platform-specific files 
+/// 'mmx_optimized.cpp' and 'sse_optimized.cpp'
 ///
 /// Author        : Copyright (c) Olli Parviainen
 /// Author e-mail : oparviai 'at' iki.fi
@@ -13,10 +13,10 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2006/02/05 16:44:06 $
-// File revision : $Revision: 1.16 $
+// Last changed  : $Date: 2012-04-01 16:49:30 -0300 (dom, 01 abr 2012) $
+// File revision : $Revision: 4 $
 //
-// $Id: TDStretch.h,v 1.16 2006/02/05 16:44:06 Olli Exp $
+// $Id: TDStretch.h 137 2012-04-01 19:49:30Z oparviai $
 //
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -44,6 +44,7 @@
 #ifndef TDStretch_H
 #define TDStretch_H
 
+#include <stddef.h>
 #include "STTypes.h"
 #include "RateTransposer.h"
 #include "FIFOSamplePipe.h"
@@ -51,7 +52,13 @@
 namespace soundtouch
 {
 
-// Default values for sound processing parameters:
+/// Default values for sound processing parameters:
+/// Notice that the default parameters are tuned for contemporary popular music 
+/// processing. For speech processing applications these parameters suit better:
+///     #define DEFAULT_SEQUENCE_MS     40
+///     #define DEFAULT_SEEKWINDOW_MS   15
+///     #define DEFAULT_OVERLAP_MS      8
+///
 
 /// Default length of a single processing sequence, in milliseconds. This determines to how 
 /// long sequences the original sound is chopped in the time-stretch algorithm.
@@ -61,11 +68,41 @@ namespace soundtouch
 /// and vice versa.
 ///
 /// Increasing this value reduces computational burden & vice versa.
-#define DEFAULT_SEQUENCE_MS     63
+//#define DEFAULT_SEQUENCE_MS         40
+#define DEFAULT_SEQUENCE_MS         USE_AUTO_SEQUENCE_LEN
 
-#define DEFAULT_SEEKWINDOW_MS   17
+/// Giving this value for the sequence length sets automatic parameter value
+/// according to tempo setting (recommended)
+#define USE_AUTO_SEQUENCE_LEN       0
 
-#define DEFAULT_OVERLAP_MS      7
+/// Seeking window default length in milliseconds for algorithm that finds the best possible 
+/// overlapping location. This determines from how wide window the algorithm may look for an 
+/// optimal joining location when mixing the sound sequences back together. 
+///
+/// The bigger this window setting is, the higher the possibility to find a better mixing
+/// position will become, but at the same time large values may cause a "drifting" artifact
+/// because consequent sequences will be taken at more uneven intervals.
+///
+/// If there's a disturbing artifact that sounds as if a constant frequency was drifting 
+/// around, try reducing this setting.
+///
+/// Increasing this value increases computational burden & vice versa.
+//#define DEFAULT_SEEKWINDOW_MS       15
+#define DEFAULT_SEEKWINDOW_MS       USE_AUTO_SEEKWINDOW_LEN
+
+/// Giving this value for the seek window length sets automatic parameter value
+/// according to tempo setting (recommended)
+#define USE_AUTO_SEEKWINDOW_LEN     0
+
+/// Overlap length in milliseconds. When the chopped sound sequences are mixed back together, 
+/// to form a continuous sound stream, this parameter defines over how long period the two 
+/// consecutive sequences are let to overlap each other. 
+///
+/// This shouldn't be that critical parameter. If you reduce the DEFAULT_SEQUENCE_MS setting 
+/// by a large amount, you might wish to try a smaller value on this.
+///
+/// Increasing this value increases computational burden & vice versa.
+#define DEFAULT_OVERLAP_MS      8
 
 
 /// Class that does the time-stretch (tempo change) effect for the processed
@@ -73,44 +110,40 @@ namespace soundtouch
 class TDStretch : public FIFOProcessor
 {
 protected:
-    uint channels;
-    uint sampleReq;
+    int channels;
+    int sampleReq;
     float tempo;
 
     SAMPLETYPE *pMidBuffer;
-    SAMPLETYPE *pRefMidBuffer;
-    SAMPLETYPE *pRefMidBufferUnaligned;
-    uint overlapLength;
-    uint overlapDividerBits;
-    uint slopingDivider;
-    uint seekLength;
-    uint seekWindowLength;
-    uint maxOffset;
+    SAMPLETYPE *pMidBufferUnaligned;
+    int overlapLength;
+    int seekLength;
+    int seekWindowLength;
+    int overlapDividerBits;
+    int slopingDivider;
     float nominalSkip;
     float skipFract;
     FIFOSampleBuffer outputBuffer;
     FIFOSampleBuffer inputBuffer;
-    BOOL bQuickseek;
-    BOOL bMidBufferDirty;
+    BOOL bQuickSeek;
 
-    uint sampleRate;
-    uint sequenceMs;
-    uint seekWindowMs;
-    uint overlapMs;
+    int sampleRate;
+    int sequenceMs;
+    int seekWindowMs;
+    int overlapMs;
+    BOOL bAutoSeqSetting;
+    BOOL bAutoSeekSetting;
 
-    void acceptNewOverlapLength(uint newOverlapLength);
+    void acceptNewOverlapLength(int newOverlapLength);
 
     virtual void clearCrossCorrState();
-    void calculateOverlapLength(uint overlapMs);
+    void calculateOverlapLength(int overlapMs);
 
-    virtual LONG_SAMPLETYPE calcCrossCorrStereo(const SAMPLETYPE *mixingPos, const SAMPLETYPE *compare) const;
-    virtual LONG_SAMPLETYPE calcCrossCorrMono(const SAMPLETYPE *mixingPos, const SAMPLETYPE *compare) const;
+    virtual double calcCrossCorr(const SAMPLETYPE *mixingPos, const SAMPLETYPE *compare) const;
 
-    virtual uint seekBestOverlapPositionStereo(const SAMPLETYPE *refPos);
-    virtual uint seekBestOverlapPositionStereoQuick(const SAMPLETYPE *refPos);
-    virtual uint seekBestOverlapPositionMono(const SAMPLETYPE *refPos);
-    virtual uint seekBestOverlapPositionMonoQuick(const SAMPLETYPE *refPos);
-    uint seekBestOverlapPosition(const SAMPLETYPE *refPos);
+    virtual int seekBestOverlapPositionFull(const SAMPLETYPE *refPos);
+    virtual int seekBestOverlapPositionQuick(const SAMPLETYPE *refPos);
+    int seekBestOverlapPosition(const SAMPLETYPE *refPos);
 
     virtual void overlapStereo(SAMPLETYPE *output, const SAMPLETYPE *input) const;
     virtual void overlapMono(SAMPLETYPE *output, const SAMPLETYPE *input) const;
@@ -118,10 +151,7 @@ protected:
     void clearMidBuffer();
     void overlap(SAMPLETYPE *output, const SAMPLETYPE *input, uint ovlPos) const;
 
-    void precalcCorrReferenceMono();
-    void precalcCorrReferenceStereo();
-
-    void processNominalTempo();
+    void calcSeqParameters();
 
     /// Changes the tempo of the given sound samples.
     /// Returns amount of samples returned in the "output" buffer.
@@ -135,7 +165,7 @@ public:
 
     /// Operator 'new' is overloaded so that it automatically creates a suitable instance 
     /// depending on if we've a MMX/SSE/etc-capable CPU available or not.
-    void *operator new(size_t s);
+    static void *operator new(size_t s);
 
     /// Use this function instead of "new" operator to create a new instance of this class. 
     /// This function automatically chooses a correct feature set depending on if the CPU
@@ -159,7 +189,7 @@ public:
     void clearInput();
 
     /// Sets the number of channels, 1 = mono, 2 = stereo
-    void setChannels(uint numChannels);
+    void setChannels(int numChannels);
 
     /// Enables/disables the quick position seeking algorithm. Zero to disable, 
     /// nonzero to enable
@@ -176,16 +206,16 @@ public:
     /// 'seekwindowMS' = seeking window length for scanning the best overlapping 
     ///      position
     /// 'overlapMS' = overlapping length
-    void setParameters(uint sampleRate,                             ///< Samplerate of sound being processed (Hz)
-                       uint sequenceMS = DEFAULT_SEQUENCE_MS,       ///< Single processing sequence length (ms)
-                       uint seekwindowMS = DEFAULT_SEEKWINDOW_MS,   ///< Offset seeking window length (ms)
-                       uint overlapMS = DEFAULT_OVERLAP_MS          ///< Sequence overlapping length (ms)
+    void setParameters(int sampleRate,          ///< Samplerate of sound being processed (Hz)
+                       int sequenceMS = -1,     ///< Single processing sequence length (ms)
+                       int seekwindowMS = -1,   ///< Offset seeking window length (ms)
+                       int overlapMS = -1       ///< Sequence overlapping length (ms)
                        );
 
     /// Get routine control parameters, see setParameters() function.
     /// Any of the parameters to this function can be NULL, in such case corresponding parameter
     /// value isn't returned.
-    void getParameters(uint *pSampleRate, uint *pSequenceMs, uint *pSeekWindowMs, uint *pOverlapMs);
+    void getParameters(int *pSampleRate, int *pSequenceMs, int *pSeekWindowMs, int *pOverlapMs) const;
 
     /// Adds 'numsamples' pcs of samples from the 'samples' memory position into
     /// the input of the object.
@@ -194,43 +224,45 @@ public:
             uint numSamples                         ///< Number of samples in 'samples' so that one sample
                                                     ///< contains both channels if stereo
             );
+
+    /// return nominal input sample requirement for triggering a processing batch
+    int getInputSampleReq() const
+    {
+        return (int)(nominalSkip + 0.5);
+    }
+
+    /// return nominal output sample amount when running a processing batch
+    int getOutputBatchSize() const
+    {
+        return seekWindowLength - overlapLength;
+    }
 };
 
 
 
 // Implementation-specific class declarations:
 
-//#ifdef ALLOW_MMX
-//    /// Class that implements MMX optimized routines for 16bit integer samples type.
-//    class TDStretchMMX : public TDStretch
-//    {
-//    protected:
-//        long calcCrossCorrStereo(const short *mixingPos, const short *compare) const;
-//        virtual void overlapStereo(short *output, const short *input) const;
-//        virtual void clearCrossCorrState();
-//    };
-//#endif /// ALLOW_MMX
-//
-//
-//#ifdef ALLOW_3DNOW
-//    /// Class that implements 3DNow! optimized routines for floating point samples type.
-//    class TDStretch3DNow : public TDStretch
-//    {
-//    protected:
-//        double calcCrossCorrStereo(const float *mixingPos, const float *compare) const;
-//    };
-//#endif /// ALLOW_3DNOW
+#ifdef SOUNDTOUCH_ALLOW_MMX
+    /// Class that implements MMX optimized routines for 16bit integer samples type.
+    class TDStretchMMX : public TDStretch
+    {
+    protected:
+        double calcCrossCorr(const short *mixingPos, const short *compare) const;
+        virtual void overlapStereo(short *output, const short *input) const;
+        virtual void clearCrossCorrState();
+    };
+#endif /// SOUNDTOUCH_ALLOW_MMX
 
 
-#ifdef ALLOW_SSE
+#ifdef SOUNDTOUCH_ALLOW_SSE
     /// Class that implements SSE optimized routines for floating point samples type.
     class TDStretchSSE : public TDStretch
     {
     protected:
-        double calcCrossCorrStereo(const float *mixingPos, const float *compare) const;
+        double calcCrossCorr(const float *mixingPos, const float *compare) const;
     };
 
-#endif /// ALLOW_SSE
+#endif /// SOUNDTOUCH_ALLOW_SSE
 
 }
 #endif  /// TDStretch_H
diff --git a/desmume/src/metaspu/SoundTouch/WavFile.cpp b/desmume/src/metaspu/SoundTouch/WavFile.cpp
index 7f5b5a662..d32bc84d6 100644
--- a/desmume/src/metaspu/SoundTouch/WavFile.cpp
+++ b/desmume/src/metaspu/SoundTouch/WavFile.cpp
@@ -1,4 +1,4 @@
-////////////////////////////////////////////////////////////////////////////////
+ ////////////////////////////////////////////////////////////////////////////////
 ///
 /// Classes for easy reading & writing of WAV sound files. 
 ///
@@ -17,10 +17,10 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2006/02/05 16:44:06 $
-// File revision : $Revision: 1.15 $
+// Last changed  : $Date: 2012-09-01 05:03:26 -0300 (sáb, 01 set 2012) $
+// File revision : $Revision: 4 $
 //
-// $Id: WavFile.cpp,v 1.15 2006/02/05 16:44:06 Olli Exp $
+// $Id: WavFile.cpp 154 2012-09-01 08:03:26Z oparviai $
 //
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -46,22 +46,21 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 #include <stdio.h>
-#include <stdexcept>
 #include <string>
+#include <sstream>
+#include <cstring>
 #include <assert.h>
 #include <limits.h>
 
-#include <cstdlib>
-#include <cstring>
-
 #include "WavFile.h"
+#include "STTypes.h"
 
 using namespace std;
 
-const static char riffStr[] = "RIFF";
-const static char waveStr[] = "WAVE";
-const static char fmtStr[]  = "fmt ";
-const static char dataStr[] = "data";
+static const char riffStr[] = "RIFF";
+static const char waveStr[] = "WAVE";
+static const char fmtStr[]  = "fmt ";
+static const char dataStr[] = "data";
 
 
 //////////////////////////////////////////////////////////////////////////////
@@ -85,29 +84,31 @@ const static char dataStr[] = "data";
     // big-endian CPU, swap bytes in 16 & 32 bit words
 
     // helper-function to swap byte-order of 32bit integer
-    static inline void _swap32(unsigned int &dwData)
+    static inline int _swap32(int &dwData)
     {
         dwData = ((dwData >> 24) & 0x000000FF) | 
-                 ((dwData >> 8)  & 0x0000FF00) | 
-                 ((dwData << 8)  & 0x00FF0000) | 
-                 ((dwData << 24) & 0xFF000000);
+               ((dwData >> 8)  & 0x0000FF00) | 
+               ((dwData << 8)  & 0x00FF0000) | 
+               ((dwData << 24) & 0xFF000000);
+        return dwData;
     }   
 
     // helper-function to swap byte-order of 16bit integer
-    static inline void _swap16(unsigned short &wData)
+    static inline short _swap16(short &wData)
     {
         wData = ((wData >> 8) & 0x00FF) | 
                 ((wData << 8) & 0xFF00);
+        return wData;
     }
 
     // helper-function to swap byte-order of buffer of 16bit integers
-    static inline void _swap16Buffer(unsigned short *pData, unsigned int dwNumWords)
+    static inline void _swap16Buffer(short *pData, int numWords)
     {
-        unsigned long i;
+        int i;
 
-        for (i = 0; i < dwNumWords; i ++)
+        for (i = 0; i < numWords; i ++)
         {
-            _swap16(pData[i]);
+            pData[i] = _swap16(pData[i]);
         }
     }
 
@@ -115,19 +116,21 @@ const static char dataStr[] = "data";
     // little-endian CPU, WAV file is ok as such
 
     // dummy helper-function
-    static inline void _swap32(unsigned int &dwData)
+    static inline int _swap32(int &dwData)
     {
         // do nothing
+        return dwData;
     }   
 
     // dummy helper-function
-    static inline void _swap16(unsigned short &wData)
+    static inline short _swap16(short &wData)
     {
         // do nothing
+        return wData;
     }
 
     // dummy helper-function
-    static inline void _swap16Buffer(unsigned short *pData, unsigned int dwNumBytes)
+    static inline void _swap16Buffer(short *pData, int numBytes)
     {
         // do nothing
     }
@@ -135,6 +138,39 @@ const static char dataStr[] = "data";
 #endif  // BIG_ENDIAN
 
 
+//////////////////////////////////////////////////////////////////////////////
+//
+// Class WavFileBase
+//
+
+WavFileBase::WavFileBase()
+{
+    convBuff = NULL;
+    convBuffSize = 0;
+}
+
+
+WavFileBase::~WavFileBase()
+{
+    delete[] convBuff;
+    convBuffSize = 0;
+}
+
+
+/// Get pointer to conversion buffer of at min. given size
+void *WavFileBase::getConvBuffer(int sizeBytes)
+{
+    if (convBuffSize < sizeBytes)
+    {
+        delete[] convBuff;
+
+        convBuffSize = (sizeBytes + 15) & -8;   // round up to following 8-byte bounday
+        convBuff = new char[convBuffSize];
+    }
+    return convBuff;
+}
+
+
 //////////////////////////////////////////////////////////////////////////////
 //
 // Class WavInFile
@@ -142,8 +178,6 @@ const static char dataStr[] = "data";
 
 WavInFile::WavInFile(const char *fileName)
 {
-    int hdrsOk;
-
     // Try to open the file for reading
     fptr = fopen(fileName, "rb");
     if (fptr == NULL) 
@@ -152,27 +186,52 @@ WavInFile::WavInFile(const char *fileName)
         string msg = "Error : Unable to open file \"";
         msg += fileName;
         msg += "\" for reading.";
-        throw runtime_error(msg);
+        ST_THROW_RT_ERROR(msg.c_str());
     }
 
+    init();
+}
+
+
+WavInFile::WavInFile(FILE *file)
+{
+    // Try to open the file for reading
+    fptr = file;
+    if (!file) 
+    {
+        // didn't succeed
+        string msg = "Error : Unable to access input stream for reading";
+        ST_THROW_RT_ERROR(msg.c_str());
+    }
+
+    init();
+}
+
+
+/// Init the WAV file stream
+void WavInFile::init()
+{
+    int hdrsOk;
+
+    // assume file stream is already open
+    assert(fptr);
+
     // Read the file headers
     hdrsOk = readWavHeaders();
     if (hdrsOk != 0) 
     {
         // Something didn't match in the wav file headers 
-        string msg = "File \"";
-        msg += fileName;
-        msg += "\" is corrupt or not a WAV file";
-        throw runtime_error(msg);
+        string msg = "Input file is corrupt or not a WAV file";
+        ST_THROW_RT_ERROR(msg.c_str());
     }
 
+    /* Ignore 'fixed' field value as 32bit signed linear data can have other value than 1.
     if (header.format.fixed != 1)
     {
-        string msg = "File \"";
-        msg += fileName;
-        msg += "\" uses unsupported encoding.";
-        throw runtime_error(msg);
+        string msg = "Input file uses unsupported encoding.";
+        ST_THROW_RT_ERROR(msg.c_str());
     }
+    */
 
     dataRead = 0;
 }
@@ -181,7 +240,8 @@ WavInFile::WavInFile(const char *fileName)
 
 WavInFile::~WavInFile()
 {
-    close();
+    if (fptr) fclose(fptr);
+    fptr = NULL;
 }
 
 
@@ -197,7 +257,7 @@ void WavInFile::rewind()
 }
 
 
-int WavInFile::checkCharTags()
+int WavInFile::checkCharTags() const
 {
     // header.format.fmt should equal to 'fmt '
     if (memcmp(fmtStr, header.format.fmt, 4) != 0) return -1;
@@ -208,7 +268,7 @@ int WavInFile::checkCharTags()
 }
 
 
-int WavInFile::read(char *buffer, int maxElems)
+int WavInFile::read(unsigned char *buffer, int maxElems)
 {
     int numBytes;
     uint afterDataRead;
@@ -216,7 +276,7 @@ int WavInFile::read(char *buffer, int maxElems)
     // ensure it's 8 bit format
     if (header.format.bits_per_sample != 8)
     {
-        throw runtime_error("Error: WavInFile::read(char*, int) works only with 8bit samples.");
+        ST_THROW_RT_ERROR("Error: WavInFile::read(char*, int) works only with 8bit samples.");
     }
     assert(sizeof(char) == 1);
 
@@ -225,11 +285,12 @@ int WavInFile::read(char *buffer, int maxElems)
     if (afterDataRead > header.data.data_len) 
     {
         // Don't read more samples than are marked available in header
-        numBytes = header.data.data_len - dataRead;
+        numBytes = (int)header.data.data_len - (int)dataRead;
         assert(numBytes >= 0);
     }
 
-    numBytes = fread(buffer, 1, numBytes, fptr);
+    assert(buffer);
+    numBytes = (int)fread(buffer, 1, numBytes, fptr);
     dataRead += numBytes;
 
     return numBytes;
@@ -242,67 +303,155 @@ int WavInFile::read(short *buffer, int maxElems)
     int numBytes;
     int numElems;
 
-    if (header.format.bits_per_sample == 8)
+    assert(buffer);
+    switch (header.format.bits_per_sample)
     {
-        // 8 bit format
-        char *temp = new char[maxElems];
-        int i;
-
-        numElems = read(temp, maxElems);
-        // convert from 8 to 16 bit
-        for (i = 0; i < numElems; i ++)
+        case 8:
         {
-            buffer[i] = temp[i] << 8;
-        }
-        delete[] temp;
-    }
-    else
-    {
-        // 16 bit format
-        assert(header.format.bits_per_sample == 16);
-        assert(sizeof(short) == 2);
+            // 8 bit format
+            unsigned char *temp = (unsigned char*)getConvBuffer(maxElems);
+            int i;
 
-        numBytes = maxElems * 2;
-        afterDataRead = dataRead + numBytes;
-        if (afterDataRead > header.data.data_len) 
-        {
-            // Don't read more samples than are marked available in header
-            numBytes = header.data.data_len - dataRead;
-            assert(numBytes >= 0);
+            numElems = read(temp, maxElems);
+            // convert from 8 to 16 bit
+            for (i = 0; i < numElems; i ++)
+            {
+                buffer[i] = (short)(((short)temp[i] - 128) * 256);
+            }
+            break;
         }
 
-        numBytes = fread(buffer, 1, numBytes, fptr);
-        dataRead += numBytes;
-        numElems = numBytes / 2;
+        case 16:
+        {
+            // 16 bit format
 
-        // 16bit samples, swap byte order if necessary
-        _swap16Buffer((unsigned short *)buffer, numElems);
-    }
+            assert(sizeof(short) == 2);
+
+            numBytes = maxElems * 2;
+            afterDataRead = dataRead + numBytes;
+            if (afterDataRead > header.data.data_len) 
+            {
+                // Don't read more samples than are marked available in header
+                numBytes = (int)header.data.data_len - (int)dataRead;
+                assert(numBytes >= 0);
+            }
+
+            numBytes = (int)fread(buffer, 1, numBytes, fptr);
+            dataRead += numBytes;
+            numElems = numBytes / 2;
+
+            // 16bit samples, swap byte order if necessary
+            _swap16Buffer((short *)buffer, numElems);
+            break;
+        }
+
+        default:
+        {
+            stringstream ss;
+            ss << "\nOnly 8/16 bit sample WAV files supported in integer compilation. Can't open WAV file with ";
+            ss << (int)header.format.bits_per_sample;
+            ss << " bit sample format. ";
+            ST_THROW_RT_ERROR(ss.str().c_str());
+        }
+    };
 
     return numElems;
 }
 
 
-
+/// Read data in float format. Notice that when reading in float format 
+/// 8/16/24/32 bit sample formats are supported
 int WavInFile::read(float *buffer, int maxElems)
 {
-    short *temp = new short[maxElems];
-    int num;
-    int i;
-    double fscale;
+    unsigned int afterDataRead;
+    int numBytes;
+    int numElems;
+    int bytesPerSample;
 
-    num = read(temp, maxElems);
+    assert(buffer);
 
-    fscale = 1.0 / 32768.0;
-    // convert to floats, scale to range [-1..+1[
-    for (i = 0; i < num; i ++)
+    bytesPerSample = header.format.bits_per_sample / 8;
+    if ((bytesPerSample < 1) || (bytesPerSample > 4))
     {
-        buffer[i] = (float)(fscale * (double)temp[i]);
+        stringstream ss;
+        ss << "\nOnly 8/16/24/32 bit sample WAV files supported. Can't open WAV file with ";
+        ss << (int)header.format.bits_per_sample;
+        ss << " bit sample format. ";
+        ST_THROW_RT_ERROR(ss.str().c_str());
     }
 
-    delete[] temp;
+    numBytes = maxElems * bytesPerSample;
+    afterDataRead = dataRead + numBytes;
+    if (afterDataRead > header.data.data_len) 
+    {
+        // Don't read more samples than are marked available in header
+        numBytes = (int)header.data.data_len - (int)dataRead;
+        assert(numBytes >= 0);
+    }
 
-    return num;
+    // read raw data into temporary buffer
+    char *temp = (char*)getConvBuffer(numBytes);
+    numBytes = (int)fread(temp, 1, numBytes, fptr);
+    dataRead += numBytes;
+
+    numElems = numBytes / bytesPerSample;
+
+    // swap byte ordert & convert to float, depending on sample format
+    switch (bytesPerSample)
+    {
+        case 1:
+        {
+            unsigned char *temp2 = (unsigned char*)temp;
+            double conv = 1.0 / 128.0;
+            for (int i = 0; i < numElems; i ++)
+            {
+                buffer[i] = (float)(temp2[i] * conv - 1.0);
+            }
+            break;
+        }
+
+        case 2:
+        {
+            short *temp2 = (short*)temp;
+            double conv = 1.0 / 32768.0;
+            for (int i = 0; i < numElems; i ++)
+            {
+                short value = temp2[i];
+                buffer[i] = (float)(_swap16(value) * conv);
+            }
+            break;
+        }
+
+        case 3:
+        {
+            char *temp2 = (char *)temp;
+            double conv = 1.0 / 8388608.0;
+            for (int i = 0; i < numElems; i ++)
+            {
+                int value = *((int*)temp2);
+                value = _swap32(value) & 0x00ffffff;             // take 24 bits
+                value |= (value & 0x00800000) ? 0xff000000 : 0;  // extend minus sign bits
+                buffer[i] = (float)(value * conv);
+                temp2 += 3;
+            }
+            break;
+        }
+
+        case 4:
+        {
+            int *temp2 = (int *)temp;
+            double conv = 1.0 / 2147483648.0;
+            assert(sizeof(int) == 4);
+            for (int i = 0; i < numElems; i ++)
+            {
+                int value = temp2[i];
+                buffer[i] = (float)(_swap32(value) * conv);
+            }
+            break;
+        }
+    }
+
+    return numElems;
 }
 
 
@@ -313,13 +462,6 @@ int WavInFile::eof() const
 }
 
 
-void WavInFile::close()
-{
-    fclose(fptr);
-    fptr = NULL;
-}
-
-
 
 // test if character code is between a white space ' ' and little 'z'
 static int isAlpha(char c)
@@ -329,9 +471,9 @@ static int isAlpha(char c)
 
 
 // test if all characters are between a white space ' ' and little 'z'
-static int isAlphaStr(char *str)
+static int isAlphaStr(const char *str)
 {
-    int c;
+    char c;
 
     c = str[0];
     while (c) 
@@ -347,10 +489,10 @@ static int isAlphaStr(char *str)
 
 int WavInFile::readRIFFBlock()
 {
-    fread(&(header.riff), sizeof(WavRiff), 1, fptr);
+    if (fread(&(header.riff), sizeof(WavRiff), 1, fptr) != 1) return -1;
 
     // swap 32bit data byte order if necessary
-    _swap32((unsigned int &)header.riff.package_len);
+    _swap32((int &)header.riff.package_len);
 
     // header.riff.riff_char should equal to 'RIFF');
     if (memcmp(riffStr, header.riff.riff_char, 4) != 0) return -1;
@@ -369,7 +511,7 @@ int WavInFile::readHeaderBlock()
     string sLabel;
 
     // lead label string
-    fread(label, 1, 4, fptr);
+    if (fread(label, 1, 4, fptr) !=4) return -1;
     label[4] = 0;
 
     if (isAlphaStr(label) == 0) return -1;    // not a valid label
@@ -383,13 +525,13 @@ int WavInFile::readHeaderBlock()
         memcpy(header.format.fmt, fmtStr, 4);
 
         // read length of the format field
-        fread(&nLen, sizeof(int), 1, fptr);
+        if (fread(&nLen, sizeof(int), 1, fptr) != 1) return -1;
         // swap byte order if necessary
-        _swap32((unsigned int &)nLen); // int format_len;
+        _swap32(nLen); // int format_len;
         header.format.format_len = nLen;
 
         // calculate how much length differs from expected
-        nDump = nLen - (sizeof(header.format) - 8);
+        nDump = nLen - ((int)sizeof(header.format) - 8);
 
         // if format_len is larger than expected, read only as much data as we've space for
         if (nDump > 0)
@@ -398,15 +540,15 @@ int WavInFile::readHeaderBlock()
         }
 
         // read data
-        fread(&(header.format.fixed), nLen, 1, fptr);
+        if (fread(&(header.format.fixed), nLen, 1, fptr) != 1) return -1;
 
         // swap byte order if necessary
-        _swap16((unsigned short &)header.format.fixed);            // short int fixed;
-        _swap16((unsigned short &)header.format.channel_number);   // short int channel_number;
-        _swap32((unsigned int   &)header.format.sample_rate);      // int sample_rate;
-        _swap32((unsigned int   &)header.format.byte_rate);        // int byte_rate;
-        _swap16((unsigned short &)header.format.byte_per_sample);  // short int byte_per_sample;
-        _swap16((unsigned short &)header.format.bits_per_sample);  // short int bits_per_sample;
+        _swap16(header.format.fixed);            // short int fixed;
+        _swap16(header.format.channel_number);   // short int channel_number;
+        _swap32((int &)header.format.sample_rate);      // int sample_rate;
+        _swap32((int &)header.format.byte_rate);        // int byte_rate;
+        _swap16(header.format.byte_per_sample);  // short int byte_per_sample;
+        _swap16(header.format.bits_per_sample);  // short int bits_per_sample;
 
         // if format_len is larger than expected, skip the extra data
         if (nDump > 0)
@@ -420,10 +562,10 @@ int WavInFile::readHeaderBlock()
     {
         // 'data' block
         memcpy(header.data.data_field, dataStr, 4);
-        fread(&(header.data.data_len), sizeof(uint), 1, fptr);
+        if (fread(&(header.data.data_len), sizeof(uint), 1, fptr) != 1) return -1;
 
         // swap byte order if necessary
-        _swap32((unsigned int &)header.data.data_len);
+        _swap32((int &)header.data.data_len);
 
         return 1;
     }
@@ -434,11 +576,11 @@ int WavInFile::readHeaderBlock()
         // unknown block
 
         // read length
-        fread(&len, sizeof(len), 1, fptr);
+        if (fread(&len, sizeof(len), 1, fptr) != 1) return -1;
         // scan through the block
         for (i = 0; i < len; i ++)
         {
-            fread(&temp, 1, 1, fptr);
+            if (fread(&temp, 1, 1, fptr) != 1) return -1;
             if (feof(fptr)) return -1;   // unexpected eof
         }
     }
@@ -499,23 +641,31 @@ uint WavInFile::getDataSizeInBytes() const
 
 uint WavInFile::getNumSamples() const
 {
-    return header.data.data_len / header.format.byte_per_sample;
+    if (header.format.byte_per_sample == 0) return 0;
+    return header.data.data_len / (unsigned short)header.format.byte_per_sample;
 }
 
 
 uint WavInFile::getLengthMS() const
 {
-   uint numSamples;
-   uint sampleRate;
+    double numSamples;
+    double sampleRate;
 
-   numSamples = getNumSamples();
-   sampleRate = getSampleRate();
+    numSamples = (double)getNumSamples();
+    sampleRate = (double)getSampleRate();
 
-   assert(numSamples < UINT_MAX / 1000);
-   return (1000 * numSamples / sampleRate);
+    return (uint)(1000.0 * numSamples / sampleRate + 0.5);
 }
 
 
+/// Returns how many milliseconds of audio have so far been read from the file
+uint WavInFile::getElapsedMS() const
+{
+    return (uint)(1000.0 * (double)dataRead / (double)header.format.byte_rate);
+}
+
+
+
 //////////////////////////////////////////////////////////////////////////////
 //
 // Class WavOutFile
@@ -531,20 +681,35 @@ WavOutFile::WavOutFile(const char *fileName, int sampleRate, int bits, int chann
         msg += fileName;
         msg += "\" for writing.";
         //pmsg = msg.c_str;
-        throw runtime_error(msg);
+        ST_THROW_RT_ERROR(msg.c_str());
+    }
+
+    fillInHeader(sampleRate, bits, channels);
+    writeHeader();
+}
+
+
+WavOutFile::WavOutFile(FILE *file, int sampleRate, int bits, int channels)
+{
+    bytesWritten = 0;
+    fptr = file;
+    if (fptr == NULL) 
+    {
+        string msg = "Error : Unable to access output file stream.";
+        ST_THROW_RT_ERROR(msg.c_str());
     }
 
     fillInHeader(sampleRate, bits, channels);
     writeHeader();
-    
-    flushTime = flushRate;
 }
 
 
 
 WavOutFile::~WavOutFile()
 {
-    close();
+    finishHeader();
+    if (fptr) fclose(fptr);
+    fptr = NULL;
 }
 
 
@@ -560,7 +725,6 @@ void WavOutFile::fillInHeader(uint sampleRate, uint bits, uint channels)
     // copy string 'WAVE' to wave
     memcpy(&(header.riff.wave), waveStr, 4);
 
-
     // fill in the 'format' part..
 
     // copy string 'fmt ' to fmt
@@ -569,11 +733,11 @@ void WavOutFile::fillInHeader(uint sampleRate, uint bits, uint channels)
     header.format.format_len = 0x10;
     header.format.fixed = 1;
     header.format.channel_number = (short)channels;
-    header.format.sample_rate = sampleRate;
+    header.format.sample_rate = (int)sampleRate;
     header.format.bits_per_sample = (short)bits;
     header.format.byte_per_sample = (short)(bits * channels / 8);
-    header.format.byte_rate = header.format.byte_per_sample * sampleRate;
-    header.format.sample_rate = sampleRate;
+    header.format.byte_rate = header.format.byte_per_sample * (int)sampleRate;
+    header.format.sample_rate = (int)sampleRate;
 
     // fill in the 'data' part..
 
@@ -598,66 +762,55 @@ void WavOutFile::finishHeader()
 void WavOutFile::writeHeader()
 {
     WavHeader hdrTemp;
+    int res;
 
     // swap byte order if necessary
     hdrTemp = header;
-    _swap32((unsigned int   &)hdrTemp.riff.package_len);
-    _swap32((unsigned int   &)hdrTemp.format.format_len);
-    _swap16((unsigned short &)hdrTemp.format.fixed);
-    _swap16((unsigned short &)hdrTemp.format.channel_number);
-    _swap32((unsigned int   &)hdrTemp.format.sample_rate);
-    _swap32((unsigned int   &)hdrTemp.format.byte_rate);
-    _swap16((unsigned short &)hdrTemp.format.byte_per_sample);
-    _swap16((unsigned short &)hdrTemp.format.bits_per_sample);
-    _swap32((unsigned int   &)hdrTemp.data.data_len);
+    _swap32((int &)hdrTemp.riff.package_len);
+    _swap32((int &)hdrTemp.format.format_len);
+    _swap16((short &)hdrTemp.format.fixed);
+    _swap16((short &)hdrTemp.format.channel_number);
+    _swap32((int &)hdrTemp.format.sample_rate);
+    _swap32((int &)hdrTemp.format.byte_rate);
+    _swap16((short &)hdrTemp.format.byte_per_sample);
+    _swap16((short &)hdrTemp.format.bits_per_sample);
+    _swap32((int &)hdrTemp.data.data_len);
 
     // write the supplemented header in the beginning of the file
     fseek(fptr, 0, SEEK_SET);
-    fwrite(&hdrTemp, sizeof(hdrTemp), 1, fptr);
+    res = (int)fwrite(&hdrTemp, sizeof(hdrTemp), 1, fptr);
+    if (res != 1)
+    {
+        ST_THROW_RT_ERROR("Error while writing to a wav file.");
+    }
+
     // jump back to the end of the file
     fseek(fptr, 0, SEEK_END);
 }
 
 
 
-void WavOutFile::close()
-{
-    finishHeader();
-    fclose(fptr);
-    fptr = NULL;
-}
-
-void WavOutFile::flush( int numElems )
-{
-	flushTime -= numElems;
-	if( flushTime < 0 )
-	{
-		flushTime += flushRate;
-		finishHeader();
-	}
-}
-
-void WavOutFile::write(const char *buffer, int numElems)
+void WavOutFile::write(const unsigned char *buffer, int numElems)
 {
     int res;
 
     if (header.format.bits_per_sample != 8)
     {
-        throw runtime_error("Error: WavOutFile::write(const char*, int) accepts only 8bit samples.");
+        ST_THROW_RT_ERROR("Error: WavOutFile::write(const char*, int) accepts only 8bit samples.");
     }
     assert(sizeof(char) == 1);
 
-    res = fwrite(buffer, 1, numElems, fptr);
+    res = (int)fwrite(buffer, 1, numElems, fptr);
     if (res != numElems) 
     {
-        throw runtime_error("Error while writing to a wav file.");
+        ST_THROW_RT_ERROR("Error while writing to a wav file.");
     }
 
     bytesWritten += numElems;
-	flush( numElems );
 }
 
 
+
 void WavOutFile::write(const short *buffer, int numElems)
 {
     int res;
@@ -665,64 +818,134 @@ void WavOutFile::write(const short *buffer, int numElems)
     // 16 bit samples
     if (numElems < 1) return;   // nothing to do
 
-    if (header.format.bits_per_sample == 8)
+    switch (header.format.bits_per_sample)
     {
-        int i;
-        char *temp = new char[numElems];
-        // convert from 16bit format to 8bit format
-        for (i = 0; i < numElems; i ++)
+        case 8:
         {
-            temp[i] = buffer[i] >> 8;
+            int i;
+            unsigned char *temp = (unsigned char *)getConvBuffer(numElems);
+            // convert from 16bit format to 8bit format
+            for (i = 0; i < numElems; i ++)
+            {
+                temp[i] = (unsigned char)(buffer[i] / 256 + 128);
+            }
+            // write in 8bit format
+            write(temp, numElems);
+            break;
+        }
+
+        case 16:
+        {
+            // 16bit format
+
+            // use temp buffer to swap byte order if necessary
+            short *pTemp = (short *)getConvBuffer(numElems * sizeof(short));
+            memcpy(pTemp, buffer, numElems * 2);
+            _swap16Buffer(pTemp, numElems);
+
+            res = (int)fwrite(pTemp, 2, numElems, fptr);
+
+            if (res != numElems) 
+            {
+                ST_THROW_RT_ERROR("Error while writing to a wav file.");
+            }
+            bytesWritten += 2 * numElems;
+            break;
+        }
+
+        default:
+        {
+            stringstream ss;
+            ss << "\nOnly 8/16 bit sample WAV files supported in integer compilation. Can't open WAV file with ";
+            ss << (int)header.format.bits_per_sample;
+            ss << " bit sample format. ";
+            ST_THROW_RT_ERROR(ss.str().c_str());
         }
-        // write in 8bit format
-        write(temp, numElems);
-        delete[] temp;
     }
-    else
+}
+
+
+/// Convert from float to integer and saturate
+inline int saturate(float fvalue, float minval, float maxval)
+{
+    if (fvalue > maxval) 
     {
-        // 16bit format
-        unsigned short *pTemp = new unsigned short[numElems];
-
-        assert(header.format.bits_per_sample == 16);
-
-        // allocate temp buffer to swap byte order if necessary
-        memcpy(pTemp, buffer, numElems * 2);
-        _swap16Buffer(pTemp, numElems);
-
-        res = fwrite(pTemp, 2, numElems, fptr);
-
-        delete[] pTemp;
-
-        if (res != numElems) 
-        {
-            throw runtime_error("Error while writing to a wav file.");
-        }
-        bytesWritten += 2 * numElems;
-		flush( numElems*2 );
+        fvalue = maxval;
+    } 
+    else if (fvalue < minval)
+    {
+        fvalue = minval;
     }
+    return (int)fvalue;
 }
 
 
 void WavOutFile::write(const float *buffer, int numElems)
 {
-    int i;
-    short *temp = new short[numElems];
-    int iTemp;
+    int numBytes;
+    int bytesPerSample;
 
-    // convert to 16 bit integer
-    for (i = 0; i < numElems; i ++)
+    if (numElems == 0) return;
+
+    bytesPerSample = header.format.bits_per_sample / 8;
+    numBytes = numElems * bytesPerSample;
+    short *temp = (short*)getConvBuffer(numBytes);
+
+    switch (bytesPerSample)
     {
-        // convert to integer
-        iTemp = (int)(32768.0f * buffer[i]);
+        case 1:
+        {
+            unsigned char *temp2 = (unsigned char *)temp;
+            for (int i = 0; i < numElems; i ++)
+            {
+                temp2[i] = (unsigned char)saturate(buffer[i] * 128.0f + 128.0f, 0.0f, 255.0f);
+            }
+            break;
+        }
 
-        // saturate
-        if (iTemp < -32768) iTemp = -32768;
-        if (iTemp > 32767)  iTemp = 32767;
-        temp[i] = (short)iTemp;
+        case 2:
+        {
+            short *temp2 = (short *)temp;
+            for (int i = 0; i < numElems; i ++)
+            {
+                short value = (short)saturate(buffer[i] * 32768.0f, -32768.0f, 32767.0f);
+                temp2[i] = _swap16(value);
+            }
+            break;
+        }
+
+        case 3:
+        {
+            char *temp2 = (char *)temp;
+            for (int i = 0; i < numElems; i ++)
+            {
+                int value = saturate(buffer[i] * 8388608.0f, -8388608.0f, 8388607.0f);
+                *((int*)temp2) = _swap32(value);
+                temp2 += 3;
+            }
+            break;
+        }
+
+        case 4:
+        {
+            int *temp2 = (int *)temp;
+            for (int i = 0; i < numElems; i ++)
+            {
+                int value = saturate(buffer[i] * 2147483648.0f, -2147483648.0f, 2147483647.0f);
+                temp2[i] = _swap32(value);
+            }
+            break;
+        }
+
+        default:
+            assert(false);
     }
 
-    write(temp, numElems);
-	flush( numElems );
+    int res = (int)fwrite(temp, 1, numBytes, fptr);
 
-    delete[] temp;
+    if (res != numBytes) 
+    {
+        ST_THROW_RT_ERROR("Error while writing to a wav file.");
+    }
+    bytesWritten += numBytes;
 }
diff --git a/desmume/src/metaspu/SoundTouch/WavFile.h b/desmume/src/metaspu/SoundTouch/WavFile.h
index e740b88fa..1d154d57b 100644
--- a/desmume/src/metaspu/SoundTouch/WavFile.h
+++ b/desmume/src/metaspu/SoundTouch/WavFile.h
@@ -16,10 +16,10 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2006/02/05 16:44:06 $
-// File revision : $Revision: 1.7 $
+// Last changed  : $Date: 2012-09-01 04:57:22 -0300 (sáb, 01 set 2012) $
+// File revision : $Revision: 4 $
 //
-// $Id: WavFile.h,v 1.7 2006/02/05 16:44:06 Olli Exp $
+// $Id: WavFile.h 153 2012-09-01 07:57:22Z oparviai $
 //
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -92,26 +92,49 @@ typedef struct
 } WavHeader;
 
 
+/// Base class for processing WAV audio files.
+class WavFileBase
+{
+private:
+    /// Conversion working buffer;
+    char *convBuff;
+    int convBuffSize;
+
+protected:
+    WavFileBase();
+    virtual ~WavFileBase();
+
+    /// Get pointer to conversion buffer of at min. given size
+    void *getConvBuffer(int sizeByte);
+};
+
+
 /// Class for reading WAV audio files.
-class WavInFile
+class WavInFile : protected WavFileBase
 {
 private:
     /// File pointer.
     FILE *fptr;
 
+    /// Position within the audio stream
+    long position;
+
     /// Counter of how many bytes of sample data have been read from the file.
-    uint dataRead;
+    long dataRead;
 
     /// WAV header information
     WavHeader header;
 
+    /// Init the WAV file stream
+    void init();
+
     /// Read WAV file headers.
     /// \return zero if all ok, nonzero if file format is invalid.
     int readWavHeaders();
 
     /// Checks WAV file header tags.
     /// \return zero if all ok, nonzero if file format is invalid.
-    int checkCharTags();
+    int checkCharTags() const;
 
     /// Reads a single WAV file header block.
     /// \return zero if all ok, nonzero if file format is invalid.
@@ -125,13 +148,11 @@ public:
     /// throws 'runtime_error' exception.
     WavInFile(const char *filename);
 
+    WavInFile(FILE *file);
+
     /// Destructor: Closes the file.
     ~WavInFile();
 
-    /// Close the file. Notice that file is automatically closed also when the 
-    /// class instance is deleted.
-    void close();
-
     /// Rewind to beginning of the file
     void rewind();
 
@@ -157,12 +178,17 @@ public:
     /// Get the audio file length in milliseconds
     uint getLengthMS() const;
 
+    /// Returns how many milliseconds of audio have so far been read from the file
+    ///
+    /// \return elapsed duration in milliseconds
+    uint getElapsedMS() const;
+
     /// Reads audio samples from the WAV file. This routine works only for 8 bit samples.
     /// Reads given number of elements from the file or if end-of-file reached, as many 
     /// elements as are left in the file.
     ///
     /// \return Number of 8-bit integers read from the file.
-    int read(char *buffer, int maxElems);
+    int read(unsigned char *buffer, int maxElems);
 
     /// Reads audio samples from the WAV file to 16 bit integer format. Reads given number 
     /// of elements from the file or if end-of-file reached, as many elements as are 
@@ -176,6 +202,7 @@ public:
     /// Reads audio samples from the WAV file to floating point format, converting 
     /// sample values to range [-1,1[. Reads given number of elements from the file
     /// or if end-of-file reached, as many elements as are left in the file.
+    /// Notice that reading in float format supports 8/16/24/32bit sample formats.
     ///
     /// \return Number of elements read from the file.
     int read(float *buffer,     ///< Pointer to buffer where to read data.
@@ -191,7 +218,7 @@ public:
 
 
 /// Class for writing WAV audio files.
-class WavOutFile
+class WavOutFile : protected WavFileBase
 {
 private:
     /// Pointer to the WAV file
@@ -203,9 +230,6 @@ private:
     /// Counter of how many bytes have been written to the file so far.
     int bytesWritten;
 
-	/// number of bytes to be written before next flush.
-	int flushTime;
-
     /// Fills in WAV file header information.
     void fillInHeader(const uint sampleRate, const uint bits, const uint channels);
 
@@ -216,14 +240,6 @@ private:
     /// Writes the WAV file header.
     void writeHeader();
 
-	/// Flushes the WAV file every so often -- writes header info for the current
-	/// data length and then returns the seek position to the end of the WAV for
-	/// continued writing.  This method is called from each write() method.
-	void flush( int numElems );
-
-	/// Flush the WAVheader every 32kb written
-	static const int flushRate = 0x8000;
-
 public:
     /// Constructor: Creates a new WAV file. Throws a 'runtime_error' exception 
     /// if file creation fails.
@@ -233,13 +249,15 @@ public:
                int channels             ///< Number of channels (1=mono, 2=stereo)
                );
 
+    WavOutFile(FILE *file, int sampleRate, int bits, int channels);
+
     /// Destructor: Finalizes & closes the WAV file.
     ~WavOutFile();
 
     /// Write data to WAV file. This function works only with 8bit samples. 
     /// Throws a 'runtime_error' exception if writing to file fails.
-    void write(const char *buffer,     ///< Pointer to sample data buffer.
-               int numElems             ///< How many array items are to be written to file.
+    void write(const unsigned char *buffer, ///< Pointer to sample data buffer.
+               int numElems                 ///< How many array items are to be written to file.
                );
 
     /// Write data to WAV file. Throws a 'runtime_error' exception if writing to
@@ -253,12 +271,6 @@ public:
     void write(const float *buffer,     ///< Pointer to sample data buffer.
                int numElems             ///< How many array items are to be written to file.
                );
-
-    /// Finalize & close the WAV file. Automatically supplements the WAV file header
-    /// information according to written data etc.
-    ///
-    /// Notice that file is automatically closed also when the class instance is deleted.
-    void close();
 };
 
 #endif
diff --git a/desmume/src/metaspu/SoundTouch/mmx_optimized.cpp b/desmume/src/metaspu/SoundTouch/mmx_optimized.cpp
index f5afb595a..8893260cb 100644
--- a/desmume/src/metaspu/SoundTouch/mmx_optimized.cpp
+++ b/desmume/src/metaspu/SoundTouch/mmx_optimized.cpp
@@ -12,7 +12,7 @@
 /// NOTICE: If using Visual Studio 6.0, you'll need to install the "Visual C++ 
 /// 6.0 processor pack" update to support compiler intrinsic syntax. The update
 /// is available for download at Microsoft Developers Network, see here:
-/// http://msdn.microsoft.com/vstudio/downloads/tools/ppack/default.aspx
+/// http://msdn.microsoft.com/en-us/vstudio/aa718349.aspx
 ///
 /// Author        : Copyright (c) Olli Parviainen
 /// Author e-mail : oparviai 'at' iki.fi
@@ -20,10 +20,10 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2006/02/06 18:52:43 $
-// File revision : $Revision: 1.1 $
+// Last changed  : $Date: 2012-11-08 16:53:01 -0200 (qui, 08 nov 2012) $
+// File revision : $Revision: 4 $
 //
-// $Id: mmx_optimized.cpp,v 1.1 2006/02/06 18:52:43 Olli Exp $
+// $Id: mmx_optimized.cpp 160 2012-11-08 18:53:01Z oparviai $
 //
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -50,13 +50,9 @@
 
 #include "STTypes.h"
 
-#ifdef ALLOW_MMX
+#ifdef SOUNDTOUCH_ALLOW_MMX
 // MMX routines available only with integer sample type
 
-#if !(_WIN32 || __i386__ || __x86_64__)
-#error "wrong platform - this source code file is exclusively for x86 platforms"
-#endif
-
 using namespace soundtouch;
 
 //////////////////////////////////////////////////////////////////////////////
@@ -68,28 +64,29 @@ using namespace soundtouch;
 #include "TDStretch.h"
 #include <mmintrin.h>
 #include <limits.h>
+#include <math.h>
 
 
 // Calculates cross correlation of two buffers
-long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const
+double TDStretchMMX::calcCrossCorr(const short *pV1, const short *pV2) const
 {
     const __m64 *pVec1, *pVec2;
     __m64 shifter;
-    __m64 accu;
-    long corr;
-    uint i;
+    __m64 accu, normaccu;
+    long corr, norm;
+    int i;
    
     pVec1 = (__m64*)pV1;
     pVec2 = (__m64*)pV2;
 
     shifter = _m_from_int(overlapDividerBits);
-    accu = _mm_setzero_si64();
+    normaccu = accu = _mm_setzero_si64();
 
-    // Process 4 parallel sets of 2 * stereo samples each during each 
-    // round to improve CPU-level parallellization.
-    for (i = 0; i < overlapLength / 8; i ++)
+    // Process 4 parallel sets of 2 * stereo samples or 4 * mono samples 
+    // during each round for improved CPU-level parallellization.
+    for (i = 0; i < channels * overlapLength / 16; i ++)
     {
-        __m64 temp;
+        __m64 temp, temp2;
 
         // dictionary of instructions:
         // _m_pmaddwd   : 4*16bit multiply-add, resulting two 32bits = [a0*b0+a1*b1 ; a2*b2+a3*b3]
@@ -98,11 +95,17 @@ long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const
 
         temp = _mm_add_pi32(_mm_madd_pi16(pVec1[0], pVec2[0]),
                             _mm_madd_pi16(pVec1[1], pVec2[1]));
+        temp2 = _mm_add_pi32(_mm_madd_pi16(pVec1[0], pVec1[0]),
+                             _mm_madd_pi16(pVec1[1], pVec1[1]));
         accu = _mm_add_pi32(accu, _mm_sra_pi32(temp, shifter));
+        normaccu = _mm_add_pi32(normaccu, _mm_sra_pi32(temp2, shifter));
 
         temp = _mm_add_pi32(_mm_madd_pi16(pVec1[2], pVec2[2]),
                             _mm_madd_pi16(pVec1[3], pVec2[3]));
+        temp2 = _mm_add_pi32(_mm_madd_pi16(pVec1[2], pVec1[2]),
+                             _mm_madd_pi16(pVec1[3], pVec1[3]));
         accu = _mm_add_pi32(accu, _mm_sra_pi32(temp, shifter));
+        normaccu = _mm_add_pi32(normaccu, _mm_sra_pi32(temp2, shifter));
 
         pVec1 += 4;
         pVec2 += 4;
@@ -114,10 +117,17 @@ long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const
     accu = _mm_add_pi32(accu, _mm_srli_si64(accu, 32));
     corr = _m_to_int(accu);
 
+    normaccu = _mm_add_pi32(normaccu, _mm_srli_si64(normaccu, 32));
+    norm = _m_to_int(normaccu);
+
     // Clear MMS state
     _m_empty();
 
-    return corr;
+    // Normalize result by dividing by sqrt(norm) - this step is easiest 
+    // done using floating point operation
+    if (norm == 0) norm = 1;    // to avoid div by zero
+
+    return (double)corr / sqrt((double)norm);
     // Note: Warning about the missing EMMS instruction is harmless
     // as it'll be called elsewhere.
 }
@@ -139,7 +149,7 @@ void TDStretchMMX::overlapStereo(short *output, const short *input) const
     const __m64 *pVinput, *pVMidBuf;
     __m64 *pVdest;
     __m64 mix1, mix2, adder, shifter;
-    uint i;
+    int i;
 
     pVinput  = (const __m64*)input;
     pVMidBuf = (const __m64*)pMidBuffer;
@@ -154,7 +164,9 @@ void TDStretchMMX::overlapStereo(short *output, const short *input) const
     mix2  = _mm_add_pi16(mix1, adder);
     adder = _mm_add_pi16(adder, adder);
 
-    shifter = _m_from_int(overlapDividerBits);
+    // Overlaplength-division by shifter. "+1" is to account for "-1" deduced in
+    // overlapDividerBits calculation earlier.
+    shifter = _m_from_int(overlapDividerBits + 1);
 
     for (i = 0; i < overlapLength / 4; i ++)
     {
@@ -227,7 +239,7 @@ void FIRFilterMMX::setCoefficients(const short *coeffs, uint newLength, uint uRe
     // Ensure that filter coeffs array is aligned to 16-byte boundary
     delete[] filterCoeffsUnalign;
     filterCoeffsUnalign = new short[2 * newLength + 8];
-    filterCoeffsAlign = (short *)(((ulongptr)filterCoeffsUnalign + 15) & -16);
+    filterCoeffsAlign = (short *)SOUNDTOUCH_ALIGN_POINTER_16(filterCoeffsUnalign);
 
     // rearrange the filter coefficients for mmx routines 
     for (i = 0;i < length; i += 4) 
@@ -247,7 +259,7 @@ void FIRFilterMMX::setCoefficients(const short *coeffs, uint newLength, uint uRe
 
 
 // mmx-optimized version of the filter routine for stereo sound
-uint FIRFilterMMX::evaluateFilterStereo(short *dest, const short *src, const uint numSamples) const
+uint FIRFilterMMX::evaluateFilterStereo(short *dest, const short *src, uint numSamples) const
 {
     // Create stack copies of the needed member variables for asm routines :
     uint i, j;
@@ -255,7 +267,7 @@ uint FIRFilterMMX::evaluateFilterStereo(short *dest, const short *src, const uin
 
     if (length < 2) return 0;
 
-    for (i = 0; i < numSamples / 2; i ++)
+    for (i = 0; i < (numSamples - length) / 2; i ++)
     {
         __m64 accu1;
         __m64 accu2;
@@ -302,4 +314,4 @@ uint FIRFilterMMX::evaluateFilterStereo(short *dest, const short *src, const uin
     return (numSamples & 0xfffffffe) - length;
 }
 
-#endif  // ALLOW_MMX
+#endif  // SOUNDTOUCH_ALLOW_MMX
diff --git a/desmume/src/metaspu/SoundTouch/sse_optimized.cpp b/desmume/src/metaspu/SoundTouch/sse_optimized.cpp
index 57598d78d..5f6e54e33 100644
--- a/desmume/src/metaspu/SoundTouch/sse_optimized.cpp
+++ b/desmume/src/metaspu/SoundTouch/sse_optimized.cpp
@@ -12,7 +12,7 @@
 /// NOTICE: If using Visual Studio 6.0, you'll need to install the "Visual C++ 
 /// 6.0 processor pack" update to support SSE instruction set. The update is 
 /// available for download at Microsoft Developers Network, see here:
-/// http://msdn.microsoft.com/vstudio/downloads/tools/ppack/default.aspx
+/// http://msdn.microsoft.com/en-us/vstudio/aa718349.aspx
 ///
 /// If the above URL is expired or removed, go to "http://msdn.microsoft.com" and 
 /// perform a search with keywords "processor pack".
@@ -23,10 +23,10 @@
 ///
 ////////////////////////////////////////////////////////////////////////////////
 //
-// Last changed  : $Date: 2006/02/05 16:44:06 $
-// File revision : $Revision: 1.2 $
+// Last changed  : $Date: 2012-11-08 16:53:01 -0200 (qui, 08 nov 2012) $
+// File revision : $Revision: 4 $
 //
-// $Id: sse_optimized.cpp,v 1.2 2006/02/05 16:44:06 Olli Exp $
+// $Id: sse_optimized.cpp 160 2012-11-08 18:53:01Z oparviai $
 //
 ////////////////////////////////////////////////////////////////////////////////
 //
@@ -56,7 +56,7 @@
 
 using namespace soundtouch;
 
-#ifdef ALLOW_SSE
+#ifdef SOUNDTOUCH_ALLOW_SSE
 
 // SSE routines available only with float sample type    
 
@@ -68,12 +68,15 @@ using namespace soundtouch;
 
 #include "TDStretch.h"
 #include <xmmintrin.h>
+#include <math.h>
 
 // Calculates cross correlation of two buffers
-double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) const
+double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2) const
 {
-    uint i;
-    __m128 vSum, *pVec2;
+    int i;
+    const float *pVec1;
+    const __m128 *pVec2;
+    __m128 vSum, vNorm;
 
     // Note. It means a major slow-down if the routine needs to tolerate 
     // unaligned __m128 memory accesses. It's way faster if we can skip 
@@ -81,16 +84,16 @@ double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) con
     // This can mean up to ~ 10-fold difference (incl. part of which is
     // due to skipping every second round for stereo sound though).
     //
-    // Compile-time define ALLOW_NONEXACT_SIMD_OPTIMIZATION is provided
+    // Compile-time define SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION is provided
     // for choosing if this little cheating is allowed.
 
-#ifdef ALLOW_NONEXACT_SIMD_OPTIMIZATION
+#ifdef SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION
     // Little cheating allowed, return valid correlation only for 
     // aligned locations, meaning every second round for stereo sound.
 
     #define _MM_LOAD    _mm_load_ps
 
-    if (((ulong)pV1) & 15) return -1e50;    // skip unaligned locations
+    if (((ulongptr)pV1) & 15) return -1e50;    // skip unaligned locations
 
 #else
     // No cheating allowed, use unaligned load & take the resulting
@@ -103,39 +106,54 @@ double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) con
 
     // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors
     // Note: pV2 _must_ be aligned to 16-bit boundary, pV1 need not.
-    pVec2 = (__m128*)pV2;
-    vSum = _mm_setzero_ps();
+    pVec1 = (const float*)pV1;
+    pVec2 = (const __m128*)pV2;
+    vSum = vNorm = _mm_setzero_ps();
 
-    // Unroll the loop by factor of 4 * 4 operations
-    for (i = 0; i < overlapLength / 8; i ++) 
+    // Unroll the loop by factor of 4 * 4 operations. Use same routine for
+    // stereo & mono, for mono it just means twice the amount of unrolling.
+    for (i = 0; i < channels * overlapLength / 16; i ++) 
     {
+        __m128 vTemp;
         // vSum += pV1[0..3] * pV2[0..3]
-        vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pV1),pVec2[0]));
+        vTemp = _MM_LOAD(pVec1);
+        vSum  = _mm_add_ps(vSum,  _mm_mul_ps(vTemp ,pVec2[0]));
+        vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
 
         // vSum += pV1[4..7] * pV2[4..7]
-        vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pV1 + 4), pVec2[1]));
+        vTemp = _MM_LOAD(pVec1 + 4);
+        vSum  = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[1]));
+        vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
 
         // vSum += pV1[8..11] * pV2[8..11]
-        vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pV1 + 8), pVec2[2]));
+        vTemp = _MM_LOAD(pVec1 + 8);
+        vSum  = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[2]));
+        vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
 
         // vSum += pV1[12..15] * pV2[12..15]
-        vSum = _mm_add_ps(vSum, _mm_mul_ps(_MM_LOAD(pV1 + 12), pVec2[3]));
+        vTemp = _MM_LOAD(pVec1 + 12);
+        vSum  = _mm_add_ps(vSum, _mm_mul_ps(vTemp, pVec2[3]));
+        vNorm = _mm_add_ps(vNorm, _mm_mul_ps(vTemp ,vTemp));
 
-        pV1 += 16;
+        pVec1 += 16;
         pVec2 += 4;
     }
 
     // return value = vSum[0] + vSum[1] + vSum[2] + vSum[3]
-    float *pvSum = (float*)&vSum;
-    return (double)(pvSum[0] + pvSum[1] + pvSum[2] + pvSum[3]);
+    float *pvNorm = (float*)&vNorm;
+    double norm = sqrt(pvNorm[0] + pvNorm[1] + pvNorm[2] + pvNorm[3]);
+    if (norm < 1e-9) norm = 1.0;    // to avoid div by zero
 
-    /* This is approximately corresponding routine in C-language:
-    double corr;
+    float *pvSum = (float*)&vSum;
+    return (double)(pvSum[0] + pvSum[1] + pvSum[2] + pvSum[3]) / norm;
+
+    /* This is approximately corresponding routine in C-language yet without normalization:
+    double corr, norm;
     uint i;
 
     // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors
-    corr = 0.0;
-    for (i = 0; i < overlapLength / 8; i ++) 
+    corr = norm = 0.0;
+    for (i = 0; i < channels * overlapLength / 16; i ++) 
     {
         corr += pV1[0] * pV2[0] +
                 pV1[1] * pV2[1] +
@@ -154,77 +172,12 @@ double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) con
                 pV1[14] * pV2[14] +
                 pV1[15] * pV2[15];
 
+    for (j = 0; j < 15; j ++) norm += pV1[j] * pV1[j];
+
         pV1 += 16;
         pV2 += 16;
     }
-    */
-
-    /* This is corresponding routine in assembler. This may be teeny-weeny bit faster
-       than intrinsic version, but more difficult to maintain & get compiled on multiple
-       platforms.
-
-    uint overlapLengthLocal = overlapLength;
-    float corr;
-
-    _asm 
-    {
-        // Very important note: data in 'pV2' _must_ be aligned to 
-        // 16-byte boundary!
-
-        // give prefetch hints to CPU of what data are to be needed soonish
-        // give more aggressive hints on pV1 as that changes while pV2 stays
-        // same between runs
-        prefetcht0 [pV1]
-        prefetcht0 [pV2]
-        prefetcht0 [pV1 + 32]
-
-        mov     eax, dword ptr pV1
-        mov     ebx, dword ptr pV2
-
-        xorps   xmm0, xmm0
-
-        mov     ecx, overlapLengthLocal
-        shr     ecx, 3  // div by eight
-
-    loop1:
-        prefetcht0 [eax + 64]     // give a prefetch hint to CPU what data are to be needed soonish
-        prefetcht0 [ebx + 32]     // give a prefetch hint to CPU what data are to be needed soonish
-        movups  xmm1, [eax]
-        mulps   xmm1, [ebx]
-        addps   xmm0, xmm1
-
-        movups  xmm2, [eax + 16]
-        mulps   xmm2, [ebx + 16]
-        addps   xmm0, xmm2
-
-        prefetcht0 [eax + 96]     // give a prefetch hint to CPU what data are to be needed soonish
-        prefetcht0 [ebx + 64]     // give a prefetch hint to CPU what data are to be needed soonish
-
-        movups  xmm3, [eax + 32]
-        mulps   xmm3, [ebx + 32]
-        addps   xmm0, xmm3
-
-        movups  xmm4, [eax + 48]
-        mulps   xmm4, [ebx + 48]
-        addps   xmm0, xmm4
-
-        add     eax, 64
-        add     ebx, 64
-
-        dec     ecx
-        jnz     loop1
-
-        // add the four floats of xmm0 together and return the result. 
-
-        movhlps xmm1, xmm0          // move 3 & 4 of xmm0 to 1 & 2 of xmm1
-        addps   xmm1, xmm0
-        movaps  xmm2, xmm1
-        shufps  xmm2, xmm2, 0x01    // move 2 of xmm2 as 1 of xmm2
-        addss   xmm2, xmm1
-        movss   corr, xmm2
-    }
-
-    return (double)corr;
+    return corr / sqrt(norm);
     */
 }
 
@@ -239,6 +192,7 @@ double TDStretchSSE::calcCrossCorrStereo(const float *pV1, const float *pV2) con
 
 FIRFilterSSE::FIRFilterSSE() : FIRFilter()
 {
+    filterCoeffsAlign = NULL;
     filterCoeffsUnalign = NULL;
 }
 
@@ -246,6 +200,8 @@ FIRFilterSSE::FIRFilterSSE() : FIRFilter()
 FIRFilterSSE::~FIRFilterSSE()
 {
     delete[] filterCoeffsUnalign;
+    filterCoeffsAlign = NULL;
+    filterCoeffsUnalign = NULL;
 }
 
 
@@ -258,11 +214,11 @@ void FIRFilterSSE::setCoefficients(const float *coeffs, uint newLength, uint uRe
     FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
 
     // Scale the filter coefficients so that it won't be necessary to scale the filtering result
-    // also rearrange coefficients suitably for 3DNow!
+    // also rearrange coefficients suitably for SSE
     // Ensure that filter coeffs array is aligned to 16-byte boundary
     delete[] filterCoeffsUnalign;
     filterCoeffsUnalign = new float[2 * newLength + 4];
-    filterCoeffsAlign = (float *)(((unsigned long)filterCoeffsUnalign + 15) & -16);
+    filterCoeffsAlign = (float *)SOUNDTOUCH_ALIGN_POINTER_16(filterCoeffsUnalign);
 
     fDivider = (float)resultDivider;
 
@@ -279,15 +235,18 @@ void FIRFilterSSE::setCoefficients(const float *coeffs, uint newLength, uint uRe
 // SSE-optimized version of the filter routine for stereo sound
 uint FIRFilterSSE::evaluateFilterStereo(float *dest, const float *source, uint numSamples) const
 {
-    int count = (numSamples - length) & -2;
+    int count = (int)((numSamples - length) & (uint)-2);
     int j;
 
     assert(count % 2 == 0);
 
     if (count < 2) return 0;
 
+    assert(source != NULL);
+    assert(dest != NULL);
     assert((length % 8) == 0);
-    assert(((unsigned long)filterCoeffsAlign) % 16 == 0);
+    assert(filterCoeffsAlign != NULL);
+    assert(((ulongptr)filterCoeffsAlign) % 16 == 0);
 
     // filter is evaluated for two stereo samples with each iteration, thus use of 'j += 2'
     for (j = 0; j < count; j += 2)
@@ -297,9 +256,9 @@ uint FIRFilterSSE::evaluateFilterStereo(float *dest, const float *source, uint n
         __m128 sum1, sum2;
         uint i;
 
-        pSrc = source;                      // source audio data
-        pFil = (__m128*)filterCoeffsAlign;  // filter coefficients. NOTE: Assumes coefficients 
-                                            // are aligned to 16-byte boundary
+        pSrc = (const float*)source;              // source audio data
+        pFil = (const __m128*)filterCoeffsAlign;  // filter coefficients. NOTE: Assumes coefficients 
+                                                  // are aligned to 16-byte boundary
         sum1 = sum2 = _mm_setzero_ps();
 
         for (i = 0; i < length / 8; i ++) 
@@ -397,88 +356,6 @@ uint FIRFilterSSE::evaluateFilterStereo(float *dest, const float *source, uint n
         dest += 4;
     }
     */
-
-
-    /* Similar routine in assembly, again obsoleted due to maintainability
-    _asm
-    {
-        // Very important note: data in 'src' _must_ be aligned to 
-        // 16-byte boundary!
-        mov     edx, count
-        mov     ebx, dword ptr src
-        mov     eax, dword ptr dest
-        shr     edx, 1
-
-    loop1:
-        // "outer loop" : during each round 2*2 output samples are calculated
-
-        // give prefetch hints to CPU of what data are to be needed soonish
-        prefetcht0 [ebx]
-        prefetcht0 [filterCoeffsLocal]
-
-        mov     esi, ebx
-        mov     edi, filterCoeffsLocal
-        xorps   xmm0, xmm0
-        xorps   xmm1, xmm1
-        mov     ecx, lengthLocal
-
-    loop2:
-        // "inner loop" : during each round eight FIR filter taps are evaluated for 2*2 samples
-        prefetcht0 [esi + 32]     // give a prefetch hint to CPU what data are to be needed soonish
-        prefetcht0 [edi + 32]     // give a prefetch hint to CPU what data are to be needed soonish
-
-        movups  xmm2, [esi]         // possibly unaligned load
-        movups  xmm3, [esi + 8]     // possibly unaligned load
-        mulps   xmm2, [edi]
-        mulps   xmm3, [edi]
-        addps   xmm0, xmm2
-        addps   xmm1, xmm3
-
-        movups  xmm4, [esi + 16]    // possibly unaligned load
-        movups  xmm5, [esi + 24]    // possibly unaligned load
-        mulps   xmm4, [edi + 16]
-        mulps   xmm5, [edi + 16]
-        addps   xmm0, xmm4
-        addps   xmm1, xmm5
-
-        prefetcht0 [esi + 64]     // give a prefetch hint to CPU what data are to be needed soonish
-        prefetcht0 [edi + 64]     // give a prefetch hint to CPU what data are to be needed soonish
-
-        movups  xmm6, [esi + 32]    // possibly unaligned load
-        movups  xmm7, [esi + 40]    // possibly unaligned load
-        mulps   xmm6, [edi + 32]
-        mulps   xmm7, [edi + 32]
-        addps   xmm0, xmm6
-        addps   xmm1, xmm7
-
-        movups  xmm4, [esi + 48]    // possibly unaligned load
-        movups  xmm5, [esi + 56]    // possibly unaligned load
-        mulps   xmm4, [edi + 48]
-        mulps   xmm5, [edi + 48]
-        addps   xmm0, xmm4
-        addps   xmm1, xmm5
-
-        add     esi, 64
-        add     edi, 64
-        dec     ecx
-        jnz     loop2
-
-        // Now xmm0 and xmm1 both have a filtered 2-channel sample each, but we still need
-        // to sum the two hi- and lo-floats of these registers together.
-
-        movhlps xmm2, xmm0          // xmm2 = xmm2_3 xmm2_2 xmm0_3 xmm0_2
-        movlhps xmm2, xmm1          // xmm2 = xmm1_1 xmm1_0 xmm0_3 xmm0_2
-        shufps  xmm0, xmm1, 0xe4    // xmm0 = xmm1_3 xmm1_2 xmm0_1 xmm0_0
-        addps   xmm0, xmm2
-
-        movaps  [eax], xmm0
-        add     ebx, 16
-        add     eax, 16
-
-        dec     edx
-        jnz     loop1
-    }
-    */
 }
 
-#endif  // ALLOW_SSE
+#endif  // SOUNDTOUCH_ALLOW_SSE