3rdparty: Upgrade soundtouch lib to 2.3.1

2021-11-21 21:36:08 -05:00 · 2021-11-21 21:36:08 -05:00 · e37afd6976
parent 791f2a63ac
commit e37afd6976
18 changed files with 1141 additions and 1020 deletions
--- a/3rdparty/soundtouch/COPYING.TXT
+++ b/3rdparty/soundtouch/COPYING.TXT
@ -2,7 +2,7 @@
 		       Version 2.1, February 1999

 Copyright (C) 1991, 1999 Free Software Foundation, Inc.
-     59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.

@ -117,7 +117,7 @@ be combined with the library in order to run.

  0. This License Agreement applies to any software library or other
 program which contains a notice placed by the copyright holder or
-other authoried party saying it may be distributed under the terms of
+other authorized party saying it may be distributed under the terms of
 this Lesser General Public License (also called "this License").
 Each licensee is addressed as "you".

--- a/3rdparty/soundtouch/README.html
+++ b/3rdparty/soundtouch/README.html
--- a/3rdparty/soundtouch/soundtouch/FIFOSampleBuffer.h
+++ b/3rdparty/soundtouch/soundtouch/FIFOSampleBuffer.h
@ -170,6 +170,9 @@ public:
    /// allow trimming (downwards) amount of samples in pipeline.
    /// Returns adjusted amount of samples
    uint adjustAmountOfSamples(uint numSamples);
+
+    /// Add silence to end of buffer
+    void addSilent(uint nSamples);
 };

 }
--- a/3rdparty/soundtouch/soundtouch/STTypes.h
+++ b/3rdparty/soundtouch/soundtouch/STTypes.h
@ -121,10 +121,10 @@ namespace soundtouch

    #endif

-    // If defined, allows the SIMD-optimized routines to take minor shortcuts 
-    // for improved performance. Undefine to require faithfully similar SIMD 
-    // calculations as in normal C implementation.
-    #define SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION    1
+    // If defined, allows the SIMD-optimized routines to skip unevenly aligned
+    // memory offsets that can cause performance penalty in some SIMD implementations.
+    // Causes slight compromise in sound quality.
+    // #define SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION    1


    #ifdef SOUNDTOUCH_INTEGER_SAMPLES
@ -149,8 +149,9 @@ namespace soundtouch

        // floating point samples
        typedef float  SAMPLETYPE;
-        // data type for sample accumulation: Use double to utilize full precision.
-        typedef double LONG_SAMPLETYPE;
+        // data type for sample accumulation: Use float also here to enable
+        // efficient autovectorization
+        typedef float LONG_SAMPLETYPE;

        #ifdef SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS
            // Allow SSE optimizations
@ -159,7 +160,13 @@ namespace soundtouch

    #endif  // SOUNDTOUCH_INTEGER_SAMPLES

-};
+    #if ((SOUNDTOUCH_ALLOW_SSE) || (__SSE__) || (SOUNDTOUCH_USE_NEON))
+        #if SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION
+            #define ST_SIMD_AVOID_UNALIGNED
+        #endif
+    #endif
+
+}

 // define ST_NO_EXCEPTION_HANDLING switch to disable throwing std exceptions:
 // #define ST_NO_EXCEPTION_HANDLING    1
--- a/3rdparty/soundtouch/soundtouch/SoundTouch.h
+++ b/3rdparty/soundtouch/soundtouch/SoundTouch.h
@ -72,10 +72,10 @@ namespace soundtouch
 {

 /// Soundtouch library version string
-#define SOUNDTOUCH_VERSION          "2.1.2"
+#define SOUNDTOUCH_VERSION          "2.3.1"

 /// SoundTouch library version id
-#define SOUNDTOUCH_VERSION_ID       (20102)
+#define SOUNDTOUCH_VERSION_ID       (20301)

 //
 // Available setting IDs for the 'setSetting' & 'get_setting' functions:
--- a/3rdparty/soundtouch/source/SoundTouch/BPMDetect.cpp
+++ b/3rdparty/soundtouch/source/SoundTouch/BPMDetect.cpp
@ -313,7 +313,7 @@ void BPMDetect::updateXCorr(int process_samples)
    #pragma omp parallel for
    for (offs = windowStart; offs < windowLen; offs ++) 
    {
-        double sum;
+        float sum;
        int i;

        sum = 0;
@ -341,7 +341,6 @@ void BPMDetect::updateBeatPos(int process_samples)
    //    static double thr = 0.0003;
    double posScale = (double)this->decimateBy / (double)this->sampleRate;
    int resetDur = (int)(0.12 / posScale + 0.5);
-    double corrScale = 1.0 / (double)(windowLen - windowStart);

    // prescale pbuffer
    float tmp[XCORR_UPDATE_SEQUENCE / 2];
@ -353,7 +352,7 @@ void BPMDetect::updateBeatPos(int process_samples)
    #pragma omp parallel for
    for (int offs = windowStart; offs < windowLen; offs++)
    {
-        double sum = 0;
+        float sum = 0;
        for (int i = 0; i < process_samples; i++)
        {
            sum += tmp[i] * pBuffer[offs + i];
@ -562,7 +561,7 @@ float BPMDetect::getBpm()
 /// \return number of beats in the arrays.
 int BPMDetect::getBeats(float *pos, float *values, int max_num)
 {
-    int num = beats.size();
+    int num = (int)beats.size();
    if ((!pos) || (!values)) return num;    // pos or values NULL, return just size

    for (int i = 0; (i < num) && (i < max_num); i++)
--- a/3rdparty/soundtouch/source/SoundTouch/FIFOSampleBuffer.cpp
+++ b/3rdparty/soundtouch/source/SoundTouch/FIFOSampleBuffer.cpp
@ -265,3 +265,11 @@ uint FIFOSampleBuffer::adjustAmountOfSamples(uint numSamples)
    }
    return samplesInBuffer;
 }
+
+
+/// Add silence to end of buffer
+void FIFOSampleBuffer::addSilent(uint nSamples)
+{
+    memset(ptrEnd(nSamples), 0, sizeof(SAMPLETYPE) * nSamples * channels);
+    samplesInBuffer += nSamples;
+}
--- a/3rdparty/soundtouch/source/SoundTouch/FIRFilter.cpp
+++ b/3rdparty/soundtouch/source/SoundTouch/FIRFilter.cpp
@ -60,12 +60,14 @@ FIRFilter::FIRFilter()
    length = 0;
    lengthDiv8 = 0;
    filterCoeffs = NULL;
+    filterCoeffsStereo = NULL;
 }


 FIRFilter::~FIRFilter()
 {
    delete[] filterCoeffs;
+    delete[] filterCoeffsStereo;
 }


@ -78,35 +80,26 @@ uint FIRFilter::evaluateFilterStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, ui
    // because division is much slower operation than multiplying.
    double dScaler = 1.0 / (double)resultDivider;
 #endif
+    // hint compiler autovectorization that loop length is divisible by 8
+    int ilength = length & -8;

-    assert(length != 0);
-    assert(src != NULL);
-    assert(dest != NULL);
-    assert(filterCoeffs != NULL);
+    assert((length != 0) && (length == ilength) && (src != NULL) && (dest != NULL) && (filterCoeffs != NULL));

-    end = 2 * (numSamples - length);
+    end = 2 * (numSamples - ilength);

    #pragma omp parallel for
    for (j = 0; j < end; j += 2) 
    {
        const SAMPLETYPE *ptr;
        LONG_SAMPLETYPE suml, sumr;
-        uint i;

        suml = sumr = 0;
        ptr = src + j;

-        for (i = 0; i < length; i += 4) 
+        for (int i = 0; i < ilength; i ++)
        {
-            // loop is unrolled by factor of 4 here for efficiency
-            suml += ptr[2 * i + 0] * filterCoeffs[i + 0] +
-                    ptr[2 * i + 2] * filterCoeffs[i + 1] +
-                    ptr[2 * i + 4] * filterCoeffs[i + 2] +
-                    ptr[2 * i + 6] * filterCoeffs[i + 3];
-            sumr += ptr[2 * i + 1] * filterCoeffs[i + 0] +
-                    ptr[2 * i + 3] * filterCoeffs[i + 1] +
-                    ptr[2 * i + 5] * filterCoeffs[i + 2] +
-                    ptr[2 * i + 7] * filterCoeffs[i + 3];
+            suml += ptr[2 * i] * filterCoeffsStereo[2 * i];
+            sumr += ptr[2 * i + 1] * filterCoeffsStereo[2 * i + 1];
        }

 #ifdef SOUNDTOUCH_INTEGER_SAMPLES
@ -116,14 +109,11 @@ uint FIRFilter::evaluateFilterStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, ui
        suml = (suml < -32768) ? -32768 : (suml > 32767) ? 32767 : suml;
        // saturate to 16 bit integer limits
        sumr = (sumr < -32768) ? -32768 : (sumr > 32767) ? 32767 : sumr;
-#else
-        suml *= dScaler;
-        sumr *= dScaler;
 #endif // SOUNDTOUCH_INTEGER_SAMPLES
        dest[j] = (SAMPLETYPE)suml;
        dest[j + 1] = (SAMPLETYPE)sumr;
    }
-    return numSamples - length;
+    return numSamples - ilength;
 }


@ -137,31 +127,28 @@ uint FIRFilter::evaluateFilterMono(SAMPLETYPE *dest, const SAMPLETYPE *src, uint
    double dScaler = 1.0 / (double)resultDivider;
 #endif

-    assert(length != 0);
+    // hint compiler autovectorization that loop length is divisible by 8
+    int ilength = length & -8;

-    end = numSamples - length;
+    assert(ilength != 0);
+
+    end = numSamples - ilength;
    #pragma omp parallel for
-    for (j = 0; j < end; j ++) 
+    for (j = 0; j < end; j ++)
    {
        const SAMPLETYPE *pSrc = src + j;
        LONG_SAMPLETYPE sum;
-        uint i;
+        int i;

        sum = 0;
-        for (i = 0; i < length; i += 4) 
+        for (i = 0; i < ilength; i ++)
        {
-            // loop is unrolled by factor of 4 here for efficiency
-            sum += pSrc[i + 0] * filterCoeffs[i + 0] + 
-                   pSrc[i + 1] * filterCoeffs[i + 1] + 
-                   pSrc[i + 2] * filterCoeffs[i + 2] + 
-                   pSrc[i + 3] * filterCoeffs[i + 3];
+            sum += pSrc[i] * filterCoeffs[i];
        }
 #ifdef SOUNDTOUCH_INTEGER_SAMPLES
        sum >>= resultDivFactor;
        // saturate to 16 bit integer limits
        sum = (sum < -32768) ? -32768 : (sum > 32767) ? 32767 : sum;
-#else
-        sum *= dScaler;
 #endif // SOUNDTOUCH_INTEGER_SAMPLES
        dest[j] = (SAMPLETYPE)sum;
    }
@ -185,14 +172,18 @@ uint FIRFilter::evaluateFilterMulti(SAMPLETYPE *dest, const SAMPLETYPE *src, uin
    assert(filterCoeffs != NULL);
    assert(numChannels < 16);

-    end = numChannels * (numSamples - length);
+    // hint compiler autovectorization that loop length is divisible by 8
+    int ilength = length & -8;
+
+    end = numChannels * (numSamples - ilength);

    #pragma omp parallel for
    for (j = 0; j < end; j += numChannels)
    {
        const SAMPLETYPE *ptr;
        LONG_SAMPLETYPE sums[16];
-        uint c, i;
+        uint c;
+        int i;

        for (c = 0; c < numChannels; c ++)
        {
@ -201,7 +192,7 @@ uint FIRFilter::evaluateFilterMulti(SAMPLETYPE *dest, const SAMPLETYPE *src, uin

        ptr = src + j;

-        for (i = 0; i < length; i ++)
+        for (i = 0; i < ilength; i ++)
        {
            SAMPLETYPE coef=filterCoeffs[i];
            for (c = 0; c < numChannels; c ++)
@ -215,13 +206,11 @@ uint FIRFilter::evaluateFilterMulti(SAMPLETYPE *dest, const SAMPLETYPE *src, uin
        {
 #ifdef SOUNDTOUCH_INTEGER_SAMPLES
            sums[c] >>= resultDivFactor;
-#else
-            sums[c] *= dScaler;
 #endif // SOUNDTOUCH_INTEGER_SAMPLES
            dest[j+c] = (SAMPLETYPE)sums[c];
        }
    }
-    return numSamples - length;
+    return numSamples - ilength;
 }


@ -233,6 +222,13 @@ void FIRFilter::setCoefficients(const SAMPLETYPE *coeffs, uint newLength, uint u
    assert(newLength > 0);
    if (newLength % 8) ST_THROW_RT_ERROR("FIR filter length not divisible by 8");

+    #ifdef SOUNDTOUCH_FLOAT_SAMPLES
+        // scale coefficients already here if using floating samples
+        double scale = 1.0 / resultDivider;
+    #else
+        short scale = 1;
+    #endif
+
    lengthDiv8 = newLength / 8;
    length = lengthDiv8 * 8;
    assert(length == newLength);
@ -242,7 +238,16 @@ void FIRFilter::setCoefficients(const SAMPLETYPE *coeffs, uint newLength, uint u

    delete[] filterCoeffs;
    filterCoeffs = new SAMPLETYPE[length];
-    memcpy(filterCoeffs, coeffs, length * sizeof(SAMPLETYPE));
+    delete[] filterCoeffsStereo;
+    filterCoeffsStereo = new SAMPLETYPE[length*2];
+    for (uint i = 0; i < length; i ++)
+    {
+        filterCoeffs[i] = (SAMPLETYPE)(coeffs[i] * scale);
+        // create also stereo set of filter coefficients: this allows compiler
+        // to autovectorize filter evaluation much more efficiently
+        filterCoeffsStereo[2 * i] = (SAMPLETYPE)(coeffs[i] * scale);
+        filterCoeffsStereo[2 * i + 1] = (SAMPLETYPE)(coeffs[i] * scale);
+    }
 }


--- a/3rdparty/soundtouch/source/SoundTouch/FIRFilter.h
+++ b/3rdparty/soundtouch/source/SoundTouch/FIRFilter.h
@ -57,6 +57,7 @@ protected:

    // Memory for filter coefficients
    SAMPLETYPE *filterCoeffs;
+    SAMPLETYPE *filterCoeffsStereo;

    virtual uint evaluateFilterStereo(SAMPLETYPE *dest, 
                                      const SAMPLETYPE *src, 
--- a/3rdparty/soundtouch/source/SoundTouch/InterpolateCubic.h
+++ b/3rdparty/soundtouch/source/SoundTouch/InterpolateCubic.h
@ -41,7 +41,6 @@ namespace soundtouch
 class InterpolateCubic : public TransposerBase
 {
 protected:
-    virtual void resetRegisters();
    virtual int transposeMono(SAMPLETYPE *dest, 
                        const SAMPLETYPE *src, 
                        int &srcSamples);
@ -56,6 +55,13 @@ protected:

 public:
    InterpolateCubic();
+
+    virtual void resetRegisters();
+
+    int getLatency() const
+    {
+        return 1;
+    }
 };

 }
--- a/3rdparty/soundtouch/source/SoundTouch/InterpolateLinear.cpp
+++ b/3rdparty/soundtouch/source/SoundTouch/InterpolateLinear.cpp
@ -142,7 +142,7 @@ int InterpolateLinearInteger::transposeMulti(SAMPLETYPE *dest, const SAMPLETYPE
        LONG_SAMPLETYPE temp, vol1;
    
        assert(iFract < SCALE);
-        vol1 = (SCALE - iFract);
+        vol1 = (LONG_SAMPLETYPE)(SCALE - iFract);
        for (int c = 0; c < numChannels; c ++)
        {
            temp = vol1 * src[c] + iFract * src[c + numChannels];
--- a/3rdparty/soundtouch/source/SoundTouch/InterpolateLinear.h
+++ b/3rdparty/soundtouch/source/SoundTouch/InterpolateLinear.h
@ -45,8 +45,6 @@ protected:
    int iFract;
    int iRate;

-    virtual void resetRegisters();
-
    virtual int transposeMono(SAMPLETYPE *dest, 
                       const SAMPLETYPE *src, 
                       int &srcSamples);
@ -60,6 +58,13 @@ public:
    /// Sets new target rate. Normal rate = 1.0, smaller values represent slower 
    /// rate, larger faster rates.
    virtual void setRate(double newRate);
+
+    virtual void resetRegisters();
+
+    int getLatency() const
+    {
+        return 0;
+    }
 };


@ -69,8 +74,6 @@ class InterpolateLinearFloat : public TransposerBase
 protected:
    double fract;

-    virtual void resetRegisters();
-
    virtual int transposeMono(SAMPLETYPE *dest, 
                       const SAMPLETYPE *src, 
                       int &srcSamples);
@ -81,6 +84,13 @@ protected:

 public:
    InterpolateLinearFloat();
+
+    virtual void resetRegisters();
+
+    int getLatency() const
+    {
+        return 0;
+    }
 };

 }
--- a/3rdparty/soundtouch/source/SoundTouch/InterpolateShannon.h
+++ b/3rdparty/soundtouch/source/SoundTouch/InterpolateShannon.h
@ -46,7 +46,6 @@ namespace soundtouch
 class InterpolateShannon : public TransposerBase
 {
 protected:
-    void resetRegisters();
    int transposeMono(SAMPLETYPE *dest, 
                        const SAMPLETYPE *src, 
                        int &srcSamples);
@ -61,6 +60,13 @@ protected:

 public:
    InterpolateShannon();
+
+    void resetRegisters();
+
+    int getLatency() const
+    {
+        return 3;
+    }
 };

 }
--- a/3rdparty/soundtouch/source/SoundTouch/PeakFinder.cpp
+++ b/3rdparty/soundtouch/source/SoundTouch/PeakFinder.cpp
@ -57,7 +57,7 @@ int PeakFinder::findTop(const float *data, int peakpos) const

    refvalue = data[peakpos];

-    // seek within <EFBFBD>10 points
+    // seek within ±10 points
    start = peakpos - 10;
    if (start < minPos) start = minPos;
    end = peakpos + 10;
@ -142,7 +142,7 @@ int PeakFinder::findCrossingLevel(const float *data, float level, int peakpos, i
    peaklevel = data[peakpos];
    assert(peaklevel >= level);
    pos = peakpos;
-    while ((pos >= minPos) && (pos < maxPos))
+    while ((pos >= minPos) && (pos + direction < maxPos))
    {
        if (data[pos + direction] < level) return pos;   // crossing found
        pos += direction;
@ -256,7 +256,7 @@ double PeakFinder::detectPeak(const float *data, int aminPos, int amaxPos)

        // accept harmonic peak if 
        // (a) it is found
-        // (b) is within <EFBFBD>4% of the expected harmonic interval
+        // (b) is within ±4% of the expected harmonic interval
        // (c) has at least half x-corr value of the max. peak

        double diff = harmonic * peaktmp / highPeak;
--- a/3rdparty/soundtouch/source/SoundTouch/RateTransposer.cpp
+++ b/3rdparty/soundtouch/source/SoundTouch/RateTransposer.cpp
@ -61,6 +61,7 @@ RateTransposer::RateTransposer() : FIFOProcessor(&outputBuffer)
    // Instantiates the anti-alias filter
    pAAFilter = new AAFilter(64);
    pTransposer = TransposerBase::newInstance();
+    clear();
 }


@ -77,6 +78,7 @@ void RateTransposer::enableAAFilter(bool newMode)
 #ifndef SOUNDTOUCH_PREVENT_CLICK_AT_RATE_CROSSOVER
    // Disable Anti-alias filter if desirable to avoid click at rate change zero value crossover
    bUseAAFilter = newMode;
+    clear();
 #endif
 }

@ -192,6 +194,11 @@ void RateTransposer::clear()
    outputBuffer.clear();
    midBuffer.clear();
    inputBuffer.clear();
+    pTransposer->resetRegisters();
+
+    // prefill buffer to avoid losing first samples at beginning of stream
+    int prefill = getLatency();
+    inputBuffer.addSilent(prefill);
 }


@ -209,7 +216,8 @@ int RateTransposer::isEmpty() const
 /// Return approximate initial input-output latency
 int RateTransposer::getLatency() const
 {
-    return (bUseAAFilter) ? pAAFilter->getLength() : 0;
+    return pTransposer->getLatency() +
+        ((bUseAAFilter) ? (pAAFilter->getLength() / 2) : 0);
 }


--- a/3rdparty/soundtouch/source/SoundTouch/RateTransposer.h
+++ b/3rdparty/soundtouch/source/SoundTouch/RateTransposer.h
@ -59,8 +59,6 @@ public:
    };

 protected:
-    virtual void resetRegisters() = 0;
-
    virtual int transposeMono(SAMPLETYPE *dest, 
                        const SAMPLETYPE *src, 
                        int &srcSamples)  = 0;
@ -83,6 +81,9 @@ public:
    virtual int transpose(FIFOSampleBuffer &dest, FIFOSampleBuffer &src);
    virtual void setRate(double newRate);
    virtual void setChannels(int channels);
+    virtual int getLatency() const = 0;
+
+    virtual void resetRegisters() = 0;

    // static factory function
    static TransposerBase *newInstance();
--- a/3rdparty/soundtouch/source/SoundTouch/TDStretch.cpp
+++ b/3rdparty/soundtouch/source/SoundTouch/TDStretch.cpp
@ -1,4 +1,4 @@
-////////////////////////////////////////////////////////////////////////////////
+///////////////////////////////////////////////////////////////////////////////
 /// 
 /// Sampled sound tempo changer/time stretch algorithm. Changes the sound tempo 
 /// while maintaining the original pitch by using a time domain WSOLA-like 
@ -54,7 +54,6 @@ using namespace soundtouch;

 #define max(x, y) (((x) > (y)) ? (x) : (y))

-
 /*****************************************************************************
 *
 * Constant definitions
@ -93,11 +92,6 @@ TDStretch::TDStretch() : FIFOProcessor(&outputBuffer)
    bAutoSeqSetting = true;
    bAutoSeekSetting = true;

-    maxnorm = 0;
-    maxnormf = 1e8;
-
-    skipFract = 0;
-
    tempo = 1.0f;
    setParameters(44100, DEFAULT_SEQUENCE_MS, DEFAULT_SEEKWINDOW_MS, DEFAULT_OVERLAP_MS);
    setTempo(1.0f);
@ -203,7 +197,7 @@ void TDStretch::overlapMono(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput) const
    m1 = (SAMPLETYPE)0;
    m2 = (SAMPLETYPE)overlapLength;

-    for (i = 0; i < overlapLength ; i ++) 
+    for (i = 0; i < overlapLength ; i ++)
    {
        pOutput[i] = (pInput[i] * m1 + pMidBuffer[i] * m2 ) / overlapLength;
        m1 += 1;
@ -224,6 +218,9 @@ void TDStretch::clearInput()
    inputBuffer.clear();
    clearMidBuffer();
    isBeginning = true;
+    maxnorm = 0;
+    maxnormf = 1e8;
+    skipFract = 0;
 }


@ -311,13 +308,14 @@ int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos)
    bestCorr = (bestCorr + 0.1) * 0.75;

    #pragma omp parallel for
-    for (i = 1; i < seekLength; i ++) 
+    for (i = 1; i < seekLength; i ++)
    {
        double corr;
        // Calculates correlation value for the mixing position corresponding to 'i'
-#ifdef _OPENMP
+#if defined(_OPENMP) || defined(ST_SIMD_AVOID_UNALIGNED)
        // in parallel OpenMP mode, can't use norm accumulator version as parallel executor won't
        // iterate the loop in sequential order
+        // in SIMD mode, avoid accumulator version to allow avoiding unaligned positions
        corr = calcCrossCorr(refPos + channels * i, pMidBuffer, norm);
 #else
        // In non-parallel version call "calcCrossCorrAccumulate" that is otherwise same
@ -675,23 +673,24 @@ void TDStretch::processSamples()
            // Adjust processing offset at beginning of track by not perform initial overlapping
            // and compensating that in the 'input buffer skip' calculation
            isBeginning = false;
-            int skip = (int)(tempo * overlapLength + 0.5);
+            int skip = (int)(tempo * overlapLength + 0.5 * seekLength + 0.5);

-            #ifdef SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION
-                #ifdef SOUNDTOUCH_ALLOW_SSE
-                // if SSE mode, round the skip amount to value corresponding to aligned memory address
-                if (channels == 1)
-                {
-                    skip &= -4;
-                }
-                else if (channels == 2)
-                {
-                    skip &= -2;
-                }
-                #endif
+            #ifdef ST_SIMD_AVOID_UNALIGNED
+            // in SIMD mode, round the skip amount to value corresponding to aligned memory address
+            if (channels == 1)
+            {
+                skip &= -4;
+            }
+            else if (channels == 2)
+            {
+                skip &= -2;
+            }
            #endif
            skipFract -= skip;
-            assert(nominalSkip >= -skipFract);
+            if (skipFract <= -nominalSkip)
+            {
+                skipFract = -nominalSkip;
+            }
        }

        // ... then copy sequence samples from 'inputBuffer' to output:
@ -818,7 +817,7 @@ void TDStretch::overlapStereo(short *poutput, const short *input) const
    short temp;
    int cnt2;

-    for (i = 0; i < overlapLength ; i ++) 
+    for (i = 0; i < overlapLength ; i ++)
    {
        temp = (short)(overlapLength - i);
        cnt2 = 2 * i;
@ -830,21 +829,19 @@ void TDStretch::overlapStereo(short *poutput, const short *input) const

 // Overlaps samples in 'midBuffer' with the samples in 'input'. The 'Multi'
 // version of the routine.
-void TDStretch::overlapMulti(SAMPLETYPE *poutput, const SAMPLETYPE *input) const
+void TDStretch::overlapMulti(short *poutput, const short *input) const
 {
-    SAMPLETYPE m1=(SAMPLETYPE)0;
-    SAMPLETYPE m2;
-    int i=0;
+    short m1;
+    int i = 0;

-    for (m2 = (SAMPLETYPE)overlapLength; m2; m2 --)
+    for (m1 = 0; m1 < overlapLength; m1 ++)
    {
+        short m2 = (short)(overlapLength - m1);
        for (int c = 0; c < channels; c ++)
        {
            poutput[i] = (input[i] * m1 + pMidBuffer[i] * m2)  / overlapLength;
            i++;
        }
-
-        m1++;
    }
 }

@ -889,20 +886,23 @@ double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare, do
    unsigned long lnorm;
    int i;

+    #ifdef ST_SIMD_AVOID_UNALIGNED
+        // in SIMD mode skip 'mixingPos' positions that aren't aligned to 16-byte boundary
+        if (((ulongptr)mixingPos) & 15) return -1e50;
+    #endif
+
+    // hint compiler autovectorization that loop length is divisible by 8
+    int ilength = (channels * overlapLength) & -8;
+
    corr = lnorm = 0;
-    // Same routine for stereo and mono. For stereo, unroll loop for better
-    // efficiency and gives slightly better resolution against rounding. 
-    // For mono it same routine, just  unrolls loop by factor of 4
-    for (i = 0; i < channels * overlapLength; i += 4) 
+    // Same routine for stereo and mono
+    for (i = 0; i < ilength; i += 2)
    {
        corr += (mixingPos[i] * compare[i] + 
-                 mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm;  // notice: do intermediate division here to avoid integer overflow
-        corr += (mixingPos[i + 2] * compare[i + 2] + 
-                mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBitsNorm;
+                 mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm;
        lnorm += (mixingPos[i] * mixingPos[i] + 
-                mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBitsNorm; // notice: do intermediate division here to avoid integer overflow
-        lnorm += (mixingPos[i + 2] * mixingPos[i + 2] + 
-                mixingPos[i + 3] * mixingPos[i + 3]) >> overlapDividerBitsNorm;
+                  mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBitsNorm;
+        // do intermediate scalings to avoid integer overflow
    }

    if (lnorm > maxnorm)
@ -925,9 +925,12 @@ double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare, do
 double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *compare, double &norm)
 {
    long corr;
-    unsigned long lnorm;
+    long lnorm;
    int i;

+    // hint compiler autovectorization that loop length is divisible by 8
+    int ilength = (channels * overlapLength) & -8;
+
    // cancel first normalizer tap from previous round
    lnorm = 0;
    for (i = 1; i <= channels; i ++)
@ -936,15 +939,11 @@ double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *c
    }

    corr = 0;
-    // Same routine for stereo and mono. For stereo, unroll loop for better
-    // efficiency and gives slightly better resolution against rounding. 
-    // For mono it same routine, just  unrolls loop by factor of 4
-    for (i = 0; i < channels * overlapLength; i += 4) 
+    // Same routine for stereo and mono.
+    for (i = 0; i < ilength; i += 2) 
    {
        corr += (mixingPos[i] * compare[i] + 
-                 mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm;  // notice: do intermediate division here to avoid integer overflow
-        corr += (mixingPos[i + 2] * compare[i + 2] + 
-                 mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBitsNorm;
+                 mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm;
    }

    // update normalizer with last samples of this round
@ -1045,27 +1044,24 @@ void TDStretch::calculateOverlapLength(int overlapInMsec)
 /// Calculate cross-correlation
 double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, double &anorm)
 {
-    double corr;
-    double norm;
+    float corr;
+    float norm;
    int i;

+    #ifdef ST_SIMD_AVOID_UNALIGNED
+        // in SIMD mode skip 'mixingPos' positions that aren't aligned to 16-byte boundary
+        if (((ulongptr)mixingPos) & 15) return -1e50;
+    #endif
+
+    // hint compiler autovectorization that loop length is divisible by 8
+    int ilength = (channels * overlapLength) & -8;
+
    corr = norm = 0;
-    // Same routine for stereo and mono. For Stereo, unroll by factor of 2.
-    // For mono it's same routine yet unrollsd by factor of 4.
-    for (i = 0; i < channels * overlapLength; i += 4) 
+    // Same routine for stereo and mono
+    for (i = 0; i < ilength; i ++)
    {
-        corr += mixingPos[i] * compare[i] +
-                mixingPos[i + 1] * compare[i + 1];
-
-        norm += mixingPos[i] * mixingPos[i] + 
-                mixingPos[i + 1] * mixingPos[i + 1];
-
-        // unroll the loop for better CPU efficiency:
-        corr += mixingPos[i + 2] * compare[i + 2] +
-                mixingPos[i + 3] * compare[i + 3];
-
-        norm += mixingPos[i + 2] * mixingPos[i + 2] +
-                mixingPos[i + 3] * mixingPos[i + 3];
+        corr += mixingPos[i] * compare[i];
+        norm += mixingPos[i] * mixingPos[i];
    }

    anorm = norm;
@ -1076,7 +1072,7 @@ double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, do
 /// Update cross-correlation by accumulating "norm" coefficient by previously calculated value
 double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *compare, double &norm)
 {
-    double corr;
+    float corr;
    int i;

    corr = 0;
@ -1087,14 +1083,13 @@ double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *c
        norm -= mixingPos[-i] * mixingPos[-i];
    }

-    // Same routine for stereo and mono. For Stereo, unroll by factor of 2.
-    // For mono it's same routine yet unrollsd by factor of 4.
-    for (i = 0; i < channels * overlapLength; i += 4) 
+    // hint compiler autovectorization that loop length is divisible by 8
+    int ilength = (channels * overlapLength) & -8;
+
+    // Same routine for stereo and mono
+    for (i = 0; i < ilength; i ++)
    {
-        corr += mixingPos[i] * compare[i] +
-                mixingPos[i + 1] * compare[i + 1] +
-                mixingPos[i + 2] * compare[i + 2] +
-                mixingPos[i + 3] * compare[i + 3];
+        corr += mixingPos[i] * compare[i];
    }

    // update normalizer with last samples of this round
--- a/3rdparty/soundtouch/source/SoundTouch/sse_optimized.cpp
+++ b/3rdparty/soundtouch/source/SoundTouch/sse_optimized.cpp
@ -80,7 +80,7 @@ double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2, double &a
    // Compile-time define SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION is provided
    // for choosing if this little cheating is allowed.

-#ifdef SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION
+#ifdef ST_SIMD_AVOID_UNALIGNED
    // Little cheating allowed, return valid correlation only for 
    // aligned locations, meaning every second round for stereo sound.