3rdparty: Upgrade soundtouch lib to 2.3.1

This commit is contained in:
Christian Kenny 2021-11-21 21:36:08 -05:00 committed by refractionpcsx2
parent 791f2a63ac
commit e37afd6976
18 changed files with 1141 additions and 1020 deletions

View File

@ -2,7 +2,7 @@
Version 2.1, February 1999 Version 2.1, February 1999
Copyright (C) 1991, 1999 Free Software Foundation, Inc. Copyright (C) 1991, 1999 Free Software Foundation, Inc.
59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Everyone is permitted to copy and distribute verbatim copies Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed. of this license document, but changing it is not allowed.
@ -117,7 +117,7 @@ be combined with the library in order to run.
0. This License Agreement applies to any software library or other 0. This License Agreement applies to any software library or other
program which contains a notice placed by the copyright holder or program which contains a notice placed by the copyright holder or
other authoried party saying it may be distributed under the terms of other authorized party saying it may be distributed under the terms of
this Lesser General Public License (also called "this License"). this Lesser General Public License (also called "this License").
Each licensee is addressed as "you". Each licensee is addressed as "you".

File diff suppressed because it is too large Load Diff

View File

@ -170,6 +170,9 @@ public:
/// allow trimming (downwards) amount of samples in pipeline. /// allow trimming (downwards) amount of samples in pipeline.
/// Returns adjusted amount of samples /// Returns adjusted amount of samples
uint adjustAmountOfSamples(uint numSamples); uint adjustAmountOfSamples(uint numSamples);
/// Add silence to end of buffer
void addSilent(uint nSamples);
}; };
} }

View File

@ -121,10 +121,10 @@ namespace soundtouch
#endif #endif
// If defined, allows the SIMD-optimized routines to take minor shortcuts // If defined, allows the SIMD-optimized routines to skip unevenly aligned
// for improved performance. Undefine to require faithfully similar SIMD // memory offsets that can cause performance penalty in some SIMD implementations.
// calculations as in normal C implementation. // Causes slight compromise in sound quality.
#define SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION 1 // #define SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION 1
#ifdef SOUNDTOUCH_INTEGER_SAMPLES #ifdef SOUNDTOUCH_INTEGER_SAMPLES
@ -149,8 +149,9 @@ namespace soundtouch
// floating point samples // floating point samples
typedef float SAMPLETYPE; typedef float SAMPLETYPE;
// data type for sample accumulation: Use double to utilize full precision. // data type for sample accumulation: Use float also here to enable
typedef double LONG_SAMPLETYPE; // efficient autovectorization
typedef float LONG_SAMPLETYPE;
#ifdef SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS #ifdef SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS
// Allow SSE optimizations // Allow SSE optimizations
@ -159,7 +160,13 @@ namespace soundtouch
#endif // SOUNDTOUCH_INTEGER_SAMPLES #endif // SOUNDTOUCH_INTEGER_SAMPLES
}; #if ((SOUNDTOUCH_ALLOW_SSE) || (__SSE__) || (SOUNDTOUCH_USE_NEON))
#if SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION
#define ST_SIMD_AVOID_UNALIGNED
#endif
#endif
}
// define ST_NO_EXCEPTION_HANDLING switch to disable throwing std exceptions: // define ST_NO_EXCEPTION_HANDLING switch to disable throwing std exceptions:
// #define ST_NO_EXCEPTION_HANDLING 1 // #define ST_NO_EXCEPTION_HANDLING 1

View File

@ -72,10 +72,10 @@ namespace soundtouch
{ {
/// Soundtouch library version string /// Soundtouch library version string
#define SOUNDTOUCH_VERSION "2.1.2" #define SOUNDTOUCH_VERSION "2.3.1"
/// SoundTouch library version id /// SoundTouch library version id
#define SOUNDTOUCH_VERSION_ID (20102) #define SOUNDTOUCH_VERSION_ID (20301)
// //
// Available setting IDs for the 'setSetting' & 'get_setting' functions: // Available setting IDs for the 'setSetting' & 'get_setting' functions:

View File

@ -313,7 +313,7 @@ void BPMDetect::updateXCorr(int process_samples)
#pragma omp parallel for #pragma omp parallel for
for (offs = windowStart; offs < windowLen; offs ++) for (offs = windowStart; offs < windowLen; offs ++)
{ {
double sum; float sum;
int i; int i;
sum = 0; sum = 0;
@ -341,7 +341,6 @@ void BPMDetect::updateBeatPos(int process_samples)
// static double thr = 0.0003; // static double thr = 0.0003;
double posScale = (double)this->decimateBy / (double)this->sampleRate; double posScale = (double)this->decimateBy / (double)this->sampleRate;
int resetDur = (int)(0.12 / posScale + 0.5); int resetDur = (int)(0.12 / posScale + 0.5);
double corrScale = 1.0 / (double)(windowLen - windowStart);
// prescale pbuffer // prescale pbuffer
float tmp[XCORR_UPDATE_SEQUENCE / 2]; float tmp[XCORR_UPDATE_SEQUENCE / 2];
@ -353,7 +352,7 @@ void BPMDetect::updateBeatPos(int process_samples)
#pragma omp parallel for #pragma omp parallel for
for (int offs = windowStart; offs < windowLen; offs++) for (int offs = windowStart; offs < windowLen; offs++)
{ {
double sum = 0; float sum = 0;
for (int i = 0; i < process_samples; i++) for (int i = 0; i < process_samples; i++)
{ {
sum += tmp[i] * pBuffer[offs + i]; sum += tmp[i] * pBuffer[offs + i];
@ -562,7 +561,7 @@ float BPMDetect::getBpm()
/// \return number of beats in the arrays. /// \return number of beats in the arrays.
int BPMDetect::getBeats(float *pos, float *values, int max_num) int BPMDetect::getBeats(float *pos, float *values, int max_num)
{ {
int num = beats.size(); int num = (int)beats.size();
if ((!pos) || (!values)) return num; // pos or values NULL, return just size if ((!pos) || (!values)) return num; // pos or values NULL, return just size
for (int i = 0; (i < num) && (i < max_num); i++) for (int i = 0; (i < num) && (i < max_num); i++)

View File

@ -265,3 +265,11 @@ uint FIFOSampleBuffer::adjustAmountOfSamples(uint numSamples)
} }
return samplesInBuffer; return samplesInBuffer;
} }
/// Add silence to end of buffer
void FIFOSampleBuffer::addSilent(uint nSamples)
{
memset(ptrEnd(nSamples), 0, sizeof(SAMPLETYPE) * nSamples * channels);
samplesInBuffer += nSamples;
}

View File

@ -60,12 +60,14 @@ FIRFilter::FIRFilter()
length = 0; length = 0;
lengthDiv8 = 0; lengthDiv8 = 0;
filterCoeffs = NULL; filterCoeffs = NULL;
filterCoeffsStereo = NULL;
} }
FIRFilter::~FIRFilter() FIRFilter::~FIRFilter()
{ {
delete[] filterCoeffs; delete[] filterCoeffs;
delete[] filterCoeffsStereo;
} }
@ -78,35 +80,26 @@ uint FIRFilter::evaluateFilterStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, ui
// because division is much slower operation than multiplying. // because division is much slower operation than multiplying.
double dScaler = 1.0 / (double)resultDivider; double dScaler = 1.0 / (double)resultDivider;
#endif #endif
// hint compiler autovectorization that loop length is divisible by 8
int ilength = length & -8;
assert(length != 0); assert((length != 0) && (length == ilength) && (src != NULL) && (dest != NULL) && (filterCoeffs != NULL));
assert(src != NULL);
assert(dest != NULL);
assert(filterCoeffs != NULL);
end = 2 * (numSamples - length); end = 2 * (numSamples - ilength);
#pragma omp parallel for #pragma omp parallel for
for (j = 0; j < end; j += 2) for (j = 0; j < end; j += 2)
{ {
const SAMPLETYPE *ptr; const SAMPLETYPE *ptr;
LONG_SAMPLETYPE suml, sumr; LONG_SAMPLETYPE suml, sumr;
uint i;
suml = sumr = 0; suml = sumr = 0;
ptr = src + j; ptr = src + j;
for (i = 0; i < length; i += 4) for (int i = 0; i < ilength; i ++)
{ {
// loop is unrolled by factor of 4 here for efficiency suml += ptr[2 * i] * filterCoeffsStereo[2 * i];
suml += ptr[2 * i + 0] * filterCoeffs[i + 0] + sumr += ptr[2 * i + 1] * filterCoeffsStereo[2 * i + 1];
ptr[2 * i + 2] * filterCoeffs[i + 1] +
ptr[2 * i + 4] * filterCoeffs[i + 2] +
ptr[2 * i + 6] * filterCoeffs[i + 3];
sumr += ptr[2 * i + 1] * filterCoeffs[i + 0] +
ptr[2 * i + 3] * filterCoeffs[i + 1] +
ptr[2 * i + 5] * filterCoeffs[i + 2] +
ptr[2 * i + 7] * filterCoeffs[i + 3];
} }
#ifdef SOUNDTOUCH_INTEGER_SAMPLES #ifdef SOUNDTOUCH_INTEGER_SAMPLES
@ -116,14 +109,11 @@ uint FIRFilter::evaluateFilterStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, ui
suml = (suml < -32768) ? -32768 : (suml > 32767) ? 32767 : suml; suml = (suml < -32768) ? -32768 : (suml > 32767) ? 32767 : suml;
// saturate to 16 bit integer limits // saturate to 16 bit integer limits
sumr = (sumr < -32768) ? -32768 : (sumr > 32767) ? 32767 : sumr; sumr = (sumr < -32768) ? -32768 : (sumr > 32767) ? 32767 : sumr;
#else
suml *= dScaler;
sumr *= dScaler;
#endif // SOUNDTOUCH_INTEGER_SAMPLES #endif // SOUNDTOUCH_INTEGER_SAMPLES
dest[j] = (SAMPLETYPE)suml; dest[j] = (SAMPLETYPE)suml;
dest[j + 1] = (SAMPLETYPE)sumr; dest[j + 1] = (SAMPLETYPE)sumr;
} }
return numSamples - length; return numSamples - ilength;
} }
@ -137,31 +127,28 @@ uint FIRFilter::evaluateFilterMono(SAMPLETYPE *dest, const SAMPLETYPE *src, uint
double dScaler = 1.0 / (double)resultDivider; double dScaler = 1.0 / (double)resultDivider;
#endif #endif
assert(length != 0); // hint compiler autovectorization that loop length is divisible by 8
int ilength = length & -8;
end = numSamples - length; assert(ilength != 0);
end = numSamples - ilength;
#pragma omp parallel for #pragma omp parallel for
for (j = 0; j < end; j ++) for (j = 0; j < end; j ++)
{ {
const SAMPLETYPE *pSrc = src + j; const SAMPLETYPE *pSrc = src + j;
LONG_SAMPLETYPE sum; LONG_SAMPLETYPE sum;
uint i; int i;
sum = 0; sum = 0;
for (i = 0; i < length; i += 4) for (i = 0; i < ilength; i ++)
{ {
// loop is unrolled by factor of 4 here for efficiency sum += pSrc[i] * filterCoeffs[i];
sum += pSrc[i + 0] * filterCoeffs[i + 0] +
pSrc[i + 1] * filterCoeffs[i + 1] +
pSrc[i + 2] * filterCoeffs[i + 2] +
pSrc[i + 3] * filterCoeffs[i + 3];
} }
#ifdef SOUNDTOUCH_INTEGER_SAMPLES #ifdef SOUNDTOUCH_INTEGER_SAMPLES
sum >>= resultDivFactor; sum >>= resultDivFactor;
// saturate to 16 bit integer limits // saturate to 16 bit integer limits
sum = (sum < -32768) ? -32768 : (sum > 32767) ? 32767 : sum; sum = (sum < -32768) ? -32768 : (sum > 32767) ? 32767 : sum;
#else
sum *= dScaler;
#endif // SOUNDTOUCH_INTEGER_SAMPLES #endif // SOUNDTOUCH_INTEGER_SAMPLES
dest[j] = (SAMPLETYPE)sum; dest[j] = (SAMPLETYPE)sum;
} }
@ -185,14 +172,18 @@ uint FIRFilter::evaluateFilterMulti(SAMPLETYPE *dest, const SAMPLETYPE *src, uin
assert(filterCoeffs != NULL); assert(filterCoeffs != NULL);
assert(numChannels < 16); assert(numChannels < 16);
end = numChannels * (numSamples - length); // hint compiler autovectorization that loop length is divisible by 8
int ilength = length & -8;
end = numChannels * (numSamples - ilength);
#pragma omp parallel for #pragma omp parallel for
for (j = 0; j < end; j += numChannels) for (j = 0; j < end; j += numChannels)
{ {
const SAMPLETYPE *ptr; const SAMPLETYPE *ptr;
LONG_SAMPLETYPE sums[16]; LONG_SAMPLETYPE sums[16];
uint c, i; uint c;
int i;
for (c = 0; c < numChannels; c ++) for (c = 0; c < numChannels; c ++)
{ {
@ -201,7 +192,7 @@ uint FIRFilter::evaluateFilterMulti(SAMPLETYPE *dest, const SAMPLETYPE *src, uin
ptr = src + j; ptr = src + j;
for (i = 0; i < length; i ++) for (i = 0; i < ilength; i ++)
{ {
SAMPLETYPE coef=filterCoeffs[i]; SAMPLETYPE coef=filterCoeffs[i];
for (c = 0; c < numChannels; c ++) for (c = 0; c < numChannels; c ++)
@ -215,13 +206,11 @@ uint FIRFilter::evaluateFilterMulti(SAMPLETYPE *dest, const SAMPLETYPE *src, uin
{ {
#ifdef SOUNDTOUCH_INTEGER_SAMPLES #ifdef SOUNDTOUCH_INTEGER_SAMPLES
sums[c] >>= resultDivFactor; sums[c] >>= resultDivFactor;
#else
sums[c] *= dScaler;
#endif // SOUNDTOUCH_INTEGER_SAMPLES #endif // SOUNDTOUCH_INTEGER_SAMPLES
dest[j+c] = (SAMPLETYPE)sums[c]; dest[j+c] = (SAMPLETYPE)sums[c];
} }
} }
return numSamples - length; return numSamples - ilength;
} }
@ -233,6 +222,13 @@ void FIRFilter::setCoefficients(const SAMPLETYPE *coeffs, uint newLength, uint u
assert(newLength > 0); assert(newLength > 0);
if (newLength % 8) ST_THROW_RT_ERROR("FIR filter length not divisible by 8"); if (newLength % 8) ST_THROW_RT_ERROR("FIR filter length not divisible by 8");
#ifdef SOUNDTOUCH_FLOAT_SAMPLES
// scale coefficients already here if using floating samples
double scale = 1.0 / resultDivider;
#else
short scale = 1;
#endif
lengthDiv8 = newLength / 8; lengthDiv8 = newLength / 8;
length = lengthDiv8 * 8; length = lengthDiv8 * 8;
assert(length == newLength); assert(length == newLength);
@ -242,7 +238,16 @@ void FIRFilter::setCoefficients(const SAMPLETYPE *coeffs, uint newLength, uint u
delete[] filterCoeffs; delete[] filterCoeffs;
filterCoeffs = new SAMPLETYPE[length]; filterCoeffs = new SAMPLETYPE[length];
memcpy(filterCoeffs, coeffs, length * sizeof(SAMPLETYPE)); delete[] filterCoeffsStereo;
filterCoeffsStereo = new SAMPLETYPE[length*2];
for (uint i = 0; i < length; i ++)
{
filterCoeffs[i] = (SAMPLETYPE)(coeffs[i] * scale);
// create also stereo set of filter coefficients: this allows compiler
// to autovectorize filter evaluation much more efficiently
filterCoeffsStereo[2 * i] = (SAMPLETYPE)(coeffs[i] * scale);
filterCoeffsStereo[2 * i + 1] = (SAMPLETYPE)(coeffs[i] * scale);
}
} }

View File

@ -57,6 +57,7 @@ protected:
// Memory for filter coefficients // Memory for filter coefficients
SAMPLETYPE *filterCoeffs; SAMPLETYPE *filterCoeffs;
SAMPLETYPE *filterCoeffsStereo;
virtual uint evaluateFilterStereo(SAMPLETYPE *dest, virtual uint evaluateFilterStereo(SAMPLETYPE *dest,
const SAMPLETYPE *src, const SAMPLETYPE *src,

View File

@ -41,7 +41,6 @@ namespace soundtouch
class InterpolateCubic : public TransposerBase class InterpolateCubic : public TransposerBase
{ {
protected: protected:
virtual void resetRegisters();
virtual int transposeMono(SAMPLETYPE *dest, virtual int transposeMono(SAMPLETYPE *dest,
const SAMPLETYPE *src, const SAMPLETYPE *src,
int &srcSamples); int &srcSamples);
@ -56,6 +55,13 @@ protected:
public: public:
InterpolateCubic(); InterpolateCubic();
virtual void resetRegisters();
int getLatency() const
{
return 1;
}
}; };
} }

View File

@ -142,7 +142,7 @@ int InterpolateLinearInteger::transposeMulti(SAMPLETYPE *dest, const SAMPLETYPE
LONG_SAMPLETYPE temp, vol1; LONG_SAMPLETYPE temp, vol1;
assert(iFract < SCALE); assert(iFract < SCALE);
vol1 = (SCALE - iFract); vol1 = (LONG_SAMPLETYPE)(SCALE - iFract);
for (int c = 0; c < numChannels; c ++) for (int c = 0; c < numChannels; c ++)
{ {
temp = vol1 * src[c] + iFract * src[c + numChannels]; temp = vol1 * src[c] + iFract * src[c + numChannels];

View File

@ -45,8 +45,6 @@ protected:
int iFract; int iFract;
int iRate; int iRate;
virtual void resetRegisters();
virtual int transposeMono(SAMPLETYPE *dest, virtual int transposeMono(SAMPLETYPE *dest,
const SAMPLETYPE *src, const SAMPLETYPE *src,
int &srcSamples); int &srcSamples);
@ -60,6 +58,13 @@ public:
/// Sets new target rate. Normal rate = 1.0, smaller values represent slower /// Sets new target rate. Normal rate = 1.0, smaller values represent slower
/// rate, larger faster rates. /// rate, larger faster rates.
virtual void setRate(double newRate); virtual void setRate(double newRate);
virtual void resetRegisters();
int getLatency() const
{
return 0;
}
}; };
@ -69,8 +74,6 @@ class InterpolateLinearFloat : public TransposerBase
protected: protected:
double fract; double fract;
virtual void resetRegisters();
virtual int transposeMono(SAMPLETYPE *dest, virtual int transposeMono(SAMPLETYPE *dest,
const SAMPLETYPE *src, const SAMPLETYPE *src,
int &srcSamples); int &srcSamples);
@ -81,6 +84,13 @@ protected:
public: public:
InterpolateLinearFloat(); InterpolateLinearFloat();
virtual void resetRegisters();
int getLatency() const
{
return 0;
}
}; };
} }

View File

@ -46,7 +46,6 @@ namespace soundtouch
class InterpolateShannon : public TransposerBase class InterpolateShannon : public TransposerBase
{ {
protected: protected:
void resetRegisters();
int transposeMono(SAMPLETYPE *dest, int transposeMono(SAMPLETYPE *dest,
const SAMPLETYPE *src, const SAMPLETYPE *src,
int &srcSamples); int &srcSamples);
@ -61,6 +60,13 @@ protected:
public: public:
InterpolateShannon(); InterpolateShannon();
void resetRegisters();
int getLatency() const
{
return 3;
}
}; };
} }

View File

@ -57,7 +57,7 @@ int PeakFinder::findTop(const float *data, int peakpos) const
refvalue = data[peakpos]; refvalue = data[peakpos];
// seek within <EFBFBD>10 points // seek within ±10 points
start = peakpos - 10; start = peakpos - 10;
if (start < minPos) start = minPos; if (start < minPos) start = minPos;
end = peakpos + 10; end = peakpos + 10;
@ -142,7 +142,7 @@ int PeakFinder::findCrossingLevel(const float *data, float level, int peakpos, i
peaklevel = data[peakpos]; peaklevel = data[peakpos];
assert(peaklevel >= level); assert(peaklevel >= level);
pos = peakpos; pos = peakpos;
while ((pos >= minPos) && (pos < maxPos)) while ((pos >= minPos) && (pos + direction < maxPos))
{ {
if (data[pos + direction] < level) return pos; // crossing found if (data[pos + direction] < level) return pos; // crossing found
pos += direction; pos += direction;
@ -256,7 +256,7 @@ double PeakFinder::detectPeak(const float *data, int aminPos, int amaxPos)
// accept harmonic peak if // accept harmonic peak if
// (a) it is found // (a) it is found
// (b) is within <EFBFBD>4% of the expected harmonic interval // (b) is within ±4% of the expected harmonic interval
// (c) has at least half x-corr value of the max. peak // (c) has at least half x-corr value of the max. peak
double diff = harmonic * peaktmp / highPeak; double diff = harmonic * peaktmp / highPeak;

View File

@ -61,6 +61,7 @@ RateTransposer::RateTransposer() : FIFOProcessor(&outputBuffer)
// Instantiates the anti-alias filter // Instantiates the anti-alias filter
pAAFilter = new AAFilter(64); pAAFilter = new AAFilter(64);
pTransposer = TransposerBase::newInstance(); pTransposer = TransposerBase::newInstance();
clear();
} }
@ -77,6 +78,7 @@ void RateTransposer::enableAAFilter(bool newMode)
#ifndef SOUNDTOUCH_PREVENT_CLICK_AT_RATE_CROSSOVER #ifndef SOUNDTOUCH_PREVENT_CLICK_AT_RATE_CROSSOVER
// Disable Anti-alias filter if desirable to avoid click at rate change zero value crossover // Disable Anti-alias filter if desirable to avoid click at rate change zero value crossover
bUseAAFilter = newMode; bUseAAFilter = newMode;
clear();
#endif #endif
} }
@ -192,6 +194,11 @@ void RateTransposer::clear()
outputBuffer.clear(); outputBuffer.clear();
midBuffer.clear(); midBuffer.clear();
inputBuffer.clear(); inputBuffer.clear();
pTransposer->resetRegisters();
// prefill buffer to avoid losing first samples at beginning of stream
int prefill = getLatency();
inputBuffer.addSilent(prefill);
} }
@ -209,7 +216,8 @@ int RateTransposer::isEmpty() const
/// Return approximate initial input-output latency /// Return approximate initial input-output latency
int RateTransposer::getLatency() const int RateTransposer::getLatency() const
{ {
return (bUseAAFilter) ? pAAFilter->getLength() : 0; return pTransposer->getLatency() +
((bUseAAFilter) ? (pAAFilter->getLength() / 2) : 0);
} }

View File

@ -59,8 +59,6 @@ public:
}; };
protected: protected:
virtual void resetRegisters() = 0;
virtual int transposeMono(SAMPLETYPE *dest, virtual int transposeMono(SAMPLETYPE *dest,
const SAMPLETYPE *src, const SAMPLETYPE *src,
int &srcSamples) = 0; int &srcSamples) = 0;
@ -83,6 +81,9 @@ public:
virtual int transpose(FIFOSampleBuffer &dest, FIFOSampleBuffer &src); virtual int transpose(FIFOSampleBuffer &dest, FIFOSampleBuffer &src);
virtual void setRate(double newRate); virtual void setRate(double newRate);
virtual void setChannels(int channels); virtual void setChannels(int channels);
virtual int getLatency() const = 0;
virtual void resetRegisters() = 0;
// static factory function // static factory function
static TransposerBase *newInstance(); static TransposerBase *newInstance();

View File

@ -1,4 +1,4 @@
//////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
/// ///
/// Sampled sound tempo changer/time stretch algorithm. Changes the sound tempo /// Sampled sound tempo changer/time stretch algorithm. Changes the sound tempo
/// while maintaining the original pitch by using a time domain WSOLA-like /// while maintaining the original pitch by using a time domain WSOLA-like
@ -54,7 +54,6 @@ using namespace soundtouch;
#define max(x, y) (((x) > (y)) ? (x) : (y)) #define max(x, y) (((x) > (y)) ? (x) : (y))
/***************************************************************************** /*****************************************************************************
* *
* Constant definitions * Constant definitions
@ -93,11 +92,6 @@ TDStretch::TDStretch() : FIFOProcessor(&outputBuffer)
bAutoSeqSetting = true; bAutoSeqSetting = true;
bAutoSeekSetting = true; bAutoSeekSetting = true;
maxnorm = 0;
maxnormf = 1e8;
skipFract = 0;
tempo = 1.0f; tempo = 1.0f;
setParameters(44100, DEFAULT_SEQUENCE_MS, DEFAULT_SEEKWINDOW_MS, DEFAULT_OVERLAP_MS); setParameters(44100, DEFAULT_SEQUENCE_MS, DEFAULT_SEEKWINDOW_MS, DEFAULT_OVERLAP_MS);
setTempo(1.0f); setTempo(1.0f);
@ -224,6 +218,9 @@ void TDStretch::clearInput()
inputBuffer.clear(); inputBuffer.clear();
clearMidBuffer(); clearMidBuffer();
isBeginning = true; isBeginning = true;
maxnorm = 0;
maxnormf = 1e8;
skipFract = 0;
} }
@ -315,9 +312,10 @@ int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos)
{ {
double corr; double corr;
// Calculates correlation value for the mixing position corresponding to 'i' // Calculates correlation value for the mixing position corresponding to 'i'
#ifdef _OPENMP #if defined(_OPENMP) || defined(ST_SIMD_AVOID_UNALIGNED)
// in parallel OpenMP mode, can't use norm accumulator version as parallel executor won't // in parallel OpenMP mode, can't use norm accumulator version as parallel executor won't
// iterate the loop in sequential order // iterate the loop in sequential order
// in SIMD mode, avoid accumulator version to allow avoiding unaligned positions
corr = calcCrossCorr(refPos + channels * i, pMidBuffer, norm); corr = calcCrossCorr(refPos + channels * i, pMidBuffer, norm);
#else #else
// In non-parallel version call "calcCrossCorrAccumulate" that is otherwise same // In non-parallel version call "calcCrossCorrAccumulate" that is otherwise same
@ -675,23 +673,24 @@ void TDStretch::processSamples()
// Adjust processing offset at beginning of track by not perform initial overlapping // Adjust processing offset at beginning of track by not perform initial overlapping
// and compensating that in the 'input buffer skip' calculation // and compensating that in the 'input buffer skip' calculation
isBeginning = false; isBeginning = false;
int skip = (int)(tempo * overlapLength + 0.5); int skip = (int)(tempo * overlapLength + 0.5 * seekLength + 0.5);
#ifdef SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION #ifdef ST_SIMD_AVOID_UNALIGNED
#ifdef SOUNDTOUCH_ALLOW_SSE // in SIMD mode, round the skip amount to value corresponding to aligned memory address
// if SSE mode, round the skip amount to value corresponding to aligned memory address if (channels == 1)
if (channels == 1) {
{ skip &= -4;
skip &= -4; }
} else if (channels == 2)
else if (channels == 2) {
{ skip &= -2;
skip &= -2; }
}
#endif
#endif #endif
skipFract -= skip; skipFract -= skip;
assert(nominalSkip >= -skipFract); if (skipFract <= -nominalSkip)
{
skipFract = -nominalSkip;
}
} }
// ... then copy sequence samples from 'inputBuffer' to output: // ... then copy sequence samples from 'inputBuffer' to output:
@ -830,21 +829,19 @@ void TDStretch::overlapStereo(short *poutput, const short *input) const
// Overlaps samples in 'midBuffer' with the samples in 'input'. The 'Multi' // Overlaps samples in 'midBuffer' with the samples in 'input'. The 'Multi'
// version of the routine. // version of the routine.
void TDStretch::overlapMulti(SAMPLETYPE *poutput, const SAMPLETYPE *input) const void TDStretch::overlapMulti(short *poutput, const short *input) const
{ {
SAMPLETYPE m1=(SAMPLETYPE)0; short m1;
SAMPLETYPE m2; int i = 0;
int i=0;
for (m2 = (SAMPLETYPE)overlapLength; m2; m2 --) for (m1 = 0; m1 < overlapLength; m1 ++)
{ {
short m2 = (short)(overlapLength - m1);
for (int c = 0; c < channels; c ++) for (int c = 0; c < channels; c ++)
{ {
poutput[i] = (input[i] * m1 + pMidBuffer[i] * m2) / overlapLength; poutput[i] = (input[i] * m1 + pMidBuffer[i] * m2) / overlapLength;
i++; i++;
} }
m1++;
} }
} }
@ -889,20 +886,23 @@ double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare, do
unsigned long lnorm; unsigned long lnorm;
int i; int i;
#ifdef ST_SIMD_AVOID_UNALIGNED
// in SIMD mode skip 'mixingPos' positions that aren't aligned to 16-byte boundary
if (((ulongptr)mixingPos) & 15) return -1e50;
#endif
// hint compiler autovectorization that loop length is divisible by 8
int ilength = (channels * overlapLength) & -8;
corr = lnorm = 0; corr = lnorm = 0;
// Same routine for stereo and mono. For stereo, unroll loop for better // Same routine for stereo and mono
// efficiency and gives slightly better resolution against rounding. for (i = 0; i < ilength; i += 2)
// For mono it same routine, just unrolls loop by factor of 4
for (i = 0; i < channels * overlapLength; i += 4)
{ {
corr += (mixingPos[i] * compare[i] + corr += (mixingPos[i] * compare[i] +
mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm; // notice: do intermediate division here to avoid integer overflow mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm;
corr += (mixingPos[i + 2] * compare[i + 2] +
mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBitsNorm;
lnorm += (mixingPos[i] * mixingPos[i] + lnorm += (mixingPos[i] * mixingPos[i] +
mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBitsNorm; // notice: do intermediate division here to avoid integer overflow mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBitsNorm;
lnorm += (mixingPos[i + 2] * mixingPos[i + 2] + // do intermediate scalings to avoid integer overflow
mixingPos[i + 3] * mixingPos[i + 3]) >> overlapDividerBitsNorm;
} }
if (lnorm > maxnorm) if (lnorm > maxnorm)
@ -925,9 +925,12 @@ double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare, do
double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *compare, double &norm) double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *compare, double &norm)
{ {
long corr; long corr;
unsigned long lnorm; long lnorm;
int i; int i;
// hint compiler autovectorization that loop length is divisible by 8
int ilength = (channels * overlapLength) & -8;
// cancel first normalizer tap from previous round // cancel first normalizer tap from previous round
lnorm = 0; lnorm = 0;
for (i = 1; i <= channels; i ++) for (i = 1; i <= channels; i ++)
@ -936,15 +939,11 @@ double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *c
} }
corr = 0; corr = 0;
// Same routine for stereo and mono. For stereo, unroll loop for better // Same routine for stereo and mono.
// efficiency and gives slightly better resolution against rounding. for (i = 0; i < ilength; i += 2)
// For mono it same routine, just unrolls loop by factor of 4
for (i = 0; i < channels * overlapLength; i += 4)
{ {
corr += (mixingPos[i] * compare[i] + corr += (mixingPos[i] * compare[i] +
mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm; // notice: do intermediate division here to avoid integer overflow mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm;
corr += (mixingPos[i + 2] * compare[i + 2] +
mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBitsNorm;
} }
// update normalizer with last samples of this round // update normalizer with last samples of this round
@ -1045,27 +1044,24 @@ void TDStretch::calculateOverlapLength(int overlapInMsec)
/// Calculate cross-correlation /// Calculate cross-correlation
double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, double &anorm) double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, double &anorm)
{ {
double corr; float corr;
double norm; float norm;
int i; int i;
#ifdef ST_SIMD_AVOID_UNALIGNED
// in SIMD mode skip 'mixingPos' positions that aren't aligned to 16-byte boundary
if (((ulongptr)mixingPos) & 15) return -1e50;
#endif
// hint compiler autovectorization that loop length is divisible by 8
int ilength = (channels * overlapLength) & -8;
corr = norm = 0; corr = norm = 0;
// Same routine for stereo and mono. For Stereo, unroll by factor of 2. // Same routine for stereo and mono
// For mono it's same routine yet unrollsd by factor of 4. for (i = 0; i < ilength; i ++)
for (i = 0; i < channels * overlapLength; i += 4)
{ {
corr += mixingPos[i] * compare[i] + corr += mixingPos[i] * compare[i];
mixingPos[i + 1] * compare[i + 1]; norm += mixingPos[i] * mixingPos[i];
norm += mixingPos[i] * mixingPos[i] +
mixingPos[i + 1] * mixingPos[i + 1];
// unroll the loop for better CPU efficiency:
corr += mixingPos[i + 2] * compare[i + 2] +
mixingPos[i + 3] * compare[i + 3];
norm += mixingPos[i + 2] * mixingPos[i + 2] +
mixingPos[i + 3] * mixingPos[i + 3];
} }
anorm = norm; anorm = norm;
@ -1076,7 +1072,7 @@ double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, do
/// Update cross-correlation by accumulating "norm" coefficient by previously calculated value /// Update cross-correlation by accumulating "norm" coefficient by previously calculated value
double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *compare, double &norm) double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *compare, double &norm)
{ {
double corr; float corr;
int i; int i;
corr = 0; corr = 0;
@ -1087,14 +1083,13 @@ double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *c
norm -= mixingPos[-i] * mixingPos[-i]; norm -= mixingPos[-i] * mixingPos[-i];
} }
// Same routine for stereo and mono. For Stereo, unroll by factor of 2. // hint compiler autovectorization that loop length is divisible by 8
// For mono it's same routine yet unrollsd by factor of 4. int ilength = (channels * overlapLength) & -8;
for (i = 0; i < channels * overlapLength; i += 4)
// Same routine for stereo and mono
for (i = 0; i < ilength; i ++)
{ {
corr += mixingPos[i] * compare[i] + corr += mixingPos[i] * compare[i];
mixingPos[i + 1] * compare[i + 1] +
mixingPos[i + 2] * compare[i + 2] +
mixingPos[i + 3] * compare[i + 3];
} }
// update normalizer with last samples of this round // update normalizer with last samples of this round

View File

@ -80,7 +80,7 @@ double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2, double &a
// Compile-time define SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION is provided // Compile-time define SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION is provided
// for choosing if this little cheating is allowed. // for choosing if this little cheating is allowed.
#ifdef SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION #ifdef ST_SIMD_AVOID_UNALIGNED
// Little cheating allowed, return valid correlation only for // Little cheating allowed, return valid correlation only for
// aligned locations, meaning every second round for stereo sound. // aligned locations, meaning every second round for stereo sound.