3rdparty: Upgrade soundtouch lib to 2.3.1

This commit is contained in:
Christian Kenny 2021-11-21 21:36:08 -05:00 committed by refractionpcsx2
parent 791f2a63ac
commit e37afd6976
18 changed files with 1141 additions and 1020 deletions

View File

@ -2,7 +2,7 @@
Version 2.1, February 1999
Copyright (C) 1991, 1999 Free Software Foundation, Inc.
59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
@ -117,7 +117,7 @@ be combined with the library in order to run.
0. This License Agreement applies to any software library or other
program which contains a notice placed by the copyright holder or
other authoried party saying it may be distributed under the terms of
other authorized party saying it may be distributed under the terms of
this Lesser General Public License (also called "this License").
Each licensee is addressed as "you".

File diff suppressed because it is too large Load Diff

View File

@ -170,6 +170,9 @@ public:
/// allow trimming (downwards) amount of samples in pipeline.
/// Returns adjusted amount of samples
uint adjustAmountOfSamples(uint numSamples);
/// Add silence to end of buffer
void addSilent(uint nSamples);
};
}

View File

@ -121,10 +121,10 @@ namespace soundtouch
#endif
// If defined, allows the SIMD-optimized routines to take minor shortcuts
// for improved performance. Undefine to require faithfully similar SIMD
// calculations as in normal C implementation.
#define SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION 1
// If defined, allows the SIMD-optimized routines to skip unevenly aligned
// memory offsets that can cause performance penalty in some SIMD implementations.
// Causes slight compromise in sound quality.
// #define SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION 1
#ifdef SOUNDTOUCH_INTEGER_SAMPLES
@ -149,8 +149,9 @@ namespace soundtouch
// floating point samples
typedef float SAMPLETYPE;
// data type for sample accumulation: Use double to utilize full precision.
typedef double LONG_SAMPLETYPE;
// data type for sample accumulation: Use float also here to enable
// efficient autovectorization
typedef float LONG_SAMPLETYPE;
#ifdef SOUNDTOUCH_ALLOW_X86_OPTIMIZATIONS
// Allow SSE optimizations
@ -159,7 +160,13 @@ namespace soundtouch
#endif // SOUNDTOUCH_INTEGER_SAMPLES
};
#if ((SOUNDTOUCH_ALLOW_SSE) || (__SSE__) || (SOUNDTOUCH_USE_NEON))
#if SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION
#define ST_SIMD_AVOID_UNALIGNED
#endif
#endif
}
// define ST_NO_EXCEPTION_HANDLING switch to disable throwing std exceptions:
// #define ST_NO_EXCEPTION_HANDLING 1

View File

@ -72,10 +72,10 @@ namespace soundtouch
{
/// Soundtouch library version string
#define SOUNDTOUCH_VERSION "2.1.2"
#define SOUNDTOUCH_VERSION "2.3.1"
/// SoundTouch library version id
#define SOUNDTOUCH_VERSION_ID (20102)
#define SOUNDTOUCH_VERSION_ID (20301)
//
// Available setting IDs for the 'setSetting' & 'get_setting' functions:

View File

@ -313,7 +313,7 @@ void BPMDetect::updateXCorr(int process_samples)
#pragma omp parallel for
for (offs = windowStart; offs < windowLen; offs ++)
{
double sum;
float sum;
int i;
sum = 0;
@ -341,7 +341,6 @@ void BPMDetect::updateBeatPos(int process_samples)
// static double thr = 0.0003;
double posScale = (double)this->decimateBy / (double)this->sampleRate;
int resetDur = (int)(0.12 / posScale + 0.5);
double corrScale = 1.0 / (double)(windowLen - windowStart);
// prescale pbuffer
float tmp[XCORR_UPDATE_SEQUENCE / 2];
@ -353,7 +352,7 @@ void BPMDetect::updateBeatPos(int process_samples)
#pragma omp parallel for
for (int offs = windowStart; offs < windowLen; offs++)
{
double sum = 0;
float sum = 0;
for (int i = 0; i < process_samples; i++)
{
sum += tmp[i] * pBuffer[offs + i];
@ -562,7 +561,7 @@ float BPMDetect::getBpm()
/// \return number of beats in the arrays.
int BPMDetect::getBeats(float *pos, float *values, int max_num)
{
int num = beats.size();
int num = (int)beats.size();
if ((!pos) || (!values)) return num; // pos or values NULL, return just size
for (int i = 0; (i < num) && (i < max_num); i++)

View File

@ -265,3 +265,11 @@ uint FIFOSampleBuffer::adjustAmountOfSamples(uint numSamples)
}
return samplesInBuffer;
}
/// Add silence to end of buffer
void FIFOSampleBuffer::addSilent(uint nSamples)
{
memset(ptrEnd(nSamples), 0, sizeof(SAMPLETYPE) * nSamples * channels);
samplesInBuffer += nSamples;
}

View File

@ -60,12 +60,14 @@ FIRFilter::FIRFilter()
length = 0;
lengthDiv8 = 0;
filterCoeffs = NULL;
filterCoeffsStereo = NULL;
}
FIRFilter::~FIRFilter()
{
delete[] filterCoeffs;
delete[] filterCoeffsStereo;
}
@ -78,35 +80,26 @@ uint FIRFilter::evaluateFilterStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, ui
// because division is much slower operation than multiplying.
double dScaler = 1.0 / (double)resultDivider;
#endif
// hint compiler autovectorization that loop length is divisible by 8
int ilength = length & -8;
assert(length != 0);
assert(src != NULL);
assert(dest != NULL);
assert(filterCoeffs != NULL);
assert((length != 0) && (length == ilength) && (src != NULL) && (dest != NULL) && (filterCoeffs != NULL));
end = 2 * (numSamples - length);
end = 2 * (numSamples - ilength);
#pragma omp parallel for
for (j = 0; j < end; j += 2)
{
const SAMPLETYPE *ptr;
LONG_SAMPLETYPE suml, sumr;
uint i;
suml = sumr = 0;
ptr = src + j;
for (i = 0; i < length; i += 4)
for (int i = 0; i < ilength; i ++)
{
// loop is unrolled by factor of 4 here for efficiency
suml += ptr[2 * i + 0] * filterCoeffs[i + 0] +
ptr[2 * i + 2] * filterCoeffs[i + 1] +
ptr[2 * i + 4] * filterCoeffs[i + 2] +
ptr[2 * i + 6] * filterCoeffs[i + 3];
sumr += ptr[2 * i + 1] * filterCoeffs[i + 0] +
ptr[2 * i + 3] * filterCoeffs[i + 1] +
ptr[2 * i + 5] * filterCoeffs[i + 2] +
ptr[2 * i + 7] * filterCoeffs[i + 3];
suml += ptr[2 * i] * filterCoeffsStereo[2 * i];
sumr += ptr[2 * i + 1] * filterCoeffsStereo[2 * i + 1];
}
#ifdef SOUNDTOUCH_INTEGER_SAMPLES
@ -116,14 +109,11 @@ uint FIRFilter::evaluateFilterStereo(SAMPLETYPE *dest, const SAMPLETYPE *src, ui
suml = (suml < -32768) ? -32768 : (suml > 32767) ? 32767 : suml;
// saturate to 16 bit integer limits
sumr = (sumr < -32768) ? -32768 : (sumr > 32767) ? 32767 : sumr;
#else
suml *= dScaler;
sumr *= dScaler;
#endif // SOUNDTOUCH_INTEGER_SAMPLES
dest[j] = (SAMPLETYPE)suml;
dest[j + 1] = (SAMPLETYPE)sumr;
}
return numSamples - length;
return numSamples - ilength;
}
@ -137,31 +127,28 @@ uint FIRFilter::evaluateFilterMono(SAMPLETYPE *dest, const SAMPLETYPE *src, uint
double dScaler = 1.0 / (double)resultDivider;
#endif
assert(length != 0);
// hint compiler autovectorization that loop length is divisible by 8
int ilength = length & -8;
end = numSamples - length;
assert(ilength != 0);
end = numSamples - ilength;
#pragma omp parallel for
for (j = 0; j < end; j ++)
for (j = 0; j < end; j ++)
{
const SAMPLETYPE *pSrc = src + j;
LONG_SAMPLETYPE sum;
uint i;
int i;
sum = 0;
for (i = 0; i < length; i += 4)
for (i = 0; i < ilength; i ++)
{
// loop is unrolled by factor of 4 here for efficiency
sum += pSrc[i + 0] * filterCoeffs[i + 0] +
pSrc[i + 1] * filterCoeffs[i + 1] +
pSrc[i + 2] * filterCoeffs[i + 2] +
pSrc[i + 3] * filterCoeffs[i + 3];
sum += pSrc[i] * filterCoeffs[i];
}
#ifdef SOUNDTOUCH_INTEGER_SAMPLES
sum >>= resultDivFactor;
// saturate to 16 bit integer limits
sum = (sum < -32768) ? -32768 : (sum > 32767) ? 32767 : sum;
#else
sum *= dScaler;
#endif // SOUNDTOUCH_INTEGER_SAMPLES
dest[j] = (SAMPLETYPE)sum;
}
@ -185,14 +172,18 @@ uint FIRFilter::evaluateFilterMulti(SAMPLETYPE *dest, const SAMPLETYPE *src, uin
assert(filterCoeffs != NULL);
assert(numChannels < 16);
end = numChannels * (numSamples - length);
// hint compiler autovectorization that loop length is divisible by 8
int ilength = length & -8;
end = numChannels * (numSamples - ilength);
#pragma omp parallel for
for (j = 0; j < end; j += numChannels)
{
const SAMPLETYPE *ptr;
LONG_SAMPLETYPE sums[16];
uint c, i;
uint c;
int i;
for (c = 0; c < numChannels; c ++)
{
@ -201,7 +192,7 @@ uint FIRFilter::evaluateFilterMulti(SAMPLETYPE *dest, const SAMPLETYPE *src, uin
ptr = src + j;
for (i = 0; i < length; i ++)
for (i = 0; i < ilength; i ++)
{
SAMPLETYPE coef=filterCoeffs[i];
for (c = 0; c < numChannels; c ++)
@ -215,13 +206,11 @@ uint FIRFilter::evaluateFilterMulti(SAMPLETYPE *dest, const SAMPLETYPE *src, uin
{
#ifdef SOUNDTOUCH_INTEGER_SAMPLES
sums[c] >>= resultDivFactor;
#else
sums[c] *= dScaler;
#endif // SOUNDTOUCH_INTEGER_SAMPLES
dest[j+c] = (SAMPLETYPE)sums[c];
}
}
return numSamples - length;
return numSamples - ilength;
}
@ -233,6 +222,13 @@ void FIRFilter::setCoefficients(const SAMPLETYPE *coeffs, uint newLength, uint u
assert(newLength > 0);
if (newLength % 8) ST_THROW_RT_ERROR("FIR filter length not divisible by 8");
#ifdef SOUNDTOUCH_FLOAT_SAMPLES
// scale coefficients already here if using floating samples
double scale = 1.0 / resultDivider;
#else
short scale = 1;
#endif
lengthDiv8 = newLength / 8;
length = lengthDiv8 * 8;
assert(length == newLength);
@ -242,7 +238,16 @@ void FIRFilter::setCoefficients(const SAMPLETYPE *coeffs, uint newLength, uint u
delete[] filterCoeffs;
filterCoeffs = new SAMPLETYPE[length];
memcpy(filterCoeffs, coeffs, length * sizeof(SAMPLETYPE));
delete[] filterCoeffsStereo;
filterCoeffsStereo = new SAMPLETYPE[length*2];
for (uint i = 0; i < length; i ++)
{
filterCoeffs[i] = (SAMPLETYPE)(coeffs[i] * scale);
// create also stereo set of filter coefficients: this allows compiler
// to autovectorize filter evaluation much more efficiently
filterCoeffsStereo[2 * i] = (SAMPLETYPE)(coeffs[i] * scale);
filterCoeffsStereo[2 * i + 1] = (SAMPLETYPE)(coeffs[i] * scale);
}
}

View File

@ -57,6 +57,7 @@ protected:
// Memory for filter coefficients
SAMPLETYPE *filterCoeffs;
SAMPLETYPE *filterCoeffsStereo;
virtual uint evaluateFilterStereo(SAMPLETYPE *dest,
const SAMPLETYPE *src,

View File

@ -41,7 +41,6 @@ namespace soundtouch
class InterpolateCubic : public TransposerBase
{
protected:
virtual void resetRegisters();
virtual int transposeMono(SAMPLETYPE *dest,
const SAMPLETYPE *src,
int &srcSamples);
@ -56,6 +55,13 @@ protected:
public:
InterpolateCubic();
virtual void resetRegisters();
int getLatency() const
{
return 1;
}
};
}

View File

@ -142,7 +142,7 @@ int InterpolateLinearInteger::transposeMulti(SAMPLETYPE *dest, const SAMPLETYPE
LONG_SAMPLETYPE temp, vol1;
assert(iFract < SCALE);
vol1 = (SCALE - iFract);
vol1 = (LONG_SAMPLETYPE)(SCALE - iFract);
for (int c = 0; c < numChannels; c ++)
{
temp = vol1 * src[c] + iFract * src[c + numChannels];

View File

@ -45,8 +45,6 @@ protected:
int iFract;
int iRate;
virtual void resetRegisters();
virtual int transposeMono(SAMPLETYPE *dest,
const SAMPLETYPE *src,
int &srcSamples);
@ -60,6 +58,13 @@ public:
/// Sets new target rate. Normal rate = 1.0, smaller values represent slower
/// rate, larger faster rates.
virtual void setRate(double newRate);
virtual void resetRegisters();
int getLatency() const
{
return 0;
}
};
@ -69,8 +74,6 @@ class InterpolateLinearFloat : public TransposerBase
protected:
double fract;
virtual void resetRegisters();
virtual int transposeMono(SAMPLETYPE *dest,
const SAMPLETYPE *src,
int &srcSamples);
@ -81,6 +84,13 @@ protected:
public:
InterpolateLinearFloat();
virtual void resetRegisters();
int getLatency() const
{
return 0;
}
};
}

View File

@ -46,7 +46,6 @@ namespace soundtouch
class InterpolateShannon : public TransposerBase
{
protected:
void resetRegisters();
int transposeMono(SAMPLETYPE *dest,
const SAMPLETYPE *src,
int &srcSamples);
@ -61,6 +60,13 @@ protected:
public:
InterpolateShannon();
void resetRegisters();
int getLatency() const
{
return 3;
}
};
}

View File

@ -57,7 +57,7 @@ int PeakFinder::findTop(const float *data, int peakpos) const
refvalue = data[peakpos];
// seek within <EFBFBD>10 points
// seek within ±10 points
start = peakpos - 10;
if (start < minPos) start = minPos;
end = peakpos + 10;
@ -142,7 +142,7 @@ int PeakFinder::findCrossingLevel(const float *data, float level, int peakpos, i
peaklevel = data[peakpos];
assert(peaklevel >= level);
pos = peakpos;
while ((pos >= minPos) && (pos < maxPos))
while ((pos >= minPos) && (pos + direction < maxPos))
{
if (data[pos + direction] < level) return pos; // crossing found
pos += direction;
@ -256,7 +256,7 @@ double PeakFinder::detectPeak(const float *data, int aminPos, int amaxPos)
// accept harmonic peak if
// (a) it is found
// (b) is within <EFBFBD>4% of the expected harmonic interval
// (b) is within ±4% of the expected harmonic interval
// (c) has at least half x-corr value of the max. peak
double diff = harmonic * peaktmp / highPeak;

View File

@ -61,6 +61,7 @@ RateTransposer::RateTransposer() : FIFOProcessor(&outputBuffer)
// Instantiates the anti-alias filter
pAAFilter = new AAFilter(64);
pTransposer = TransposerBase::newInstance();
clear();
}
@ -77,6 +78,7 @@ void RateTransposer::enableAAFilter(bool newMode)
#ifndef SOUNDTOUCH_PREVENT_CLICK_AT_RATE_CROSSOVER
// Disable Anti-alias filter if desirable to avoid click at rate change zero value crossover
bUseAAFilter = newMode;
clear();
#endif
}
@ -192,6 +194,11 @@ void RateTransposer::clear()
outputBuffer.clear();
midBuffer.clear();
inputBuffer.clear();
pTransposer->resetRegisters();
// prefill buffer to avoid losing first samples at beginning of stream
int prefill = getLatency();
inputBuffer.addSilent(prefill);
}
@ -209,7 +216,8 @@ int RateTransposer::isEmpty() const
/// Return approximate initial input-output latency
int RateTransposer::getLatency() const
{
return (bUseAAFilter) ? pAAFilter->getLength() : 0;
return pTransposer->getLatency() +
((bUseAAFilter) ? (pAAFilter->getLength() / 2) : 0);
}

View File

@ -59,8 +59,6 @@ public:
};
protected:
virtual void resetRegisters() = 0;
virtual int transposeMono(SAMPLETYPE *dest,
const SAMPLETYPE *src,
int &srcSamples) = 0;
@ -83,6 +81,9 @@ public:
virtual int transpose(FIFOSampleBuffer &dest, FIFOSampleBuffer &src);
virtual void setRate(double newRate);
virtual void setChannels(int channels);
virtual int getLatency() const = 0;
virtual void resetRegisters() = 0;
// static factory function
static TransposerBase *newInstance();

View File

@ -1,4 +1,4 @@
////////////////////////////////////////////////////////////////////////////////
///////////////////////////////////////////////////////////////////////////////
///
/// Sampled sound tempo changer/time stretch algorithm. Changes the sound tempo
/// while maintaining the original pitch by using a time domain WSOLA-like
@ -54,7 +54,6 @@ using namespace soundtouch;
#define max(x, y) (((x) > (y)) ? (x) : (y))
/*****************************************************************************
*
* Constant definitions
@ -93,11 +92,6 @@ TDStretch::TDStretch() : FIFOProcessor(&outputBuffer)
bAutoSeqSetting = true;
bAutoSeekSetting = true;
maxnorm = 0;
maxnormf = 1e8;
skipFract = 0;
tempo = 1.0f;
setParameters(44100, DEFAULT_SEQUENCE_MS, DEFAULT_SEEKWINDOW_MS, DEFAULT_OVERLAP_MS);
setTempo(1.0f);
@ -203,7 +197,7 @@ void TDStretch::overlapMono(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput) const
m1 = (SAMPLETYPE)0;
m2 = (SAMPLETYPE)overlapLength;
for (i = 0; i < overlapLength ; i ++)
for (i = 0; i < overlapLength ; i ++)
{
pOutput[i] = (pInput[i] * m1 + pMidBuffer[i] * m2 ) / overlapLength;
m1 += 1;
@ -224,6 +218,9 @@ void TDStretch::clearInput()
inputBuffer.clear();
clearMidBuffer();
isBeginning = true;
maxnorm = 0;
maxnormf = 1e8;
skipFract = 0;
}
@ -311,13 +308,14 @@ int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos)
bestCorr = (bestCorr + 0.1) * 0.75;
#pragma omp parallel for
for (i = 1; i < seekLength; i ++)
for (i = 1; i < seekLength; i ++)
{
double corr;
// Calculates correlation value for the mixing position corresponding to 'i'
#ifdef _OPENMP
#if defined(_OPENMP) || defined(ST_SIMD_AVOID_UNALIGNED)
// in parallel OpenMP mode, can't use norm accumulator version as parallel executor won't
// iterate the loop in sequential order
// in SIMD mode, avoid accumulator version to allow avoiding unaligned positions
corr = calcCrossCorr(refPos + channels * i, pMidBuffer, norm);
#else
// In non-parallel version call "calcCrossCorrAccumulate" that is otherwise same
@ -675,23 +673,24 @@ void TDStretch::processSamples()
// Adjust processing offset at beginning of track by not perform initial overlapping
// and compensating that in the 'input buffer skip' calculation
isBeginning = false;
int skip = (int)(tempo * overlapLength + 0.5);
int skip = (int)(tempo * overlapLength + 0.5 * seekLength + 0.5);
#ifdef SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION
#ifdef SOUNDTOUCH_ALLOW_SSE
// if SSE mode, round the skip amount to value corresponding to aligned memory address
if (channels == 1)
{
skip &= -4;
}
else if (channels == 2)
{
skip &= -2;
}
#endif
#ifdef ST_SIMD_AVOID_UNALIGNED
// in SIMD mode, round the skip amount to value corresponding to aligned memory address
if (channels == 1)
{
skip &= -4;
}
else if (channels == 2)
{
skip &= -2;
}
#endif
skipFract -= skip;
assert(nominalSkip >= -skipFract);
if (skipFract <= -nominalSkip)
{
skipFract = -nominalSkip;
}
}
// ... then copy sequence samples from 'inputBuffer' to output:
@ -818,7 +817,7 @@ void TDStretch::overlapStereo(short *poutput, const short *input) const
short temp;
int cnt2;
for (i = 0; i < overlapLength ; i ++)
for (i = 0; i < overlapLength ; i ++)
{
temp = (short)(overlapLength - i);
cnt2 = 2 * i;
@ -830,21 +829,19 @@ void TDStretch::overlapStereo(short *poutput, const short *input) const
// Overlaps samples in 'midBuffer' with the samples in 'input'. The 'Multi'
// version of the routine.
void TDStretch::overlapMulti(SAMPLETYPE *poutput, const SAMPLETYPE *input) const
void TDStretch::overlapMulti(short *poutput, const short *input) const
{
SAMPLETYPE m1=(SAMPLETYPE)0;
SAMPLETYPE m2;
int i=0;
short m1;
int i = 0;
for (m2 = (SAMPLETYPE)overlapLength; m2; m2 --)
for (m1 = 0; m1 < overlapLength; m1 ++)
{
short m2 = (short)(overlapLength - m1);
for (int c = 0; c < channels; c ++)
{
poutput[i] = (input[i] * m1 + pMidBuffer[i] * m2) / overlapLength;
i++;
}
m1++;
}
}
@ -889,20 +886,23 @@ double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare, do
unsigned long lnorm;
int i;
#ifdef ST_SIMD_AVOID_UNALIGNED
// in SIMD mode skip 'mixingPos' positions that aren't aligned to 16-byte boundary
if (((ulongptr)mixingPos) & 15) return -1e50;
#endif
// hint compiler autovectorization that loop length is divisible by 8
int ilength = (channels * overlapLength) & -8;
corr = lnorm = 0;
// Same routine for stereo and mono. For stereo, unroll loop for better
// efficiency and gives slightly better resolution against rounding.
// For mono it same routine, just unrolls loop by factor of 4
for (i = 0; i < channels * overlapLength; i += 4)
// Same routine for stereo and mono
for (i = 0; i < ilength; i += 2)
{
corr += (mixingPos[i] * compare[i] +
mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm; // notice: do intermediate division here to avoid integer overflow
corr += (mixingPos[i + 2] * compare[i + 2] +
mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBitsNorm;
mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm;
lnorm += (mixingPos[i] * mixingPos[i] +
mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBitsNorm; // notice: do intermediate division here to avoid integer overflow
lnorm += (mixingPos[i + 2] * mixingPos[i + 2] +
mixingPos[i + 3] * mixingPos[i + 3]) >> overlapDividerBitsNorm;
mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBitsNorm;
// do intermediate scalings to avoid integer overflow
}
if (lnorm > maxnorm)
@ -925,9 +925,12 @@ double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare, do
double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *compare, double &norm)
{
long corr;
unsigned long lnorm;
long lnorm;
int i;
// hint compiler autovectorization that loop length is divisible by 8
int ilength = (channels * overlapLength) & -8;
// cancel first normalizer tap from previous round
lnorm = 0;
for (i = 1; i <= channels; i ++)
@ -936,15 +939,11 @@ double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *c
}
corr = 0;
// Same routine for stereo and mono. For stereo, unroll loop for better
// efficiency and gives slightly better resolution against rounding.
// For mono it same routine, just unrolls loop by factor of 4
for (i = 0; i < channels * overlapLength; i += 4)
// Same routine for stereo and mono.
for (i = 0; i < ilength; i += 2)
{
corr += (mixingPos[i] * compare[i] +
mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm; // notice: do intermediate division here to avoid integer overflow
corr += (mixingPos[i + 2] * compare[i + 2] +
mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBitsNorm;
mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm;
}
// update normalizer with last samples of this round
@ -1045,27 +1044,24 @@ void TDStretch::calculateOverlapLength(int overlapInMsec)
/// Calculate cross-correlation
double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, double &anorm)
{
double corr;
double norm;
float corr;
float norm;
int i;
#ifdef ST_SIMD_AVOID_UNALIGNED
// in SIMD mode skip 'mixingPos' positions that aren't aligned to 16-byte boundary
if (((ulongptr)mixingPos) & 15) return -1e50;
#endif
// hint compiler autovectorization that loop length is divisible by 8
int ilength = (channels * overlapLength) & -8;
corr = norm = 0;
// Same routine for stereo and mono. For Stereo, unroll by factor of 2.
// For mono it's same routine yet unrollsd by factor of 4.
for (i = 0; i < channels * overlapLength; i += 4)
// Same routine for stereo and mono
for (i = 0; i < ilength; i ++)
{
corr += mixingPos[i] * compare[i] +
mixingPos[i + 1] * compare[i + 1];
norm += mixingPos[i] * mixingPos[i] +
mixingPos[i + 1] * mixingPos[i + 1];
// unroll the loop for better CPU efficiency:
corr += mixingPos[i + 2] * compare[i + 2] +
mixingPos[i + 3] * compare[i + 3];
norm += mixingPos[i + 2] * mixingPos[i + 2] +
mixingPos[i + 3] * mixingPos[i + 3];
corr += mixingPos[i] * compare[i];
norm += mixingPos[i] * mixingPos[i];
}
anorm = norm;
@ -1076,7 +1072,7 @@ double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, do
/// Update cross-correlation by accumulating "norm" coefficient by previously calculated value
double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *compare, double &norm)
{
double corr;
float corr;
int i;
corr = 0;
@ -1087,14 +1083,13 @@ double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *c
norm -= mixingPos[-i] * mixingPos[-i];
}
// Same routine for stereo and mono. For Stereo, unroll by factor of 2.
// For mono it's same routine yet unrollsd by factor of 4.
for (i = 0; i < channels * overlapLength; i += 4)
// hint compiler autovectorization that loop length is divisible by 8
int ilength = (channels * overlapLength) & -8;
// Same routine for stereo and mono
for (i = 0; i < ilength; i ++)
{
corr += mixingPos[i] * compare[i] +
mixingPos[i + 1] * compare[i + 1] +
mixingPos[i + 2] * compare[i + 2] +
mixingPos[i + 3] * compare[i + 3];
corr += mixingPos[i] * compare[i];
}
// update normalizer with last samples of this round

View File

@ -80,7 +80,7 @@ double TDStretchSSE::calcCrossCorr(const float *pV1, const float *pV2, double &a
// Compile-time define SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION is provided
// for choosing if this little cheating is allowed.
#ifdef SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION
#ifdef ST_SIMD_AVOID_UNALIGNED
// Little cheating allowed, return valid correlation only for
// aligned locations, meaning every second round for stereo sound.