pcsx2/3rdparty/soundtouch/source/SoundTouch/TDStretch.cpp

1115 lines
34 KiB
C++

////////////////////////////////////////////////////////////////////////////////
///
/// Sampled sound tempo changer/time stretch algorithm. Changes the sound tempo
/// while maintaining the original pitch by using a time domain WSOLA-like
/// method with several performance-increasing tweaks.
///
/// Notes : MMX optimized functions reside in a separate, platform-specific
/// file, e.g. 'mmx_win.cpp' or 'mmx_gcc.cpp'.
///
/// This source file contains OpenMP optimizations that allow speeding up the
/// corss-correlation algorithm by executing it in several threads / CPU cores
/// in parallel. See the following article link for more detailed discussion
/// about SoundTouch OpenMP optimizations:
/// http://www.softwarecoven.com/parallel-computing-in-embedded-mobile-devices
///
/// Author : Copyright (c) Olli Parviainen
/// Author e-mail : oparviai 'at' iki.fi
/// SoundTouch WWW: http://www.surina.net/soundtouch
///
////////////////////////////////////////////////////////////////////////////////
//
// Last changed : $Date: 2017-04-07 22:01:22 +0300 (pe, 07 huhti 2017) $
// File revision : $Revision: 1.12 $
//
// $Id: TDStretch.cpp 249 2017-04-07 19:01:22Z oparviai $
//
////////////////////////////////////////////////////////////////////////////////
//
// License :
//
// SoundTouch audio processing library
// Copyright (c) Olli Parviainen
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
////////////////////////////////////////////////////////////////////////////////
#include <string.h>
#include <limits.h>
#include <assert.h>
#include <math.h>
#include <float.h>
#include "STTypes.h"
#include "cpu_detect.h"
#include "TDStretch.h"
using namespace soundtouch;
#define max(x, y) (((x) > (y)) ? (x) : (y))
/*****************************************************************************
*
* Constant definitions
*
*****************************************************************************/
// Table for the hierarchical mixing position seeking algorithm
const short _scanOffsets[5][24]={
{ 124, 186, 248, 310, 372, 434, 496, 558, 620, 682, 744, 806,
868, 930, 992, 1054, 1116, 1178, 1240, 1302, 1364, 1426, 1488, 0},
{-100, -75, -50, -25, 25, 50, 75, 100, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{ -20, -15, -10, -5, 5, 10, 15, 20, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{ -4, -3, -2, -1, 1, 2, 3, 4, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
{ 121, 114, 97, 114, 98, 105, 108, 32, 104, 99, 117, 111,
116, 100, 110, 117, 111, 115, 0, 0, 0, 0, 0, 0}};
/*****************************************************************************
*
* Implementation of the class 'TDStretch'
*
*****************************************************************************/
TDStretch::TDStretch() : FIFOProcessor(&outputBuffer)
{
bQuickSeek = false;
channels = 2;
pMidBuffer = NULL;
pMidBufferUnaligned = NULL;
overlapLength = 0;
bAutoSeqSetting = true;
bAutoSeekSetting = true;
maxnorm = 0;
maxnormf = 1e8;
skipFract = 0;
tempo = 1.0f;
setParameters(44100, DEFAULT_SEQUENCE_MS, DEFAULT_SEEKWINDOW_MS, DEFAULT_OVERLAP_MS);
setTempo(1.0f);
clear();
}
TDStretch::~TDStretch()
{
delete[] pMidBufferUnaligned;
}
// Sets routine control parameters. These control are certain time constants
// defining how the sound is stretched to the desired duration.
//
// 'sampleRate' = sample rate of the sound
// 'sequenceMS' = one processing sequence length in milliseconds (default = 82 ms)
// 'seekwindowMS' = seeking window length for scanning the best overlapping
// position (default = 28 ms)
// 'overlapMS' = overlapping length (default = 12 ms)
void TDStretch::setParameters(int aSampleRate, int aSequenceMS,
int aSeekWindowMS, int aOverlapMS)
{
// accept only positive parameter values - if zero or negative, use old values instead
if (aSampleRate > 0) this->sampleRate = aSampleRate;
if (aOverlapMS > 0) this->overlapMs = aOverlapMS;
if (aSequenceMS > 0)
{
this->sequenceMs = aSequenceMS;
bAutoSeqSetting = false;
}
else if (aSequenceMS == 0)
{
// if zero, use automatic setting
bAutoSeqSetting = true;
}
if (aSeekWindowMS > 0)
{
this->seekWindowMs = aSeekWindowMS;
bAutoSeekSetting = false;
}
else if (aSeekWindowMS == 0)
{
// if zero, use automatic setting
bAutoSeekSetting = true;
}
calcSeqParameters();
calculateOverlapLength(overlapMs);
// set tempo to recalculate 'sampleReq'
setTempo(tempo);
}
/// Get routine control parameters, see setParameters() function.
/// Any of the parameters to this function can be NULL, in such case corresponding parameter
/// value isn't returned.
void TDStretch::getParameters(int *pSampleRate, int *pSequenceMs, int *pSeekWindowMs, int *pOverlapMs) const
{
if (pSampleRate)
{
*pSampleRate = sampleRate;
}
if (pSequenceMs)
{
*pSequenceMs = (bAutoSeqSetting) ? (USE_AUTO_SEQUENCE_LEN) : sequenceMs;
}
if (pSeekWindowMs)
{
*pSeekWindowMs = (bAutoSeekSetting) ? (USE_AUTO_SEEKWINDOW_LEN) : seekWindowMs;
}
if (pOverlapMs)
{
*pOverlapMs = overlapMs;
}
}
// Overlaps samples in 'midBuffer' with the samples in 'pInput'
void TDStretch::overlapMono(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput) const
{
int i;
SAMPLETYPE m1, m2;
m1 = (SAMPLETYPE)0;
m2 = (SAMPLETYPE)overlapLength;
for (i = 0; i < overlapLength ; i ++)
{
pOutput[i] = (pInput[i] * m1 + pMidBuffer[i] * m2 ) / overlapLength;
m1 += 1;
m2 -= 1;
}
}
void TDStretch::clearMidBuffer()
{
memset(pMidBuffer, 0, channels * sizeof(SAMPLETYPE) * overlapLength);
}
void TDStretch::clearInput()
{
inputBuffer.clear();
clearMidBuffer();
isBeginning = true;
}
// Clears the sample buffers
void TDStretch::clear()
{
outputBuffer.clear();
clearInput();
}
// Enables/disables the quick position seeking algorithm. Zero to disable, nonzero
// to enable
void TDStretch::enableQuickSeek(bool enable)
{
bQuickSeek = enable;
}
// Returns nonzero if the quick seeking algorithm is enabled.
bool TDStretch::isQuickSeekEnabled() const
{
return bQuickSeek;
}
// Seeks for the optimal overlap-mixing position.
int TDStretch::seekBestOverlapPosition(const SAMPLETYPE *refPos)
{
if (bQuickSeek)
{
return seekBestOverlapPositionQuick(refPos);
}
else
{
return seekBestOverlapPositionFull(refPos);
}
}
// Overlaps samples in 'midBuffer' with the samples in 'pInputBuffer' at position
// of 'ovlPos'.
inline void TDStretch::overlap(SAMPLETYPE *pOutput, const SAMPLETYPE *pInput, uint ovlPos) const
{
#ifndef USE_MULTICH_ALWAYS
if (channels == 1)
{
// mono sound.
overlapMono(pOutput, pInput + ovlPos);
}
else if (channels == 2)
{
// stereo sound
overlapStereo(pOutput, pInput + 2 * ovlPos);
}
else
#endif // USE_MULTICH_ALWAYS
{
assert(channels > 0);
overlapMulti(pOutput, pInput + channels * ovlPos);
}
}
// Seeks for the optimal overlap-mixing position. The 'stereo' version of the
// routine
//
// The best position is determined as the position where the two overlapped
// sample sequences are 'most alike', in terms of the highest cross-correlation
// value over the overlapping period
int TDStretch::seekBestOverlapPositionFull(const SAMPLETYPE *refPos)
{
int bestOffs;
double bestCorr;
int i;
double norm;
bestCorr = -FLT_MAX;
bestOffs = 0;
// Scans for the best correlation value by testing each possible position
// over the permitted range.
bestCorr = calcCrossCorr(refPos, pMidBuffer, norm);
bestCorr = (bestCorr + 0.1) * 0.75;
#pragma omp parallel for
for (i = 1; i < seekLength; i ++)
{
double corr;
// Calculates correlation value for the mixing position corresponding to 'i'
#ifdef _OPENMP
// in parallel OpenMP mode, can't use norm accumulator version as parallel executor won't
// iterate the loop in sequential order
corr = calcCrossCorr(refPos + channels * i, pMidBuffer, norm);
#else
// In non-parallel version call "calcCrossCorrAccumulate" that is otherwise same
// as "calcCrossCorr", but saves time by reusing & updating previously stored
// "norm" value
corr = calcCrossCorrAccumulate(refPos + channels * i, pMidBuffer, norm);
#endif
// heuristic rule to slightly favour values close to mid of the range
double tmp = (double)(2 * i - seekLength) / (double)seekLength;
corr = ((corr + 0.1) * (1.0 - 0.25 * tmp * tmp));
// Checks for the highest correlation value
if (corr > bestCorr)
{
// For optimal performance, enter critical section only in case that best value found.
// in such case repeat 'if' condition as it's possible that parallel execution may have
// updated the bestCorr value in the mean time
#pragma omp critical
if (corr > bestCorr)
{
bestCorr = corr;
bestOffs = i;
}
}
}
#ifdef SOUNDTOUCH_INTEGER_SAMPLES
adaptNormalizer();
#endif
// clear cross correlation routine state if necessary (is so e.g. in MMX routines).
clearCrossCorrState();
return bestOffs;
}
// Quick seek algorithm for improved runtime-performance: First roughly scans through the
// correlation area, and then scan surroundings of two best preliminary correlation candidates
// with improved precision
//
// Based on testing:
// - This algorithm gives on average 99% as good match as the full algorith
// - this quick seek algorithm finds the best match on ~90% of cases
// - on those 10% of cases when this algorithm doesn't find best match,
// it still finds on average ~90% match vs. the best possible match
int TDStretch::seekBestOverlapPositionQuick(const SAMPLETYPE *refPos)
{
#define _MIN(a, b) (((a) < (b)) ? (a) : (b))
#define SCANSTEP 16
#define SCANWIND 8
int bestOffs;
int i;
int bestOffs2;
float bestCorr, corr;
float bestCorr2;
double norm;
// note: 'float' types used in this function in case that the platform would need to use software-fp
bestCorr =
bestCorr2 = -FLT_MAX;
bestOffs =
bestOffs2 = SCANWIND;
// Scans for the best correlation value by testing each possible position
// over the permitted range. Look for two best matches on the first pass to
// increase possibility of ideal match.
//
// Begin from "SCANSTEP" instead of SCANWIND to make the calculation
// catch the 'middlepoint' of seekLength vector as that's the a-priori
// expected best match position
//
// Roughly:
// - 15% of cases find best result directly on the first round,
// - 75% cases find better match on 2nd round around the best match from 1st round
// - 10% cases find better match on 2nd round around the 2nd-best-match from 1st round
for (i = SCANSTEP; i < seekLength - SCANWIND - 1; i += SCANSTEP)
{
// Calculates correlation value for the mixing position corresponding
// to 'i'
corr = (float)calcCrossCorr(refPos + channels*i, pMidBuffer, norm);
// heuristic rule to slightly favour values close to mid of the seek range
float tmp = (float)(2 * i - seekLength - 1) / (float)seekLength;
corr = ((corr + 0.1f) * (1.0f - 0.25f * tmp * tmp));
// Checks for the highest correlation value
if (corr > bestCorr)
{
// found new best match. keep the previous best as 2nd best match
bestCorr2 = bestCorr;
bestOffs2 = bestOffs;
bestCorr = corr;
bestOffs = i;
}
else if (corr > bestCorr2)
{
// not new best, but still new 2nd best match
bestCorr2 = corr;
bestOffs2 = i;
}
}
// Scans surroundings of the found best match with small stepping
int end = _MIN(bestOffs + SCANWIND + 1, seekLength);
for (i = bestOffs - SCANWIND; i < end; i++)
{
if (i == bestOffs) continue; // this offset already calculated, thus skip
// Calculates correlation value for the mixing position corresponding
// to 'i'
corr = (float)calcCrossCorr(refPos + channels*i, pMidBuffer, norm);
// heuristic rule to slightly favour values close to mid of the range
float tmp = (float)(2 * i - seekLength - 1) / (float)seekLength;
corr = ((corr + 0.1f) * (1.0f - 0.25f * tmp * tmp));
// Checks for the highest correlation value
if (corr > bestCorr)
{
bestCorr = corr;
bestOffs = i;
}
}
// Scans surroundings of the 2nd best match with small stepping
end = _MIN(bestOffs2 + SCANWIND + 1, seekLength);
for (i = bestOffs2 - SCANWIND; i < end; i++)
{
if (i == bestOffs2) continue; // this offset already calculated, thus skip
// Calculates correlation value for the mixing position corresponding
// to 'i'
corr = (float)calcCrossCorr(refPos + channels*i, pMidBuffer, norm);
// heuristic rule to slightly favour values close to mid of the range
float tmp = (float)(2 * i - seekLength - 1) / (float)seekLength;
corr = ((corr + 0.1f) * (1.0f - 0.25f * tmp * tmp));
// Checks for the highest correlation value
if (corr > bestCorr)
{
bestCorr = corr;
bestOffs = i;
}
}
// clear cross correlation routine state if necessary (is so e.g. in MMX routines).
clearCrossCorrState();
#ifdef SOUNDTOUCH_INTEGER_SAMPLES
adaptNormalizer();
#endif
return bestOffs;
}
/// For integer algorithm: adapt normalization factor divider with music so that
/// it'll not be pessimistically restrictive that can degrade quality on quieter sections
/// yet won't cause integer overflows either
void TDStretch::adaptNormalizer()
{
// Do not adapt normalizer over too silent sequences to avoid averaging filter depleting to
// too low values during pauses in music
if ((maxnorm > 1000) || (maxnormf > 40000000))
{
//norm averaging filter
maxnormf = 0.9f * maxnormf + 0.1f * (float)maxnorm;
if ((maxnorm > 800000000) && (overlapDividerBitsNorm < 16))
{
// large values, so increase divider
overlapDividerBitsNorm++;
if (maxnorm > 1600000000) overlapDividerBitsNorm++; // extra large value => extra increase
}
else if ((maxnormf < 1000000) && (overlapDividerBitsNorm > 0))
{
// extra small values, decrease divider
overlapDividerBitsNorm--;
}
}
maxnorm = 0;
}
/// clear cross correlation routine state if necessary
void TDStretch::clearCrossCorrState()
{
// default implementation is empty.
}
/// Calculates processing sequence length according to tempo setting
void TDStretch::calcSeqParameters()
{
// Adjust tempo param according to tempo, so that variating processing sequence length is used
// at varius tempo settings, between the given low...top limits
#define AUTOSEQ_TEMPO_LOW 0.5 // auto setting low tempo range (-50%)
#define AUTOSEQ_TEMPO_TOP 2.0 // auto setting top tempo range (+100%)
// sequence-ms setting values at above low & top tempo
#define AUTOSEQ_AT_MIN 90.0
#define AUTOSEQ_AT_MAX 40.0
#define AUTOSEQ_K ((AUTOSEQ_AT_MAX - AUTOSEQ_AT_MIN) / (AUTOSEQ_TEMPO_TOP - AUTOSEQ_TEMPO_LOW))
#define AUTOSEQ_C (AUTOSEQ_AT_MIN - (AUTOSEQ_K) * (AUTOSEQ_TEMPO_LOW))
// seek-window-ms setting values at above low & top tempoq
#define AUTOSEEK_AT_MIN 20.0
#define AUTOSEEK_AT_MAX 15.0
#define AUTOSEEK_K ((AUTOSEEK_AT_MAX - AUTOSEEK_AT_MIN) / (AUTOSEQ_TEMPO_TOP - AUTOSEQ_TEMPO_LOW))
#define AUTOSEEK_C (AUTOSEEK_AT_MIN - (AUTOSEEK_K) * (AUTOSEQ_TEMPO_LOW))
#define CHECK_LIMITS(x, mi, ma) (((x) < (mi)) ? (mi) : (((x) > (ma)) ? (ma) : (x)))
double seq, seek;
if (bAutoSeqSetting)
{
seq = AUTOSEQ_C + AUTOSEQ_K * tempo;
seq = CHECK_LIMITS(seq, AUTOSEQ_AT_MAX, AUTOSEQ_AT_MIN);
sequenceMs = (int)(seq + 0.5);
}
if (bAutoSeekSetting)
{
seek = AUTOSEEK_C + AUTOSEEK_K * tempo;
seek = CHECK_LIMITS(seek, AUTOSEEK_AT_MAX, AUTOSEEK_AT_MIN);
seekWindowMs = (int)(seek + 0.5);
}
// Update seek window lengths
seekWindowLength = (sampleRate * sequenceMs) / 1000;
if (seekWindowLength < 2 * overlapLength)
{
seekWindowLength = 2 * overlapLength;
}
seekLength = (sampleRate * seekWindowMs) / 1000;
}
// Sets new target tempo. Normal tempo = 'SCALE', smaller values represent slower
// tempo, larger faster tempo.
void TDStretch::setTempo(double newTempo)
{
int intskip;
tempo = newTempo;
// Calculate new sequence duration
calcSeqParameters();
// Calculate ideal skip length (according to tempo value)
nominalSkip = tempo * (seekWindowLength - overlapLength);
intskip = (int)(nominalSkip + 0.5);
// Calculate how many samples are needed in the 'inputBuffer' to
// process another batch of samples
//sampleReq = max(intskip + overlapLength, seekWindowLength) + seekLength / 2;
sampleReq = max(intskip + overlapLength, seekWindowLength) + seekLength;
}
// Sets the number of channels, 1 = mono, 2 = stereo
void TDStretch::setChannels(int numChannels)
{
assert(numChannels > 0);
if (channels == numChannels) return;
// assert(numChannels == 1 || numChannels == 2);
channels = numChannels;
inputBuffer.setChannels(channels);
outputBuffer.setChannels(channels);
// re-init overlap/buffer
overlapLength=0;
setParameters(sampleRate);
}
// nominal tempo, no need for processing, just pass the samples through
// to outputBuffer
/*
void TDStretch::processNominalTempo()
{
assert(tempo == 1.0f);
if (bMidBufferDirty)
{
// If there are samples in pMidBuffer waiting for overlapping,
// do a single sliding overlapping with them in order to prevent a
// clicking distortion in the output sound
if (inputBuffer.numSamples() < overlapLength)
{
// wait until we've got overlapLength input samples
return;
}
// Mix the samples in the beginning of 'inputBuffer' with the
// samples in 'midBuffer' using sliding overlapping
overlap(outputBuffer.ptrEnd(overlapLength), inputBuffer.ptrBegin(), 0);
outputBuffer.putSamples(overlapLength);
inputBuffer.receiveSamples(overlapLength);
clearMidBuffer();
// now we've caught the nominal sample flow and may switch to
// bypass mode
}
// Simply bypass samples from input to output
outputBuffer.moveSamples(inputBuffer);
}
*/
// Processes as many processing frames of the samples 'inputBuffer', store
// the result into 'outputBuffer'
void TDStretch::processSamples()
{
int ovlSkip;
int offset = 0;
int temp;
/* Removed this small optimization - can introduce a click to sound when tempo setting
crosses the nominal value
if (tempo == 1.0f)
{
// tempo not changed from the original, so bypass the processing
processNominalTempo();
return;
}
*/
// Process samples as long as there are enough samples in 'inputBuffer'
// to form a processing frame.
while ((int)inputBuffer.numSamples() >= sampleReq)
{
if (isBeginning == false)
{
// apart from the very beginning of the track,
// scan for the best overlapping position & do overlap-add
offset = seekBestOverlapPosition(inputBuffer.ptrBegin());
// Mix the samples in the 'inputBuffer' at position of 'offset' with the
// samples in 'midBuffer' using sliding overlapping
// ... first partially overlap with the end of the previous sequence
// (that's in 'midBuffer')
overlap(outputBuffer.ptrEnd((uint)overlapLength), inputBuffer.ptrBegin(), (uint)offset);
outputBuffer.putSamples((uint)overlapLength);
offset += overlapLength;
}
else
{
// Adjust processing offset at beginning of track by not perform initial overlapping
// and compensating that in the 'input buffer skip' calculation
isBeginning = false;
int skip = (int)(tempo * overlapLength + 0.5);
#ifdef SOUNDTOUCH_ALLOW_NONEXACT_SIMD_OPTIMIZATION
#ifdef SOUNDTOUCH_ALLOW_SSE
// if SSE mode, round the skip amount to value corresponding to aligned memory address
if (channels == 1)
{
skip &= -4;
}
else if (channels == 2)
{
skip &= -2;
}
#endif
#endif
skipFract -= skip;
assert(nominalSkip >= -skipFract);
}
// ... then copy sequence samples from 'inputBuffer' to output:
// crosscheck that we don't have buffer overflow...
if ((int)inputBuffer.numSamples() < (offset + seekWindowLength - overlapLength))
{
continue; // just in case, shouldn't really happen
}
// length of sequence
temp = (seekWindowLength - 2 * overlapLength);
outputBuffer.putSamples(inputBuffer.ptrBegin() + channels * offset, (uint)temp);
// Copies the end of the current sequence from 'inputBuffer' to
// 'midBuffer' for being mixed with the beginning of the next
// processing sequence and so on
assert((offset + temp + overlapLength) <= (int)inputBuffer.numSamples());
memcpy(pMidBuffer, inputBuffer.ptrBegin() + channels * (offset + temp),
channels * sizeof(SAMPLETYPE) * overlapLength);
// Remove the processed samples from the input buffer. Update
// the difference between integer & nominal skip step to 'skipFract'
// in order to prevent the error from accumulating over time.
skipFract += nominalSkip; // real skip size
ovlSkip = (int)skipFract; // rounded to integer skip
skipFract -= ovlSkip; // maintain the fraction part, i.e. real vs. integer skip
inputBuffer.receiveSamples((uint)ovlSkip);
}
}
// Adds 'numsamples' pcs of samples from the 'samples' memory position into
// the input of the object.
void TDStretch::putSamples(const SAMPLETYPE *samples, uint nSamples)
{
// Add the samples into the input buffer
inputBuffer.putSamples(samples, nSamples);
// Process the samples in input buffer
processSamples();
}
/// Set new overlap length parameter & reallocate RefMidBuffer if necessary.
void TDStretch::acceptNewOverlapLength(int newOverlapLength)
{
int prevOvl;
assert(newOverlapLength >= 0);
prevOvl = overlapLength;
overlapLength = newOverlapLength;
if (overlapLength > prevOvl)
{
delete[] pMidBufferUnaligned;
pMidBufferUnaligned = new SAMPLETYPE[overlapLength * channels + 16 / sizeof(SAMPLETYPE)];
// ensure that 'pMidBuffer' is aligned to 16 byte boundary for efficiency
pMidBuffer = (SAMPLETYPE *)SOUNDTOUCH_ALIGN_POINTER_16(pMidBufferUnaligned);
clearMidBuffer();
}
}
// Operator 'new' is overloaded so that it automatically creates a suitable instance
// depending on if we've a MMX/SSE/etc-capable CPU available or not.
void * TDStretch::operator new(size_t s)
{
// Notice! don't use "new TDStretch" directly, use "newInstance" to create a new instance instead!
ST_THROW_RT_ERROR("Error in TDStretch::new: Don't use 'new TDStretch' directly, use 'newInstance' member instead!");
return newInstance();
}
TDStretch * TDStretch::newInstance()
{
uint uExtensions;
uExtensions = detectCPUextensions();
// Check if MMX/SSE instruction set extensions supported by CPU
#ifdef SOUNDTOUCH_ALLOW_MMX
// MMX routines available only with integer sample types
if (uExtensions & SUPPORT_MMX)
{
return ::new TDStretchMMX;
}
else
#endif // SOUNDTOUCH_ALLOW_MMX
#ifdef SOUNDTOUCH_ALLOW_SSE
if (uExtensions & SUPPORT_SSE)
{
// SSE support
return ::new TDStretchSSE;
}
else
#endif // SOUNDTOUCH_ALLOW_SSE
{
// ISA optimizations not supported, use plain C version
return ::new TDStretch;
}
}
//////////////////////////////////////////////////////////////////////////////
//
// Integer arithmetics specific algorithm implementations.
//
//////////////////////////////////////////////////////////////////////////////
#ifdef SOUNDTOUCH_INTEGER_SAMPLES
// Overlaps samples in 'midBuffer' with the samples in 'input'. The 'Stereo'
// version of the routine.
void TDStretch::overlapStereo(short *poutput, const short *input) const
{
int i;
short temp;
int cnt2;
for (i = 0; i < overlapLength ; i ++)
{
temp = (short)(overlapLength - i);
cnt2 = 2 * i;
poutput[cnt2] = (input[cnt2] * i + pMidBuffer[cnt2] * temp ) / overlapLength;
poutput[cnt2 + 1] = (input[cnt2 + 1] * i + pMidBuffer[cnt2 + 1] * temp ) / overlapLength;
}
}
// Overlaps samples in 'midBuffer' with the samples in 'input'. The 'Multi'
// version of the routine.
void TDStretch::overlapMulti(SAMPLETYPE *poutput, const SAMPLETYPE *input) const
{
SAMPLETYPE m1=(SAMPLETYPE)0;
SAMPLETYPE m2;
int i=0;
for (m2 = (SAMPLETYPE)overlapLength; m2; m2 --)
{
for (int c = 0; c < channels; c ++)
{
poutput[i] = (input[i] * m1 + pMidBuffer[i] * m2) / overlapLength;
i++;
}
m1++;
}
}
// Calculates the x having the closest 2^x value for the given value
static int _getClosest2Power(double value)
{
return (int)(log(value) / log(2.0) + 0.5);
}
/// Calculates overlap period length in samples.
/// Integer version rounds overlap length to closest power of 2
/// for a divide scaling operation.
void TDStretch::calculateOverlapLength(int aoverlapMs)
{
int newOvl;
assert(aoverlapMs >= 0);
// calculate overlap length so that it's power of 2 - thus it's easy to do
// integer division by right-shifting. Term "-1" at end is to account for
// the extra most significatnt bit left unused in result by signed multiplication
overlapDividerBitsPure = _getClosest2Power((sampleRate * aoverlapMs) / 1000.0) - 1;
if (overlapDividerBitsPure > 9) overlapDividerBitsPure = 9;
if (overlapDividerBitsPure < 3) overlapDividerBitsPure = 3;
newOvl = (int)pow(2.0, (int)overlapDividerBitsPure + 1); // +1 => account for -1 above
acceptNewOverlapLength(newOvl);
overlapDividerBitsNorm = overlapDividerBitsPure;
// calculate sloping divider so that crosscorrelation operation won't
// overflow 32-bit register. Max. sum of the crosscorrelation sum without
// divider would be 2^30*(N^3-N)/3, where N = overlap length
slopingDivider = (newOvl * newOvl - 1) / 3;
}
double TDStretch::calcCrossCorr(const short *mixingPos, const short *compare, double &norm)
{
long corr;
unsigned long lnorm;
int i;
corr = lnorm = 0;
// Same routine for stereo and mono. For stereo, unroll loop for better
// efficiency and gives slightly better resolution against rounding.
// For mono it same routine, just unrolls loop by factor of 4
for (i = 0; i < channels * overlapLength; i += 4)
{
corr += (mixingPos[i] * compare[i] +
mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm; // notice: do intermediate division here to avoid integer overflow
corr += (mixingPos[i + 2] * compare[i + 2] +
mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBitsNorm;
lnorm += (mixingPos[i] * mixingPos[i] +
mixingPos[i + 1] * mixingPos[i + 1]) >> overlapDividerBitsNorm; // notice: do intermediate division here to avoid integer overflow
lnorm += (mixingPos[i + 2] * mixingPos[i + 2] +
mixingPos[i + 3] * mixingPos[i + 3]) >> overlapDividerBitsNorm;
}
if (lnorm > maxnorm)
{
// modify 'maxnorm' inside critical section to avoid multi-access conflict if in OpenMP mode
#pragma omp critical
if (lnorm > maxnorm)
{
maxnorm = lnorm;
}
}
// Normalize result by dividing by sqrt(norm) - this step is easiest
// done using floating point operation
norm = (double)lnorm;
return (double)corr / sqrt((norm < 1e-9) ? 1.0 : norm);
}
/// Update cross-correlation by accumulating "norm" coefficient by previously calculated value
double TDStretch::calcCrossCorrAccumulate(const short *mixingPos, const short *compare, double &norm)
{
long corr;
unsigned long lnorm;
int i;
// cancel first normalizer tap from previous round
lnorm = 0;
for (i = 1; i <= channels; i ++)
{
lnorm -= (mixingPos[-i] * mixingPos[-i]) >> overlapDividerBitsNorm;
}
corr = 0;
// Same routine for stereo and mono. For stereo, unroll loop for better
// efficiency and gives slightly better resolution against rounding.
// For mono it same routine, just unrolls loop by factor of 4
for (i = 0; i < channels * overlapLength; i += 4)
{
corr += (mixingPos[i] * compare[i] +
mixingPos[i + 1] * compare[i + 1]) >> overlapDividerBitsNorm; // notice: do intermediate division here to avoid integer overflow
corr += (mixingPos[i + 2] * compare[i + 2] +
mixingPos[i + 3] * compare[i + 3]) >> overlapDividerBitsNorm;
}
// update normalizer with last samples of this round
for (int j = 0; j < channels; j ++)
{
i --;
lnorm += (mixingPos[i] * mixingPos[i]) >> overlapDividerBitsNorm;
}
norm += (double)lnorm;
if (norm > maxnorm)
{
maxnorm = (unsigned long)norm;
}
// Normalize result by dividing by sqrt(norm) - this step is easiest
// done using floating point operation
return (double)corr / sqrt((norm < 1e-9) ? 1.0 : norm);
}
#endif // SOUNDTOUCH_INTEGER_SAMPLES
//////////////////////////////////////////////////////////////////////////////
//
// Floating point arithmetics specific algorithm implementations.
//
#ifdef SOUNDTOUCH_FLOAT_SAMPLES
// Overlaps samples in 'midBuffer' with the samples in 'pInput'
void TDStretch::overlapStereo(float *pOutput, const float *pInput) const
{
int i;
float fScale;
float f1;
float f2;
fScale = 1.0f / (float)overlapLength;
f1 = 0;
f2 = 1.0f;
for (i = 0; i < 2 * (int)overlapLength ; i += 2)
{
pOutput[i + 0] = pInput[i + 0] * f1 + pMidBuffer[i + 0] * f2;
pOutput[i + 1] = pInput[i + 1] * f1 + pMidBuffer[i + 1] * f2;
f1 += fScale;
f2 -= fScale;
}
}
// Overlaps samples in 'midBuffer' with the samples in 'input'.
void TDStretch::overlapMulti(float *pOutput, const float *pInput) const
{
int i;
float fScale;
float f1;
float f2;
fScale = 1.0f / (float)overlapLength;
f1 = 0;
f2 = 1.0f;
i=0;
for (int i2 = 0; i2 < overlapLength; i2 ++)
{
// note: Could optimize this slightly by taking into account that always channels > 2
for (int c = 0; c < channels; c ++)
{
pOutput[i] = pInput[i] * f1 + pMidBuffer[i] * f2;
i++;
}
f1 += fScale;
f2 -= fScale;
}
}
/// Calculates overlapInMsec period length in samples.
void TDStretch::calculateOverlapLength(int overlapInMsec)
{
int newOvl;
assert(overlapInMsec >= 0);
newOvl = (sampleRate * overlapInMsec) / 1000;
if (newOvl < 16) newOvl = 16;
// must be divisible by 8
newOvl -= newOvl % 8;
acceptNewOverlapLength(newOvl);
}
/// Calculate cross-correlation
double TDStretch::calcCrossCorr(const float *mixingPos, const float *compare, double &anorm)
{
double corr;
double norm;
int i;
corr = norm = 0;
// Same routine for stereo and mono. For Stereo, unroll by factor of 2.
// For mono it's same routine yet unrollsd by factor of 4.
for (i = 0; i < channels * overlapLength; i += 4)
{
corr += mixingPos[i] * compare[i] +
mixingPos[i + 1] * compare[i + 1];
norm += mixingPos[i] * mixingPos[i] +
mixingPos[i + 1] * mixingPos[i + 1];
// unroll the loop for better CPU efficiency:
corr += mixingPos[i + 2] * compare[i + 2] +
mixingPos[i + 3] * compare[i + 3];
norm += mixingPos[i + 2] * mixingPos[i + 2] +
mixingPos[i + 3] * mixingPos[i + 3];
}
anorm = norm;
return corr / sqrt((norm < 1e-9 ? 1.0 : norm));
}
/// Update cross-correlation by accumulating "norm" coefficient by previously calculated value
double TDStretch::calcCrossCorrAccumulate(const float *mixingPos, const float *compare, double &norm)
{
double corr;
int i;
corr = 0;
// cancel first normalizer tap from previous round
for (i = 1; i <= channels; i ++)
{
norm -= mixingPos[-i] * mixingPos[-i];
}
// Same routine for stereo and mono. For Stereo, unroll by factor of 2.
// For mono it's same routine yet unrollsd by factor of 4.
for (i = 0; i < channels * overlapLength; i += 4)
{
corr += mixingPos[i] * compare[i] +
mixingPos[i + 1] * compare[i + 1] +
mixingPos[i + 2] * compare[i + 2] +
mixingPos[i + 3] * compare[i + 3];
}
// update normalizer with last samples of this round
for (int j = 0; j < channels; j ++)
{
i --;
norm += mixingPos[i] * mixingPos[i];
}
return corr / sqrt((norm < 1e-9 ? 1.0 : norm));
}
#endif // SOUNDTOUCH_FLOAT_SAMPLES