BizHawk/waterbox/pcfx/sound/OwlResampler.cpp

804 lines
19 KiB
C++
Raw Normal View History

2017-07-09 15:17:42 +00:00
/* Mednafen - Multi-system Emulator
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
// Don't pass more than about 40ms worth of audio data to Resample()
// at a time.
#define _GNU_SOURCE
#include "../defs.h"
#include <math.h>
#include <assert.h>
#include <limits.h>
#include "OwlResampler.h"
//#include "../cputest/cputest.h"
#if defined(ARCH_POWERPC_ALTIVEC) && defined(HAVE_ALTIVEC_H)
#include <altivec.h>
#endif
#ifdef __ARM_NEON__
#include <arm_neon.h>
#endif
#ifdef __FAST_MATH__
#error "OwlResampler.cpp not compatible with unsafe math optimizations!"
#endif
OwlBuffer::OwlBuffer()
{
assert(sizeof(I32_F_Pudding) == 4);
assert(sizeof(float) == 4);
memset(HRBuf, 0, sizeof(HRBuf));
accum = 0;
filter_state[0] = 0;
filter_state[1] = 0;
leftover = 0;
InputIndex = 0;
InputPhase = 0;
debias = 0;
}
OwlBuffer::~OwlBuffer()
{
}
template<unsigned DoExMix, bool Integrate, unsigned IntegrateShift, bool Lowpass, bool Highpass, bool FloatOutput>
static int32 ProcessLoop(unsigned count, int32 a, int32* b, int32* exmix0 = NULL, int32* exmix1 = NULL, unsigned lp_shift = 0, unsigned hp_shift = 0, int64* f_in = NULL)
{
int64 lp_f;
int64 hp_f;
if(Lowpass)
{
lp_f = f_in[0];
}
if(Highpass)
{
hp_f = f_in[1];
}
while(count--)
{
int32 tmp;
if(Integrate)
{
a += *b;
tmp = a >> IntegrateShift;
}
else
tmp = *b;
if(Lowpass)
{
lp_f += ((int64)((uint64)(int64)tmp << 16) - lp_f) >> lp_shift;
tmp = lp_f >> 16;
}
if(Highpass)
{
hp_f += ((int64)((uint64)(int64)tmp << 16) - hp_f) >> hp_shift;
tmp = tmp - (hp_f >> 16);
}
if(DoExMix >= 1)
{
tmp += *exmix0;
exmix0++;
}
if(DoExMix >= 2)
{
tmp += *exmix1;
exmix1++;
}
if(FloatOutput)
*(float*)b = tmp;
else
*b = tmp;
b++;
}
if(Lowpass)
f_in[0] = lp_f;
if(Highpass)
f_in[1] = hp_f;
return(a);
}
void OwlBuffer::ResampleSkipped(unsigned count)
{
memmove(HRBuf, &HRBuf[count], HRBUF_OVERFLOW_PADDING * sizeof(HRBuf[0]));
memset(&HRBuf[HRBUF_OVERFLOW_PADDING], 0, count * sizeof(HRBuf[0]));
}
void OwlBuffer::Integrate(unsigned count, unsigned lp_shift, unsigned hp_shift, RavenBuffer* mixin0, RavenBuffer* mixin1)
{
//lp_shift = hp_shift = 0;
if(lp_shift != 0 || hp_shift != 0)
{
if(mixin0 && mixin1)
accum = ProcessLoop<2, true, 3, true, true, true>(count, accum, Buf(), mixin0->Buf(), mixin1->Buf(), lp_shift, hp_shift, filter_state);
else if(mixin0)
accum = ProcessLoop<1, true, 3, true, true, true>(count, accum, Buf(), mixin0->Buf(), NULL, lp_shift, hp_shift, filter_state);
else
accum = ProcessLoop<0, true, 3, true, true, true>(count, accum, Buf(), NULL, NULL, lp_shift, hp_shift, filter_state);
}
else
{
if(mixin0 && mixin1)
accum = ProcessLoop<2, true, 3, false, false, true>(count, accum, Buf(), mixin0->Buf(), mixin1->Buf());
else if(mixin0)
accum = ProcessLoop<1, true, 3, false, false, true>(count, accum, Buf(), mixin0->Buf());
else
accum = ProcessLoop<0, true, 3, false, false, true>(count, accum, Buf());
}
if(accum >= 32767 * 256 * 8 || accum <= -32767 * 256 * 8)
{
//printf("Possible delta sample loss; accum=%d\n", accum);
}
}
//
//
//
RavenBuffer::RavenBuffer()
{
memset(BB, 0, sizeof(BB));
accum = 0;
filter_state[0] = 0;
filter_state[1] = 0;
}
RavenBuffer::~RavenBuffer()
{
}
void RavenBuffer::Process(unsigned count, bool integrate, unsigned lp_shift)
{
if(integrate)
{
if(lp_shift != 0)
accum = ProcessLoop<0, true, 3, true, false, false>(count, accum, Buf(), NULL, NULL, lp_shift, 0, filter_state);
else
accum = ProcessLoop<0, true, 3, false, false, false>(count, accum, Buf(), NULL, NULL, lp_shift, 0);
}
else
{
if(lp_shift != 0)
accum = ProcessLoop<0, false, 0, true, false, false>(count, accum, Buf(), NULL, NULL, lp_shift, 0, filter_state);
else
accum = ProcessLoop<0, false, 0, false, false, false>(count, accum, Buf(), NULL, NULL, lp_shift, 0);
}
}
void RavenBuffer::Finish(unsigned count)
{
memmove(BB, &BB[count], OwlBuffer::HRBUF_OVERFLOW_PADDING * sizeof(BB[0]));
memset(&BB[OwlBuffer::HRBUF_OVERFLOW_PADDING], 0, count * sizeof(BB[0]));
}
static void kaiser_window( double* io, int count, double beta )
{
int const accuracy = 16; //12;
double* end = io + count;
double beta2 = beta * beta * (double) -0.25;
double to_fract = beta2 / ((double) count * count);
double i = 0;
double rescale = 0; // Doesn't need an initializer, to shut up gcc
for ( ; io < end; ++io, i += 1 )
{
double x = i * i * to_fract - beta2;
double u = x;
double k = x + 1;
double n = 2;
do
{
u *= x / (n * n);
n += 1;
k += u;
}
while ( k <= u * (1 << accuracy) );
if ( !i )
rescale = 1 / k; // otherwise values get large
*io *= k * rescale;
}
}
static void gen_sinc( double* out, int size, double cutoff, double kaiser )
{
assert( size % 2 == 0 ); // size must be enev
int const half_size = size / 2;
double* const mid = &out [half_size];
// Generate right half of sinc
for ( int i = 0; i < half_size; i++ )
{
double angle = (i * 2 + 1) * (M_PI / 2);
mid [i] = sin( angle * cutoff ) / angle;
}
kaiser_window( mid, half_size, kaiser );
// Mirror for left half
for ( int i = 0; i < half_size; i++ )
out [i] = mid [half_size - 1 - i];
}
static void normalize( double* io, int size, double gain = 1.0 )
{
double sum = 0;
for ( int i = 0; i < size; i++ )
sum += io [i];
double scale = gain / sum;
for ( int i = 0; i < size; i++ )
io [i] *= scale;
}
static INLINE void DoMAC(float *wave, float *coeffs, int32 count, int32 *accum_output)
{
float acc[4] = { 0, 0, 0, 0 };
for(int c = 0; MDFN_LIKELY(c < count); c += 4)
{
acc[0] += wave[c + 0] * coeffs[c + 0];
acc[1] += wave[c + 1] * coeffs[c + 1];
acc[2] += wave[c + 2] * coeffs[c + 2];
acc[3] += wave[c + 3] * coeffs[c + 3];
}
*accum_output = (acc[0] + acc[2]) + (acc[1] + acc[3]);
}
#ifdef ARCH_X86
#include "OwlResampler_x86.inc"
#endif
#ifdef ARCH_POWERPC_ALTIVEC
#include "OwlResampler_altivec.inc"
#endif
#ifdef __ARM_NEON__
#include "OwlResampler_neon.inc"
#endif
template<typename T, unsigned sa>
static T SDP2(T v)
{
T tmp;
tmp = (v >> ((sizeof(T) * 8) - 1)) & (((T)1 << sa) - 1);
return ((v + tmp) >> sa);
}
enum
{
SIMD_NONE = 0,
#ifdef ARCH_X86
SIMD_SSE_16X,
#ifdef HAVE_INLINEASM_AVX
SIMD_AVX_32X,
SIMD_AVX_32X_P16,
#else
#warning "Compiling without AVX inline assembly."
#endif
#elif defined(ARCH_POWERPC_ALTIVEC)
SIMD_ALTIVEC,
#elif defined __ARM_NEON__
SIMD_NEON
#endif
};
template<unsigned TA_SIMD_Type>
NO_INLINE int32 OwlResampler::T_Resample(OwlBuffer* in, const uint32 in_count, int16* out, const uint32 max_out_count, const bool reverse)
{
if(reverse)
{
int32* a = in->Buf();
int32* b = in->Buf() + in_count - 1;
while(MDFN_LIKELY(a < b))
{
std::swap<int32>(*a, *b);
a++;
b--;
}
}
//
//
//
uint32 count = 0;
int32* I32Out = &IntermediateBuffer[0];
const uint32 in_count_WLO = in->leftover + in_count;
const uint32 max = std::max<int64>(0, (int64)in_count_WLO - NumCoeffs);
uint32 InputPhase = in->InputPhase;
uint32 InputIndex = in->InputIndex;
OwlBuffer::I32_F_Pudding* InSamps = in->BufPudding() - in->leftover;
int32 leftover;
if(MDFN_UNLIKELY(InputPhase >= NumPhases))
{
fprintf(stderr, "[BUG] InputPhase >= NumPhases\n"); // Save states can also trigger this.
InputPhase = 0;
}
while(InputIndex < max)
{
float* wave = &InSamps[InputIndex].f;
float* coeffs = &PInfos[InputPhase].Coeffs[0];
int32 coeff_count = NumCoeffs;
switch(TA_SIMD_Type)
{
default:
case SIMD_NONE:
DoMAC(wave, coeffs, coeff_count, I32Out);
break;
#ifdef ARCH_X86
case SIMD_SSE_16X:
DoMAC_SSE_16X(wave, coeffs, coeff_count, I32Out);
break;
#ifdef HAVE_INLINEASM_AVX
case SIMD_AVX_32X:
DoMAC_AVX_32X(wave, coeffs, coeff_count, I32Out);
break;
case SIMD_AVX_32X_P16:
DoMAC_AVX_32X_P16(wave, coeffs, coeff_count, I32Out);
break;
#endif
#elif defined(ARCH_POWERPC_ALTIVEC)
case SIMD_ALTIVEC:
DoMAC_AltiVec(wave, coeffs, coeff_count, I32Out);
break;
#elif defined __ARM_NEON__
case SIMD_NEON:
DoMAC_NEON(wave, coeffs, coeff_count, I32Out);
break;
#endif
}
I32Out++;
count++;
InputPhase = PInfos[InputPhase].Next;
InputIndex += PInfos[InputPhase].Step;
}
#if defined(ARCH_X86) && defined(HAVE_INLINEASM_AVX)
if(TA_SIMD_Type == SIMD_AVX_32X || TA_SIMD_Type == SIMD_AVX_32X_P16)
{
asm volatile("vzeroupper\n\t" : : :
#if defined(__AVX__)
"ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7"
#if defined(__x86_64__)
, "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15"
#endif
#endif
);
}
#endif
if(InputIndex > in_count_WLO)
{
leftover = 0;
InputIndex -= in_count_WLO;
}
else
{
leftover = (int32)in_count_WLO - (int32)InputIndex;
InputIndex = 0;
}
#if 0
for(uint32 x = 0; x < count; x++)
{
int s = IntermediateBuffer[x] >> 8;
if(s < -32768 || s > 32767)
{
//printf("Flow: %6d\n", s);
if(s < -32768)
s = -32768;
else if(s > 32767)
s = 32767;
}
out[x * 2] = s;
}
#else
{
int64 debias = in->debias;
for(uint32 x = 0; x < count; x++)
{
int32 sample = IntermediateBuffer[x];
int32 s;
debias += (((int64)((uint64)(int64)sample << 16) - debias) * debias_multiplier) >> 16;
s = SDP2<int32, 8>(sample - (debias >> 16));
if(s < -32768 || s > 32767)
{
//printf("Flow: %6d\n", s);
if(s < -32768)
s = -32768;
else if(s > 32767)
s = 32767;
}
out[x * 2] = s;
}
in->debias = debias;
}
#endif
memmove(in->Buf() - leftover,
in->Buf() + in_count - leftover,
sizeof(int32) * (leftover + OwlBuffer::HRBUF_OVERFLOW_PADDING));
memset(in->Buf() + OwlBuffer::HRBUF_OVERFLOW_PADDING, 0, sizeof(int32) * in_count);
in->leftover = leftover;
in->InputPhase = InputPhase;
in->InputIndex = InputIndex;
return count;
}
void OwlResampler::ResetBufResampState(OwlBuffer* buf)
{
memset(buf->HRBuf, 0, sizeof(buf->HRBuf[0]) * OwlBuffer::HRBUF_LEFTOVER_PADDING);
buf->InputPhase = 0;
}
OwlResampler::~OwlResampler()
{
}
//
// Flush denormals, and coefficients that could lead to denormals, to zero.
//
static float FilterDenormal(float v)
{
union
{
float f;
uint32 i;
} cat_pun;
cat_pun.f = v;
if(((cat_pun.i >> 23) & 0xFF) <= 24) // Maybe < 24 is more correct?
{
MDFN_printf("Small FP coefficient detected: 0x%08x --- raw_sign=%d, raw_exp=0x%02x, raw_mantissa=0x%06x\n", cat_pun.i, cat_pun.i >> 31, (cat_pun.i >> 23) & 0xFF, cat_pun.i & ((1U << 23) - 1));
return(0);
}
return(v);
}
OwlResampler::OwlResampler(double input_rate, double output_rate, double rate_error, double debias_corner, int quality, double nyq_fudge)
{
std::unique_ptr<double[]> FilterBuf;
double ratio = (double)output_rate / input_rate;
double cutoff;
double required_bandwidth;
double k_beta;
double k_d;
for(int i = 0; i < 256; i++)
{
int a = SDP2<int32, 3>(i);
int b = SDP2<int32, 3>(-i);
int c = i / (1 << 3);
assert(a == -b && a == c);
}
assert(sizeof(OwlBuffer::I32_F_Pudding) == 4);
InputRate = input_rate;
OutputRate = output_rate;
RateError = rate_error;
DebiasCorner = debias_corner;
Quality = quality;
IntermediateBuffer.resize(OutputRate * 4 / 50); // *4 for safety padding, / min(50,60), an approximate calculation
const uint32 cpuext = 0; //cputest_get_flags();
MDFN_printf("OwlResampler.cpp debug info:\n");
//MDFN_indent(1);
// Get the number of phases required, and adjust ratio.
{
double s_ratio = (double)input_rate / output_rate;
double findo = 0;
uint32 count = 0;
uint32 findo_i;
do
{
count++;
findo += s_ratio;
} while( fabs(1.0 - ((floor(0.5 + findo) / count) / s_ratio)) > rate_error);
s_ratio = floor(0.5 + findo) / count;
findo_i = (uint32) floor(0.5 + findo);
ratio = 1 / s_ratio;
NumPhases = count;
PInfos.resize(NumPhases);
uint32 last_indoo = 0;
for(unsigned int i = 0; i < NumPhases; i++)
{
uint32 index_pos = i * findo_i / NumPhases;
PInfos[i].Next = (i + 1) % (NumPhases);
PInfos[i].Step = index_pos - last_indoo;
last_indoo = index_pos;
}
PInfos[0].Step = findo_i - last_indoo;
Ratio_Dividend = findo_i;
Ratio_Divisor = NumPhases;
MDFN_printf("Phases: %d, Output rate: %f, %d %d\n", NumPhases, input_rate * ratio, Ratio_Dividend, Ratio_Divisor);
MDFN_printf("Desired maximum rate error: %.10f, Actual rate error: %.10f\n", rate_error, fabs((double)input_rate / output_rate * ratio - 1));
}
static const struct
{
double beta;
double d;
double obw;
} QualityTable[7] =
{
{ 5.658, 3.62, 0.65 },
{ 6.764, 4.32, 0.70 },
{ 7.865, 5.00, 0.75 },
{ 8.960, 5.70, 0.80 },
{ 10.056, 6.40, 0.85 },
{ 10.056, 6.40, 0.90 },
{ 10.056, 6.40, 0.9333 }, // 1.0 - (6.40 / 96)
};
assert(quality >= 0 && quality <= 6);
k_beta = QualityTable[quality].beta;
k_d = QualityTable[quality].d;
//
// As far as filter frequency response design goes, we clamp the output rate parameter
// to keep PCE CD and PC-FX CD-DA sample amplitudes from going wild since we're not resampling CD-DA totally properly.
//
#define OWLRESAMP_FCALC_RATE_CLAMP 128000.0 //192000.0 //96000.0 //48000.0
// A little SOMETHING to widen the transition band a bit to reduce computational complexity with higher output rates.
const double something = std::min<double>(OWLRESAMP_FCALC_RATE_CLAMP, (48000.0 + std::min<double>(OWLRESAMP_FCALC_RATE_CLAMP, output_rate)) / 2 / QualityTable[quality].obw);
//
// Note: Cutoff calculation is performed again(though slightly differently) down below after the SIMD check.
//
cutoff = QualityTable[quality].obw * (std::min<double>(something, std::min<double>(input_rate, output_rate)) / input_rate);
required_bandwidth = (std::min<double>(OWLRESAMP_FCALC_RATE_CLAMP, std::min<double>(input_rate, output_rate)) / input_rate) - cutoff;
NumCoeffs = ceil(k_d / required_bandwidth);
MDFN_printf("Initial number of coefficients per phase: %u\n", NumCoeffs);
MDFN_printf("Initial nominal cutoff frequency: %f\n", InputRate * cutoff / 2);
//
// Put this lower limit BEFORE the SIMD stuff, otherwise the NumCoeffs calculation will be off.
//
if(NumCoeffs < 16)
NumCoeffs = 16;
if(0)
{
abort(); // The sky is falling AAAAAAAAAAAAA
}
#ifdef ARCH_X86
#ifdef HAVE_INLINEASM_AVX
else if((cpuext & CPUTEST_FLAG_AVX) && (NumCoeffs + 0xF) >= 32)
{
MDFN_printf("SIMD: AVX\n");
// AVX loop can't handle less than 32 MACs properly.
NumCoeffs = std::max<uint32>(32, NumCoeffs);
// Past 32 MACs, AVX loop granularity is 16 MACs(with some ugly maaaagic~)
NumCoeffs = (NumCoeffs + 0xF) &~ 0xF;
if(NumCoeffs & 0x10)
Resample_ = &OwlResampler::T_Resample<SIMD_AVX_32X_P16>;
else
Resample_ = &OwlResampler::T_Resample<SIMD_AVX_32X>;
}
#endif
else if(cpuext & CPUTEST_FLAG_SSE)
{
MDFN_printf("SIMD: SSE\n");
// SSE loop does 16 MACs per iteration.
NumCoeffs = (NumCoeffs + 0xF) &~ 0xF;
Resample_ = &OwlResampler::T_Resample<SIMD_SSE_16X>;
}
#endif
#ifdef ARCH_POWERPC_ALTIVEC
else if(1)
{
MDFN_printf("SIMD: AltiVec\n");
// AltiVec loop does 16 MACs per iteration.
NumCoeffs = (NumCoeffs + 0xF) &~ 0xF;
Resample_ = &OwlResampler::T_Resample<SIMD_ALTIVEC>;
}
#endif
#ifdef __ARM_NEON__
else if(1)
{
MDFN_printf("SIMD: NEON\n");
// NEON loop does 16 MACs per iteration.
NumCoeffs = (NumCoeffs + 0xF) &~ 0xF;
Resample_ = &OwlResampler::T_Resample<SIMD_NEON>;
}
#endif
else
{
// Default loop does 4 MACs per iteration.
NumCoeffs = (NumCoeffs + 3) &~ 3;
Resample_ = &OwlResampler::T_Resample<SIMD_NONE>;
}
//
// Don't alter NumCoeffs anymore from here on.
//
#if !defined(ARCH_X86) && !defined(ARCH_POWERPC_ALTIVEC) && !defined(__ARM_NEON__)
#warning "OwlResampler is being compiled without SIMD support."
#endif
//
// Adjust cutoff now that NumCoeffs may have been increased.
//
cutoff = std::min<double>(QualityTable[quality].obw * something / input_rate, (std::min<double>(input_rate, output_rate) / input_rate - ((double)k_d / NumCoeffs)));
cutoff *= nyq_fudge;
if(ceil(cutoff) > 1.0)
cutoff = 1.0;
MDFN_printf("Adjusted number of coefficients per phase: %u\n", NumCoeffs);
MDFN_printf("Adjusted nominal cutoff frequency: %f\n", InputRate * cutoff / 2);
assert(NumCoeffs <= OwlBuffer::HRBUF_LEFTOVER_PADDING);
CoeffsBuffer.resize((256 / sizeof(float)) + NumCoeffs * NumPhases);
for(unsigned int i = 0; i < NumPhases; i++)
PInfos[i].Coeffs = (float *)(((uintptr_t)&CoeffsBuffer[0] + 0xFF) &~ 0xFF) + (i * NumCoeffs);
MDFN_printf("Impulse response table memory usage: %zu bytes\n", CoeffsBuffer.size() * sizeof(float));
FilterBuf.reset(new double[NumCoeffs * NumPhases]);
gen_sinc(&FilterBuf[0], NumCoeffs * NumPhases, cutoff / NumPhases, k_beta);
normalize(&FilterBuf[0], NumCoeffs * NumPhases);
#if 0
for(int i = 0; i < NumCoeffs * NumPhases; i++)
fprintf(stderr, "%.20f\n", FilterBuf[i]);
#endif
for(unsigned int phase = 0; phase < NumPhases; phase++)
{
//double sum_d = 0;
//float sum_f4[4] = { 0, 0, 0, 0 };
const unsigned sp = (NumPhases - 1 - (((uint64)phase * Ratio_Dividend) % NumPhases));
const unsigned tp = phase;
for(unsigned int i = 0; i < NumCoeffs; i++)
{
double tmpcod = FilterBuf[i * NumPhases + sp] * NumPhases; // Tasty cod.
PInfos[tp].Coeffs[i] = FilterDenormal(tmpcod);
//sum_d += PInfos[tp].Coeffs[i];
//sum_f4[i % 4] += PInfos[tp].Coeffs[i];
}
#if 0
{
double sf4t = (sum_f4[0] + sum_f4[2]) + (sum_f4[1] + sum_f4[3]);
double sd_div_sf4t = sum_d / sf4t;
MDFN_printf("Phase %4u: sum_d=%.10f, sum_f4t=%.10f, sum_d div sum_f4t=%.10f(*65536=%f, dB=%.8f)\n", sp, sum_d, (double)sf4t, sd_div_sf4t, 65536.0 * sd_div_sf4t, fabs(20 * log10(sum_d / sf4t)));
}
#endif
}
assert(debias_corner < (output_rate / 16));
debias_multiplier = (uint32)(((uint64)1 << 16) * debias_corner / output_rate);
//MDFN_indent(-1);
//abort();
#if 0
{
static float dummy_wave[1024];
static float dummy_coeffs[1024];
int32 dummy_out;
uint32 begin_time = MDFND_GetTime();
for(int i = 0; i < 1024 * 1024; i++)
{
DoMAC_AVX_32X(dummy_wave, dummy_coeffs, 1024, &dummy_out);
//DoMAC_SSE_16X(dummy_wave, dummy_coeffs, 1024, &dummy_out);
}
printf("%u\n", MDFND_GetTime() - begin_time);
abort();
}
#endif
}