BizHawk/waterbox/pcfx/sound/OwlResampler.cpp

/* Mednafen - Multi-system Emulator
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

// Don't pass more than about 40ms worth of audio data to Resample()
// at a time.

#define _GNU_SOURCE
#include "../defs.h"
#include <math.h>
#include <assert.h>
#include <limits.h>
#include "OwlResampler.h"
//#include "../cputest/cputest.h"

#if defined(ARCH_POWERPC_ALTIVEC) && defined(HAVE_ALTIVEC_H)
 #include <altivec.h>
#endif

#ifdef __ARM_NEON__
 #include <arm_neon.h>
#endif

#ifdef __FAST_MATH__
 #error "OwlResampler.cpp not compatible with unsafe math optimizations!"
#endif

OwlBuffer::OwlBuffer()
{
 assert(sizeof(I32_F_Pudding) == 4);
 assert(sizeof(float) == 4);

 memset(HRBuf, 0, sizeof(HRBuf));

 accum = 0;
 filter_state[0] = 0;
 filter_state[1] = 0;

 leftover = 0;

 InputIndex = 0;
 InputPhase = 0;

 debias = 0;
}

OwlBuffer::~OwlBuffer()
{
}

template<unsigned DoExMix, bool Integrate, unsigned IntegrateShift, bool Lowpass, bool Highpass, bool FloatOutput>
static int32 ProcessLoop(unsigned count, int32 a, int32* b, int32* exmix0 = NULL, int32* exmix1 = NULL, unsigned lp_shift = 0, unsigned hp_shift = 0, int64* f_in = NULL)
{
 int64 lp_f;
 int64 hp_f;

 if(Lowpass)
 {
  lp_f = f_in[0];
 }

 if(Highpass)
 {
  hp_f = f_in[1];
 }

 while(count--)
 {
  int32 tmp;

  if(Integrate)
  {
   a += *b;
   tmp = a >> IntegrateShift;
  }
  else
   tmp = *b;

  if(Lowpass)
  {
   lp_f += ((int64)((uint64)(int64)tmp << 16) - lp_f) >> lp_shift;
   tmp = lp_f >> 16;
  }

  if(Highpass)
  {
   hp_f += ((int64)((uint64)(int64)tmp << 16) - hp_f) >> hp_shift;
   tmp = tmp - (hp_f >> 16);
  }

  if(DoExMix >= 1)
  {
   tmp += *exmix0;
   exmix0++;
  }

  if(DoExMix >= 2)
  {
   tmp += *exmix1;
   exmix1++;
  }

  if(FloatOutput)
   *(float*)b = tmp;
  else
   *b = tmp;

  b++;
 }

 if(Lowpass)
  f_in[0] = lp_f;

 if(Highpass)
  f_in[1] = hp_f;

 return(a);
}

void OwlBuffer::ResampleSkipped(unsigned count)
{
 memmove(HRBuf, &HRBuf[count], HRBUF_OVERFLOW_PADDING * sizeof(HRBuf[0]));
 memset(&HRBuf[HRBUF_OVERFLOW_PADDING], 0, count * sizeof(HRBuf[0]));
}

void OwlBuffer::Integrate(unsigned count, unsigned lp_shift, unsigned hp_shift, RavenBuffer* mixin0, RavenBuffer* mixin1)
{
 //lp_shift = hp_shift = 0;
 if(lp_shift != 0 || hp_shift != 0)
 {
  if(mixin0 && mixin1)
   accum = ProcessLoop<2, true, 3, true, true, true>(count, accum, Buf(), mixin0->Buf(), mixin1->Buf(),	lp_shift, hp_shift, filter_state);
  else if(mixin0)
   accum = ProcessLoop<1, true, 3, true, true, true>(count, accum, Buf(), mixin0->Buf(), NULL,		lp_shift, hp_shift, filter_state);
  else
   accum = ProcessLoop<0, true, 3, true, true, true>(count, accum, Buf(), NULL, 	 NULL,		lp_shift, hp_shift, filter_state);
 }
 else
 {
  if(mixin0 && mixin1)
   accum = ProcessLoop<2, true, 3, false, false, true>(count, accum, Buf(), mixin0->Buf(), mixin1->Buf());
  else if(mixin0)
   accum = ProcessLoop<1, true, 3, false, false, true>(count, accum, Buf(), mixin0->Buf());
  else
   accum = ProcessLoop<0, true, 3, false, false, true>(count, accum, Buf());
 }

 if(accum >= 32767 * 256 * 8 || accum <= -32767 * 256 * 8)
 {
  //printf("Possible delta sample loss; accum=%d\n", accum);
 }
}

//
//
//

RavenBuffer::RavenBuffer()
{
 memset(BB, 0, sizeof(BB));

 accum = 0;

 filter_state[0] = 0;
 filter_state[1] = 0;
}


RavenBuffer::~RavenBuffer()
{


}


void RavenBuffer::Process(unsigned count, bool integrate, unsigned lp_shift)
{
 if(integrate)
 {
  if(lp_shift != 0)
   accum = ProcessLoop<0, true, 3, true,  false, false>(count, accum, Buf(), NULL, NULL, lp_shift, 0, filter_state);
  else
   accum = ProcessLoop<0, true, 3, false, false, false>(count, accum, Buf(), NULL, NULL, lp_shift, 0);
 }
 else
 {
  if(lp_shift != 0)
   accum = ProcessLoop<0, false, 0, true,  false, false>(count, accum, Buf(), NULL, NULL, lp_shift, 0, filter_state);
  else
   accum = ProcessLoop<0, false, 0, false, false, false>(count, accum, Buf(), NULL, NULL, lp_shift, 0);
 }
}

void RavenBuffer::Finish(unsigned count)
{
 memmove(BB, &BB[count], OwlBuffer::HRBUF_OVERFLOW_PADDING * sizeof(BB[0]));
 memset(&BB[OwlBuffer::HRBUF_OVERFLOW_PADDING], 0, count * sizeof(BB[0]));
}


static void kaiser_window( double* io, int count, double beta )
{
        int const accuracy = 16; //12;

        double* end = io + count;

        double beta2    = beta * beta * (double) -0.25;
        double to_fract = beta2 / ((double) count * count);
        double i        = 0;
        double rescale = 0; // Doesn't need an initializer, to shut up gcc

        for ( ; io < end; ++io, i += 1 )
        {
                double x = i * i * to_fract - beta2;
                double u = x;
                double k = x + 1;

                double n = 2;
                do
                {
                        u *= x / (n * n);
                        n += 1;
                        k += u;
                }
                while ( k <= u * (1 << accuracy) );

                if ( !i )
                        rescale = 1 / k; // otherwise values get large

                *io *= k * rescale;
        }
}

static void gen_sinc( double* out, int size, double cutoff, double kaiser )
{
	assert( size % 2 == 0 ); // size must be enev

	int const half_size = size / 2;
	double* const mid = &out [half_size];

	// Generate right half of sinc
	for ( int i = 0; i < half_size; i++ )
	{
		double angle = (i * 2 + 1) * (M_PI / 2);
		mid [i] = sin( angle * cutoff ) / angle;
	}

	kaiser_window( mid, half_size, kaiser );

	// Mirror for left half
	for ( int i = 0; i < half_size; i++ )
		out [i] = mid [half_size - 1 - i];
}

static void normalize( double* io, int size, double gain = 1.0 )
{
	double sum = 0;
	for ( int i = 0; i < size; i++ )
		sum += io [i];

	double scale = gain / sum;
	for ( int i = 0; i < size; i++ )
		io [i] *= scale;
}


static INLINE void DoMAC(float *wave, float *coeffs, int32 count, int32 *accum_output)
{
 float acc[4] = { 0, 0, 0, 0 };

 for(int c = 0; MDFN_LIKELY(c < count); c += 4)
 {
  acc[0] += wave[c + 0] * coeffs[c + 0];
  acc[1] += wave[c + 1] * coeffs[c + 1];
  acc[2] += wave[c + 2] * coeffs[c + 2];
  acc[3] += wave[c + 3] * coeffs[c + 3];
 }

 *accum_output = (acc[0] + acc[2]) + (acc[1] + acc[3]);
}


#ifdef ARCH_X86
 #include "OwlResampler_x86.inc"
#endif

#ifdef ARCH_POWERPC_ALTIVEC
 #include "OwlResampler_altivec.inc"
#endif

#ifdef __ARM_NEON__
 #include "OwlResampler_neon.inc"
#endif


template<typename T, unsigned sa>
static T SDP2(T v)
{
 T tmp;

 tmp = (v >> ((sizeof(T) * 8) - 1)) & (((T)1 << sa) - 1);

 return ((v + tmp) >> sa);
}

enum
{
 SIMD_NONE = 0,

#ifdef ARCH_X86
 SIMD_SSE_16X,

#ifdef HAVE_INLINEASM_AVX
 SIMD_AVX_32X,
 SIMD_AVX_32X_P16,
#else
 #warning "Compiling without AVX inline assembly."
#endif
#elif defined(ARCH_POWERPC_ALTIVEC)
 SIMD_ALTIVEC,
#elif defined __ARM_NEON__
 SIMD_NEON
#endif
};

template<unsigned TA_SIMD_Type>
NO_INLINE int32 OwlResampler::T_Resample(OwlBuffer* in, const uint32 in_count, int16* out, const uint32 max_out_count, const bool reverse)
{
	if(reverse)
	{
	 int32* a = in->Buf();
	 int32* b = in->Buf() + in_count - 1;

	 while(MDFN_LIKELY(a < b))
	 {
	  std::swap<int32>(*a, *b);
	  a++;
	  b--;
	 }
	}

	//
	//
	//
	uint32 count = 0;
	int32* I32Out = &IntermediateBuffer[0];
	const uint32 in_count_WLO = in->leftover + in_count;
	const uint32 max = std::max<int64>(0, (int64)in_count_WLO - NumCoeffs);
        uint32 InputPhase = in->InputPhase;
        uint32 InputIndex = in->InputIndex;
	OwlBuffer::I32_F_Pudding* InSamps = in->BufPudding() - in->leftover;
	int32 leftover;

	if(MDFN_UNLIKELY(InputPhase >= NumPhases))
	{
	 fprintf(stderr, "[BUG] InputPhase >= NumPhases\n");	// Save states can also trigger this.
	 InputPhase = 0;
	}

        while(InputIndex < max)
        {
         float* wave = &InSamps[InputIndex].f;
         float* coeffs = &PInfos[InputPhase].Coeffs[0];
         int32 coeff_count = NumCoeffs;

	 switch(TA_SIMD_Type)
  	 {
	  default:
	  case SIMD_NONE:
		DoMAC(wave, coeffs, coeff_count, I32Out);
		break;

#ifdef ARCH_X86
	  case SIMD_SSE_16X:
		DoMAC_SSE_16X(wave, coeffs, coeff_count, I32Out);
		break;
#ifdef HAVE_INLINEASM_AVX
	  case SIMD_AVX_32X:
		DoMAC_AVX_32X(wave, coeffs, coeff_count, I32Out);
		break;

	  case SIMD_AVX_32X_P16:
		DoMAC_AVX_32X_P16(wave, coeffs, coeff_count, I32Out);
		break;
#endif

#elif defined(ARCH_POWERPC_ALTIVEC)
	  case SIMD_ALTIVEC:
		DoMAC_AltiVec(wave, coeffs, coeff_count, I32Out);
		break;
#elif defined __ARM_NEON__
	  case SIMD_NEON:
		DoMAC_NEON(wave, coeffs, coeff_count, I32Out);
		break;
#endif
	 }

         I32Out++;
         count++;

         InputPhase = PInfos[InputPhase].Next;
         InputIndex += PInfos[InputPhase].Step;
        }

#if defined(ARCH_X86) && defined(HAVE_INLINEASM_AVX)
	if(TA_SIMD_Type == SIMD_AVX_32X || TA_SIMD_Type == SIMD_AVX_32X_P16)
	{
	 asm volatile("vzeroupper\n\t" : : :
	 #if defined(__AVX__)
	 "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", "ymm7"
	  #if defined(__x86_64__)
	  , "ymm8", "ymm9", "ymm10", "ymm11", "ymm12", "ymm13", "ymm14", "ymm15"
	  #endif
	 #endif
	 );
	}
#endif

        if(InputIndex > in_count_WLO)
	{
	 leftover = 0;
	 InputIndex -= in_count_WLO;
	}
	else
	{
	 leftover = (int32)in_count_WLO - (int32)InputIndex;
	 InputIndex = 0;
	}

#if 0
	for(uint32 x = 0; x < count; x++)
	{
 	 int s = IntermediateBuffer[x] >> 8;

	 if(s < -32768 || s > 32767)
	 {
	  //printf("Flow: %6d\n", s);
	  if(s < -32768)
	   s = -32768;
	  else if(s > 32767)
	   s = 32767;
	 }
	 out[x * 2] = s;
	}
#else
	{
	 int64 debias = in->debias;

 	 for(uint32 x = 0; x < count; x++)
	 {
 	  int32 sample = IntermediateBuffer[x];
	  int32 s;

          debias += (((int64)((uint64)(int64)sample << 16) - debias) * debias_multiplier) >> 16;
          s = SDP2<int32, 8>(sample - (debias >> 16));
	  if(s < -32768 || s > 32767)
	  {
	   //printf("Flow: %6d\n", s);
	   if(s < -32768)
	    s = -32768;
	   else if(s > 32767)
	    s = 32767;
	  }
	  out[x * 2] = s;
	 }

	 in->debias = debias;
	}
#endif
        memmove(in->Buf() - leftover,
	        in->Buf() + in_count - leftover,
		sizeof(int32) * (leftover + OwlBuffer::HRBUF_OVERFLOW_PADDING));

        memset(in->Buf() + OwlBuffer::HRBUF_OVERFLOW_PADDING, 0, sizeof(int32) * in_count);

	in->leftover = leftover;
	in->InputPhase = InputPhase;
	in->InputIndex = InputIndex;

	return count;
}

void OwlResampler::ResetBufResampState(OwlBuffer* buf)
{
 memset(buf->HRBuf, 0, sizeof(buf->HRBuf[0]) * OwlBuffer::HRBUF_LEFTOVER_PADDING);
 buf->InputPhase = 0;
}


OwlResampler::~OwlResampler()
{

}

//
// Flush denormals, and coefficients that could lead to denormals, to zero.
//
static float FilterDenormal(float v)
{
 union
 {
  float f;
  uint32 i;
 } cat_pun;

 cat_pun.f = v;

 if(((cat_pun.i >> 23) & 0xFF) <= 24)	// Maybe < 24 is more correct?
 {
  MDFN_printf("Small FP coefficient detected: 0x%08x --- raw_sign=%d, raw_exp=0x%02x, raw_mantissa=0x%06x\n", cat_pun.i, cat_pun.i >> 31, (cat_pun.i >> 23) & 0xFF, cat_pun.i & ((1U << 23) - 1));
  return(0);
 }

 return(v);
}

OwlResampler::OwlResampler(double input_rate, double output_rate, double rate_error, double debias_corner, int quality, double nyq_fudge)
{
 std::unique_ptr<double[]> FilterBuf;
 double ratio = (double)output_rate / input_rate;
 double cutoff;
 double required_bandwidth;
 double k_beta;
 double k_d;

 for(int i = 0; i < 256; i++)
 {
  int a = SDP2<int32, 3>(i);
  int b = SDP2<int32, 3>(-i);
  int c = i / (1 << 3);

  assert(a == -b && a == c);
 }

 assert(sizeof(OwlBuffer::I32_F_Pudding) == 4);

 InputRate = input_rate;
 OutputRate = output_rate;
 RateError = rate_error;
 DebiasCorner = debias_corner;
 Quality = quality;

 IntermediateBuffer.resize(OutputRate * 4 / 50);	// *4 for safety padding, / min(50,60), an approximate calculation

 const uint32 cpuext = 0; //cputest_get_flags();

 MDFN_printf("OwlResampler.cpp debug info:\n");
 //MDFN_indent(1);

 // Get the number of phases required, and adjust ratio.
 {
  double s_ratio = (double)input_rate / output_rate;
  double findo = 0;
  uint32 count = 0;
  uint32 findo_i;

  do
  {
   count++;
   findo += s_ratio;
  } while( fabs(1.0 - ((floor(0.5 + findo) / count) / s_ratio)) > rate_error);

  s_ratio = floor(0.5 + findo) / count;
  findo_i = (uint32) floor(0.5 + findo);
  ratio = 1 / s_ratio;
  NumPhases = count;

  PInfos.resize(NumPhases);

  uint32 last_indoo = 0;
  for(unsigned int i = 0; i < NumPhases; i++)
  {
   uint32 index_pos = i * findo_i / NumPhases;

   PInfos[i].Next = (i + 1) % (NumPhases);
   PInfos[i].Step = index_pos - last_indoo;
   last_indoo = index_pos;
  }
  PInfos[0].Step = findo_i - last_indoo;

  Ratio_Dividend = findo_i;
  Ratio_Divisor = NumPhases;

  MDFN_printf("Phases: %d, Output rate: %f, %d %d\n", NumPhases, input_rate * ratio, Ratio_Dividend, Ratio_Divisor);

  MDFN_printf("Desired maximum rate error: %.10f, Actual rate error: %.10f\n", rate_error, fabs((double)input_rate / output_rate * ratio - 1));
 }

 static const struct
 {
  double beta;
  double d;
  double obw;
 } QualityTable[7] =
 {
  {  5.658, 3.62,  0.65 },
  {  6.764, 4.32,  0.70 },
  {  7.865, 5.00,  0.75 },
  {  8.960, 5.70,  0.80 },
  { 10.056, 6.40,  0.85 },
  { 10.056, 6.40,  0.90 },

  { 10.056, 6.40,  0.9333 }, // 1.0 - (6.40 / 96)
 };

 assert(quality >= 0 && quality <= 6);

 k_beta = QualityTable[quality].beta;
 k_d = QualityTable[quality].d;


 //
 // As far as filter frequency response design goes, we clamp the output rate parameter
 // to keep PCE CD and PC-FX CD-DA sample amplitudes from going wild since we're not resampling CD-DA totally properly.
 //
#define OWLRESAMP_FCALC_RATE_CLAMP 128000.0 //192000.0 //96000.0 //48000.0

 // A little SOMETHING to widen the transition band a bit to reduce computational complexity with higher output rates.
 const double something = std::min<double>(OWLRESAMP_FCALC_RATE_CLAMP, (48000.0 + std::min<double>(OWLRESAMP_FCALC_RATE_CLAMP, output_rate)) / 2 / QualityTable[quality].obw);

 //
 // Note: Cutoff calculation is performed again(though slightly differently) down below after the SIMD check.
 //
 cutoff = QualityTable[quality].obw * (std::min<double>(something, std::min<double>(input_rate, output_rate)) / input_rate);

 required_bandwidth = (std::min<double>(OWLRESAMP_FCALC_RATE_CLAMP, std::min<double>(input_rate, output_rate)) / input_rate) - cutoff;

 NumCoeffs = ceil(k_d / required_bandwidth);

 MDFN_printf("Initial number of coefficients per phase: %u\n", NumCoeffs);
 MDFN_printf("Initial nominal cutoff frequency: %f\n", InputRate * cutoff / 2);

 //
 // Put this lower limit BEFORE the SIMD stuff, otherwise the NumCoeffs calculation will be off.
 //
 if(NumCoeffs < 16)
  NumCoeffs = 16;

 if(0)
 {
  abort();	// The sky is falling AAAAAAAAAAAAA
 }
 #ifdef ARCH_X86
 #ifdef HAVE_INLINEASM_AVX
 else if((cpuext & CPUTEST_FLAG_AVX) && (NumCoeffs + 0xF) >= 32)
 {
  MDFN_printf("SIMD: AVX\n");

  // AVX loop can't handle less than 32 MACs properly.
  NumCoeffs = std::max<uint32>(32, NumCoeffs);

  // Past 32 MACs, AVX loop granularity is 16 MACs(with some ugly maaaagic~)
  NumCoeffs = (NumCoeffs + 0xF) &~ 0xF;

  if(NumCoeffs & 0x10)
   Resample_ = &OwlResampler::T_Resample<SIMD_AVX_32X_P16>;
  else
   Resample_ = &OwlResampler::T_Resample<SIMD_AVX_32X>;
 }
 #endif
 else if(cpuext & CPUTEST_FLAG_SSE)
 {
  MDFN_printf("SIMD: SSE\n");

  // SSE loop does 16 MACs per iteration.
  NumCoeffs = (NumCoeffs + 0xF) &~ 0xF;
  Resample_ = &OwlResampler::T_Resample<SIMD_SSE_16X>;
 }
 #endif
 #ifdef ARCH_POWERPC_ALTIVEC
 else if(1)
 {
  MDFN_printf("SIMD: AltiVec\n");

  // AltiVec loop does 16 MACs per iteration.
  NumCoeffs = (NumCoeffs + 0xF) &~ 0xF;
  Resample_ = &OwlResampler::T_Resample<SIMD_ALTIVEC>;
 }
 #endif
 #ifdef __ARM_NEON__
 else if(1)
 {
  MDFN_printf("SIMD: NEON\n");

  // NEON loop does 16 MACs per iteration.
  NumCoeffs = (NumCoeffs + 0xF) &~ 0xF;
  Resample_ = &OwlResampler::T_Resample<SIMD_NEON>;
 }
 #endif
 else
 {
  // Default loop does 4 MACs per iteration.
  NumCoeffs = (NumCoeffs + 3) &~ 3;
  Resample_ = &OwlResampler::T_Resample<SIMD_NONE>;
 }
 //
 // Don't alter NumCoeffs anymore from here on.
 //

 #if !defined(ARCH_X86) && !defined(ARCH_POWERPC_ALTIVEC) && !defined(__ARM_NEON__)
  #warning "OwlResampler is being compiled without SIMD support."
 #endif

 //
 // Adjust cutoff now that NumCoeffs may have been increased.
 //
 cutoff = std::min<double>(QualityTable[quality].obw * something / input_rate, (std::min<double>(input_rate, output_rate) / input_rate - ((double)k_d / NumCoeffs)));

 cutoff *= nyq_fudge;
 if(ceil(cutoff) > 1.0)
  cutoff = 1.0;


 MDFN_printf("Adjusted number of coefficients per phase: %u\n", NumCoeffs);
 MDFN_printf("Adjusted nominal cutoff frequency: %f\n", InputRate * cutoff / 2);

 assert(NumCoeffs <= OwlBuffer::HRBUF_LEFTOVER_PADDING);

 CoeffsBuffer.resize((256 / sizeof(float)) + NumCoeffs * NumPhases);

 for(unsigned int i = 0; i < NumPhases; i++)
  PInfos[i].Coeffs = (float *)(((uintptr_t)&CoeffsBuffer[0] + 0xFF) &~ 0xFF) + (i * NumCoeffs);

 MDFN_printf("Impulse response table memory usage: %zu bytes\n", CoeffsBuffer.size() * sizeof(float));

 FilterBuf.reset(new double[NumCoeffs * NumPhases]);
 gen_sinc(&FilterBuf[0], NumCoeffs * NumPhases, cutoff / NumPhases, k_beta);
 normalize(&FilterBuf[0], NumCoeffs * NumPhases);

 #if 0
 for(int i = 0; i < NumCoeffs * NumPhases; i++)
  fprintf(stderr, "%.20f\n", FilterBuf[i]);
 #endif

 for(unsigned int phase = 0; phase < NumPhases; phase++)
 {
  //double sum_d = 0;
  //float sum_f4[4] = { 0, 0, 0, 0 };

  const unsigned sp = (NumPhases - 1 - (((uint64)phase * Ratio_Dividend) % NumPhases));
  const unsigned tp = phase;

  for(unsigned int i = 0; i < NumCoeffs; i++)
  {
   double tmpcod = FilterBuf[i * NumPhases + sp] * NumPhases;	// Tasty cod.

   PInfos[tp].Coeffs[i] = FilterDenormal(tmpcod);
   //sum_d += PInfos[tp].Coeffs[i];
   //sum_f4[i % 4] += PInfos[tp].Coeffs[i];
  }

#if 0
  {
   double sf4t = (sum_f4[0] + sum_f4[2]) + (sum_f4[1] + sum_f4[3]);
   double sd_div_sf4t = sum_d / sf4t;

   MDFN_printf("Phase %4u: sum_d=%.10f, sum_f4t=%.10f, sum_d div sum_f4t=%.10f(*65536=%f, dB=%.8f)\n", sp, sum_d, (double)sf4t, sd_div_sf4t, 65536.0 * sd_div_sf4t, fabs(20 * log10(sum_d / sf4t)));
  }
#endif
 }

 assert(debias_corner < (output_rate / 16));
 debias_multiplier = (uint32)(((uint64)1 << 16) * debias_corner / output_rate);

 //MDFN_indent(-1);

 //abort();

 #if 0
 {
  static float dummy_wave[1024];
  static float dummy_coeffs[1024];
  int32 dummy_out;
  uint32 begin_time = MDFND_GetTime();

  for(int i = 0; i < 1024 * 1024; i++)
  {
   DoMAC_AVX_32X(dummy_wave, dummy_coeffs, 1024, &dummy_out);
   //DoMAC_SSE_16X(dummy_wave, dummy_coeffs, 1024, &dummy_out);
  }

  printf("%u\n", MDFND_GetTime() - begin_time);
  abort();
 }
 #endif
}