diff --git a/libretro-common/audio/resampler/drivers/sinc_resampler.c b/libretro-common/audio/resampler/drivers/sinc_resampler.c index c4c880cb43..020e6579b8 100644 --- a/libretro-common/audio/resampler/drivers/sinc_resampler.c +++ b/libretro-common/audio/resampler/drivers/sinc_resampler.c @@ -85,16 +85,81 @@ typedef struct rarch_sinc_resampler float kaiser_beta; } rarch_sinc_resampler_t; -#if (defined(__ARM_NEON__) && !defined(DONT_WANT_ARM_ASM_OPTIMIZATIONS)) || defined(HAVE_NEON) -#if TARGET_OS_IPHONE -#else -#ifndef WANT_NEON -#define WANT_NEON -#endif -#endif -#endif +#if (defined(__ARM_NEON__) || defined(HAVE_NEON) +#ifdef DONT_WANT_ARM_ASM_OPTIMIZATIONS +#include -#ifdef WANT_NEON +/* Assumes that taps >= 8, and that taps is a multiple of 8. */ +static void resampler_sinc_process_neon_intrin(void *re_, struct resampler_data *data) +{ + rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_; + unsigned phases = 1 << (resamp->phase_bits + resamp->subphase_bits); + + uint32_t ratio = phases / data->ratio; + const float *input = data->data_in; + float *output = data->data_out; + size_t frames = data->input_frames; + size_t out_frames = 0; + + while (frames) + { + while (resamp->time >= phases) + { + /* Push in reverse to make filter more obvious. */ + if (!resamp->ptr) + resamp->ptr = resamp->taps; + resamp->ptr--; + + resamp->buffer_l[resamp->ptr + resamp->taps] = + resamp->buffer_l[resamp->ptr] = *input++; + + resamp->buffer_r[resamp->ptr + resamp->taps] = + resamp->buffer_r[resamp->ptr] = *input++; + + resamp->time -= phases; + frames--; + } + + { + const float *buffer_l = resamp->buffer_l + resamp->ptr; + const float *buffer_r = resamp->buffer_r + resamp->ptr; + unsigned taps = resamp->taps; + while (resamp->time < phases) + { + int i; + unsigned phase = resamp->time >> resamp->subphase_bits; + const float *phase_table = resamp->phase_table + phase * taps; + + float32x4_t p1 = {0, 0, 0, 0}, p2 = {0, 0, 0, 0}; + float32x2_t p3, p4; + + for (i = 0; i < taps; i += 8) + { + float32x4x2_t coeff8 = vld2q_f32(&phase_table[i]); + float32x4x2_t left8 = vld2q_f32(&buffer_l[i]); + float32x4x2_t right8 = vld2q_f32(&buffer_r[i]); + + p1 = vmlaq_f32(p1, left8.val[0], coeff8.val[0]); + p2 = vmlaq_f32(p2, right8.val[0], coeff8.val[0]); + p1 = vmlaq_f32(p1, left8.val[1], coeff8.val[1]); + p2 = vmlaq_f32(p2, right8.val[1], coeff8.val[1]); + } + + p3 = vadd_f32(vget_low_f32(p1), vget_high_f32(p1)); + p4 = vadd_f32(vget_low_f32(p2), vget_high_f32(p2)); + vst1_f32(output, vpadd_f32(p3, p4)); + + + output += 2; + out_frames++; + resamp->time += ratio; + } + } + } + + data->output_frames = out_frames; +} +#else /* Assumes that taps >= 8, and that taps is a multiple of 8. */ void process_sinc_neon_asm(float *out, const float *left, const float *right, const float *coeff, unsigned taps); @@ -150,6 +215,7 @@ static void resampler_sinc_process_neon(void *re_, struct resampler_data *data) data->output_frames = out_frames; } #endif +#endif #if defined(__AVX__) static void resampler_sinc_process_avx_kaiser(void *re_, struct resampler_data *data) @@ -844,7 +910,7 @@ static void *resampler_sinc_new(const struct resampler_config *config, else #endif { -#if defined(WANT_NEON) +#if (defined(__ARM_NEON__) || defined(HAVE_NEON) re->taps = (re->taps + 7) & ~7; #else re->taps = (re->taps + 3) & ~3; @@ -902,8 +968,12 @@ static void *resampler_sinc_new(const struct resampler_config *config, } else if (mask & RESAMPLER_SIMD_NEON && window_type != SINC_WINDOW_KAISER) { -#if defined(WANT_NEON) +#if (defined(__ARM_NEON__) || defined(HAVE_NEON) +#ifdef DONT_WANT_ARM_ASM_OPTIMIZATIONS + sinc_resampler.process = resampler_sinc_process_neon_intrin; +#else sinc_resampler.process = resampler_sinc_process_neon; +#endif #endif } @@ -922,5 +992,3 @@ retro_resampler_t sinc_resampler = { "sinc", "sinc" }; - -#undef WANT_NEON