diff --git a/libretro-common/audio/resampler/drivers/sinc_resampler.c b/libretro-common/audio/resampler/drivers/sinc_resampler.c index 2b43cb0d59..db30e86b5a 100644 --- a/libretro-common/audio/resampler/drivers/sinc_resampler.c +++ b/libretro-common/audio/resampler/drivers/sinc_resampler.c @@ -161,87 +161,137 @@ static void resampler_sinc_process_avx(void *re_, struct resampler_data *data) size_t frames = data->input_frames; size_t out_frames = 0; - while (frames) + if (resamp->window_type == SINC_WINDOW_KAISER) { - while (frames && resamp->time >= phases) + while (frames) { - /* Push in reverse to make filter more obvious. */ - if (!resamp->ptr) - resamp->ptr = resamp->taps; - resamp->ptr--; - - resamp->buffer_l[resamp->ptr + resamp->taps] = - resamp->buffer_l[resamp->ptr] = *input++; - - resamp->buffer_r[resamp->ptr + resamp->taps] = - resamp->buffer_r[resamp->ptr] = *input++; - - resamp->time -= phases; - frames--; - } - - while (resamp->time < phases) - { - unsigned i; - __m256 delta, sum_l, sum_r; - float *delta_table = NULL; - float *phase_table = NULL; - const float *buffer_l = resamp->buffer_l + resamp->ptr; - const float *buffer_r = resamp->buffer_r + resamp->ptr; - unsigned taps = resamp->taps; - unsigned phase = resamp->time >> resamp->subphase_bits; - - phase_table = resamp->phase_table + phase * taps; - - if (resamp->window_type == SINC_WINDOW_KAISER) + while (frames && resamp->time >= phases) { - phase_table = resamp->phase_table + phase * taps * 2; - delta_table = phase_table + taps; - delta = _mm256_set1_ps((float) + /* Push in reverse to make filter more obvious. */ + if (!resamp->ptr) + resamp->ptr = resamp->taps; + resamp->ptr--; + + resamp->buffer_l[resamp->ptr + resamp->taps] = + resamp->buffer_l[resamp->ptr] = *input++; + + resamp->buffer_r[resamp->ptr + resamp->taps] = + resamp->buffer_r[resamp->ptr] = *input++; + + resamp->time -= phases; + frames--; + } + + while (resamp->time < phases) + { + unsigned i; + const float *buffer_l = resamp->buffer_l + resamp->ptr; + const float *buffer_r = resamp->buffer_r + resamp->ptr; + unsigned taps = resamp->taps; + unsigned phase = resamp->time >> resamp->subphase_bits; + + float *phase_table = resamp->phase_table + phase * taps * 2; + float *delta_table = phase_table + taps; + __m256 delta = _mm256_set1_ps((float) (resamp->time & resamp->subphase_mask) * resamp->subphase_mod); - } - sum_l = _mm256_setzero_ps(); - sum_r = _mm256_setzero_ps(); + __m256 sum_l = _mm256_setzero_ps(); + __m256 sum_r = _mm256_setzero_ps(); - for (i = 0; i < taps; i += 8) - { - __m256 sinc; - __m256 buf_l = _mm256_loadu_ps(buffer_l + i); - __m256 buf_r = _mm256_loadu_ps(buffer_r + i); - - if (resamp->window_type == SINC_WINDOW_KAISER) + for (i = 0; i < taps; i += 8) { + __m256 buf_l = _mm256_loadu_ps(buffer_l + i); + __m256 buf_r = _mm256_loadu_ps(buffer_r + i); __m256 deltas = _mm256_load_ps(delta_table + i); - sinc = _mm256_add_ps(_mm256_load_ps((const float*)phase_table + i), + __m256 sinc = _mm256_add_ps(_mm256_load_ps((const float*)phase_table + i), _mm256_mul_ps(deltas, delta)); - } - else - { - sinc = _mm256_load_ps((const float*)phase_table + i); + + sum_l = _mm256_add_ps(sum_l, _mm256_mul_ps(buf_l, sinc)); + sum_r = _mm256_add_ps(sum_r, _mm256_mul_ps(buf_r, sinc)); } - sum_l = _mm256_add_ps(sum_l, _mm256_mul_ps(buf_l, sinc)); - sum_r = _mm256_add_ps(sum_r, _mm256_mul_ps(buf_r, sinc)); + /* hadd on AVX is weird, and acts on low-lanes + * and high-lanes separately. */ + __m256 res_l = _mm256_hadd_ps(sum_l, sum_l); + __m256 res_r = _mm256_hadd_ps(sum_r, sum_r); + res_l = _mm256_hadd_ps(res_l, res_l); + res_r = _mm256_hadd_ps(res_r, res_r); + res_l = _mm256_add_ps(_mm256_permute2f128_ps(res_l, res_l, 1), res_l); + res_r = _mm256_add_ps(_mm256_permute2f128_ps(res_r, res_r, 1), res_r); + + /* This is optimized to mov %xmmN, [mem]. + * There doesn't seem to be any _mm256_store_ss intrinsic. */ + _mm_store_ss(output + 0, _mm256_extractf128_ps(res_l, 0)); + _mm_store_ss(output + 1, _mm256_extractf128_ps(res_r, 0)); + + output += 2; + out_frames++; + resamp->time += ratio; + } + } + } + else + { + while (frames) + { + while (frames && resamp->time >= phases) + { + /* Push in reverse to make filter more obvious. */ + if (!resamp->ptr) + resamp->ptr = resamp->taps; + resamp->ptr--; + + resamp->buffer_l[resamp->ptr + resamp->taps] = + resamp->buffer_l[resamp->ptr] = *input++; + + resamp->buffer_r[resamp->ptr + resamp->taps] = + resamp->buffer_r[resamp->ptr] = *input++; + + resamp->time -= phases; + frames--; } - /* hadd on AVX is weird, and acts on low-lanes - * and high-lanes separately. */ - __m256 res_l = _mm256_hadd_ps(sum_l, sum_l); - __m256 res_r = _mm256_hadd_ps(sum_r, sum_r); - res_l = _mm256_hadd_ps(res_l, res_l); - res_r = _mm256_hadd_ps(res_r, res_r); - res_l = _mm256_add_ps(_mm256_permute2f128_ps(res_l, res_l, 1), res_l); - res_r = _mm256_add_ps(_mm256_permute2f128_ps(res_r, res_r, 1), res_r); + while (resamp->time < phases) + { + unsigned i; + __m256 delta; + const float *buffer_l = resamp->buffer_l + resamp->ptr; + const float *buffer_r = resamp->buffer_r + resamp->ptr; + unsigned taps = resamp->taps; + unsigned phase = resamp->time >> resamp->subphase_bits; + float *phase_table = resamp->phase_table + phase * taps; - /* This is optimized to mov %xmmN, [mem]. - * There doesn't seem to be any _mm256_store_ss intrinsic. */ - _mm_store_ss(output + 0, _mm256_extractf128_ps(res_l, 0)); - _mm_store_ss(output + 1, _mm256_extractf128_ps(res_r, 0)); + __m256 sum_l = _mm256_setzero_ps(); + __m256 sum_r = _mm256_setzero_ps(); - output += 2; - out_frames++; - resamp->time += ratio; + for (i = 0; i < taps; i += 8) + { + __m256 buf_l = _mm256_loadu_ps(buffer_l + i); + __m256 buf_r = _mm256_loadu_ps(buffer_r + i); + __m256 sinc = _mm256_load_ps((const float*)phase_table + i); + + sum_l = _mm256_add_ps(sum_l, _mm256_mul_ps(buf_l, sinc)); + sum_r = _mm256_add_ps(sum_r, _mm256_mul_ps(buf_r, sinc)); + } + + /* hadd on AVX is weird, and acts on low-lanes + * and high-lanes separately. */ + __m256 res_l = _mm256_hadd_ps(sum_l, sum_l); + __m256 res_r = _mm256_hadd_ps(sum_r, sum_r); + res_l = _mm256_hadd_ps(res_l, res_l); + res_r = _mm256_hadd_ps(res_r, res_r); + res_l = _mm256_add_ps(_mm256_permute2f128_ps(res_l, res_l, 1), res_l); + res_r = _mm256_add_ps(_mm256_permute2f128_ps(res_r, res_r, 1), res_r); + + /* This is optimized to mov %xmmN, [mem]. + * There doesn't seem to be any _mm256_store_ss intrinsic. */ + _mm_store_ss(output + 0, _mm256_extractf128_ps(res_l, 0)); + _mm_store_ss(output + 1, _mm256_extractf128_ps(res_r, 0)); + + output += 2; + out_frames++; + resamp->time += ratio; + } } } @@ -261,99 +311,157 @@ static void resampler_sinc_process_sse(void *re_, struct resampler_data *data) size_t frames = data->input_frames; size_t out_frames = 0; - while (frames) + if (resamp->window_type == SINC_WINDOW_KAISER) { - while (frames && resamp->time >= phases) + while (frames) { - /* Push in reverse to make filter more obvious. */ - if (!resamp->ptr) - resamp->ptr = resamp->taps; - resamp->ptr--; - - resamp->buffer_l[resamp->ptr + resamp->taps] = - resamp->buffer_l[resamp->ptr] = *input++; - - resamp->buffer_r[resamp->ptr + resamp->taps] = - resamp->buffer_r[resamp->ptr] = *input++; - - resamp->time -= phases; - frames--; - } - - while (resamp->time < phases) - { - unsigned i; - __m128 sum, sum_l, sum_r, delta; - float *phase_table = NULL; - float *delta_table = NULL; - const float *buffer_l = resamp->buffer_l + resamp->ptr; - const float *buffer_r = resamp->buffer_r + resamp->ptr; - unsigned taps = resamp->taps; - unsigned phase = resamp->time >> resamp->subphase_bits; - - if (resamp->window_type == SINC_WINDOW_KAISER) + while (frames && resamp->time >= phases) { - phase_table = resamp->phase_table + phase * taps * 2; - delta_table = phase_table + taps; - delta = _mm_set1_ps((float) + /* Push in reverse to make filter more obvious. */ + if (!resamp->ptr) + resamp->ptr = resamp->taps; + resamp->ptr--; + + resamp->buffer_l[resamp->ptr + resamp->taps] = + resamp->buffer_l[resamp->ptr] = *input++; + + resamp->buffer_r[resamp->ptr + resamp->taps] = + resamp->buffer_r[resamp->ptr] = *input++; + + resamp->time -= phases; + frames--; + } + + while (resamp->time < phases) + { + unsigned i; + __m128 sum; + const float *buffer_l = resamp->buffer_l + resamp->ptr; + const float *buffer_r = resamp->buffer_r + resamp->ptr; + unsigned taps = resamp->taps; + unsigned phase = resamp->time >> resamp->subphase_bits; + float *phase_table = resamp->phase_table + phase * taps * 2; + float *delta_table = phase_table + taps; + __m128 delta = _mm_set1_ps((float) (resamp->time & resamp->subphase_mask) * resamp->subphase_mod); - } - else - { - phase_table = resamp->phase_table + phase * taps; - } - sum_l = _mm_setzero_ps(); - sum_r = _mm_setzero_ps(); + __m128 sum_l = _mm_setzero_ps(); + __m128 sum_r = _mm_setzero_ps(); - for (i = 0; i < taps; i += 4) - { - __m128 deltas, _sinc; - __m128 buf_l = _mm_loadu_ps(buffer_l + i); - __m128 buf_r = _mm_loadu_ps(buffer_r + i); - - if (resamp->window_type == SINC_WINDOW_KAISER) + for (i = 0; i < taps; i += 4) { - deltas = _mm_load_ps(delta_table + i); - _sinc = _mm_add_ps(_mm_load_ps((const float*)phase_table + i), + __m128 buf_l = _mm_loadu_ps(buffer_l + i); + __m128 buf_r = _mm_loadu_ps(buffer_r + i); + __m128 deltas = _mm_load_ps(delta_table + i); + __m128 _sinc = _mm_add_ps(_mm_load_ps((const float*)phase_table + i), _mm_mul_ps(deltas, delta)); + sum_l = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, _sinc)); + sum_r = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, _sinc)); } - else - { - _sinc = _mm_load_ps((const float*)phase_table + i); - } - sum_l = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, _sinc)); - sum_r = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, _sinc)); + + /* Them annoying shuffles. + * sum_l = { l3, l2, l1, l0 } + * sum_r = { r3, r2, r1, r0 } + */ + + sum = _mm_add_ps(_mm_shuffle_ps(sum_l, sum_r, + _MM_SHUFFLE(1, 0, 1, 0)), + _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2))); + + /* sum = { r1, r0, l1, l0 } + { r3, r2, l3, l2 } + * sum = { R1, R0, L1, L0 } + */ + + sum = _mm_add_ps(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1)), sum); + + /* sum = {R1, R1, L1, L1 } + { R1, R0, L1, L0 } + * sum = { X, R, X, L } + */ + + /* Store L */ + _mm_store_ss(output + 0, sum); + + /* movehl { X, R, X, L } == { X, R, X, R } */ + _mm_store_ss(output + 1, _mm_movehl_ps(sum, sum)); + + output += 2; + out_frames++; + resamp->time += ratio; + } + } + } + else + { + while (frames) + { + while (frames && resamp->time >= phases) + { + /* Push in reverse to make filter more obvious. */ + if (!resamp->ptr) + resamp->ptr = resamp->taps; + resamp->ptr--; + + resamp->buffer_l[resamp->ptr + resamp->taps] = + resamp->buffer_l[resamp->ptr] = *input++; + + resamp->buffer_r[resamp->ptr + resamp->taps] = + resamp->buffer_r[resamp->ptr] = *input++; + + resamp->time -= phases; + frames--; } - /* Them annoying shuffles. - * sum_l = { l3, l2, l1, l0 } - * sum_r = { r3, r2, r1, r0 } - */ + while (resamp->time < phases) + { + unsigned i; + __m128 sum; + const float *buffer_l = resamp->buffer_l + resamp->ptr; + const float *buffer_r = resamp->buffer_r + resamp->ptr; + unsigned taps = resamp->taps; + unsigned phase = resamp->time >> resamp->subphase_bits; + float *phase_table = resamp->phase_table + phase * taps; - sum = _mm_add_ps(_mm_shuffle_ps(sum_l, sum_r, - _MM_SHUFFLE(1, 0, 1, 0)), - _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2))); + __m128 sum_l = _mm_setzero_ps(); + __m128 sum_r = _mm_setzero_ps(); - /* sum = { r1, r0, l1, l0 } + { r3, r2, l3, l2 } - * sum = { R1, R0, L1, L0 } - */ + for (i = 0; i < taps; i += 4) + { + __m128 buf_l = _mm_loadu_ps(buffer_l + i); + __m128 buf_r = _mm_loadu_ps(buffer_r + i); + __m128 _sinc = _mm_load_ps((const float*)phase_table + i); + sum_l = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, _sinc)); + sum_r = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, _sinc)); + } - sum = _mm_add_ps(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1)), sum); + /* Them annoying shuffles. + * sum_l = { l3, l2, l1, l0 } + * sum_r = { r3, r2, r1, r0 } + */ - /* sum = {R1, R1, L1, L1 } + { R1, R0, L1, L0 } - * sum = { X, R, X, L } - */ + sum = _mm_add_ps(_mm_shuffle_ps(sum_l, sum_r, + _MM_SHUFFLE(1, 0, 1, 0)), + _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2))); - /* Store L */ - _mm_store_ss(output + 0, sum); + /* sum = { r1, r0, l1, l0 } + { r3, r2, l3, l2 } + * sum = { R1, R0, L1, L0 } + */ - /* movehl { X, R, X, L } == { X, R, X, R } */ - _mm_store_ss(output + 1, _mm_movehl_ps(sum, sum)); + sum = _mm_add_ps(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1)), sum); - output += 2; - out_frames++; - resamp->time += ratio; + /* sum = {R1, R1, L1, L1 } + { R1, R0, L1, L0 } + * sum = { X, R, X, L } + */ + + /* Store L */ + _mm_store_ss(output + 0, sum); + + /* movehl { X, R, X, L } == { X, R, X, R } */ + _mm_store_ss(output + 1, _mm_movehl_ps(sum, sum)); + + output += 2; + out_frames++; + resamp->time += ratio; + } } } @@ -372,69 +480,108 @@ static void resampler_sinc_process_c(void *re_, struct resampler_data *data) size_t frames = data->input_frames; size_t out_frames = 0; - while (frames) + if (resamp->window_type == SINC_WINDOW_KAISER) { - while (frames && resamp->time >= phases) + while (frames) { - /* Push in reverse to make filter more obvious. */ - if (!resamp->ptr) - resamp->ptr = resamp->taps; - resamp->ptr--; - - resamp->buffer_l[resamp->ptr + resamp->taps] = - resamp->buffer_l[resamp->ptr] = *input++; - - resamp->buffer_r[resamp->ptr + resamp->taps] = - resamp->buffer_r[resamp->ptr] = *input++; - - resamp->time -= phases; - frames--; - } - - while (resamp->time < phases) - { - unsigned i; - float delta = 0.0f; - float sum_l = 0.0f; - float sum_r = 0.0f; - float *phase_table = NULL; - float *delta_table = NULL; - const float *buffer_l = resamp->buffer_l + resamp->ptr; - const float *buffer_r = resamp->buffer_r + resamp->ptr; - unsigned taps = resamp->taps; - unsigned phase = resamp->time >> resamp->subphase_bits; - - if (resamp->window_type == SINC_WINDOW_KAISER) + while (frames && resamp->time >= phases) { - phase_table = resamp->phase_table + phase * taps * 2; - delta_table = phase_table + taps; - delta = (float) + /* Push in reverse to make filter more obvious. */ + if (!resamp->ptr) + resamp->ptr = resamp->taps; + resamp->ptr--; + + resamp->buffer_l[resamp->ptr + resamp->taps] = + resamp->buffer_l[resamp->ptr] = *input++; + + resamp->buffer_r[resamp->ptr + resamp->taps] = + resamp->buffer_r[resamp->ptr] = *input++; + + resamp->time -= phases; + frames--; + } + + while (resamp->time < phases) + { + unsigned i; + float sum_l = 0.0f; + float sum_r = 0.0f; + const float *buffer_l = resamp->buffer_l + resamp->ptr; + const float *buffer_r = resamp->buffer_r + resamp->ptr; + unsigned taps = resamp->taps; + unsigned phase = resamp->time >> resamp->subphase_bits; + float *phase_table = resamp->phase_table + phase * taps * 2; + float *delta_table = phase_table + taps; + float delta = (float) (resamp->time & resamp->subphase_mask) * resamp->subphase_mod; - } - else - { - phase_table = resamp->phase_table + phase * taps; + + for (i = 0; i < taps; i++) + { + float sinc_val = phase_table[i] + delta_table[i] * delta; + + sum_l += buffer_l[i] * sinc_val; + sum_r += buffer_r[i] * sinc_val; + } + + output[0] = sum_l; + output[1] = sum_r; + + output += 2; + out_frames++; + resamp->time += ratio; } - for (i = 0; i < taps; i++) - { - float sinc_val = phase_table[i]; - - if (resamp->window_type == SINC_WINDOW_KAISER) - sinc_val = sinc_val + delta_table[i] * delta; - - sum_l += buffer_l[i] * sinc_val; - sum_r += buffer_r[i] * sinc_val; - } - - output[0] = sum_l; - output[1] = sum_r; - - output += 2; - out_frames++; - resamp->time += ratio; } + } + else + { + while (frames) + { + while (frames && resamp->time >= phases) + { + /* Push in reverse to make filter more obvious. */ + if (!resamp->ptr) + resamp->ptr = resamp->taps; + resamp->ptr--; + resamp->buffer_l[resamp->ptr + resamp->taps] = + resamp->buffer_l[resamp->ptr] = *input++; + + resamp->buffer_r[resamp->ptr + resamp->taps] = + resamp->buffer_r[resamp->ptr] = *input++; + + resamp->time -= phases; + frames--; + } + + while (resamp->time < phases) + { + unsigned i; + float sum_l = 0.0f; + float sum_r = 0.0f; + const float *buffer_l = resamp->buffer_l + resamp->ptr; + const float *buffer_r = resamp->buffer_r + resamp->ptr; + unsigned taps = resamp->taps; + unsigned phase = resamp->time >> resamp->subphase_bits; + float *phase_table = resamp->phase_table + phase * taps; + + for (i = 0; i < taps; i++) + { + float sinc_val = phase_table[i]; + + sum_l += buffer_l[i] * sinc_val; + sum_r += buffer_r[i] * sinc_val; + } + + output[0] = sum_l; + output[1] = sum_r; + + output += 2; + out_frames++; + resamp->time += ratio; + } + + } } data->output_frames = out_frames;