(sinc resampler) style nits
This commit is contained in:
parent
99fe15be46
commit
dc9ffcce66
|
@ -111,6 +111,7 @@ static void resampler_sinc_process_neon_kaiser(void *re_, struct resampler_data
|
||||||
size_t frames = data->input_frames;
|
size_t frames = data->input_frames;
|
||||||
size_t out_frames = 0;
|
size_t out_frames = 0;
|
||||||
unsigned taps = resamp->taps;
|
unsigned taps = resamp->taps;
|
||||||
|
|
||||||
while (frames)
|
while (frames)
|
||||||
{
|
{
|
||||||
while (frames && resamp->time >= phases)
|
while (frames && resamp->time >= phases)
|
||||||
|
@ -135,11 +136,11 @@ static void resampler_sinc_process_neon_kaiser(void *re_, struct resampler_data
|
||||||
const float *buffer_r = resamp->buffer_r + resamp->ptr;
|
const float *buffer_r = resamp->buffer_r + resamp->ptr;
|
||||||
while (resamp->time < phases)
|
while (resamp->time < phases)
|
||||||
{
|
{
|
||||||
|
int i;
|
||||||
unsigned phase = resamp->time >> resamp->subphase_bits;
|
unsigned phase = resamp->time >> resamp->subphase_bits;
|
||||||
const float *phase_table = resamp->phase_table + phase * taps * 2;
|
const float *phase_table = resamp->phase_table + phase * taps * 2;
|
||||||
const float *delta_table = phase_table + taps;
|
const float *delta_table = phase_table + taps;
|
||||||
float32x4_t delta = vdupq_n_f32((resamp->time & resamp->subphase_mask) * resamp->subphase_mod);
|
float32x4_t delta = vdupq_n_f32((resamp->time & resamp->subphase_mask) * resamp->subphase_mod);
|
||||||
int i;
|
|
||||||
float32x4_t p1 = {0, 0, 0, 0}, p2 = {0, 0, 0, 0};
|
float32x4_t p1 = {0, 0, 0, 0}, p2 = {0, 0, 0, 0};
|
||||||
float32x2_t p3, p4;
|
float32x2_t p3, p4;
|
||||||
|
|
||||||
|
@ -258,72 +259,69 @@ static void resampler_sinc_process_avx_kaiser(void *re_, struct resampler_data *
|
||||||
size_t out_frames = 0;
|
size_t out_frames = 0;
|
||||||
unsigned taps = resamp->taps;
|
unsigned taps = resamp->taps;
|
||||||
|
|
||||||
|
while (frames)
|
||||||
{
|
{
|
||||||
while (frames)
|
while (frames && resamp->time >= phases)
|
||||||
{
|
{
|
||||||
while (frames && resamp->time >= phases)
|
/* Push in reverse to make filter more obvious. */
|
||||||
|
if (!resamp->ptr)
|
||||||
|
resamp->ptr = taps;
|
||||||
|
resamp->ptr--;
|
||||||
|
|
||||||
|
resamp->buffer_l[resamp->ptr + taps] =
|
||||||
|
resamp->buffer_l[resamp->ptr] = *input++;
|
||||||
|
|
||||||
|
resamp->buffer_r[resamp->ptr + taps] =
|
||||||
|
resamp->buffer_r[resamp->ptr] = *input++;
|
||||||
|
|
||||||
|
resamp->time -= phases;
|
||||||
|
frames--;
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
const float *buffer_l = resamp->buffer_l + resamp->ptr;
|
||||||
|
const float *buffer_r = resamp->buffer_r + resamp->ptr;
|
||||||
|
while (resamp->time < phases)
|
||||||
{
|
{
|
||||||
/* Push in reverse to make filter more obvious. */
|
int i;
|
||||||
if (!resamp->ptr)
|
__m256 res_l, res_r;
|
||||||
resamp->ptr = taps;
|
unsigned phase = resamp->time >> resamp->subphase_bits;
|
||||||
resamp->ptr--;
|
float *phase_table = resamp->phase_table + phase * taps * 2;
|
||||||
|
float *delta_table = phase_table + taps;
|
||||||
|
__m256 delta = _mm256_set1_ps((float)
|
||||||
|
(resamp->time & resamp->subphase_mask) * resamp->subphase_mod);
|
||||||
|
__m256 sum_l = _mm256_setzero_ps();
|
||||||
|
__m256 sum_r = _mm256_setzero_ps();
|
||||||
|
|
||||||
resamp->buffer_l[resamp->ptr + taps] =
|
for (i = 0; i < (int)taps; i += 8)
|
||||||
resamp->buffer_l[resamp->ptr] = *input++;
|
|
||||||
|
|
||||||
resamp->buffer_r[resamp->ptr + taps] =
|
|
||||||
resamp->buffer_r[resamp->ptr] = *input++;
|
|
||||||
|
|
||||||
resamp->time -= phases;
|
|
||||||
frames--;
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
const float *buffer_l = resamp->buffer_l + resamp->ptr;
|
|
||||||
const float *buffer_r = resamp->buffer_r + resamp->ptr;
|
|
||||||
while (resamp->time < phases)
|
|
||||||
{
|
{
|
||||||
int i;
|
__m256 buf_l = _mm256_loadu_ps(buffer_l + i);
|
||||||
unsigned phase = resamp->time >> resamp->subphase_bits;
|
__m256 buf_r = _mm256_loadu_ps(buffer_r + i);
|
||||||
|
__m256 deltas = _mm256_load_ps(delta_table + i);
|
||||||
|
__m256 sinc = _mm256_add_ps(_mm256_load_ps((const float*)phase_table + i),
|
||||||
|
_mm256_mul_ps(deltas, delta));
|
||||||
|
|
||||||
float *phase_table = resamp->phase_table + phase * taps * 2;
|
sum_l = _mm256_add_ps(sum_l, _mm256_mul_ps(buf_l, sinc));
|
||||||
float *delta_table = phase_table + taps;
|
sum_r = _mm256_add_ps(sum_r, _mm256_mul_ps(buf_r, sinc));
|
||||||
__m256 delta = _mm256_set1_ps((float)
|
|
||||||
(resamp->time & resamp->subphase_mask) * resamp->subphase_mod);
|
|
||||||
|
|
||||||
__m256 sum_l = _mm256_setzero_ps();
|
|
||||||
__m256 sum_r = _mm256_setzero_ps();
|
|
||||||
|
|
||||||
for (i = 0; i < (int)taps; i += 8)
|
|
||||||
{
|
|
||||||
__m256 buf_l = _mm256_loadu_ps(buffer_l + i);
|
|
||||||
__m256 buf_r = _mm256_loadu_ps(buffer_r + i);
|
|
||||||
__m256 deltas = _mm256_load_ps(delta_table + i);
|
|
||||||
__m256 sinc = _mm256_add_ps(_mm256_load_ps((const float*)phase_table + i),
|
|
||||||
_mm256_mul_ps(deltas, delta));
|
|
||||||
|
|
||||||
sum_l = _mm256_add_ps(sum_l, _mm256_mul_ps(buf_l, sinc));
|
|
||||||
sum_r = _mm256_add_ps(sum_r, _mm256_mul_ps(buf_r, sinc));
|
|
||||||
}
|
|
||||||
|
|
||||||
/* hadd on AVX is weird, and acts on low-lanes
|
|
||||||
* and high-lanes separately. */
|
|
||||||
__m256 res_l = _mm256_hadd_ps(sum_l, sum_l);
|
|
||||||
__m256 res_r = _mm256_hadd_ps(sum_r, sum_r);
|
|
||||||
res_l = _mm256_hadd_ps(res_l, res_l);
|
|
||||||
res_r = _mm256_hadd_ps(res_r, res_r);
|
|
||||||
res_l = _mm256_add_ps(_mm256_permute2f128_ps(res_l, res_l, 1), res_l);
|
|
||||||
res_r = _mm256_add_ps(_mm256_permute2f128_ps(res_r, res_r, 1), res_r);
|
|
||||||
|
|
||||||
/* This is optimized to mov %xmmN, [mem].
|
|
||||||
* There doesn't seem to be any _mm256_store_ss intrinsic. */
|
|
||||||
_mm_store_ss(output + 0, _mm256_extractf128_ps(res_l, 0));
|
|
||||||
_mm_store_ss(output + 1, _mm256_extractf128_ps(res_r, 0));
|
|
||||||
|
|
||||||
output += 2;
|
|
||||||
out_frames++;
|
|
||||||
resamp->time += ratio;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* hadd on AVX is weird, and acts on low-lanes
|
||||||
|
* and high-lanes separately. */
|
||||||
|
res_l = _mm256_hadd_ps(sum_l, sum_l);
|
||||||
|
res_r = _mm256_hadd_ps(sum_r, sum_r);
|
||||||
|
res_l = _mm256_hadd_ps(res_l, res_l);
|
||||||
|
res_r = _mm256_hadd_ps(res_r, res_r);
|
||||||
|
res_l = _mm256_add_ps(_mm256_permute2f128_ps(res_l, res_l, 1), res_l);
|
||||||
|
res_r = _mm256_add_ps(_mm256_permute2f128_ps(res_r, res_r, 1), res_r);
|
||||||
|
|
||||||
|
/* This is optimized to mov %xmmN, [mem].
|
||||||
|
* There doesn't seem to be any _mm256_store_ss intrinsic. */
|
||||||
|
_mm_store_ss(output + 0, _mm256_extractf128_ps(res_l, 0));
|
||||||
|
_mm_store_ss(output + 1, _mm256_extractf128_ps(res_r, 0));
|
||||||
|
|
||||||
|
output += 2;
|
||||||
|
out_frames++;
|
||||||
|
resamp->time += ratio;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -334,76 +332,74 @@ static void resampler_sinc_process_avx_kaiser(void *re_, struct resampler_data *
|
||||||
static void resampler_sinc_process_avx(void *re_, struct resampler_data *data)
|
static void resampler_sinc_process_avx(void *re_, struct resampler_data *data)
|
||||||
{
|
{
|
||||||
rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_;
|
rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_;
|
||||||
unsigned phases = 1 << (resamp->phase_bits + resamp->subphase_bits);
|
unsigned phases = 1 << (resamp->phase_bits + resamp->subphase_bits);
|
||||||
|
|
||||||
uint32_t ratio = phases / data->ratio;
|
uint32_t ratio = phases / data->ratio;
|
||||||
const float *input = data->data_in;
|
const float *input = data->data_in;
|
||||||
float *output = data->data_out;
|
float *output = data->data_out;
|
||||||
size_t frames = data->input_frames;
|
size_t frames = data->input_frames;
|
||||||
size_t out_frames = 0;
|
size_t out_frames = 0;
|
||||||
unsigned taps = resamp->taps;
|
unsigned taps = resamp->taps;
|
||||||
|
|
||||||
|
while (frames)
|
||||||
{
|
{
|
||||||
while (frames)
|
while (frames && resamp->time >= phases)
|
||||||
{
|
{
|
||||||
while (frames && resamp->time >= phases)
|
/* Push in reverse to make filter more obvious. */
|
||||||
|
if (!resamp->ptr)
|
||||||
|
resamp->ptr = taps;
|
||||||
|
resamp->ptr--;
|
||||||
|
|
||||||
|
resamp->buffer_l[resamp->ptr + taps] =
|
||||||
|
resamp->buffer_l[resamp->ptr] = *input++;
|
||||||
|
|
||||||
|
resamp->buffer_r[resamp->ptr + taps] =
|
||||||
|
resamp->buffer_r[resamp->ptr] = *input++;
|
||||||
|
|
||||||
|
resamp->time -= phases;
|
||||||
|
frames--;
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
const float *buffer_l = resamp->buffer_l + resamp->ptr;
|
||||||
|
const float *buffer_r = resamp->buffer_r + resamp->ptr;
|
||||||
|
while (resamp->time < phases)
|
||||||
{
|
{
|
||||||
/* Push in reverse to make filter more obvious. */
|
int i;
|
||||||
if (!resamp->ptr)
|
__m256 delta, res_l, res_r;
|
||||||
resamp->ptr = taps;
|
unsigned phase = resamp->time >> resamp->subphase_bits;
|
||||||
resamp->ptr--;
|
float *phase_table = resamp->phase_table + phase * taps;
|
||||||
|
|
||||||
resamp->buffer_l[resamp->ptr + taps] =
|
__m256 sum_l = _mm256_setzero_ps();
|
||||||
resamp->buffer_l[resamp->ptr] = *input++;
|
__m256 sum_r = _mm256_setzero_ps();
|
||||||
|
|
||||||
resamp->buffer_r[resamp->ptr + taps] =
|
for (i = 0; i < (int)taps; i += 8)
|
||||||
resamp->buffer_r[resamp->ptr] = *input++;
|
|
||||||
|
|
||||||
resamp->time -= phases;
|
|
||||||
frames--;
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
const float *buffer_l = resamp->buffer_l + resamp->ptr;
|
|
||||||
const float *buffer_r = resamp->buffer_r + resamp->ptr;
|
|
||||||
while (resamp->time < phases)
|
|
||||||
{
|
{
|
||||||
int i;
|
__m256 buf_l = _mm256_loadu_ps(buffer_l + i);
|
||||||
__m256 delta;
|
__m256 buf_r = _mm256_loadu_ps(buffer_r + i);
|
||||||
unsigned phase = resamp->time >> resamp->subphase_bits;
|
__m256 sinc = _mm256_load_ps((const float*)phase_table + i);
|
||||||
float *phase_table = resamp->phase_table + phase * taps;
|
|
||||||
|
|
||||||
__m256 sum_l = _mm256_setzero_ps();
|
sum_l = _mm256_add_ps(sum_l, _mm256_mul_ps(buf_l, sinc));
|
||||||
__m256 sum_r = _mm256_setzero_ps();
|
sum_r = _mm256_add_ps(sum_r, _mm256_mul_ps(buf_r, sinc));
|
||||||
|
|
||||||
for (i = 0; i < (int)taps; i += 8)
|
|
||||||
{
|
|
||||||
__m256 buf_l = _mm256_loadu_ps(buffer_l + i);
|
|
||||||
__m256 buf_r = _mm256_loadu_ps(buffer_r + i);
|
|
||||||
__m256 sinc = _mm256_load_ps((const float*)phase_table + i);
|
|
||||||
|
|
||||||
sum_l = _mm256_add_ps(sum_l, _mm256_mul_ps(buf_l, sinc));
|
|
||||||
sum_r = _mm256_add_ps(sum_r, _mm256_mul_ps(buf_r, sinc));
|
|
||||||
}
|
|
||||||
|
|
||||||
/* hadd on AVX is weird, and acts on low-lanes
|
|
||||||
* and high-lanes separately. */
|
|
||||||
__m256 res_l = _mm256_hadd_ps(sum_l, sum_l);
|
|
||||||
__m256 res_r = _mm256_hadd_ps(sum_r, sum_r);
|
|
||||||
res_l = _mm256_hadd_ps(res_l, res_l);
|
|
||||||
res_r = _mm256_hadd_ps(res_r, res_r);
|
|
||||||
res_l = _mm256_add_ps(_mm256_permute2f128_ps(res_l, res_l, 1), res_l);
|
|
||||||
res_r = _mm256_add_ps(_mm256_permute2f128_ps(res_r, res_r, 1), res_r);
|
|
||||||
|
|
||||||
/* This is optimized to mov %xmmN, [mem].
|
|
||||||
* There doesn't seem to be any _mm256_store_ss intrinsic. */
|
|
||||||
_mm_store_ss(output + 0, _mm256_extractf128_ps(res_l, 0));
|
|
||||||
_mm_store_ss(output + 1, _mm256_extractf128_ps(res_r, 0));
|
|
||||||
|
|
||||||
output += 2;
|
|
||||||
out_frames++;
|
|
||||||
resamp->time += ratio;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* hadd on AVX is weird, and acts on low-lanes
|
||||||
|
* and high-lanes separately. */
|
||||||
|
res_l = _mm256_hadd_ps(sum_l, sum_l);
|
||||||
|
res_r = _mm256_hadd_ps(sum_r, sum_r);
|
||||||
|
res_l = _mm256_hadd_ps(res_l, res_l);
|
||||||
|
res_r = _mm256_hadd_ps(res_r, res_r);
|
||||||
|
res_l = _mm256_add_ps(_mm256_permute2f128_ps(res_l, res_l, 1), res_l);
|
||||||
|
res_r = _mm256_add_ps(_mm256_permute2f128_ps(res_r, res_r, 1), res_r);
|
||||||
|
|
||||||
|
/* This is optimized to mov %xmmN, [mem].
|
||||||
|
* There doesn't seem to be any _mm256_store_ss intrinsic. */
|
||||||
|
_mm_store_ss(output + 0, _mm256_extractf128_ps(res_l, 0));
|
||||||
|
_mm_store_ss(output + 1, _mm256_extractf128_ps(res_r, 0));
|
||||||
|
|
||||||
|
output += 2;
|
||||||
|
out_frames++;
|
||||||
|
resamp->time += ratio;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -425,82 +421,80 @@ static void resampler_sinc_process_sse_kaiser(void *re_, struct resampler_data *
|
||||||
size_t out_frames = 0;
|
size_t out_frames = 0;
|
||||||
unsigned taps = resamp->taps;
|
unsigned taps = resamp->taps;
|
||||||
|
|
||||||
|
while (frames)
|
||||||
{
|
{
|
||||||
while (frames)
|
while (frames && resamp->time >= phases)
|
||||||
{
|
{
|
||||||
while (frames && resamp->time >= phases)
|
/* Push in reverse to make filter more obvious. */
|
||||||
|
if (!resamp->ptr)
|
||||||
|
resamp->ptr = taps;
|
||||||
|
resamp->ptr--;
|
||||||
|
|
||||||
|
resamp->buffer_l[resamp->ptr + taps] =
|
||||||
|
resamp->buffer_l[resamp->ptr] = *input++;
|
||||||
|
|
||||||
|
resamp->buffer_r[resamp->ptr + taps] =
|
||||||
|
resamp->buffer_r[resamp->ptr] = *input++;
|
||||||
|
|
||||||
|
resamp->time -= phases;
|
||||||
|
frames--;
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
const float *buffer_l = resamp->buffer_l + resamp->ptr;
|
||||||
|
const float *buffer_r = resamp->buffer_r + resamp->ptr;
|
||||||
|
while (resamp->time < phases)
|
||||||
{
|
{
|
||||||
/* Push in reverse to make filter more obvious. */
|
int i;
|
||||||
if (!resamp->ptr)
|
__m128 sum;
|
||||||
resamp->ptr = taps;
|
unsigned phase = resamp->time >> resamp->subphase_bits;
|
||||||
resamp->ptr--;
|
float *phase_table = resamp->phase_table + phase * taps * 2;
|
||||||
|
float *delta_table = phase_table + taps;
|
||||||
|
__m128 delta = _mm_set1_ps((float)
|
||||||
|
(resamp->time & resamp->subphase_mask) * resamp->subphase_mod);
|
||||||
|
|
||||||
resamp->buffer_l[resamp->ptr + taps] =
|
__m128 sum_l = _mm_setzero_ps();
|
||||||
resamp->buffer_l[resamp->ptr] = *input++;
|
__m128 sum_r = _mm_setzero_ps();
|
||||||
|
|
||||||
resamp->buffer_r[resamp->ptr + taps] =
|
for (i = 0; i < (int)taps; i += 4)
|
||||||
resamp->buffer_r[resamp->ptr] = *input++;
|
|
||||||
|
|
||||||
resamp->time -= phases;
|
|
||||||
frames--;
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
const float *buffer_l = resamp->buffer_l + resamp->ptr;
|
|
||||||
const float *buffer_r = resamp->buffer_r + resamp->ptr;
|
|
||||||
while (resamp->time < phases)
|
|
||||||
{
|
{
|
||||||
int i;
|
__m128 buf_l = _mm_loadu_ps(buffer_l + i);
|
||||||
__m128 sum;
|
__m128 buf_r = _mm_loadu_ps(buffer_r + i);
|
||||||
unsigned phase = resamp->time >> resamp->subphase_bits;
|
__m128 deltas = _mm_load_ps(delta_table + i);
|
||||||
float *phase_table = resamp->phase_table + phase * taps * 2;
|
__m128 _sinc = _mm_add_ps(_mm_load_ps((const float*)phase_table + i),
|
||||||
float *delta_table = phase_table + taps;
|
_mm_mul_ps(deltas, delta));
|
||||||
__m128 delta = _mm_set1_ps((float)
|
sum_l = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, _sinc));
|
||||||
(resamp->time & resamp->subphase_mask) * resamp->subphase_mod);
|
sum_r = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, _sinc));
|
||||||
|
|
||||||
__m128 sum_l = _mm_setzero_ps();
|
|
||||||
__m128 sum_r = _mm_setzero_ps();
|
|
||||||
|
|
||||||
for (i = 0; i < (int)taps; i += 4)
|
|
||||||
{
|
|
||||||
__m128 buf_l = _mm_loadu_ps(buffer_l + i);
|
|
||||||
__m128 buf_r = _mm_loadu_ps(buffer_r + i);
|
|
||||||
__m128 deltas = _mm_load_ps(delta_table + i);
|
|
||||||
__m128 _sinc = _mm_add_ps(_mm_load_ps((const float*)phase_table + i),
|
|
||||||
_mm_mul_ps(deltas, delta));
|
|
||||||
sum_l = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, _sinc));
|
|
||||||
sum_r = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, _sinc));
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Them annoying shuffles.
|
|
||||||
* sum_l = { l3, l2, l1, l0 }
|
|
||||||
* sum_r = { r3, r2, r1, r0 }
|
|
||||||
*/
|
|
||||||
|
|
||||||
sum = _mm_add_ps(_mm_shuffle_ps(sum_l, sum_r,
|
|
||||||
_MM_SHUFFLE(1, 0, 1, 0)),
|
|
||||||
_mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2)));
|
|
||||||
|
|
||||||
/* sum = { r1, r0, l1, l0 } + { r3, r2, l3, l2 }
|
|
||||||
* sum = { R1, R0, L1, L0 }
|
|
||||||
*/
|
|
||||||
|
|
||||||
sum = _mm_add_ps(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1)), sum);
|
|
||||||
|
|
||||||
/* sum = {R1, R1, L1, L1 } + { R1, R0, L1, L0 }
|
|
||||||
* sum = { X, R, X, L }
|
|
||||||
*/
|
|
||||||
|
|
||||||
/* Store L */
|
|
||||||
_mm_store_ss(output + 0, sum);
|
|
||||||
|
|
||||||
/* movehl { X, R, X, L } == { X, R, X, R } */
|
|
||||||
_mm_store_ss(output + 1, _mm_movehl_ps(sum, sum));
|
|
||||||
|
|
||||||
output += 2;
|
|
||||||
out_frames++;
|
|
||||||
resamp->time += ratio;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Them annoying shuffles.
|
||||||
|
* sum_l = { l3, l2, l1, l0 }
|
||||||
|
* sum_r = { r3, r2, r1, r0 }
|
||||||
|
*/
|
||||||
|
|
||||||
|
sum = _mm_add_ps(_mm_shuffle_ps(sum_l, sum_r,
|
||||||
|
_MM_SHUFFLE(1, 0, 1, 0)),
|
||||||
|
_mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2)));
|
||||||
|
|
||||||
|
/* sum = { r1, r0, l1, l0 } + { r3, r2, l3, l2 }
|
||||||
|
* sum = { R1, R0, L1, L0 }
|
||||||
|
*/
|
||||||
|
|
||||||
|
sum = _mm_add_ps(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1)), sum);
|
||||||
|
|
||||||
|
/* sum = {R1, R1, L1, L1 } + { R1, R0, L1, L0 }
|
||||||
|
* sum = { X, R, X, L }
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Store L */
|
||||||
|
_mm_store_ss(output + 0, sum);
|
||||||
|
|
||||||
|
/* movehl { X, R, X, L } == { X, R, X, R } */
|
||||||
|
_mm_store_ss(output + 1, _mm_movehl_ps(sum, sum));
|
||||||
|
|
||||||
|
output += 2;
|
||||||
|
out_frames++;
|
||||||
|
resamp->time += ratio;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -520,77 +514,75 @@ static void resampler_sinc_process_sse(void *re_, struct resampler_data *data)
|
||||||
size_t out_frames = 0;
|
size_t out_frames = 0;
|
||||||
unsigned taps = resamp->taps;
|
unsigned taps = resamp->taps;
|
||||||
|
|
||||||
|
while (frames)
|
||||||
{
|
{
|
||||||
while (frames)
|
while (frames && resamp->time >= phases)
|
||||||
{
|
{
|
||||||
while (frames && resamp->time >= phases)
|
/* Push in reverse to make filter more obvious. */
|
||||||
|
if (!resamp->ptr)
|
||||||
|
resamp->ptr = taps;
|
||||||
|
resamp->ptr--;
|
||||||
|
|
||||||
|
resamp->buffer_l[resamp->ptr + taps] =
|
||||||
|
resamp->buffer_l[resamp->ptr] = *input++;
|
||||||
|
|
||||||
|
resamp->buffer_r[resamp->ptr + taps] =
|
||||||
|
resamp->buffer_r[resamp->ptr] = *input++;
|
||||||
|
|
||||||
|
resamp->time -= phases;
|
||||||
|
frames--;
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
const float *buffer_l = resamp->buffer_l + resamp->ptr;
|
||||||
|
const float *buffer_r = resamp->buffer_r + resamp->ptr;
|
||||||
|
while (resamp->time < phases)
|
||||||
{
|
{
|
||||||
/* Push in reverse to make filter more obvious. */
|
int i;
|
||||||
if (!resamp->ptr)
|
__m128 sum;
|
||||||
resamp->ptr = taps;
|
unsigned phase = resamp->time >> resamp->subphase_bits;
|
||||||
resamp->ptr--;
|
float *phase_table = resamp->phase_table + phase * taps;
|
||||||
|
|
||||||
resamp->buffer_l[resamp->ptr + taps] =
|
__m128 sum_l = _mm_setzero_ps();
|
||||||
resamp->buffer_l[resamp->ptr] = *input++;
|
__m128 sum_r = _mm_setzero_ps();
|
||||||
|
|
||||||
resamp->buffer_r[resamp->ptr + taps] =
|
for (i = 0; i < (int)taps; i += 4)
|
||||||
resamp->buffer_r[resamp->ptr] = *input++;
|
|
||||||
|
|
||||||
resamp->time -= phases;
|
|
||||||
frames--;
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
const float *buffer_l = resamp->buffer_l + resamp->ptr;
|
|
||||||
const float *buffer_r = resamp->buffer_r + resamp->ptr;
|
|
||||||
while (resamp->time < phases)
|
|
||||||
{
|
{
|
||||||
int i;
|
__m128 buf_l = _mm_loadu_ps(buffer_l + i);
|
||||||
__m128 sum;
|
__m128 buf_r = _mm_loadu_ps(buffer_r + i);
|
||||||
unsigned phase = resamp->time >> resamp->subphase_bits;
|
__m128 _sinc = _mm_load_ps((const float*)phase_table + i);
|
||||||
float *phase_table = resamp->phase_table + phase * taps;
|
sum_l = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, _sinc));
|
||||||
|
sum_r = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, _sinc));
|
||||||
__m128 sum_l = _mm_setzero_ps();
|
|
||||||
__m128 sum_r = _mm_setzero_ps();
|
|
||||||
|
|
||||||
for (i = 0; i < (int)taps; i += 4)
|
|
||||||
{
|
|
||||||
__m128 buf_l = _mm_loadu_ps(buffer_l + i);
|
|
||||||
__m128 buf_r = _mm_loadu_ps(buffer_r + i);
|
|
||||||
__m128 _sinc = _mm_load_ps((const float*)phase_table + i);
|
|
||||||
sum_l = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, _sinc));
|
|
||||||
sum_r = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, _sinc));
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Them annoying shuffles.
|
|
||||||
* sum_l = { l3, l2, l1, l0 }
|
|
||||||
* sum_r = { r3, r2, r1, r0 }
|
|
||||||
*/
|
|
||||||
|
|
||||||
sum = _mm_add_ps(_mm_shuffle_ps(sum_l, sum_r,
|
|
||||||
_MM_SHUFFLE(1, 0, 1, 0)),
|
|
||||||
_mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2)));
|
|
||||||
|
|
||||||
/* sum = { r1, r0, l1, l0 } + { r3, r2, l3, l2 }
|
|
||||||
* sum = { R1, R0, L1, L0 }
|
|
||||||
*/
|
|
||||||
|
|
||||||
sum = _mm_add_ps(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1)), sum);
|
|
||||||
|
|
||||||
/* sum = {R1, R1, L1, L1 } + { R1, R0, L1, L0 }
|
|
||||||
* sum = { X, R, X, L }
|
|
||||||
*/
|
|
||||||
|
|
||||||
/* Store L */
|
|
||||||
_mm_store_ss(output + 0, sum);
|
|
||||||
|
|
||||||
/* movehl { X, R, X, L } == { X, R, X, R } */
|
|
||||||
_mm_store_ss(output + 1, _mm_movehl_ps(sum, sum));
|
|
||||||
|
|
||||||
output += 2;
|
|
||||||
out_frames++;
|
|
||||||
resamp->time += ratio;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Them annoying shuffles.
|
||||||
|
* sum_l = { l3, l2, l1, l0 }
|
||||||
|
* sum_r = { r3, r2, r1, r0 }
|
||||||
|
*/
|
||||||
|
|
||||||
|
sum = _mm_add_ps(_mm_shuffle_ps(sum_l, sum_r,
|
||||||
|
_MM_SHUFFLE(1, 0, 1, 0)),
|
||||||
|
_mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2)));
|
||||||
|
|
||||||
|
/* sum = { r1, r0, l1, l0 } + { r3, r2, l3, l2 }
|
||||||
|
* sum = { R1, R0, L1, L0 }
|
||||||
|
*/
|
||||||
|
|
||||||
|
sum = _mm_add_ps(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1)), sum);
|
||||||
|
|
||||||
|
/* sum = {R1, R1, L1, L1 } + { R1, R0, L1, L0 }
|
||||||
|
* sum = { X, R, X, L }
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* Store L */
|
||||||
|
_mm_store_ss(output + 0, sum);
|
||||||
|
|
||||||
|
/* movehl { X, R, X, L } == { X, R, X, R } */
|
||||||
|
_mm_store_ss(output + 1, _mm_movehl_ps(sum, sum));
|
||||||
|
|
||||||
|
output += 2;
|
||||||
|
out_frames++;
|
||||||
|
resamp->time += ratio;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -611,57 +603,54 @@ static void resampler_sinc_process_c_kaiser(void *re_, struct resampler_data *da
|
||||||
size_t out_frames = 0;
|
size_t out_frames = 0;
|
||||||
unsigned taps = resamp->taps;
|
unsigned taps = resamp->taps;
|
||||||
|
|
||||||
|
while (frames)
|
||||||
{
|
{
|
||||||
while (frames)
|
while (frames && resamp->time >= phases)
|
||||||
{
|
{
|
||||||
while (frames && resamp->time >= phases)
|
/* Push in reverse to make filter more obvious. */
|
||||||
|
if (!resamp->ptr)
|
||||||
|
resamp->ptr = taps;
|
||||||
|
resamp->ptr--;
|
||||||
|
|
||||||
|
resamp->buffer_l[resamp->ptr + taps] =
|
||||||
|
resamp->buffer_l[resamp->ptr] = *input++;
|
||||||
|
|
||||||
|
resamp->buffer_r[resamp->ptr + taps] =
|
||||||
|
resamp->buffer_r[resamp->ptr] = *input++;
|
||||||
|
|
||||||
|
resamp->time -= phases;
|
||||||
|
frames--;
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
const float *buffer_l = resamp->buffer_l + resamp->ptr;
|
||||||
|
const float *buffer_r = resamp->buffer_r + resamp->ptr;
|
||||||
|
while (resamp->time < phases)
|
||||||
{
|
{
|
||||||
/* Push in reverse to make filter more obvious. */
|
int i;
|
||||||
if (!resamp->ptr)
|
float sum_l = 0.0f;
|
||||||
resamp->ptr = taps;
|
float sum_r = 0.0f;
|
||||||
resamp->ptr--;
|
unsigned phase = resamp->time >> resamp->subphase_bits;
|
||||||
|
float *phase_table = resamp->phase_table + phase * taps * 2;
|
||||||
|
float *delta_table = phase_table + taps;
|
||||||
|
float delta = (float)
|
||||||
|
(resamp->time & resamp->subphase_mask) * resamp->subphase_mod;
|
||||||
|
|
||||||
resamp->buffer_l[resamp->ptr + taps] =
|
for (i = 0; i < (int)taps; i++)
|
||||||
resamp->buffer_l[resamp->ptr] = *input++;
|
|
||||||
|
|
||||||
resamp->buffer_r[resamp->ptr + taps] =
|
|
||||||
resamp->buffer_r[resamp->ptr] = *input++;
|
|
||||||
|
|
||||||
resamp->time -= phases;
|
|
||||||
frames--;
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
const float *buffer_l = resamp->buffer_l + resamp->ptr;
|
|
||||||
const float *buffer_r = resamp->buffer_r + resamp->ptr;
|
|
||||||
while (resamp->time < phases)
|
|
||||||
{
|
{
|
||||||
int i;
|
float sinc_val = phase_table[i] + delta_table[i] * delta;
|
||||||
float sum_l = 0.0f;
|
|
||||||
float sum_r = 0.0f;
|
|
||||||
unsigned phase = resamp->time >> resamp->subphase_bits;
|
|
||||||
float *phase_table = resamp->phase_table + phase * taps * 2;
|
|
||||||
float *delta_table = phase_table + taps;
|
|
||||||
float delta = (float)
|
|
||||||
(resamp->time & resamp->subphase_mask) * resamp->subphase_mod;
|
|
||||||
|
|
||||||
for (i = 0; i < (int)taps; i++)
|
sum_l += buffer_l[i] * sinc_val;
|
||||||
{
|
sum_r += buffer_r[i] * sinc_val;
|
||||||
float sinc_val = phase_table[i] + delta_table[i] * delta;
|
|
||||||
|
|
||||||
sum_l += buffer_l[i] * sinc_val;
|
|
||||||
sum_r += buffer_r[i] * sinc_val;
|
|
||||||
}
|
|
||||||
|
|
||||||
output[0] = sum_l;
|
|
||||||
output[1] = sum_r;
|
|
||||||
|
|
||||||
output += 2;
|
|
||||||
out_frames++;
|
|
||||||
resamp->time += ratio;
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
|
output[0] = sum_l;
|
||||||
|
output[1] = sum_r;
|
||||||
|
|
||||||
|
output += 2;
|
||||||
|
out_frames++;
|
||||||
|
resamp->time += ratio;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -680,54 +669,51 @@ static void resampler_sinc_process_c(void *re_, struct resampler_data *data)
|
||||||
size_t out_frames = 0;
|
size_t out_frames = 0;
|
||||||
unsigned taps = resamp->taps;
|
unsigned taps = resamp->taps;
|
||||||
|
|
||||||
|
while (frames)
|
||||||
{
|
{
|
||||||
while (frames)
|
while (frames && resamp->time >= phases)
|
||||||
{
|
{
|
||||||
while (frames && resamp->time >= phases)
|
/* Push in reverse to make filter more obvious. */
|
||||||
|
if (!resamp->ptr)
|
||||||
|
resamp->ptr = taps;
|
||||||
|
resamp->ptr--;
|
||||||
|
|
||||||
|
resamp->buffer_l[resamp->ptr + taps] =
|
||||||
|
resamp->buffer_l[resamp->ptr] = *input++;
|
||||||
|
|
||||||
|
resamp->buffer_r[resamp->ptr + taps] =
|
||||||
|
resamp->buffer_r[resamp->ptr] = *input++;
|
||||||
|
|
||||||
|
resamp->time -= phases;
|
||||||
|
frames--;
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
const float *buffer_l = resamp->buffer_l + resamp->ptr;
|
||||||
|
const float *buffer_r = resamp->buffer_r + resamp->ptr;
|
||||||
|
while (resamp->time < phases)
|
||||||
{
|
{
|
||||||
/* Push in reverse to make filter more obvious. */
|
int i;
|
||||||
if (!resamp->ptr)
|
float sum_l = 0.0f;
|
||||||
resamp->ptr = taps;
|
float sum_r = 0.0f;
|
||||||
resamp->ptr--;
|
unsigned phase = resamp->time >> resamp->subphase_bits;
|
||||||
|
float *phase_table = resamp->phase_table + phase * taps;
|
||||||
|
|
||||||
resamp->buffer_l[resamp->ptr + taps] =
|
for (i = 0; i < (int)taps; i++)
|
||||||
resamp->buffer_l[resamp->ptr] = *input++;
|
|
||||||
|
|
||||||
resamp->buffer_r[resamp->ptr + taps] =
|
|
||||||
resamp->buffer_r[resamp->ptr] = *input++;
|
|
||||||
|
|
||||||
resamp->time -= phases;
|
|
||||||
frames--;
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
const float *buffer_l = resamp->buffer_l + resamp->ptr;
|
|
||||||
const float *buffer_r = resamp->buffer_r + resamp->ptr;
|
|
||||||
while (resamp->time < phases)
|
|
||||||
{
|
{
|
||||||
int i;
|
float sinc_val = phase_table[i];
|
||||||
float sum_l = 0.0f;
|
|
||||||
float sum_r = 0.0f;
|
|
||||||
unsigned phase = resamp->time >> resamp->subphase_bits;
|
|
||||||
float *phase_table = resamp->phase_table + phase * taps;
|
|
||||||
|
|
||||||
for (i = 0; i < (int)taps; i++)
|
sum_l += buffer_l[i] * sinc_val;
|
||||||
{
|
sum_r += buffer_r[i] * sinc_val;
|
||||||
float sinc_val = phase_table[i];
|
|
||||||
|
|
||||||
sum_l += buffer_l[i] * sinc_val;
|
|
||||||
sum_r += buffer_r[i] * sinc_val;
|
|
||||||
}
|
|
||||||
|
|
||||||
output[0] = sum_l;
|
|
||||||
output[1] = sum_r;
|
|
||||||
|
|
||||||
output += 2;
|
|
||||||
out_frames++;
|
|
||||||
resamp->time += ratio;
|
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
|
output[0] = sum_l;
|
||||||
|
output[1] = sum_r;
|
||||||
|
|
||||||
|
output += 2;
|
||||||
|
out_frames++;
|
||||||
|
resamp->time += ratio;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -748,10 +734,10 @@ static void sinc_init_table_kaiser(rarch_sinc_resampler_t *resamp,
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
/* Kaiser window function - need to normalize w(0) to 1.0f */
|
/* Kaiser window function - need to normalize w(0) to 1.0f */
|
||||||
float kaiser_beta = resamp->kaiser_beta;
|
float kaiser_beta = resamp->kaiser_beta;
|
||||||
double window_mod = besseli0(kaiser_beta);
|
double window_mod = besseli0(kaiser_beta);
|
||||||
int stride = calculate_delta ? 2 : 1;
|
int stride = calculate_delta ? 2 : 1;
|
||||||
double sidelobes = taps / 2.0;
|
double sidelobes = taps / 2.0;
|
||||||
|
|
||||||
for (i = 0; i < phases; i++)
|
for (i = 0; i < phases; i++)
|
||||||
{
|
{
|
||||||
|
@ -774,7 +760,6 @@ static void sinc_init_table_kaiser(rarch_sinc_resampler_t *resamp,
|
||||||
{
|
{
|
||||||
int phase;
|
int phase;
|
||||||
int p;
|
int p;
|
||||||
|
|
||||||
for (p = 0; p < phases - 1; p++)
|
for (p = 0; p < phases - 1; p++)
|
||||||
{
|
{
|
||||||
for (j = 0; j < taps; j++)
|
for (j = 0; j < taps; j++)
|
||||||
|
@ -810,9 +795,9 @@ static void sinc_init_table_lanczos(
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
/* Lanczos window function - need to normalize w(0) to 1.0f */
|
/* Lanczos window function - need to normalize w(0) to 1.0f */
|
||||||
double window_mod = 1.0;
|
double window_mod = 1.0;
|
||||||
int stride = calculate_delta ? 2 : 1;
|
int stride = calculate_delta ? 2 : 1;
|
||||||
double sidelobes = taps / 2.0;
|
double sidelobes = taps / 2.0;
|
||||||
|
|
||||||
for (i = 0; i < phases; i++)
|
for (i = 0; i < phases; i++)
|
||||||
{
|
{
|
||||||
|
@ -832,8 +817,8 @@ static void sinc_init_table_lanczos(
|
||||||
|
|
||||||
if (calculate_delta)
|
if (calculate_delta)
|
||||||
{
|
{
|
||||||
int phase;
|
|
||||||
int p;
|
int p;
|
||||||
|
int phase;
|
||||||
|
|
||||||
for (p = 0; p < phases - 1; p++)
|
for (p = 0; p < phases - 1; p++)
|
||||||
{
|
{
|
||||||
|
@ -939,21 +924,21 @@ static void *resampler_sinc_new(const struct resampler_config *config,
|
||||||
/* Be SIMD-friendly. */
|
/* Be SIMD-friendly. */
|
||||||
#if defined(__AVX__)
|
#if defined(__AVX__)
|
||||||
if (enable_avx)
|
if (enable_avx)
|
||||||
re->taps = (re->taps + 7) & ~7;
|
re->taps = (re->taps + 7) & ~7;
|
||||||
else
|
else
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
#if (defined(__ARM_NEON__) || defined(HAVE_NEON))
|
#if (defined(__ARM_NEON__) || defined(HAVE_NEON))
|
||||||
re->taps = (re->taps + 7) & ~7;
|
re->taps = (re->taps + 7) & ~7;
|
||||||
#else
|
#else
|
||||||
re->taps = (re->taps + 3) & ~3;
|
re->taps = (re->taps + 3) & ~3;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
phase_elems = ((1 << re->phase_bits) * re->taps);
|
phase_elems = ((1 << re->phase_bits) * re->taps);
|
||||||
if (window_type == SINC_WINDOW_KAISER)
|
if (window_type == SINC_WINDOW_KAISER)
|
||||||
phase_elems = phase_elems * 2;
|
phase_elems = phase_elems * 2;
|
||||||
elems = phase_elems + 4 * re->taps;
|
elems = phase_elems + 4 * re->taps;
|
||||||
|
|
||||||
re->main_buffer = (float*)memalign_alloc(128, sizeof(float) * elems);
|
re->main_buffer = (float*)memalign_alloc(128, sizeof(float) * elems);
|
||||||
if (!re->main_buffer)
|
if (!re->main_buffer)
|
||||||
|
|
Loading…
Reference in New Issue