[APU] Add AVX intrinsic variants for conversion

This commit is contained in:
Joel Linn 2021-06-08 03:36:07 +02:00 committed by Triang3l
parent cd631fc447
commit 0ad939b2f1
1 changed files with 66 additions and 0 deletions

View File

@ -13,11 +13,76 @@
#include <cstdint> #include <cstdint>
#include "xenia/base/byte_order.h" #include "xenia/base/byte_order.h"
#include "xenia/base/platform.h"
namespace xe { namespace xe {
namespace apu { namespace apu {
namespace conversion { namespace conversion {
#if XE_ARCH_AMD64
inline void sequential_6_BE_to_interleaved_6_LE(float* output,
const float* input,
size_t ch_sample_count) {
const uint32_t* in = reinterpret_cast<const uint32_t*>(input);
uint32_t* out = reinterpret_cast<uint32_t*>(output);
const __m128i byte_swap_shuffle =
_mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
for (size_t sample = 0; sample < ch_sample_count; sample++) {
__m128i sample0 = _mm_set_epi32(
in[3 * ch_sample_count + sample], in[2 * ch_sample_count + sample],
in[1 * ch_sample_count + sample], in[0 * ch_sample_count + sample]);
uint32_t sample1 = in[4 * ch_sample_count + sample];
uint32_t sample2 = in[5 * ch_sample_count + sample];
sample0 = _mm_shuffle_epi8(sample0, byte_swap_shuffle);
_mm_storeu_si128(reinterpret_cast<__m128i*>(&out[sample * 6]), sample0);
sample1 = xe::byte_swap(sample1);
out[sample * 6 + 4] = sample1;
sample2 = xe::byte_swap(sample2);
out[sample * 6 + 5] = sample2;
}
}
inline void sequential_6_BE_to_interleaved_2_LE(float* output,
const float* input,
size_t ch_sample_count) {
assert_true(ch_sample_count % 4 == 0);
const uint32_t* in = reinterpret_cast<const uint32_t*>(input);
uint32_t* out = reinterpret_cast<uint32_t*>(output);
const __m128i byte_swap_shuffle =
_mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
const __m128 half = _mm_set1_ps(0.5f);
const __m128 two_fifths = _mm_set1_ps(1.0f / 2.5f);
// put center on left and right, discard low frequency
for (size_t sample = 0; sample < ch_sample_count; sample += 4) {
// load 4 samples from 6 channels each
__m128 fl = _mm_loadu_ps(&input[0 * ch_sample_count + sample]);
__m128 fr = _mm_loadu_ps(&input[1 * ch_sample_count + sample]);
__m128 fc = _mm_loadu_ps(&input[2 * ch_sample_count + sample]);
__m128 bl = _mm_loadu_ps(&input[4 * ch_sample_count + sample]);
__m128 br = _mm_loadu_ps(&input[5 * ch_sample_count + sample]);
// byte swap
fl = _mm_castsi128_ps(
_mm_shuffle_epi8(_mm_castps_si128(fl), byte_swap_shuffle));
fr = _mm_castsi128_ps(
_mm_shuffle_epi8(_mm_castps_si128(fr), byte_swap_shuffle));
fc = _mm_castsi128_ps(
_mm_shuffle_epi8(_mm_castps_si128(fc), byte_swap_shuffle));
bl = _mm_castsi128_ps(
_mm_shuffle_epi8(_mm_castps_si128(bl), byte_swap_shuffle));
br = _mm_castsi128_ps(
_mm_shuffle_epi8(_mm_castps_si128(br), byte_swap_shuffle));
__m128 center_halved = _mm_mul_ps(fc, half);
__m128 left = _mm_add_ps(_mm_add_ps(fl, bl), center_halved);
__m128 right = _mm_add_ps(_mm_add_ps(fr, br), center_halved);
left = _mm_mul_ps(left, two_fifths);
right = _mm_mul_ps(right, two_fifths);
_mm_storeu_ps(&output[sample * 2], _mm_unpacklo_ps(left, right));
_mm_storeu_ps(&output[(sample + 2) * 2], _mm_unpackhi_ps(left, right));
}
}
#else
inline void sequential_6_BE_to_interleaved_6_LE(float* output, inline void sequential_6_BE_to_interleaved_6_LE(float* output,
const float* input, const float* input,
size_t ch_sample_count) { size_t ch_sample_count) {
@ -45,6 +110,7 @@ inline void sequential_6_BE_to_interleaved_2_LE(float* output,
output[sample * 2 + 1] = (fr + br + center_halved) * (1.0f / 2.5f); output[sample * 2 + 1] = (fr + br + center_halved) * (1.0f / 2.5f);
} }
} }
#endif
} // namespace conversion } // namespace conversion
} // namespace apu } // namespace apu