From 0ad939b2f10571b03a8652dc1df8a5e444bbac13 Mon Sep 17 00:00:00 2001 From: Joel Linn Date: Tue, 8 Jun 2021 03:36:07 +0200 Subject: [PATCH] [APU] Add AVX intrinsic variants for conversion --- src/xenia/apu/conversion.h | 66 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/src/xenia/apu/conversion.h b/src/xenia/apu/conversion.h index 2a487af78..211243348 100644 --- a/src/xenia/apu/conversion.h +++ b/src/xenia/apu/conversion.h @@ -13,11 +13,76 @@ #include #include "xenia/base/byte_order.h" +#include "xenia/base/platform.h" namespace xe { namespace apu { namespace conversion { +#if XE_ARCH_AMD64 +inline void sequential_6_BE_to_interleaved_6_LE(float* output, + const float* input, + size_t ch_sample_count) { + const uint32_t* in = reinterpret_cast(input); + uint32_t* out = reinterpret_cast(output); + const __m128i byte_swap_shuffle = + _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3); + for (size_t sample = 0; sample < ch_sample_count; sample++) { + __m128i sample0 = _mm_set_epi32( + in[3 * ch_sample_count + sample], in[2 * ch_sample_count + sample], + in[1 * ch_sample_count + sample], in[0 * ch_sample_count + sample]); + uint32_t sample1 = in[4 * ch_sample_count + sample]; + uint32_t sample2 = in[5 * ch_sample_count + sample]; + sample0 = _mm_shuffle_epi8(sample0, byte_swap_shuffle); + _mm_storeu_si128(reinterpret_cast<__m128i*>(&out[sample * 6]), sample0); + sample1 = xe::byte_swap(sample1); + out[sample * 6 + 4] = sample1; + sample2 = xe::byte_swap(sample2); + out[sample * 6 + 5] = sample2; + } +} + +inline void sequential_6_BE_to_interleaved_2_LE(float* output, + const float* input, + size_t ch_sample_count) { + assert_true(ch_sample_count % 4 == 0); + const uint32_t* in = reinterpret_cast(input); + uint32_t* out = reinterpret_cast(output); + const __m128i byte_swap_shuffle = + _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3); + const __m128 half = _mm_set1_ps(0.5f); + const __m128 two_fifths = _mm_set1_ps(1.0f / 2.5f); + + // put center on left and right, discard low frequency + for (size_t sample = 0; sample < ch_sample_count; sample += 4) { + // load 4 samples from 6 channels each + __m128 fl = _mm_loadu_ps(&input[0 * ch_sample_count + sample]); + __m128 fr = _mm_loadu_ps(&input[1 * ch_sample_count + sample]); + __m128 fc = _mm_loadu_ps(&input[2 * ch_sample_count + sample]); + __m128 bl = _mm_loadu_ps(&input[4 * ch_sample_count + sample]); + __m128 br = _mm_loadu_ps(&input[5 * ch_sample_count + sample]); + // byte swap + fl = _mm_castsi128_ps( + _mm_shuffle_epi8(_mm_castps_si128(fl), byte_swap_shuffle)); + fr = _mm_castsi128_ps( + _mm_shuffle_epi8(_mm_castps_si128(fr), byte_swap_shuffle)); + fc = _mm_castsi128_ps( + _mm_shuffle_epi8(_mm_castps_si128(fc), byte_swap_shuffle)); + bl = _mm_castsi128_ps( + _mm_shuffle_epi8(_mm_castps_si128(bl), byte_swap_shuffle)); + br = _mm_castsi128_ps( + _mm_shuffle_epi8(_mm_castps_si128(br), byte_swap_shuffle)); + + __m128 center_halved = _mm_mul_ps(fc, half); + __m128 left = _mm_add_ps(_mm_add_ps(fl, bl), center_halved); + __m128 right = _mm_add_ps(_mm_add_ps(fr, br), center_halved); + left = _mm_mul_ps(left, two_fifths); + right = _mm_mul_ps(right, two_fifths); + _mm_storeu_ps(&output[sample * 2], _mm_unpacklo_ps(left, right)); + _mm_storeu_ps(&output[(sample + 2) * 2], _mm_unpackhi_ps(left, right)); + } +} +#else inline void sequential_6_BE_to_interleaved_6_LE(float* output, const float* input, size_t ch_sample_count) { @@ -45,6 +110,7 @@ inline void sequential_6_BE_to_interleaved_2_LE(float* output, output[sample * 2 + 1] = (fr + br + center_halved) * (1.0f / 2.5f); } } +#endif } // namespace conversion } // namespace apu