From b17d6f5088976b5aed12e45d44bf5c73d91bc41d Mon Sep 17 00:00:00 2001 From: DrChat Date: Wed, 14 Feb 2018 20:28:34 -0600 Subject: [PATCH] [Base] Enable aligned copy and swap routines --- src/xenia/base/memory.cc | 88 ++++++++++++++++++++++++++++++++++++---- 1 file changed, 80 insertions(+), 8 deletions(-) diff --git a/src/xenia/base/memory.cc b/src/xenia/base/memory.cc index 1179ecc1f..3ddef8113 100644 --- a/src/xenia/base/memory.cc +++ b/src/xenia/base/memory.cc @@ -24,8 +24,20 @@ void copy_128_aligned(void* dest, const void* src, size_t count) { } #if XE_ARCH_AMD64 -void copy_and_swap_16_aligned(void* dest, const void* src, size_t count) { - return copy_and_swap_16_unaligned(dest, src, count); +void copy_and_swap_16_aligned(void* dest_ptr, const void* src_ptr, + size_t count) { + auto dest = reinterpret_cast(dest_ptr); + auto src = reinterpret_cast(src_ptr); + size_t i; + for (i = 0; i + 8 <= count; i += 8) { + __m128i input = _mm_load_si128(reinterpret_cast(&src[i])); + __m128i output = + _mm_or_si128(_mm_slli_epi16(input, 8), _mm_srli_epi16(input, 8)); + _mm_store_si128(reinterpret_cast<__m128i*>(&dest[i]), output); + } + for (; i < count; ++i) { // handle residual elements + dest[i] = byte_swap(src[i]); + } } void copy_and_swap_16_unaligned(void* dest_ptr, const void* src_ptr, @@ -44,8 +56,31 @@ void copy_and_swap_16_unaligned(void* dest_ptr, const void* src_ptr, } } -void copy_and_swap_32_aligned(void* dest, const void* src, size_t count) { - return copy_and_swap_32_unaligned(dest, src, count); +void copy_and_swap_32_aligned(void* dest_ptr, const void* src_ptr, + size_t count) { + auto dest = reinterpret_cast(dest_ptr); + auto src = reinterpret_cast(src_ptr); + __m128i byte2mask = _mm_set1_epi32(0x00FF0000); + __m128i byte3mask = _mm_set1_epi32(0x0000FF00); + size_t i; + for (i = 0; i + 4 <= count; i += 4) { + __m128i input = _mm_load_si128(reinterpret_cast(&src[i])); + // Do the four shifts. + __m128i byte1 = _mm_slli_epi32(input, 24); + __m128i byte2 = _mm_slli_epi32(input, 8); + __m128i byte3 = _mm_srli_epi32(input, 8); + __m128i byte4 = _mm_srli_epi32(input, 24); + // OR bytes together. + __m128i output = _mm_or_si128(byte1, byte4); + byte2 = _mm_and_si128(byte2, byte2mask); + output = _mm_or_si128(output, byte2); + byte3 = _mm_and_si128(byte3, byte3mask); + output = _mm_or_si128(output, byte3); + _mm_store_si128(reinterpret_cast<__m128i*>(&dest[i]), output); + } + for (; i < count; ++i) { // handle residual elements + dest[i] = byte_swap(src[i]); + } } void copy_and_swap_32_unaligned(void* dest_ptr, const void* src_ptr, @@ -75,8 +110,33 @@ void copy_and_swap_32_unaligned(void* dest_ptr, const void* src_ptr, } } -void copy_and_swap_64_aligned(void* dest, const void* src, size_t count) { - return copy_and_swap_64_unaligned(dest, src, count); +void copy_and_swap_64_aligned(void* dest_ptr, const void* src_ptr, + size_t count) { + auto dest = reinterpret_cast(dest_ptr); + auto src = reinterpret_cast(src_ptr); + __m128i byte2mask = _mm_set1_epi32(0x00FF0000); + __m128i byte3mask = _mm_set1_epi32(0x0000FF00); + size_t i; + for (i = 0; i + 2 <= count; i += 2) { + __m128i input = _mm_load_si128(reinterpret_cast(&src[i])); + // Do the four shifts. + __m128i byte1 = _mm_slli_epi32(input, 24); + __m128i byte2 = _mm_slli_epi32(input, 8); + __m128i byte3 = _mm_srli_epi32(input, 8); + __m128i byte4 = _mm_srli_epi32(input, 24); + // OR bytes together. + __m128i output = _mm_or_si128(byte1, byte4); + byte2 = _mm_and_si128(byte2, byte2mask); + output = _mm_or_si128(output, byte2); + byte3 = _mm_and_si128(byte3, byte3mask); + output = _mm_or_si128(output, byte3); + // Reorder the two words. + output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1)); + _mm_store_si128(reinterpret_cast<__m128i*>(&dest[i]), output); + } + for (; i < count; ++i) { // handle residual elements + dest[i] = byte_swap(src[i]); + } } void copy_and_swap_64_unaligned(void* dest_ptr, const void* src_ptr, @@ -108,8 +168,20 @@ void copy_and_swap_64_unaligned(void* dest_ptr, const void* src_ptr, } } -void copy_and_swap_16_in_32_aligned(void* dest, const void* src, size_t count) { - return copy_and_swap_16_in_32_unaligned(dest, src, count); +void copy_and_swap_16_in_32_aligned(void* dest_ptr, const void* src_ptr, + size_t count) { + auto dest = reinterpret_cast(dest_ptr); + auto src = reinterpret_cast(src_ptr); + size_t i; + for (i = 0; i + 4 <= count; i += 4) { + __m128i input = _mm_load_si128(reinterpret_cast(&src[i])); + __m128i output = + _mm_or_si128(_mm_slli_epi32(input, 16), _mm_srli_epi32(input, 16)); + _mm_store_si128(reinterpret_cast<__m128i*>(&dest[i]), output); + } + for (; i < count; ++i) { // handle residual elements + dest[i] = (src[i] >> 16) | (src[i] << 16); + } } void copy_and_swap_16_in_32_unaligned(void* dest_ptr, const void* src_ptr,