diff --git a/src/xenia/base/memory_generic.cc b/src/xenia/base/memory_generic.cc index b25327770..e5c2d0957 100644 --- a/src/xenia/base/memory_generic.cc +++ b/src/xenia/base/memory_generic.cc @@ -44,10 +44,12 @@ void copy_and_swap_16_aligned(uint16_t* dest, const uint16_t* src, void copy_and_swap_16_unaligned(uint16_t* dest, const uint16_t* src, size_t count) { size_t i; + __m128i input, output; + for (i = 0; i + 8 <= count; i += 8) { - __m128i s = _mm_loadu_si128((__m128i*)&src[i]); - __m128i d = _mm_or_si128(_mm_slli_epi16(s, 8), _mm_srli_epi16(s, 8)); - _mm_storeu_si128((__m128i*)&dest[i], d); + input = _mm_loadu_si128((__m128i*)&src[i]); + output = _mm_or_si128(_mm_slli_epi16(input, 8), _mm_srli_epi16(input, 8)); + _mm_storeu_si128((__m128i*)&dest[i], output); } for (; i < count; ++i) { // handle residual elements @@ -62,7 +64,31 @@ void copy_and_swap_32_aligned(uint32_t* dest, const uint32_t* src, void copy_and_swap_32_unaligned(uint32_t* dest, const uint32_t* src, size_t count) { - for (size_t i = 0; i < count; ++i) { + size_t i; + __m128i input, byte1, byte2, byte3, byte4, output; + __m128i byte2mask = _mm_set1_epi32(0x00FF0000); + __m128i byte3mask = _mm_set1_epi32(0x0000FF00); + + for (i = 0; i + 4 <= count; i += 4) { + input = _mm_loadu_si128((__m128i*)&src[i]); + + // Do the four shifts + byte1 = _mm_slli_epi32(input, 24); + byte2 = _mm_slli_epi32(input, 8); + byte3 = _mm_srli_epi32(input, 8); + byte4 = _mm_srli_epi32(input, 24); + + // Or bytes together + output = _mm_or_si128(byte1, byte4); + byte2 = _mm_and_si128(byte2, byte2mask); + output = _mm_or_si128(output, byte2); + byte3 = _mm_and_si128(byte3, byte3mask); + output = _mm_or_si128(output, byte3); + + _mm_storeu_si128((__m128i*)&dest[i], output); + } + + for (; i < count; ++i) { // handle residual elements dest[i] = byte_swap(src[i]); } }