diff --git a/src/xenia/base/memory.h b/src/xenia/base/memory.h index 9fa28ee74..210a9c069 100644 --- a/src/xenia/base/memory.h +++ b/src/xenia/base/memory.h @@ -46,6 +46,8 @@ void copy_and_swap_64_aligned(uint64_t* dest, const uint64_t* src, size_t count); void copy_and_swap_64_unaligned(uint64_t* dest, const uint64_t* src, size_t count); +void copy_and_swap_16_in_32_aligned(uint32_t* dest, const uint32_t* src, + size_t count); template void copy_and_swap(T* dest, const T* src, size_t count) { diff --git a/src/xenia/base/memory_generic.cc b/src/xenia/base/memory_generic.cc index 5cade5caa..2e4d28a80 100644 --- a/src/xenia/base/memory_generic.cc +++ b/src/xenia/base/memory_generic.cc @@ -132,4 +132,18 @@ void copy_and_swap_64_unaligned(uint64_t* dest, const uint64_t* src, } } +void copy_and_swap_16_in_32_aligned(uint32_t* dest, const uint32_t* src, + size_t count) { + size_t i; + __m128i input, output; + for (i = 0; i + 4 <= count; i += 4) { + input = _mm_loadu_si128((__m128i*)&src[i]); + output = _mm_or_si128(_mm_slli_epi32(input, 16), _mm_srli_epi32(input, 16)); + _mm_storeu_si128((__m128i*)&dest[i], output); + } + for (; i < count; ++i) { // handle residual elements + dest[i] = (src[i] >> 16) | (src[i] << 16); + } +} + } // namespace xe diff --git a/src/xenia/gpu/gl4/texture_cache.cc b/src/xenia/gpu/gl4/texture_cache.cc index c3f076aca..0c7074e56 100644 --- a/src/xenia/gpu/gl4/texture_cache.cc +++ b/src/xenia/gpu/gl4/texture_cache.cc @@ -687,13 +687,10 @@ void TextureSwap(Endian endianness, void* dest, const void* src, reinterpret_cast(src), length / 4); break; - case Endian::k16in32: - // TODO(benvanik): make more efficient. - /*for (uint32_t i = 0; i < length; i += 4, src += 4, dest += 4) { - uint32_t value = *(uint32_t*)src; - *(uint32_t*)dest = ((value >> 16) & 0xFFFF) | (value << 16); - }*/ - assert_always("16in32 not supported"); + case Endian::k16in32: // Swap high and low 16 bits within a 32 bit word + xe::copy_and_swap_16_in_32_aligned(reinterpret_cast(dest), + reinterpret_cast(src), + length); break; default: case Endian::kUnspecified: