gl4: swap high and low 16 bits within a 32 bit word using SSE intrinsics

This commit is contained in:
raven02 2015-07-03 21:01:20 +08:00
parent fd50209760
commit 0f7f2f2491
3 changed files with 20 additions and 7 deletions

View File

@ -46,6 +46,8 @@ void copy_and_swap_64_aligned(uint64_t* dest, const uint64_t* src,
size_t count);
void copy_and_swap_64_unaligned(uint64_t* dest, const uint64_t* src,
size_t count);
void copy_and_swap_16_in_32_aligned(uint32_t* dest, const uint32_t* src,
size_t count);
template <typename T>
void copy_and_swap(T* dest, const T* src, size_t count) {

View File

@ -132,4 +132,18 @@ void copy_and_swap_64_unaligned(uint64_t* dest, const uint64_t* src,
}
}
void copy_and_swap_16_in_32_aligned(uint32_t* dest, const uint32_t* src,
size_t count) {
size_t i;
__m128i input, output;
for (i = 0; i + 4 <= count; i += 4) {
input = _mm_loadu_si128((__m128i*)&src[i]);
output = _mm_or_si128(_mm_slli_epi32(input, 16), _mm_srli_epi32(input, 16));
_mm_storeu_si128((__m128i*)&dest[i], output);
}
for (; i < count; ++i) { // handle residual elements
dest[i] = (src[i] >> 16) | (src[i] << 16);
}
}
} // namespace xe

View File

@ -687,13 +687,10 @@ void TextureSwap(Endian endianness, void* dest, const void* src,
reinterpret_cast<const uint32_t*>(src),
length / 4);
break;
case Endian::k16in32:
// TODO(benvanik): make more efficient.
/*for (uint32_t i = 0; i < length; i += 4, src += 4, dest += 4) {
uint32_t value = *(uint32_t*)src;
*(uint32_t*)dest = ((value >> 16) & 0xFFFF) | (value << 16);
}*/
assert_always("16in32 not supported");
case Endian::k16in32: // Swap high and low 16 bits within a 32 bit word
xe::copy_and_swap_16_in_32_aligned(reinterpret_cast<uint32_t*>(dest),
reinterpret_cast<const uint32_t*>(src),
length);
break;
default:
case Endian::kUnspecified: