gl4: swap high and low 16 bits within a 32 bit word using SSE intrinsics
This commit is contained in:
parent
fd50209760
commit
0f7f2f2491
|
@ -46,6 +46,8 @@ void copy_and_swap_64_aligned(uint64_t* dest, const uint64_t* src,
|
|||
size_t count);
|
||||
void copy_and_swap_64_unaligned(uint64_t* dest, const uint64_t* src,
|
||||
size_t count);
|
||||
void copy_and_swap_16_in_32_aligned(uint32_t* dest, const uint32_t* src,
|
||||
size_t count);
|
||||
|
||||
template <typename T>
|
||||
void copy_and_swap(T* dest, const T* src, size_t count) {
|
||||
|
|
|
@ -132,4 +132,18 @@ void copy_and_swap_64_unaligned(uint64_t* dest, const uint64_t* src,
|
|||
}
|
||||
}
|
||||
|
||||
void copy_and_swap_16_in_32_aligned(uint32_t* dest, const uint32_t* src,
|
||||
size_t count) {
|
||||
size_t i;
|
||||
__m128i input, output;
|
||||
for (i = 0; i + 4 <= count; i += 4) {
|
||||
input = _mm_loadu_si128((__m128i*)&src[i]);
|
||||
output = _mm_or_si128(_mm_slli_epi32(input, 16), _mm_srli_epi32(input, 16));
|
||||
_mm_storeu_si128((__m128i*)&dest[i], output);
|
||||
}
|
||||
for (; i < count; ++i) { // handle residual elements
|
||||
dest[i] = (src[i] >> 16) | (src[i] << 16);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace xe
|
||||
|
|
|
@ -687,13 +687,10 @@ void TextureSwap(Endian endianness, void* dest, const void* src,
|
|||
reinterpret_cast<const uint32_t*>(src),
|
||||
length / 4);
|
||||
break;
|
||||
case Endian::k16in32:
|
||||
// TODO(benvanik): make more efficient.
|
||||
/*for (uint32_t i = 0; i < length; i += 4, src += 4, dest += 4) {
|
||||
uint32_t value = *(uint32_t*)src;
|
||||
*(uint32_t*)dest = ((value >> 16) & 0xFFFF) | (value << 16);
|
||||
}*/
|
||||
assert_always("16in32 not supported");
|
||||
case Endian::k16in32: // Swap high and low 16 bits within a 32 bit word
|
||||
xe::copy_and_swap_16_in_32_aligned(reinterpret_cast<uint32_t*>(dest),
|
||||
reinterpret_cast<const uint32_t*>(src),
|
||||
length);
|
||||
break;
|
||||
default:
|
||||
case Endian::kUnspecified:
|
||||
|
|
Loading…
Reference in New Issue