Making memory API less error prone; fixes buffer/constant uploads.

This commit is contained in:
Ben Vanik 2016-02-20 19:19:29 -08:00
parent fad5ad7f64
commit cd02cdfc70
7 changed files with 60 additions and 85 deletions

View File

@ -22,109 +22,99 @@ void copy_128_aligned(void* dest, const void* src, size_t count) {
std::memcpy(dest, src, count * 16); std::memcpy(dest, src, count * 16);
} }
void copy_and_swap_16_aligned(uint16_t* dest, const uint16_t* src, void copy_and_swap_16_aligned(void* dest, const void* src, size_t count) {
size_t count) {
return copy_and_swap_16_unaligned(dest, src, count); return copy_and_swap_16_unaligned(dest, src, count);
} }
void copy_and_swap_16_unaligned(uint16_t* dest, const uint16_t* src, void copy_and_swap_16_unaligned(void* dest_ptr, const void* src_ptr,
size_t count) { size_t count) {
auto dest = reinterpret_cast<uint16_t*>(dest_ptr);
auto src = reinterpret_cast<const uint16_t*>(src_ptr);
size_t i; size_t i;
__m128i input, output;
for (i = 0; i + 8 <= count; i += 8) { for (i = 0; i + 8 <= count; i += 8) {
input = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[i])); __m128i input = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[i]));
output = _mm_or_si128(_mm_slli_epi16(input, 8), _mm_srli_epi16(input, 8)); __m128i output =
_mm_or_si128(_mm_slli_epi16(input, 8), _mm_srli_epi16(input, 8));
_mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output); _mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
} }
for (; i < count; ++i) { // handle residual elements for (; i < count; ++i) { // handle residual elements
dest[i] = byte_swap(src[i]); dest[i] = byte_swap(src[i]);
} }
} }
void copy_and_swap_32_aligned(uint32_t* dest, const uint32_t* src, void copy_and_swap_32_aligned(void* dest, const void* src, size_t count) {
size_t count) {
return copy_and_swap_32_unaligned(dest, src, count); return copy_and_swap_32_unaligned(dest, src, count);
} }
void copy_and_swap_32_unaligned(uint32_t* dest, const uint32_t* src, void copy_and_swap_32_unaligned(void* dest_ptr, const void* src_ptr,
size_t count) { size_t count) {
size_t i; auto dest = reinterpret_cast<uint32_t*>(dest_ptr);
__m128i input, byte1, byte2, byte3, byte4, output; auto src = reinterpret_cast<const uint32_t*>(src_ptr);
__m128i byte2mask = _mm_set1_epi32(0x00FF0000); __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
__m128i byte3mask = _mm_set1_epi32(0x0000FF00); __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
size_t i;
for (i = 0; i + 4 <= count; i += 4) { for (i = 0; i + 4 <= count; i += 4) {
input = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[i])); __m128i input = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[i]));
// Do the four shifts.
// Do the four shifts __m128i byte1 = _mm_slli_epi32(input, 24);
byte1 = _mm_slli_epi32(input, 24); __m128i byte2 = _mm_slli_epi32(input, 8);
byte2 = _mm_slli_epi32(input, 8); __m128i byte3 = _mm_srli_epi32(input, 8);
byte3 = _mm_srli_epi32(input, 8); __m128i byte4 = _mm_srli_epi32(input, 24);
byte4 = _mm_srli_epi32(input, 24); // OR bytes together.
__m128i output = _mm_or_si128(byte1, byte4);
// Or bytes together
output = _mm_or_si128(byte1, byte4);
byte2 = _mm_and_si128(byte2, byte2mask); byte2 = _mm_and_si128(byte2, byte2mask);
output = _mm_or_si128(output, byte2); output = _mm_or_si128(output, byte2);
byte3 = _mm_and_si128(byte3, byte3mask); byte3 = _mm_and_si128(byte3, byte3mask);
output = _mm_or_si128(output, byte3); output = _mm_or_si128(output, byte3);
_mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output); _mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
} }
for (; i < count; ++i) { // handle residual elements for (; i < count; ++i) { // handle residual elements
dest[i] = byte_swap(src[i]); dest[i] = byte_swap(src[i]);
} }
} }
void copy_and_swap_64_aligned(uint64_t* dest, const uint64_t* src, void copy_and_swap_64_aligned(void* dest, const void* src, size_t count) {
size_t count) {
return copy_and_swap_64_unaligned(dest, src, count); return copy_and_swap_64_unaligned(dest, src, count);
} }
void copy_and_swap_64_unaligned(uint64_t* dest, const uint64_t* src, void copy_and_swap_64_unaligned(void* dest_ptr, const void* src_ptr,
size_t count) { size_t count) {
size_t i; auto dest = reinterpret_cast<uint64_t*>(dest_ptr);
__m128i input, byte1, byte2, byte3, byte4, output; auto src = reinterpret_cast<const uint64_t*>(src_ptr);
__m128i byte2mask = _mm_set1_epi32(0x00FF0000); __m128i byte2mask = _mm_set1_epi32(0x00FF0000);
__m128i byte3mask = _mm_set1_epi32(0x0000FF00); __m128i byte3mask = _mm_set1_epi32(0x0000FF00);
size_t i;
for (i = 0; i + 2 <= count; i += 2) { for (i = 0; i + 2 <= count; i += 2) {
input = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[i])); __m128i input = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[i]));
// Do the four shifts.
// Do the four shifts __m128i byte1 = _mm_slli_epi32(input, 24);
byte1 = _mm_slli_epi32(input, 24); __m128i byte2 = _mm_slli_epi32(input, 8);
byte2 = _mm_slli_epi32(input, 8); __m128i byte3 = _mm_srli_epi32(input, 8);
byte3 = _mm_srli_epi32(input, 8); __m128i byte4 = _mm_srli_epi32(input, 24);
byte4 = _mm_srli_epi32(input, 24); // OR bytes together.
__m128i output = _mm_or_si128(byte1, byte4);
// Or bytes together
output = _mm_or_si128(byte1, byte4);
byte2 = _mm_and_si128(byte2, byte2mask); byte2 = _mm_and_si128(byte2, byte2mask);
output = _mm_or_si128(output, byte2); output = _mm_or_si128(output, byte2);
byte3 = _mm_and_si128(byte3, byte3mask); byte3 = _mm_and_si128(byte3, byte3mask);
output = _mm_or_si128(output, byte3); output = _mm_or_si128(output, byte3);
// Reorder the two words.
// Reorder the two words
output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1)); output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1));
_mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output); _mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
} }
for (; i < count; ++i) { // handle residual elements for (; i < count; ++i) { // handle residual elements
dest[i] = byte_swap(src[i]); dest[i] = byte_swap(src[i]);
} }
} }
void copy_and_swap_16_in_32_aligned(uint32_t* dest, const uint32_t* src, void copy_and_swap_16_in_32_aligned(void* dest_ptr, const void* src_ptr,
size_t count) { size_t count) {
auto dest = reinterpret_cast<uint64_t*>(dest_ptr);
auto src = reinterpret_cast<const uint64_t*>(src_ptr);
size_t i; size_t i;
__m128i input, output;
for (i = 0; i + 4 <= count; i += 4) { for (i = 0; i + 4 <= count; i += 4) {
input = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[i])); __m128i input = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[i]));
output = _mm_or_si128(_mm_slli_epi32(input, 16), _mm_srli_epi32(input, 16)); __m128i output =
_mm_or_si128(_mm_slli_epi32(input, 16), _mm_srli_epi32(input, 16));
_mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output); _mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
} }
for (; i < count; ++i) { // handle residual elements for (; i < count; ++i) { // handle residual elements

View File

@ -123,20 +123,13 @@ inline void* low_address(void* address) {
void copy_128_aligned(void* dest, const void* src, size_t count); void copy_128_aligned(void* dest, const void* src, size_t count);
void copy_and_swap_16_aligned(uint16_t* dest, const uint16_t* src, void copy_and_swap_16_aligned(void* dest, const void* src, size_t count);
size_t count); void copy_and_swap_16_unaligned(void* dest, const void* src, size_t count);
void copy_and_swap_16_unaligned(uint16_t* dest, const uint16_t* src, void copy_and_swap_32_aligned(void* dest, const void* src, size_t count);
size_t count); void copy_and_swap_32_unaligned(void* dest, const void* src, size_t count);
void copy_and_swap_32_aligned(uint32_t* dest, const uint32_t* src, void copy_and_swap_64_aligned(void* dest, const void* src, size_t count);
size_t count); void copy_and_swap_64_unaligned(void* dest, const void* src, size_t count);
void copy_and_swap_32_unaligned(uint32_t* dest, const uint32_t* src, void copy_and_swap_16_in_32_aligned(void* dest, const void* src, size_t count);
size_t count);
void copy_and_swap_64_aligned(uint64_t* dest, const uint64_t* src,
size_t count);
void copy_and_swap_64_unaligned(uint64_t* dest, const uint64_t* src,
size_t count);
void copy_and_swap_16_in_32_aligned(uint32_t* dest, const uint32_t* src,
size_t count);
template <typename T> template <typename T>
void copy_and_swap(T* dest, const T* src, size_t count) { void copy_and_swap(T* dest, const T* src, size_t count) {

View File

@ -1019,8 +1019,7 @@ bool CommandProcessor::ExecutePacketType3_EVENT_WRITE_EXT(RingBuffer* reader,
1, // max z 1, // max z
}; };
assert_true(endianness == Endian::k8in16); assert_true(endianness == Endian::k8in16);
xe::copy_and_swap_16_aligned( xe::copy_and_swap_16_aligned(memory_->TranslatePhysical(address), extents,
reinterpret_cast<uint16_t*>(memory_->TranslatePhysical(address)), extents,
xe::countof(extents)); xe::countof(extents));
trace_writer_.WriteMemoryWrite(CpuToGpu(address), sizeof(extents)); trace_writer_.WriteMemoryWrite(CpuToGpu(address), sizeof(extents));
return true; return true;

View File

@ -1410,7 +1410,7 @@ GL4CommandProcessor::UpdateStatus GL4CommandProcessor::PopulateVertexBuffers() {
// as we copy and only if it differs from the previous value committing // as we copy and only if it differs from the previous value committing
// it (and if it matches just discard and reuse). // it (and if it matches just discard and reuse).
xe::copy_and_swap_32_aligned( xe::copy_and_swap_32_aligned(
reinterpret_cast<uint32_t*>(allocation.host_ptr), allocation.host_ptr,
memory_->TranslatePhysical<const uint32_t*>(fetch->address << 2), memory_->TranslatePhysical<const uint32_t*>(fetch->address << 2),
valid_range / 4); valid_range / 4);

View File

@ -662,19 +662,13 @@ void TextureSwap(Endian endianness, void* dest, const void* src,
size_t length) { size_t length) {
switch (endianness) { switch (endianness) {
case Endian::k8in16: case Endian::k8in16:
xe::copy_and_swap_16_aligned(reinterpret_cast<uint16_t*>(dest), xe::copy_and_swap_16_aligned(dest, src, length / 2);
reinterpret_cast<const uint16_t*>(src),
length / 2);
break; break;
case Endian::k8in32: case Endian::k8in32:
xe::copy_and_swap_32_aligned(reinterpret_cast<uint32_t*>(dest), xe::copy_and_swap_32_aligned(dest, src, length / 4);
reinterpret_cast<const uint32_t*>(src),
length / 4);
break; break;
case Endian::k16in32: // Swap high and low 16 bits within a 32 bit word case Endian::k16in32: // Swap high and low 16 bits within a 32 bit word
xe::copy_and_swap_16_in_32_aligned(reinterpret_cast<uint32_t*>(dest), xe::copy_and_swap_16_in_32_aligned(dest, src, length);
reinterpret_cast<const uint32_t*>(src),
length);
break; break;
default: default:
case Endian::kUnspecified: case Endian::kUnspecified:

View File

@ -290,13 +290,13 @@ std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadIndexBuffer(
if (format == IndexFormat::kInt16) { if (format == IndexFormat::kInt16) {
// Endian::k8in16, swap half-words. // Endian::k8in16, swap half-words.
xe::copy_and_swap_16_aligned( xe::copy_and_swap_16_aligned(
reinterpret_cast<uint16_t*>(transient_buffer_data_) + offset, reinterpret_cast<uint8_t*>(transient_buffer_data_) + offset, source_ptr,
reinterpret_cast<const uint16_t*>(source_ptr), source_length / 2); source_length / 2);
} else if (format == IndexFormat::kInt32) { } else if (format == IndexFormat::kInt32) {
// Endian::k8in32, swap words. // Endian::k8in32, swap words.
xe::copy_and_swap_32_aligned( xe::copy_and_swap_32_aligned(
reinterpret_cast<uint32_t*>(transient_buffer_data_) + offset, reinterpret_cast<uint8_t*>(transient_buffer_data_) + offset, source_ptr,
reinterpret_cast<const uint32_t*>(source_ptr), source_length / 4); source_length / 4);
} }
return {transient_index_buffer_, offset}; return {transient_index_buffer_, offset};
@ -317,8 +317,8 @@ std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadVertexBuffer(
// TODO(benvanik): memcpy then use compute shaders to swap? // TODO(benvanik): memcpy then use compute shaders to swap?
// Endian::k8in32, swap words. // Endian::k8in32, swap words.
xe::copy_and_swap_32_aligned( xe::copy_and_swap_32_aligned(
reinterpret_cast<uint32_t*>(transient_buffer_data_) + offset, reinterpret_cast<uint8_t*>(transient_buffer_data_) + offset, source_ptr,
reinterpret_cast<const uint32_t*>(source_ptr), source_length / 4); source_length / 4);
return {transient_vertex_buffer_, offset}; return {transient_vertex_buffer_, offset};
} }

View File

@ -343,8 +343,7 @@ void VdSwap(lpvoid_t buffer_ptr, // ptr into primary ringbuffer
lpunknown_t unk8, unknown_t unk9) { lpunknown_t unk8, unknown_t unk9) {
gpu::xenos::xe_gpu_texture_fetch_t fetch; gpu::xenos::xe_gpu_texture_fetch_t fetch;
xe::copy_and_swap_32_unaligned( xe::copy_and_swap_32_unaligned(
reinterpret_cast<uint32_t*>(&fetch), &fetch, reinterpret_cast<uint32_t*>(fetch_ptr.host_address()), 6);
reinterpret_cast<uint32_t*>(fetch_ptr.host_address()), 6);
auto color_format = gpu::ColorFormat(color_format_ptr.value()); auto color_format = gpu::ColorFormat(color_format_ptr.value());
auto color_space = *color_space_ptr; auto color_space = *color_space_ptr;