From cd02cdfc70584d98e42e1f6bbb1580217c9e75d3 Mon Sep 17 00:00:00 2001 From: Ben Vanik Date: Sat, 20 Feb 2016 19:19:29 -0800 Subject: [PATCH] Making memory API less error prone; fixes buffer/constant uploads. --- src/xenia/base/memory.cc | 90 +++++++++------------ src/xenia/base/memory.h | 21 ++--- src/xenia/gpu/command_processor.cc | 5 +- src/xenia/gpu/gl4/gl4_command_processor.cc | 2 +- src/xenia/gpu/gl4/texture_cache.cc | 12 +-- src/xenia/gpu/vulkan/buffer_cache.cc | 12 +-- src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc | 3 +- 7 files changed, 60 insertions(+), 85 deletions(-) diff --git a/src/xenia/base/memory.cc b/src/xenia/base/memory.cc index f83b01d72..223ebf379 100644 --- a/src/xenia/base/memory.cc +++ b/src/xenia/base/memory.cc @@ -22,109 +22,99 @@ void copy_128_aligned(void* dest, const void* src, size_t count) { std::memcpy(dest, src, count * 16); } -void copy_and_swap_16_aligned(uint16_t* dest, const uint16_t* src, - size_t count) { +void copy_and_swap_16_aligned(void* dest, const void* src, size_t count) { return copy_and_swap_16_unaligned(dest, src, count); } -void copy_and_swap_16_unaligned(uint16_t* dest, const uint16_t* src, +void copy_and_swap_16_unaligned(void* dest_ptr, const void* src_ptr, size_t count) { + auto dest = reinterpret_cast(dest_ptr); + auto src = reinterpret_cast(src_ptr); size_t i; - __m128i input, output; - for (i = 0; i + 8 <= count; i += 8) { - input = _mm_loadu_si128(reinterpret_cast(&src[i])); - output = _mm_or_si128(_mm_slli_epi16(input, 8), _mm_srli_epi16(input, 8)); + __m128i input = _mm_loadu_si128(reinterpret_cast(&src[i])); + __m128i output = + _mm_or_si128(_mm_slli_epi16(input, 8), _mm_srli_epi16(input, 8)); _mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output); } - for (; i < count; ++i) { // handle residual elements dest[i] = byte_swap(src[i]); } } -void copy_and_swap_32_aligned(uint32_t* dest, const uint32_t* src, - size_t count) { +void copy_and_swap_32_aligned(void* dest, const void* src, size_t count) { return copy_and_swap_32_unaligned(dest, src, count); } -void copy_and_swap_32_unaligned(uint32_t* dest, const uint32_t* src, +void copy_and_swap_32_unaligned(void* dest_ptr, const void* src_ptr, size_t count) { - size_t i; - __m128i input, byte1, byte2, byte3, byte4, output; + auto dest = reinterpret_cast(dest_ptr); + auto src = reinterpret_cast(src_ptr); __m128i byte2mask = _mm_set1_epi32(0x00FF0000); __m128i byte3mask = _mm_set1_epi32(0x0000FF00); - + size_t i; for (i = 0; i + 4 <= count; i += 4) { - input = _mm_loadu_si128(reinterpret_cast(&src[i])); - - // Do the four shifts - byte1 = _mm_slli_epi32(input, 24); - byte2 = _mm_slli_epi32(input, 8); - byte3 = _mm_srli_epi32(input, 8); - byte4 = _mm_srli_epi32(input, 24); - - // Or bytes together - output = _mm_or_si128(byte1, byte4); + __m128i input = _mm_loadu_si128(reinterpret_cast(&src[i])); + // Do the four shifts. + __m128i byte1 = _mm_slli_epi32(input, 24); + __m128i byte2 = _mm_slli_epi32(input, 8); + __m128i byte3 = _mm_srli_epi32(input, 8); + __m128i byte4 = _mm_srli_epi32(input, 24); + // OR bytes together. + __m128i output = _mm_or_si128(byte1, byte4); byte2 = _mm_and_si128(byte2, byte2mask); output = _mm_or_si128(output, byte2); byte3 = _mm_and_si128(byte3, byte3mask); output = _mm_or_si128(output, byte3); - _mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output); } - for (; i < count; ++i) { // handle residual elements dest[i] = byte_swap(src[i]); } } -void copy_and_swap_64_aligned(uint64_t* dest, const uint64_t* src, - size_t count) { +void copy_and_swap_64_aligned(void* dest, const void* src, size_t count) { return copy_and_swap_64_unaligned(dest, src, count); } -void copy_and_swap_64_unaligned(uint64_t* dest, const uint64_t* src, +void copy_and_swap_64_unaligned(void* dest_ptr, const void* src_ptr, size_t count) { - size_t i; - __m128i input, byte1, byte2, byte3, byte4, output; + auto dest = reinterpret_cast(dest_ptr); + auto src = reinterpret_cast(src_ptr); __m128i byte2mask = _mm_set1_epi32(0x00FF0000); __m128i byte3mask = _mm_set1_epi32(0x0000FF00); - + size_t i; for (i = 0; i + 2 <= count; i += 2) { - input = _mm_loadu_si128(reinterpret_cast(&src[i])); - - // Do the four shifts - byte1 = _mm_slli_epi32(input, 24); - byte2 = _mm_slli_epi32(input, 8); - byte3 = _mm_srli_epi32(input, 8); - byte4 = _mm_srli_epi32(input, 24); - - // Or bytes together - output = _mm_or_si128(byte1, byte4); + __m128i input = _mm_loadu_si128(reinterpret_cast(&src[i])); + // Do the four shifts. + __m128i byte1 = _mm_slli_epi32(input, 24); + __m128i byte2 = _mm_slli_epi32(input, 8); + __m128i byte3 = _mm_srli_epi32(input, 8); + __m128i byte4 = _mm_srli_epi32(input, 24); + // OR bytes together. + __m128i output = _mm_or_si128(byte1, byte4); byte2 = _mm_and_si128(byte2, byte2mask); output = _mm_or_si128(output, byte2); byte3 = _mm_and_si128(byte3, byte3mask); output = _mm_or_si128(output, byte3); - - // Reorder the two words + // Reorder the two words. output = _mm_shuffle_epi32(output, _MM_SHUFFLE(2, 3, 0, 1)); - _mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output); } - for (; i < count; ++i) { // handle residual elements dest[i] = byte_swap(src[i]); } } -void copy_and_swap_16_in_32_aligned(uint32_t* dest, const uint32_t* src, +void copy_and_swap_16_in_32_aligned(void* dest_ptr, const void* src_ptr, size_t count) { + auto dest = reinterpret_cast(dest_ptr); + auto src = reinterpret_cast(src_ptr); size_t i; - __m128i input, output; for (i = 0; i + 4 <= count; i += 4) { - input = _mm_loadu_si128(reinterpret_cast(&src[i])); - output = _mm_or_si128(_mm_slli_epi32(input, 16), _mm_srli_epi32(input, 16)); + __m128i input = _mm_loadu_si128(reinterpret_cast(&src[i])); + __m128i output = + _mm_or_si128(_mm_slli_epi32(input, 16), _mm_srli_epi32(input, 16)); _mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output); } for (; i < count; ++i) { // handle residual elements diff --git a/src/xenia/base/memory.h b/src/xenia/base/memory.h index 183843416..c35bfb1db 100644 --- a/src/xenia/base/memory.h +++ b/src/xenia/base/memory.h @@ -123,20 +123,13 @@ inline void* low_address(void* address) { void copy_128_aligned(void* dest, const void* src, size_t count); -void copy_and_swap_16_aligned(uint16_t* dest, const uint16_t* src, - size_t count); -void copy_and_swap_16_unaligned(uint16_t* dest, const uint16_t* src, - size_t count); -void copy_and_swap_32_aligned(uint32_t* dest, const uint32_t* src, - size_t count); -void copy_and_swap_32_unaligned(uint32_t* dest, const uint32_t* src, - size_t count); -void copy_and_swap_64_aligned(uint64_t* dest, const uint64_t* src, - size_t count); -void copy_and_swap_64_unaligned(uint64_t* dest, const uint64_t* src, - size_t count); -void copy_and_swap_16_in_32_aligned(uint32_t* dest, const uint32_t* src, - size_t count); +void copy_and_swap_16_aligned(void* dest, const void* src, size_t count); +void copy_and_swap_16_unaligned(void* dest, const void* src, size_t count); +void copy_and_swap_32_aligned(void* dest, const void* src, size_t count); +void copy_and_swap_32_unaligned(void* dest, const void* src, size_t count); +void copy_and_swap_64_aligned(void* dest, const void* src, size_t count); +void copy_and_swap_64_unaligned(void* dest, const void* src, size_t count); +void copy_and_swap_16_in_32_aligned(void* dest, const void* src, size_t count); template void copy_and_swap(T* dest, const T* src, size_t count) { diff --git a/src/xenia/gpu/command_processor.cc b/src/xenia/gpu/command_processor.cc index 545679d6b..14e381b6d 100644 --- a/src/xenia/gpu/command_processor.cc +++ b/src/xenia/gpu/command_processor.cc @@ -1019,9 +1019,8 @@ bool CommandProcessor::ExecutePacketType3_EVENT_WRITE_EXT(RingBuffer* reader, 1, // max z }; assert_true(endianness == Endian::k8in16); - xe::copy_and_swap_16_aligned( - reinterpret_cast(memory_->TranslatePhysical(address)), extents, - xe::countof(extents)); + xe::copy_and_swap_16_aligned(memory_->TranslatePhysical(address), extents, + xe::countof(extents)); trace_writer_.WriteMemoryWrite(CpuToGpu(address), sizeof(extents)); return true; } diff --git a/src/xenia/gpu/gl4/gl4_command_processor.cc b/src/xenia/gpu/gl4/gl4_command_processor.cc index 2305b38b4..aaed5c0c6 100644 --- a/src/xenia/gpu/gl4/gl4_command_processor.cc +++ b/src/xenia/gpu/gl4/gl4_command_processor.cc @@ -1410,7 +1410,7 @@ GL4CommandProcessor::UpdateStatus GL4CommandProcessor::PopulateVertexBuffers() { // as we copy and only if it differs from the previous value committing // it (and if it matches just discard and reuse). xe::copy_and_swap_32_aligned( - reinterpret_cast(allocation.host_ptr), + allocation.host_ptr, memory_->TranslatePhysical(fetch->address << 2), valid_range / 4); diff --git a/src/xenia/gpu/gl4/texture_cache.cc b/src/xenia/gpu/gl4/texture_cache.cc index 0e1132218..4a8917e71 100644 --- a/src/xenia/gpu/gl4/texture_cache.cc +++ b/src/xenia/gpu/gl4/texture_cache.cc @@ -662,19 +662,13 @@ void TextureSwap(Endian endianness, void* dest, const void* src, size_t length) { switch (endianness) { case Endian::k8in16: - xe::copy_and_swap_16_aligned(reinterpret_cast(dest), - reinterpret_cast(src), - length / 2); + xe::copy_and_swap_16_aligned(dest, src, length / 2); break; case Endian::k8in32: - xe::copy_and_swap_32_aligned(reinterpret_cast(dest), - reinterpret_cast(src), - length / 4); + xe::copy_and_swap_32_aligned(dest, src, length / 4); break; case Endian::k16in32: // Swap high and low 16 bits within a 32 bit word - xe::copy_and_swap_16_in_32_aligned(reinterpret_cast(dest), - reinterpret_cast(src), - length); + xe::copy_and_swap_16_in_32_aligned(dest, src, length); break; default: case Endian::kUnspecified: diff --git a/src/xenia/gpu/vulkan/buffer_cache.cc b/src/xenia/gpu/vulkan/buffer_cache.cc index 32c2cef4b..1def6d26f 100644 --- a/src/xenia/gpu/vulkan/buffer_cache.cc +++ b/src/xenia/gpu/vulkan/buffer_cache.cc @@ -290,13 +290,13 @@ std::pair BufferCache::UploadIndexBuffer( if (format == IndexFormat::kInt16) { // Endian::k8in16, swap half-words. xe::copy_and_swap_16_aligned( - reinterpret_cast(transient_buffer_data_) + offset, - reinterpret_cast(source_ptr), source_length / 2); + reinterpret_cast(transient_buffer_data_) + offset, source_ptr, + source_length / 2); } else if (format == IndexFormat::kInt32) { // Endian::k8in32, swap words. xe::copy_and_swap_32_aligned( - reinterpret_cast(transient_buffer_data_) + offset, - reinterpret_cast(source_ptr), source_length / 4); + reinterpret_cast(transient_buffer_data_) + offset, source_ptr, + source_length / 4); } return {transient_index_buffer_, offset}; @@ -317,8 +317,8 @@ std::pair BufferCache::UploadVertexBuffer( // TODO(benvanik): memcpy then use compute shaders to swap? // Endian::k8in32, swap words. xe::copy_and_swap_32_aligned( - reinterpret_cast(transient_buffer_data_) + offset, - reinterpret_cast(source_ptr), source_length / 4); + reinterpret_cast(transient_buffer_data_) + offset, source_ptr, + source_length / 4); return {transient_vertex_buffer_, offset}; } diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc index 895e85455..e979cb62a 100644 --- a/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc +++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_video.cc @@ -343,8 +343,7 @@ void VdSwap(lpvoid_t buffer_ptr, // ptr into primary ringbuffer lpunknown_t unk8, unknown_t unk9) { gpu::xenos::xe_gpu_texture_fetch_t fetch; xe::copy_and_swap_32_unaligned( - reinterpret_cast(&fetch), - reinterpret_cast(fetch_ptr.host_address()), 6); + &fetch, reinterpret_cast(fetch_ptr.host_address()), 6); auto color_format = gpu::ColorFormat(color_format_ptr.value()); auto color_space = *color_space_ptr;