diff --git a/src/xenia/base/memory.cc b/src/xenia/base/memory.cc index 223ebf379..534b63f9c 100644 --- a/src/xenia/base/memory.cc +++ b/src/xenia/base/memory.cc @@ -106,8 +106,12 @@ void copy_and_swap_64_unaligned(void* dest_ptr, const void* src_ptr, } } -void copy_and_swap_16_in_32_aligned(void* dest_ptr, const void* src_ptr, - size_t count) { +void copy_and_swap_16_in_32_aligned(void* dest, const void* src, size_t count) { + return copy_and_swap_16_in_32_unaligned(dest, src, count); +} + +void copy_and_swap_16_in_32_unaligned(void* dest_ptr, const void* src_ptr, + size_t count) { auto dest = reinterpret_cast(dest_ptr); auto src = reinterpret_cast(src_ptr); size_t i; diff --git a/src/xenia/base/memory.h b/src/xenia/base/memory.h index c35bfb1db..4b8a99237 100644 --- a/src/xenia/base/memory.h +++ b/src/xenia/base/memory.h @@ -130,6 +130,8 @@ void copy_and_swap_32_unaligned(void* dest, const void* src, size_t count); void copy_and_swap_64_aligned(void* dest, const void* src, size_t count); void copy_and_swap_64_unaligned(void* dest, const void* src, size_t count); void copy_and_swap_16_in_32_aligned(void* dest, const void* src, size_t count); +void copy_and_swap_16_in_32_unaligned(void* dest, const void* src, + size_t count); template void copy_and_swap(T* dest, const T* src, size_t count) { diff --git a/src/xenia/gpu/vulkan/buffer_cache.cc b/src/xenia/gpu/vulkan/buffer_cache.cc index 12d87899d..bcd2e98c9 100644 --- a/src/xenia/gpu/vulkan/buffer_cache.cc +++ b/src/xenia/gpu/vulkan/buffer_cache.cc @@ -25,9 +25,9 @@ using xe::ui::vulkan::CheckResult; constexpr VkDeviceSize kConstantRegisterUniformRange = 512 * 4 * 4 + 8 * 4 + 32 * 4; -BufferCache::BufferCache(RegisterFile* register_file, +BufferCache::BufferCache(RegisterFile* register_file, Memory* memory, ui::vulkan::VulkanDevice* device, size_t capacity) - : register_file_(register_file), device_(*device) { + : register_file_(register_file), memory_(memory), device_(*device) { transient_buffer_ = std::make_unique( device, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT | @@ -229,15 +229,22 @@ std::pair BufferCache::UploadConstantRegisters( } std::pair BufferCache::UploadIndexBuffer( - const void* source_ptr, size_t source_length, IndexFormat format, + uint32_t source_addr, uint32_t source_length, IndexFormat format, VkFence fence) { + auto offset = FindCachedTransientData(source_addr, source_length); + if (offset != VK_WHOLE_SIZE) { + return {transient_buffer_->gpu_buffer(), offset}; + } + // Allocate space in the buffer for our data. - auto offset = AllocateTransientData(source_length, fence); + offset = AllocateTransientData(source_length, fence); if (offset == VK_WHOLE_SIZE) { // OOM. return {nullptr, VK_WHOLE_SIZE}; } + const void* source_ptr = memory_->TranslatePhysical(source_addr); + // Copy data into the buffer. // TODO(benvanik): get min/max indices and pass back? // TODO(benvanik): memcpy then use compute shaders to swap? @@ -251,28 +258,41 @@ std::pair BufferCache::UploadIndexBuffer( source_ptr, source_length / 4); } + CacheTransientData(source_addr, source_length, offset); return {transient_buffer_->gpu_buffer(), offset}; } std::pair BufferCache::UploadVertexBuffer( - const void* source_ptr, size_t source_length, Endian endian, + uint32_t source_addr, uint32_t source_length, Endian endian, VkFence fence) { + auto offset = FindCachedTransientData(source_addr, source_length); + if (offset != VK_WHOLE_SIZE) { + return {transient_buffer_->gpu_buffer(), offset}; + } + // Allocate space in the buffer for our data. - auto offset = AllocateTransientData(source_length, fence); + offset = AllocateTransientData(source_length, fence); if (offset == VK_WHOLE_SIZE) { // OOM. return {nullptr, VK_WHOLE_SIZE}; } + const void* source_ptr = memory_->TranslatePhysical(source_addr); + // Copy data into the buffer. // TODO(benvanik): memcpy then use compute shaders to swap? - assert_true(endian == Endian::k8in32); if (endian == Endian::k8in32) { // Endian::k8in32, swap words. xe::copy_and_swap_32_aligned(transient_buffer_->host_base() + offset, source_ptr, source_length / 4); + } else if (endian == Endian::k16in32) { + xe::copy_and_swap_16_in_32_aligned(transient_buffer_->host_base() + offset, + source_ptr, source_length / 4); + } else { + assert_always(); } + CacheTransientData(source_addr, source_length, offset); return {transient_buffer_->gpu_buffer(), offset}; } @@ -304,6 +324,24 @@ VkDeviceSize BufferCache::TryAllocateTransientData(VkDeviceSize length, return VK_WHOLE_SIZE; } +VkDeviceSize BufferCache::FindCachedTransientData(uint32_t guest_address, + uint32_t guest_length) { + uint64_t key = uint64_t(guest_length) << 32 | uint64_t(guest_address); + auto it = transient_cache_.find(key); + if (it != transient_cache_.end()) { + return it->second; + } + + return VK_WHOLE_SIZE; +} + +void BufferCache::CacheTransientData(uint32_t guest_address, + uint32_t guest_length, + VkDeviceSize offset) { + uint64_t key = uint64_t(guest_length) << 32 | uint64_t(guest_address); + transient_cache_[key] = offset; +} + void BufferCache::Flush(VkCommandBuffer command_buffer) { // If we are flushing a big enough chunk queue up an event. // We don't want to do this for everything but often enough so that we won't @@ -331,7 +369,10 @@ void BufferCache::InvalidateCache() { void BufferCache::ClearCache() { transient_cache_.clear(); } -void BufferCache::Scavenge() { transient_buffer_->Scavenge(); } +void BufferCache::Scavenge() { + transient_cache_.clear(); + transient_buffer_->Scavenge(); +} } // namespace vulkan } // namespace gpu diff --git a/src/xenia/gpu/vulkan/buffer_cache.h b/src/xenia/gpu/vulkan/buffer_cache.h index bde7d5cb2..8740c45f6 100644 --- a/src/xenia/gpu/vulkan/buffer_cache.h +++ b/src/xenia/gpu/vulkan/buffer_cache.h @@ -13,11 +13,12 @@ #include "xenia/gpu/register_file.h" #include "xenia/gpu/shader.h" #include "xenia/gpu/xenos.h" +#include "xenia/memory.h" #include "xenia/ui/vulkan/circular_buffer.h" #include "xenia/ui/vulkan/vulkan.h" #include "xenia/ui/vulkan/vulkan_device.h" -#include +#include namespace xe { namespace gpu { @@ -28,8 +29,8 @@ namespace vulkan { // transient data like shader constants. class BufferCache { public: - BufferCache(RegisterFile* register_file, ui::vulkan::VulkanDevice* device, - size_t capacity); + BufferCache(RegisterFile* register_file, Memory* memory, + ui::vulkan::VulkanDevice* device, size_t capacity); ~BufferCache(); // Descriptor set containing the dynamic uniform buffer used for constant @@ -60,8 +61,8 @@ class BufferCache { // recently uploaded data or cached copies. // Returns a buffer and offset that can be used with vkCmdBindIndexBuffer. // Size will be VK_WHOLE_SIZE if the data could not be uploaded (OOM). - std::pair UploadIndexBuffer(const void* source_ptr, - size_t source_length, + std::pair UploadIndexBuffer(uint32_t source_addr, + uint32_t source_length, IndexFormat format, VkFence fence); @@ -69,8 +70,8 @@ class BufferCache { // recently uploaded data or cached copies. // Returns a buffer and offset that can be used with vkCmdBindVertexBuffers. // Size will be VK_WHOLE_SIZE if the data could not be uploaded (OOM). - std::pair UploadVertexBuffer(const void* source_ptr, - size_t source_length, + std::pair UploadVertexBuffer(uint32_t source_addr, + uint32_t source_length, Endian endian, VkFence fence); @@ -99,8 +100,16 @@ class BufferCache { // Tries to allocate a block of memory in the transient buffer. // Returns VK_WHOLE_SIZE if requested amount of memory is not available. VkDeviceSize TryAllocateTransientData(VkDeviceSize length, VkFence fence); + // Finds a block of data in the transient buffer sourced from the specified + // guest address and length. + VkDeviceSize FindCachedTransientData(uint32_t guest_address, + uint32_t guest_length); + // Adds a block of data to the frame cache. + void CacheTransientData(uint32_t guest_address, uint32_t guest_length, + VkDeviceSize offset); RegisterFile* register_file_ = nullptr; + Memory* memory_ = nullptr; VkDevice device_ = nullptr; VkDeviceMemory gpu_memory_pool_ = nullptr; @@ -108,7 +117,7 @@ class BufferCache { // Staging ringbuffer we cycle through fast. Used for data we don't // plan on keeping past the current frame. std::unique_ptr transient_buffer_ = nullptr; - std::unordered_map transient_cache_; + std::map transient_cache_; VkDescriptorPool descriptor_pool_ = nullptr; VkDescriptorSetLayout descriptor_set_layout_ = nullptr; diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc index 2d08fc35a..fd23af343 100644 --- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc +++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc @@ -73,8 +73,8 @@ bool VulkanCommandProcessor::SetupContext() { *device_, device_->queue_family_index(), VK_COMMAND_BUFFER_LEVEL_PRIMARY); // Initialize the state machine caches. - buffer_cache_ = std::make_unique(register_file_, device_, - kDefaultBufferCacheCapacity); + buffer_cache_ = std::make_unique( + register_file_, memory_, device_, kDefaultBufferCacheCapacity); texture_cache_ = std::make_unique(memory_, register_file_, &trace_writer_, device_); pipeline_cache_ = std::make_unique( @@ -696,13 +696,12 @@ bool VulkanCommandProcessor::PopulateIndexBuffer( trace_writer_.WriteMemoryRead(info.guest_base, info.length); // Upload (or get a cached copy of) the buffer. - const void* source_ptr = - memory_->TranslatePhysical(info.guest_base); - size_t source_length = + uint32_t source_addr = info.guest_base; + uint32_t source_length = info.count * (info.format == IndexFormat::kInt32 ? sizeof(uint32_t) : sizeof(uint16_t)); auto buffer_ref = buffer_cache_->UploadIndexBuffer( - source_ptr, source_length, info.format, current_batch_fence_); + source_addr, source_length, info.format, current_batch_fence_); if (buffer_ref.second == VK_WHOLE_SIZE) { // Failed to upload buffer. return false; @@ -764,11 +763,9 @@ bool VulkanCommandProcessor::PopulateVertexBuffers( trace_writer_.WriteMemoryRead(physical_address, valid_range); // Upload (or get a cached copy of) the buffer. - const void* source_ptr = - memory_->TranslatePhysical(physical_address); - size_t source_length = valid_range; + uint32_t source_length = uint32_t(valid_range); auto buffer_ref = buffer_cache_->UploadVertexBuffer( - source_ptr, source_length, static_cast(fetch->endian), + physical_address, source_length, static_cast(fetch->endian), current_batch_fence_); if (buffer_ref.second == VK_WHOLE_SIZE) { // Failed to upload buffer.