Merge branch 'vtx_cache'

2018-02-18 16:49:32 -06:00 · 2018-02-18 16:49:32 -06:00 · 79308b477f
parent 7fa27343c7 71e5e967ec
commit 79308b477f
12 changed files with 247 additions and 82 deletions
--- a/src/xenia/cpu/mmio_handler.cc
+++ b/src/xenia/cpu/mmio_handler.cc
@ -240,12 +240,20 @@ bool MMIOHandler::IsRangeWatched(uint32_t physical_address, size_t length) {
  for (auto it = access_watches_.begin(); it != access_watches_.end(); ++it) {
    auto entry = *it;
    if ((entry->address <= physical_address &&
-         entry->address + entry->length > physical_address) ||
-        (entry->address >= physical_address &&
-         entry->address < physical_address + length)) {
-      // This watch lies within the range.
+         entry->address + entry->length > physical_address + length)) {
+      // This range lies entirely within this watch.
      return true;
    }
+
+    // TODO(DrChat): Check if the range is partially covered, and subtract the
+    // covered portion if it is.
+    if ((entry->address <= physical_address &&
+         entry->address + entry->length > physical_address)) {
+      // The beginning of range lies partially within this watch.
+    } else if ((entry->address < physical_address + length &&
+                entry->address + entry->length > physical_address + length)) {
+      // The ending of this range lies partially within this watch.
+    }
  }

  return false;
--- a/src/xenia/cpu/mmio_handler.h
+++ b/src/xenia/cpu/mmio_handler.h
@ -77,7 +77,7 @@ class MMIOHandler {
  // Fires and clears any access watches that overlap this range.
  void InvalidateRange(uint32_t physical_address, size_t length);

-  // Returns true if /any/ part of this range is watched.
+  // Returns true if /all/ of this range is watched.
  bool IsRangeWatched(uint32_t physical_address, size_t length);

 protected:
--- a/src/xenia/gpu/register_table.inc
+++ b/src/xenia/gpu/register_table.inc
@ -47,6 +47,10 @@ XE_GPU_REGISTER(0x0D04, kDword, SQ_EO_RT)

 XE_GPU_REGISTER(0x0C85, kDword, PA_CL_ENHANCE)

+// Set with WAIT_UNTIL = WAIT_3D_IDLECLEAN
+XE_GPU_REGISTER(0x0E00, kDword, UNKNOWN_0E00)
+XE_GPU_REGISTER(0x0E40, kDword, UNKNOWN_0E40)
+
 XE_GPU_REGISTER(0x0E42, kDword, UNKNOWN_0E42)

 XE_GPU_REGISTER(0x0F01, kDword, RB_BC_CONTROL)
--- a/src/xenia/gpu/vulkan/buffer_cache.cc
+++ b/src/xenia/gpu/vulkan/buffer_cache.cc
@ -16,10 +16,80 @@
 #include "xenia/gpu/gpu_flags.h"
 #include "xenia/gpu/vulkan/vulkan_gpu_flags.h"

+#include "third_party/vulkan/vk_mem_alloc.h"
+
 namespace xe {
 namespace gpu {
 namespace vulkan {

+#if XE_ARCH_AMD64
+void copy_cmp_swap_16_unaligned(void* dest_ptr, const void* src_ptr,
+                                uint16_t cmp_value, size_t count) {
+  auto dest = reinterpret_cast<uint16_t*>(dest_ptr);
+  auto src = reinterpret_cast<const uint16_t*>(src_ptr);
+  __m128i shufmask =
+      _mm_set_epi8(0x0E, 0x0F, 0x0C, 0x0D, 0x0A, 0x0B, 0x08, 0x09, 0x06, 0x07,
+                   0x04, 0x05, 0x02, 0x03, 0x00, 0x01);
+  __m128i cmpval = _mm_set1_epi16(cmp_value);
+
+  size_t i;
+  for (i = 0; i + 8 <= count; i += 8) {
+    __m128i input = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[i]));
+    __m128i output = _mm_shuffle_epi8(input, shufmask);
+
+    __m128i mask = _mm_cmpeq_epi16(output, cmpval);
+    output = _mm_or_si128(output, mask);
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
+  }
+  for (; i < count; ++i) {  // handle residual elements
+    dest[i] = byte_swap(src[i]);
+  }
+}
+
+void copy_cmp_swap_32_unaligned(void* dest_ptr, const void* src_ptr,
+                                uint32_t cmp_value, size_t count) {
+  auto dest = reinterpret_cast<uint32_t*>(dest_ptr);
+  auto src = reinterpret_cast<const uint32_t*>(src_ptr);
+  __m128i shufmask =
+      _mm_set_epi8(0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x04, 0x05,
+                   0x06, 0x07, 0x00, 0x01, 0x02, 0x03);
+  __m128i cmpval = _mm_set1_epi32(cmp_value);
+
+  size_t i;
+  for (i = 0; i + 4 <= count; i += 4) {
+    __m128i input = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[i]));
+    __m128i output = _mm_shuffle_epi8(input, shufmask);
+
+    __m128i mask = _mm_cmpeq_epi32(output, cmpval);
+    output = _mm_or_si128(output, mask);
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
+  }
+  for (; i < count; ++i) {  // handle residual elements
+    dest[i] = byte_swap(src[i]);
+  }
+}
+#else
+void copy_and_swap_16_unaligned(void* dest_ptr, const void* src_ptr,
+                                uint16_t cmp_value, size_t count) {
+  auto dest = reinterpret_cast<uint16_t*>(dest_ptr);
+  auto src = reinterpret_cast<const uint16_t*>(src_ptr);
+  for (size_t i = 0; i < count; ++i) {
+    uint16_t value = byte_swap(src[i]);
+    dest[i] = value == cmp_value ? 0xFFFF : value;
+  }
+}
+
+void copy_and_swap_32_unaligned(void* dest_ptr, const void* src_ptr,
+                                uint32_t cmp_value, size_t count) {
+  auto dest = reinterpret_cast<uint32_t*>(dest_ptr);
+  auto src = reinterpret_cast<const uint32_t*>(src_ptr);
+  for (size_t i = 0; i < count; ++i) {
+    uint32_t value = byte_swap(src[i]);
+    dest[i] = value == cmp_value ? 0xFFFFFFFF : value;
+  }
+}
+#endif
+
 using xe::ui::vulkan::CheckResult;

 constexpr VkDeviceSize kConstantRegisterUniformRange =
@ -32,7 +102,7 @@ BufferCache::BufferCache(RegisterFile* register_file, Memory* memory,
      device_,
      VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT |
          VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
-      capacity);
+      capacity, 4096);
 }

 BufferCache::~BufferCache() { Shutdown(); }
@ -47,6 +117,15 @@ VkResult BufferCache::Initialize() {
    return status;
  }

+  // Create a memory allocator for textures.
+  VmaAllocatorCreateInfo alloc_info = {
+      0, *device_, *device_, 0, 0, nullptr, nullptr,
+  };
+  status = vmaCreateAllocator(&alloc_info, &mem_allocator_);
+  if (status != VK_SUCCESS) {
+    return status;
+  }
+
  // Descriptor pool used for all of our cached descriptors.
  // In the steady state we don't allocate anything, so these are all manually
  // managed.
@ -150,28 +229,23 @@ VkResult BufferCache::Initialize() {
 }

 void BufferCache::Shutdown() {
+  if (mem_allocator_) {
+    vmaDestroyAllocator(mem_allocator_);
+    mem_allocator_ = nullptr;
+  }
+
  if (transient_descriptor_set_) {
    vkFreeDescriptorSets(*device_, descriptor_pool_, 1,
                         &transient_descriptor_set_);
    transient_descriptor_set_ = nullptr;
  }

-  if (descriptor_set_layout_) {
-    vkDestroyDescriptorSetLayout(*device_, descriptor_set_layout_, nullptr);
-    descriptor_set_layout_ = nullptr;
-  }
-
-  if (descriptor_pool_) {
-    vkDestroyDescriptorPool(*device_, descriptor_pool_, nullptr);
-    descriptor_pool_ = nullptr;
-  }
+  VK_SAFE_DESTROY(vkDestroyDescriptorSetLayout, *device_,
+                  descriptor_set_layout_, nullptr);
+  VK_SAFE_DESTROY(vkDestroyDescriptorPool, *device_, descriptor_pool_, nullptr);

  transient_buffer_->Shutdown();
-
-  if (gpu_memory_pool_) {
-    vkFreeMemory(*device_, gpu_memory_pool_, nullptr);
-    gpu_memory_pool_ = nullptr;
-  }
+  VK_SAFE_DESTROY(vkFreeMemory, *device_, gpu_memory_pool_, nullptr);
 }

 std::pair<VkDeviceSize, VkDeviceSize> BufferCache::UploadConstantRegisters(
@ -278,13 +352,8 @@ std::pair<VkDeviceSize, VkDeviceSize> BufferCache::UploadConstantRegisters(
 std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadIndexBuffer(
    VkCommandBuffer command_buffer, uint32_t source_addr,
    uint32_t source_length, IndexFormat format, VkFence fence) {
-  auto offset = FindCachedTransientData(source_addr, source_length);
-  if (offset != VK_WHOLE_SIZE) {
-    return {transient_buffer_->gpu_buffer(), offset};
-  }
-
  // Allocate space in the buffer for our data.
-  offset = AllocateTransientData(source_length, fence);
+  auto offset = AllocateTransientData(source_length, fence);
  if (offset == VK_WHOLE_SIZE) {
    // OOM.
    return {nullptr, VK_WHOLE_SIZE};
@ -292,9 +361,27 @@ std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadIndexBuffer(

  const void* source_ptr = memory_->TranslatePhysical(source_addr);

-  // Copy data into the buffer.
-  // TODO(benvanik): get min/max indices and pass back?
+  uint32_t prim_reset_index =
+      register_file_->values[XE_GPU_REG_VGT_MULTI_PRIM_IB_RESET_INDX].u32;
+  bool prim_reset_enabled =
+      !!(register_file_->values[XE_GPU_REG_PA_SU_SC_MODE_CNTL].u32 & (1 << 21));
+
+  // Copy data into the buffer. If primitive reset is enabled, translate any
+  // primitive reset indices to something Vulkan understands.
  // TODO(benvanik): memcpy then use compute shaders to swap?
+  if (prim_reset_enabled) {
+    if (format == IndexFormat::kInt16) {
+      // Endian::k8in16, swap half-words.
+      copy_cmp_swap_16_unaligned(
+          transient_buffer_->host_base() + offset, source_ptr,
+          static_cast<uint16_t>(prim_reset_index), source_length / 2);
+    } else if (format == IndexFormat::kInt32) {
+      // Endian::k8in32, swap words.
+      copy_cmp_swap_32_unaligned(transient_buffer_->host_base() + offset,
+                                 source_ptr, prim_reset_index,
+                                 source_length / 4);
+    }
+  } else {
    if (format == IndexFormat::kInt16) {
      // Endian::k8in16, swap half-words.
      xe::copy_and_swap_16_unaligned(transient_buffer_->host_base() + offset,
@ -304,6 +391,7 @@ std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadIndexBuffer(
      xe::copy_and_swap_32_unaligned(transient_buffer_->host_base() + offset,
                                     source_ptr, source_length / 4);
    }
+  }

  transient_buffer_->Flush(offset, source_length);

@ -323,7 +411,6 @@ std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadIndexBuffer(
                       VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, 0, nullptr, 1,
                       &barrier, 0, nullptr);

-  CacheTransientData(source_addr, source_length, offset);
  return {transient_buffer_->gpu_buffer(), offset};
 }

@ -335,29 +422,41 @@ std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadVertexBuffer(
    return {transient_buffer_->gpu_buffer(), offset};
  }

+  // Slow path :)
+  // Expand the region up to the allocation boundary
+  auto physical_heap = memory_->GetPhysicalHeap();
+  uint32_t upload_base = source_addr;
+  uint32_t upload_size = source_length;
+
+  // Ping the memory subsystem for allocation size.
+  // TODO(DrChat): Artifacting occurring in GripShift with this enabled.
+  // physical_heap->QueryBaseAndSize(&upload_base, &upload_size);
+  assert(upload_base <= source_addr);
+  uint32_t source_offset = source_addr - upload_base;
+
  // Allocate space in the buffer for our data.
-  offset = AllocateTransientData(source_length, fence);
+  offset = AllocateTransientData(upload_size, fence);
  if (offset == VK_WHOLE_SIZE) {
    // OOM.
    return {nullptr, VK_WHOLE_SIZE};
  }

-  const void* source_ptr = memory_->TranslatePhysical(source_addr);
+  const void* upload_ptr = memory_->TranslatePhysical(upload_base);

  // Copy data into the buffer.
  // TODO(benvanik): memcpy then use compute shaders to swap?
  if (endian == Endian::k8in32) {
    // Endian::k8in32, swap words.
    xe::copy_and_swap_32_unaligned(transient_buffer_->host_base() + offset,
-                                   source_ptr, source_length / 4);
+                                   upload_ptr, source_length / 4);
  } else if (endian == Endian::k16in32) {
    xe::copy_and_swap_16_in_32_unaligned(
-        transient_buffer_->host_base() + offset, source_ptr, source_length / 4);
+        transient_buffer_->host_base() + offset, upload_ptr, source_length / 4);
  } else {
    assert_always();
  }

-  transient_buffer_->Flush(offset, source_length);
+  transient_buffer_->Flush(offset, upload_size);

  // Append a barrier to the command buffer.
  VkBufferMemoryBarrier barrier = {
@ -369,14 +468,14 @@ std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadVertexBuffer(
      VK_QUEUE_FAMILY_IGNORED,
      transient_buffer_->gpu_buffer(),
      offset,
-      source_length,
+      upload_size,
  };
  vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_HOST_BIT,
                       VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, 0, nullptr, 1,
                       &barrier, 0, nullptr);

-  CacheTransientData(source_addr, source_length, offset);
-  return {transient_buffer_->gpu_buffer(), offset};
+  CacheTransientData(upload_base, upload_size, offset);
+  return {transient_buffer_->gpu_buffer(), offset + source_offset};
 }

 VkDeviceSize BufferCache::AllocateTransientData(VkDeviceSize length,
@ -409,10 +508,22 @@ VkDeviceSize BufferCache::TryAllocateTransientData(VkDeviceSize length,

 VkDeviceSize BufferCache::FindCachedTransientData(uint32_t guest_address,
                                                  uint32_t guest_length) {
-  uint64_t key = uint64_t(guest_length) << 32 | uint64_t(guest_address);
-  auto it = transient_cache_.find(key);
-  if (it != transient_cache_.end()) {
-    return it->second;
+  if (transient_cache_.empty()) {
+    // Short-circuit exit.
+    return VK_WHOLE_SIZE;
+  }
+
+  // Find the first element > guest_address
+  auto it = transient_cache_.upper_bound(guest_address);
+  if (it != transient_cache_.begin()) {
+    // it = first element <= guest_address
+    --it;
+
+    if ((it->first + it->second.first) >= (guest_address + guest_length)) {
+      // This data is contained within some existing transient data.
+      auto source_offset = static_cast<VkDeviceSize>(guest_address - it->first);
+      return it->second.second + source_offset;
+    }
  }

  return VK_WHOLE_SIZE;
@ -421,8 +532,17 @@ VkDeviceSize BufferCache::FindCachedTransientData(uint32_t guest_address,
 void BufferCache::CacheTransientData(uint32_t guest_address,
                                     uint32_t guest_length,
                                     VkDeviceSize offset) {
-  uint64_t key = uint64_t(guest_length) << 32 | uint64_t(guest_address);
-  transient_cache_[key] = offset;
+  transient_cache_[guest_address] = {guest_length, offset};
+
+  // Erase any entries contained within
+  auto it = transient_cache_.upper_bound(guest_address);
+  while (it != transient_cache_.end()) {
+    if ((guest_address + guest_length) >= (it->first + it->second.first)) {
+      it = transient_cache_.erase(it);
+    } else {
+      break;
+    }
+  }
 }

 void BufferCache::Flush(VkCommandBuffer command_buffer) {
--- a/src/xenia/gpu/vulkan/buffer_cache.h
+++ b/src/xenia/gpu/vulkan/buffer_cache.h
@ -18,6 +18,8 @@
 #include "xenia/ui/vulkan/vulkan.h"
 #include "xenia/ui/vulkan/vulkan_device.h"

+#include "third_party/vulkan/vk_mem_alloc.h"
+
 #include <map>

 namespace xe {
@ -95,6 +97,15 @@ class BufferCache {
  void Scavenge();

 private:
+  // This represents an uploaded vertex buffer.
+  struct VertexBuffer {
+    uint32_t guest_address;
+    uint32_t size;
+
+    VmaAllocation alloc;
+    VmaAllocationInfo alloc_info;
+  };
+
  // Allocates a block of memory in the transient buffer.
  // When memory is not available fences are checked and space is reclaimed.
  // Returns VK_WHOLE_SIZE if requested amount of memory is not available.
@ -115,11 +126,12 @@ class BufferCache {
  ui::vulkan::VulkanDevice* device_ = nullptr;

  VkDeviceMemory gpu_memory_pool_ = nullptr;
+  VmaAllocator mem_allocator_ = nullptr;

  // Staging ringbuffer we cycle through fast. Used for data we don't
  // plan on keeping past the current frame.
  std::unique_ptr<ui::vulkan::CircularBuffer> transient_buffer_ = nullptr;
-  std::map<uint64_t, VkDeviceSize> transient_cache_;
+  std::map<uint32_t, std::pair<uint32_t, VkDeviceSize>> transient_cache_;

  VkDescriptorPool descriptor_pool_ = nullptr;
  VkDescriptorSetLayout descriptor_set_layout_ = nullptr;
--- a/src/xenia/gpu/vulkan/pipeline_cache.cc
+++ b/src/xenia/gpu/vulkan/pipeline_cache.cc
@ -1202,16 +1202,12 @@ PipelineCache::UpdateStatus PipelineCache::UpdateInputAssemblyState(
  //   glProvokingVertex(GL_FIRST_VERTEX_CONVENTION);
  // }

+  // Primitive restart index is handled in the buffer cache.
  if (regs.pa_su_sc_mode_cntl & (1 << 21)) {
    state_info.primitiveRestartEnable = VK_TRUE;
  } else {
    state_info.primitiveRestartEnable = VK_FALSE;
  }
-  // TODO(benvanik): no way to specify in Vulkan?
-  assert_true(regs.multi_prim_ib_reset_index == 0xFFFF ||
-              regs.multi_prim_ib_reset_index == 0xFFFFFF ||
-              regs.multi_prim_ib_reset_index == 0xFFFFFFFF);
-  // glPrimitiveRestartIndex(regs.multi_prim_ib_reset_index);

  return UpdateStatus::kMismatch;
 }
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
@ -860,14 +860,13 @@ bool VulkanCommandProcessor::PopulateVertexBuffers(
    // TODO: Make the buffer cache ... actually cache buffers. We can have
    // a list of buffers that were cached, and store those in chunks in a
    // multiple of the host's page size.
-    // WRITE WATCHES: We need to invalidate vertex buffers if they're written
-    // to. Since most vertex buffers aren't aligned to a page boundary, this
-    // means a watch may cover more than one vertex buffer.
-    // We need to maintain a list of write watches, and what memory ranges
-    // they cover. If a vertex buffer lies within a write watch's range, assign
-    // it to the watch. If there's partial alignment where a buffer lies within
-    // one watch and outside of it, should we create a new watch or extend the
-    // existing watch?
+    // So, we need to track all vertex buffers in a sorted map, and track all
+    // write watches in a sorted map. When a vertex buffer is uploaded, track
+    // all untracked pages with 1-page write watches. In the callback,
+    // invalidate any overlapping vertex buffers.
+    //
+    // We would keep the old transient buffer as a staging buffer, and upload
+    // to a GPU-only buffer that tracks all cached vertex buffers.
    auto buffer_ref = buffer_cache_->UploadVertexBuffer(
        current_setup_buffer_, physical_address, source_length,
        static_cast<Endian>(fetch->endian), current_batch_fence_);
--- a/src/xenia/kernel/xboxkrnl/xboxkrnl_memory.cc
+++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_memory.cc
@ -273,14 +273,11 @@ dword_result_t NtQueryVirtualMemory(
    return X_STATUS_INVALID_PARAMETER;
  }

-  memory_basic_information_ptr->base_address =
-      static_cast<uint32_t>(alloc_info.base_address);
-  memory_basic_information_ptr->allocation_base =
-      static_cast<uint32_t>(alloc_info.allocation_base);
+  memory_basic_information_ptr->base_address = alloc_info.base_address;
+  memory_basic_information_ptr->allocation_base = alloc_info.allocation_base;
  memory_basic_information_ptr->allocation_protect =
      ToXdkProtectFlags(alloc_info.allocation_protect);
-  memory_basic_information_ptr->region_size =
-      static_cast<uint32_t>(alloc_info.region_size);
+  memory_basic_information_ptr->region_size = alloc_info.region_size;
  uint32_t x_state = 0;
  if (alloc_info.state & kMemoryAllocationReserve) {
    x_state |= X_MEM_RESERVE;
@ -290,7 +287,7 @@ dword_result_t NtQueryVirtualMemory(
  }
  memory_basic_information_ptr->state = x_state;
  memory_basic_information_ptr->protect = ToXdkProtectFlags(alloc_info.protect);
-  memory_basic_information_ptr->type = alloc_info.type;
+  memory_basic_information_ptr->type = X_MEM_PRIVATE;

  return X_STATUS_SUCCESS;
 }
--- a/src/xenia/memory.cc
+++ b/src/xenia/memory.cc
@ -339,6 +339,8 @@ BaseHeap* Memory::LookupHeapByType(bool physical, uint32_t page_size) {
  }
 }

+VirtualHeap* Memory::GetPhysicalHeap() { return &heaps_.physical; }
+
 void Memory::Zero(uint32_t address, uint32_t size) {
  std::memset(TranslateVirtual(address), 0, size);
 }
@ -1096,16 +1098,19 @@ bool BaseHeap::QueryRegionInfo(uint32_t base_address,
  out_info->region_size = 0;
  out_info->state = 0;
  out_info->protect = 0;
-  out_info->type = 0;
  if (start_page_entry.state) {
    // Committed/reserved region.
    out_info->allocation_base = start_page_entry.base_address * page_size_;
    out_info->allocation_protect = start_page_entry.allocation_protect;
+    out_info->allocation_size = start_page_entry.region_page_count * page_size_;
    out_info->state = start_page_entry.state;
    out_info->protect = start_page_entry.current_protect;
-    out_info->type = 0x20000;
+
+    // Scan forward and report the size of the region matching the initial
+    // base address's attributes.
    for (uint32_t page_number = start_page_number;
-         page_number < start_page_number + start_page_entry.region_page_count;
+         page_number <
+         start_page_entry.base_address + start_page_entry.region_page_count;
         ++page_number) {
      auto page_entry = page_table_[page_number];
      if (page_entry.base_address != start_page_entry.base_address ||
@ -1144,6 +1149,20 @@ bool BaseHeap::QuerySize(uint32_t address, uint32_t* out_size) {
  return true;
 }

+bool BaseHeap::QueryBaseAndSize(uint32_t* in_out_address, uint32_t* out_size) {
+  uint32_t page_number = (*in_out_address - heap_base_) / page_size_;
+  if (page_number > page_table_.size()) {
+    XELOGE("BaseHeap::QuerySize base page out of range");
+    *out_size = 0;
+    return false;
+  }
+  auto global_lock = global_critical_region_.Acquire();
+  auto page_entry = page_table_[page_number];
+  *in_out_address = (page_entry.base_address * page_size_);
+  *out_size = (page_entry.region_page_count * page_size_);
+  return true;
+}
+
 bool BaseHeap::QueryProtect(uint32_t address, uint32_t* out_protect) {
  uint32_t page_number = (address - heap_base_) / page_size_;
  if (page_number > page_table_.size()) {
--- a/src/xenia/memory.h
+++ b/src/xenia/memory.h
@ -56,6 +56,8 @@ struct HeapAllocationInfo {
  uint32_t allocation_base;
  // The memory protection option when the region was initially allocated.
  uint32_t allocation_protect;
+  // The size specified when the region was initially allocated, in bytes.
+  uint32_t allocation_size;
  // The size of the region beginning at the base address in which all pages
  // have identical attributes, in bytes.
  uint32_t region_size;
@ -63,8 +65,6 @@ struct HeapAllocationInfo {
  uint32_t state;
  // The access protection of the pages in the region.
  uint32_t protect;
-  // The type of pages in the region (private).
-  uint32_t type;
 };

 // Describes a single page in the page table.
@ -144,6 +144,9 @@ class BaseHeap {
  // Queries the size of the region containing the given address.
  bool QuerySize(uint32_t address, uint32_t* out_size);

+  // Queries the base and size of a region containing the given address.
+  bool QueryBaseAndSize(uint32_t* in_out_address, uint32_t* out_size);
+
  // Queries the current protection mode of the region containing the given
  // address.
  bool QueryProtect(uint32_t address, uint32_t* out_protect);
@ -332,6 +335,9 @@ class Memory {
  // Gets the heap with the given properties.
  BaseHeap* LookupHeapByType(bool physical, uint32_t page_size);

+  // Gets the physical base heap.
+  VirtualHeap* GetPhysicalHeap();
+
  // Dumps a map of all allocated memory to the log.
  void DumpMap();

--- a/src/xenia/ui/vulkan/circular_buffer.cc
+++ b/src/xenia/ui/vulkan/circular_buffer.cc
@ -42,7 +42,7 @@ CircularBuffer::CircularBuffer(VulkanDevice* device, VkBufferUsageFlags usage,

  VkMemoryRequirements reqs;
  vkGetBufferMemoryRequirements(*device_, gpu_buffer_, &reqs);
-  alignment_ = reqs.alignment;
+  alignment_ = xe::round_up(alignment, reqs.alignment);
 }
 CircularBuffer::~CircularBuffer() { Shutdown(); }

--- a/src/xenia/ui/vulkan/vulkan_util.h
+++ b/src/xenia/ui/vulkan/vulkan_util.h
@ -26,10 +26,14 @@ namespace ui {
 namespace vulkan {

 #define VK_SAFE_DESTROY(fn, dev, obj, alloc) \
+                                             \
+  do {                                       \
    if (obj) {                               \
      fn(dev, obj, alloc);                   \
      obj = nullptr;                         \
-  }
+    }                                        \
+                                             \
+  } while (0)

 class Fence {
 public: