diff --git a/src/xenia/cpu/mmio_handler.cc b/src/xenia/cpu/mmio_handler.cc
index f12cb65d9..33bdaf889 100644
--- a/src/xenia/cpu/mmio_handler.cc
+++ b/src/xenia/cpu/mmio_handler.cc
@@ -240,12 +240,20 @@ bool MMIOHandler::IsRangeWatched(uint32_t physical_address, size_t length) {
   for (auto it = access_watches_.begin(); it != access_watches_.end(); ++it) {
     auto entry = *it;
     if ((entry->address <= physical_address &&
-         entry->address + entry->length > physical_address) ||
-        (entry->address >= physical_address &&
-         entry->address < physical_address + length)) {
-      // This watch lies within the range.
+         entry->address + entry->length > physical_address + length)) {
+      // This range lies entirely within this watch.
       return true;
     }
+
+    // TODO(DrChat): Check if the range is partially covered, and subtract the
+    // covered portion if it is.
+    if ((entry->address <= physical_address &&
+         entry->address + entry->length > physical_address)) {
+      // The beginning of range lies partially within this watch.
+    } else if ((entry->address < physical_address + length &&
+                entry->address + entry->length > physical_address + length)) {
+      // The ending of this range lies partially within this watch.
+    }
   }
 
   return false;
diff --git a/src/xenia/cpu/mmio_handler.h b/src/xenia/cpu/mmio_handler.h
index e68a2e276..e61cd1c20 100644
--- a/src/xenia/cpu/mmio_handler.h
+++ b/src/xenia/cpu/mmio_handler.h
@@ -77,7 +77,7 @@ class MMIOHandler {
   // Fires and clears any access watches that overlap this range.
   void InvalidateRange(uint32_t physical_address, size_t length);
 
-  // Returns true if /any/ part of this range is watched.
+  // Returns true if /all/ of this range is watched.
   bool IsRangeWatched(uint32_t physical_address, size_t length);
 
  protected:
diff --git a/src/xenia/gpu/register_table.inc b/src/xenia/gpu/register_table.inc
index 9aa649742..3ab07abd5 100644
--- a/src/xenia/gpu/register_table.inc
+++ b/src/xenia/gpu/register_table.inc
@@ -47,6 +47,10 @@ XE_GPU_REGISTER(0x0D04, kDword, SQ_EO_RT)
 
 XE_GPU_REGISTER(0x0C85, kDword, PA_CL_ENHANCE)
 
+// Set with WAIT_UNTIL = WAIT_3D_IDLECLEAN
+XE_GPU_REGISTER(0x0E00, kDword, UNKNOWN_0E00)
+XE_GPU_REGISTER(0x0E40, kDword, UNKNOWN_0E40)
+
 XE_GPU_REGISTER(0x0E42, kDword, UNKNOWN_0E42)
 
 XE_GPU_REGISTER(0x0F01, kDword, RB_BC_CONTROL)
diff --git a/src/xenia/gpu/vulkan/buffer_cache.cc b/src/xenia/gpu/vulkan/buffer_cache.cc
index 72b2f2d39..64dc3121b 100644
--- a/src/xenia/gpu/vulkan/buffer_cache.cc
+++ b/src/xenia/gpu/vulkan/buffer_cache.cc
@@ -16,10 +16,80 @@
 #include "xenia/gpu/gpu_flags.h"
 #include "xenia/gpu/vulkan/vulkan_gpu_flags.h"
 
+#include "third_party/vulkan/vk_mem_alloc.h"
+
 namespace xe {
 namespace gpu {
 namespace vulkan {
 
+#if XE_ARCH_AMD64
+void copy_cmp_swap_16_unaligned(void* dest_ptr, const void* src_ptr,
+                                uint16_t cmp_value, size_t count) {
+  auto dest = reinterpret_cast<uint16_t*>(dest_ptr);
+  auto src = reinterpret_cast<const uint16_t*>(src_ptr);
+  __m128i shufmask =
+      _mm_set_epi8(0x0E, 0x0F, 0x0C, 0x0D, 0x0A, 0x0B, 0x08, 0x09, 0x06, 0x07,
+                   0x04, 0x05, 0x02, 0x03, 0x00, 0x01);
+  __m128i cmpval = _mm_set1_epi16(cmp_value);
+
+  size_t i;
+  for (i = 0; i + 8 <= count; i += 8) {
+    __m128i input = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[i]));
+    __m128i output = _mm_shuffle_epi8(input, shufmask);
+
+    __m128i mask = _mm_cmpeq_epi16(output, cmpval);
+    output = _mm_or_si128(output, mask);
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
+  }
+  for (; i < count; ++i) {  // handle residual elements
+    dest[i] = byte_swap(src[i]);
+  }
+}
+
+void copy_cmp_swap_32_unaligned(void* dest_ptr, const void* src_ptr,
+                                uint32_t cmp_value, size_t count) {
+  auto dest = reinterpret_cast<uint32_t*>(dest_ptr);
+  auto src = reinterpret_cast<const uint32_t*>(src_ptr);
+  __m128i shufmask =
+      _mm_set_epi8(0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B, 0x04, 0x05,
+                   0x06, 0x07, 0x00, 0x01, 0x02, 0x03);
+  __m128i cmpval = _mm_set1_epi32(cmp_value);
+
+  size_t i;
+  for (i = 0; i + 4 <= count; i += 4) {
+    __m128i input = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&src[i]));
+    __m128i output = _mm_shuffle_epi8(input, shufmask);
+
+    __m128i mask = _mm_cmpeq_epi32(output, cmpval);
+    output = _mm_or_si128(output, mask);
+    _mm_storeu_si128(reinterpret_cast<__m128i*>(&dest[i]), output);
+  }
+  for (; i < count; ++i) {  // handle residual elements
+    dest[i] = byte_swap(src[i]);
+  }
+}
+#else
+void copy_and_swap_16_unaligned(void* dest_ptr, const void* src_ptr,
+                                uint16_t cmp_value, size_t count) {
+  auto dest = reinterpret_cast<uint16_t*>(dest_ptr);
+  auto src = reinterpret_cast<const uint16_t*>(src_ptr);
+  for (size_t i = 0; i < count; ++i) {
+    uint16_t value = byte_swap(src[i]);
+    dest[i] = value == cmp_value ? 0xFFFF : value;
+  }
+}
+
+void copy_and_swap_32_unaligned(void* dest_ptr, const void* src_ptr,
+                                uint32_t cmp_value, size_t count) {
+  auto dest = reinterpret_cast<uint32_t*>(dest_ptr);
+  auto src = reinterpret_cast<const uint32_t*>(src_ptr);
+  for (size_t i = 0; i < count; ++i) {
+    uint32_t value = byte_swap(src[i]);
+    dest[i] = value == cmp_value ? 0xFFFFFFFF : value;
+  }
+}
+#endif
+
 using xe::ui::vulkan::CheckResult;
 
 constexpr VkDeviceSize kConstantRegisterUniformRange =
@@ -32,7 +102,7 @@ BufferCache::BufferCache(RegisterFile* register_file, Memory* memory,
       device_,
       VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT |
           VK_BUFFER_USAGE_VERTEX_BUFFER_BIT,
-      capacity);
+      capacity, 4096);
 }
 
 BufferCache::~BufferCache() { Shutdown(); }
@@ -47,6 +117,15 @@ VkResult BufferCache::Initialize() {
     return status;
   }
 
+  // Create a memory allocator for textures.
+  VmaAllocatorCreateInfo alloc_info = {
+      0, *device_, *device_, 0, 0, nullptr, nullptr,
+  };
+  status = vmaCreateAllocator(&alloc_info, &mem_allocator_);
+  if (status != VK_SUCCESS) {
+    return status;
+  }
+
   // Descriptor pool used for all of our cached descriptors.
   // In the steady state we don't allocate anything, so these are all manually
   // managed.
@@ -150,28 +229,23 @@ VkResult BufferCache::Initialize() {
 }
 
 void BufferCache::Shutdown() {
+  if (mem_allocator_) {
+    vmaDestroyAllocator(mem_allocator_);
+    mem_allocator_ = nullptr;
+  }
+
   if (transient_descriptor_set_) {
     vkFreeDescriptorSets(*device_, descriptor_pool_, 1,
                          &transient_descriptor_set_);
     transient_descriptor_set_ = nullptr;
   }
 
-  if (descriptor_set_layout_) {
-    vkDestroyDescriptorSetLayout(*device_, descriptor_set_layout_, nullptr);
-    descriptor_set_layout_ = nullptr;
-  }
-
-  if (descriptor_pool_) {
-    vkDestroyDescriptorPool(*device_, descriptor_pool_, nullptr);
-    descriptor_pool_ = nullptr;
-  }
+  VK_SAFE_DESTROY(vkDestroyDescriptorSetLayout, *device_,
+                  descriptor_set_layout_, nullptr);
+  VK_SAFE_DESTROY(vkDestroyDescriptorPool, *device_, descriptor_pool_, nullptr);
 
   transient_buffer_->Shutdown();
-
-  if (gpu_memory_pool_) {
-    vkFreeMemory(*device_, gpu_memory_pool_, nullptr);
-    gpu_memory_pool_ = nullptr;
-  }
+  VK_SAFE_DESTROY(vkFreeMemory, *device_, gpu_memory_pool_, nullptr);
 }
 
 std::pair<VkDeviceSize, VkDeviceSize> BufferCache::UploadConstantRegisters(
@@ -278,13 +352,8 @@ std::pair<VkDeviceSize, VkDeviceSize> BufferCache::UploadConstantRegisters(
 std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadIndexBuffer(
     VkCommandBuffer command_buffer, uint32_t source_addr,
     uint32_t source_length, IndexFormat format, VkFence fence) {
-  auto offset = FindCachedTransientData(source_addr, source_length);
-  if (offset != VK_WHOLE_SIZE) {
-    return {transient_buffer_->gpu_buffer(), offset};
-  }
-
   // Allocate space in the buffer for our data.
-  offset = AllocateTransientData(source_length, fence);
+  auto offset = AllocateTransientData(source_length, fence);
   if (offset == VK_WHOLE_SIZE) {
     // OOM.
     return {nullptr, VK_WHOLE_SIZE};
@@ -292,17 +361,36 @@ std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadIndexBuffer(
 
   const void* source_ptr = memory_->TranslatePhysical(source_addr);
 
-  // Copy data into the buffer.
-  // TODO(benvanik): get min/max indices and pass back?
+  uint32_t prim_reset_index =
+      register_file_->values[XE_GPU_REG_VGT_MULTI_PRIM_IB_RESET_INDX].u32;
+  bool prim_reset_enabled =
+      !!(register_file_->values[XE_GPU_REG_PA_SU_SC_MODE_CNTL].u32 & (1 << 21));
+
+  // Copy data into the buffer. If primitive reset is enabled, translate any
+  // primitive reset indices to something Vulkan understands.
   // TODO(benvanik): memcpy then use compute shaders to swap?
-  if (format == IndexFormat::kInt16) {
-    // Endian::k8in16, swap half-words.
-    xe::copy_and_swap_16_unaligned(transient_buffer_->host_base() + offset,
-                                   source_ptr, source_length / 2);
-  } else if (format == IndexFormat::kInt32) {
-    // Endian::k8in32, swap words.
-    xe::copy_and_swap_32_unaligned(transient_buffer_->host_base() + offset,
-                                   source_ptr, source_length / 4);
+  if (prim_reset_enabled) {
+    if (format == IndexFormat::kInt16) {
+      // Endian::k8in16, swap half-words.
+      copy_cmp_swap_16_unaligned(
+          transient_buffer_->host_base() + offset, source_ptr,
+          static_cast<uint16_t>(prim_reset_index), source_length / 2);
+    } else if (format == IndexFormat::kInt32) {
+      // Endian::k8in32, swap words.
+      copy_cmp_swap_32_unaligned(transient_buffer_->host_base() + offset,
+                                 source_ptr, prim_reset_index,
+                                 source_length / 4);
+    }
+  } else {
+    if (format == IndexFormat::kInt16) {
+      // Endian::k8in16, swap half-words.
+      xe::copy_and_swap_16_unaligned(transient_buffer_->host_base() + offset,
+                                     source_ptr, source_length / 2);
+    } else if (format == IndexFormat::kInt32) {
+      // Endian::k8in32, swap words.
+      xe::copy_and_swap_32_unaligned(transient_buffer_->host_base() + offset,
+                                     source_ptr, source_length / 4);
+    }
   }
 
   transient_buffer_->Flush(offset, source_length);
@@ -323,7 +411,6 @@ std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadIndexBuffer(
                        VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, 0, nullptr, 1,
                        &barrier, 0, nullptr);
 
-  CacheTransientData(source_addr, source_length, offset);
   return {transient_buffer_->gpu_buffer(), offset};
 }
 
@@ -335,29 +422,41 @@ std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadVertexBuffer(
     return {transient_buffer_->gpu_buffer(), offset};
   }
 
+  // Slow path :)
+  // Expand the region up to the allocation boundary
+  auto physical_heap = memory_->GetPhysicalHeap();
+  uint32_t upload_base = source_addr;
+  uint32_t upload_size = source_length;
+
+  // Ping the memory subsystem for allocation size.
+  // TODO(DrChat): Artifacting occurring in GripShift with this enabled.
+  // physical_heap->QueryBaseAndSize(&upload_base, &upload_size);
+  assert(upload_base <= source_addr);
+  uint32_t source_offset = source_addr - upload_base;
+
   // Allocate space in the buffer for our data.
-  offset = AllocateTransientData(source_length, fence);
+  offset = AllocateTransientData(upload_size, fence);
   if (offset == VK_WHOLE_SIZE) {
     // OOM.
     return {nullptr, VK_WHOLE_SIZE};
   }
 
-  const void* source_ptr = memory_->TranslatePhysical(source_addr);
+  const void* upload_ptr = memory_->TranslatePhysical(upload_base);
 
   // Copy data into the buffer.
   // TODO(benvanik): memcpy then use compute shaders to swap?
   if (endian == Endian::k8in32) {
     // Endian::k8in32, swap words.
     xe::copy_and_swap_32_unaligned(transient_buffer_->host_base() + offset,
-                                   source_ptr, source_length / 4);
+                                   upload_ptr, source_length / 4);
   } else if (endian == Endian::k16in32) {
     xe::copy_and_swap_16_in_32_unaligned(
-        transient_buffer_->host_base() + offset, source_ptr, source_length / 4);
+        transient_buffer_->host_base() + offset, upload_ptr, source_length / 4);
   } else {
     assert_always();
   }
 
-  transient_buffer_->Flush(offset, source_length);
+  transient_buffer_->Flush(offset, upload_size);
 
   // Append a barrier to the command buffer.
   VkBufferMemoryBarrier barrier = {
@@ -369,14 +468,14 @@ std::pair<VkBuffer, VkDeviceSize> BufferCache::UploadVertexBuffer(
       VK_QUEUE_FAMILY_IGNORED,
       transient_buffer_->gpu_buffer(),
       offset,
-      source_length,
+      upload_size,
   };
   vkCmdPipelineBarrier(command_buffer, VK_PIPELINE_STAGE_HOST_BIT,
                        VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, 0, nullptr, 1,
                        &barrier, 0, nullptr);
 
-  CacheTransientData(source_addr, source_length, offset);
-  return {transient_buffer_->gpu_buffer(), offset};
+  CacheTransientData(upload_base, upload_size, offset);
+  return {transient_buffer_->gpu_buffer(), offset + source_offset};
 }
 
 VkDeviceSize BufferCache::AllocateTransientData(VkDeviceSize length,
@@ -409,10 +508,22 @@ VkDeviceSize BufferCache::TryAllocateTransientData(VkDeviceSize length,
 
 VkDeviceSize BufferCache::FindCachedTransientData(uint32_t guest_address,
                                                   uint32_t guest_length) {
-  uint64_t key = uint64_t(guest_length) << 32 | uint64_t(guest_address);
-  auto it = transient_cache_.find(key);
-  if (it != transient_cache_.end()) {
-    return it->second;
+  if (transient_cache_.empty()) {
+    // Short-circuit exit.
+    return VK_WHOLE_SIZE;
+  }
+
+  // Find the first element > guest_address
+  auto it = transient_cache_.upper_bound(guest_address);
+  if (it != transient_cache_.begin()) {
+    // it = first element <= guest_address
+    --it;
+
+    if ((it->first + it->second.first) >= (guest_address + guest_length)) {
+      // This data is contained within some existing transient data.
+      auto source_offset = static_cast<VkDeviceSize>(guest_address - it->first);
+      return it->second.second + source_offset;
+    }
   }
 
   return VK_WHOLE_SIZE;
@@ -421,8 +532,17 @@ VkDeviceSize BufferCache::FindCachedTransientData(uint32_t guest_address,
 void BufferCache::CacheTransientData(uint32_t guest_address,
                                      uint32_t guest_length,
                                      VkDeviceSize offset) {
-  uint64_t key = uint64_t(guest_length) << 32 | uint64_t(guest_address);
-  transient_cache_[key] = offset;
+  transient_cache_[guest_address] = {guest_length, offset};
+
+  // Erase any entries contained within
+  auto it = transient_cache_.upper_bound(guest_address);
+  while (it != transient_cache_.end()) {
+    if ((guest_address + guest_length) >= (it->first + it->second.first)) {
+      it = transient_cache_.erase(it);
+    } else {
+      break;
+    }
+  }
 }
 
 void BufferCache::Flush(VkCommandBuffer command_buffer) {
diff --git a/src/xenia/gpu/vulkan/buffer_cache.h b/src/xenia/gpu/vulkan/buffer_cache.h
index ffaa8b8fd..2f321f26f 100644
--- a/src/xenia/gpu/vulkan/buffer_cache.h
+++ b/src/xenia/gpu/vulkan/buffer_cache.h
@@ -18,6 +18,8 @@
 #include "xenia/ui/vulkan/vulkan.h"
 #include "xenia/ui/vulkan/vulkan_device.h"
 
+#include "third_party/vulkan/vk_mem_alloc.h"
+
 #include <map>
 
 namespace xe {
@@ -95,6 +97,15 @@ class BufferCache {
   void Scavenge();
 
  private:
+  // This represents an uploaded vertex buffer.
+  struct VertexBuffer {
+    uint32_t guest_address;
+    uint32_t size;
+
+    VmaAllocation alloc;
+    VmaAllocationInfo alloc_info;
+  };
+
   // Allocates a block of memory in the transient buffer.
   // When memory is not available fences are checked and space is reclaimed.
   // Returns VK_WHOLE_SIZE if requested amount of memory is not available.
@@ -115,11 +126,12 @@ class BufferCache {
   ui::vulkan::VulkanDevice* device_ = nullptr;
 
   VkDeviceMemory gpu_memory_pool_ = nullptr;
+  VmaAllocator mem_allocator_ = nullptr;
 
   // Staging ringbuffer we cycle through fast. Used for data we don't
   // plan on keeping past the current frame.
   std::unique_ptr<ui::vulkan::CircularBuffer> transient_buffer_ = nullptr;
-  std::map<uint64_t, VkDeviceSize> transient_cache_;
+  std::map<uint32_t, std::pair<uint32_t, VkDeviceSize>> transient_cache_;
 
   VkDescriptorPool descriptor_pool_ = nullptr;
   VkDescriptorSetLayout descriptor_set_layout_ = nullptr;
diff --git a/src/xenia/gpu/vulkan/pipeline_cache.cc b/src/xenia/gpu/vulkan/pipeline_cache.cc
index 6aace62fe..f36755e79 100644
--- a/src/xenia/gpu/vulkan/pipeline_cache.cc
+++ b/src/xenia/gpu/vulkan/pipeline_cache.cc
@@ -1202,16 +1202,12 @@ PipelineCache::UpdateStatus PipelineCache::UpdateInputAssemblyState(
   //   glProvokingVertex(GL_FIRST_VERTEX_CONVENTION);
   // }
 
+  // Primitive restart index is handled in the buffer cache.
   if (regs.pa_su_sc_mode_cntl & (1 << 21)) {
     state_info.primitiveRestartEnable = VK_TRUE;
   } else {
     state_info.primitiveRestartEnable = VK_FALSE;
   }
-  // TODO(benvanik): no way to specify in Vulkan?
-  assert_true(regs.multi_prim_ib_reset_index == 0xFFFF ||
-              regs.multi_prim_ib_reset_index == 0xFFFFFF ||
-              regs.multi_prim_ib_reset_index == 0xFFFFFFFF);
-  // glPrimitiveRestartIndex(regs.multi_prim_ib_reset_index);
 
   return UpdateStatus::kMismatch;
 }
diff --git a/src/xenia/gpu/vulkan/vulkan_command_processor.cc b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
index 9bab7fda5..6112d9a3d 100644
--- a/src/xenia/gpu/vulkan/vulkan_command_processor.cc
+++ b/src/xenia/gpu/vulkan/vulkan_command_processor.cc
@@ -860,14 +860,13 @@ bool VulkanCommandProcessor::PopulateVertexBuffers(
     // TODO: Make the buffer cache ... actually cache buffers. We can have
     // a list of buffers that were cached, and store those in chunks in a
     // multiple of the host's page size.
-    // WRITE WATCHES: We need to invalidate vertex buffers if they're written
-    // to. Since most vertex buffers aren't aligned to a page boundary, this
-    // means a watch may cover more than one vertex buffer.
-    // We need to maintain a list of write watches, and what memory ranges
-    // they cover. If a vertex buffer lies within a write watch's range, assign
-    // it to the watch. If there's partial alignment where a buffer lies within
-    // one watch and outside of it, should we create a new watch or extend the
-    // existing watch?
+    // So, we need to track all vertex buffers in a sorted map, and track all
+    // write watches in a sorted map. When a vertex buffer is uploaded, track
+    // all untracked pages with 1-page write watches. In the callback,
+    // invalidate any overlapping vertex buffers.
+    //
+    // We would keep the old transient buffer as a staging buffer, and upload
+    // to a GPU-only buffer that tracks all cached vertex buffers.
     auto buffer_ref = buffer_cache_->UploadVertexBuffer(
         current_setup_buffer_, physical_address, source_length,
         static_cast<Endian>(fetch->endian), current_batch_fence_);
diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_memory.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_memory.cc
index e397e53eb..0ef461683 100644
--- a/src/xenia/kernel/xboxkrnl/xboxkrnl_memory.cc
+++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_memory.cc
@@ -273,14 +273,11 @@ dword_result_t NtQueryVirtualMemory(
     return X_STATUS_INVALID_PARAMETER;
   }
 
-  memory_basic_information_ptr->base_address =
-      static_cast<uint32_t>(alloc_info.base_address);
-  memory_basic_information_ptr->allocation_base =
-      static_cast<uint32_t>(alloc_info.allocation_base);
+  memory_basic_information_ptr->base_address = alloc_info.base_address;
+  memory_basic_information_ptr->allocation_base = alloc_info.allocation_base;
   memory_basic_information_ptr->allocation_protect =
       ToXdkProtectFlags(alloc_info.allocation_protect);
-  memory_basic_information_ptr->region_size =
-      static_cast<uint32_t>(alloc_info.region_size);
+  memory_basic_information_ptr->region_size = alloc_info.region_size;
   uint32_t x_state = 0;
   if (alloc_info.state & kMemoryAllocationReserve) {
     x_state |= X_MEM_RESERVE;
@@ -290,7 +287,7 @@ dword_result_t NtQueryVirtualMemory(
   }
   memory_basic_information_ptr->state = x_state;
   memory_basic_information_ptr->protect = ToXdkProtectFlags(alloc_info.protect);
-  memory_basic_information_ptr->type = alloc_info.type;
+  memory_basic_information_ptr->type = X_MEM_PRIVATE;
 
   return X_STATUS_SUCCESS;
 }
diff --git a/src/xenia/memory.cc b/src/xenia/memory.cc
index 87f135682..cb2553fe1 100644
--- a/src/xenia/memory.cc
+++ b/src/xenia/memory.cc
@@ -339,6 +339,8 @@ BaseHeap* Memory::LookupHeapByType(bool physical, uint32_t page_size) {
   }
 }
 
+VirtualHeap* Memory::GetPhysicalHeap() { return &heaps_.physical; }
+
 void Memory::Zero(uint32_t address, uint32_t size) {
   std::memset(TranslateVirtual(address), 0, size);
 }
@@ -1096,16 +1098,19 @@ bool BaseHeap::QueryRegionInfo(uint32_t base_address,
   out_info->region_size = 0;
   out_info->state = 0;
   out_info->protect = 0;
-  out_info->type = 0;
   if (start_page_entry.state) {
     // Committed/reserved region.
     out_info->allocation_base = start_page_entry.base_address * page_size_;
     out_info->allocation_protect = start_page_entry.allocation_protect;
+    out_info->allocation_size = start_page_entry.region_page_count * page_size_;
     out_info->state = start_page_entry.state;
     out_info->protect = start_page_entry.current_protect;
-    out_info->type = 0x20000;
+
+    // Scan forward and report the size of the region matching the initial
+    // base address's attributes.
     for (uint32_t page_number = start_page_number;
-         page_number < start_page_number + start_page_entry.region_page_count;
+         page_number <
+         start_page_entry.base_address + start_page_entry.region_page_count;
          ++page_number) {
       auto page_entry = page_table_[page_number];
       if (page_entry.base_address != start_page_entry.base_address ||
@@ -1144,6 +1149,20 @@ bool BaseHeap::QuerySize(uint32_t address, uint32_t* out_size) {
   return true;
 }
 
+bool BaseHeap::QueryBaseAndSize(uint32_t* in_out_address, uint32_t* out_size) {
+  uint32_t page_number = (*in_out_address - heap_base_) / page_size_;
+  if (page_number > page_table_.size()) {
+    XELOGE("BaseHeap::QuerySize base page out of range");
+    *out_size = 0;
+    return false;
+  }
+  auto global_lock = global_critical_region_.Acquire();
+  auto page_entry = page_table_[page_number];
+  *in_out_address = (page_entry.base_address * page_size_);
+  *out_size = (page_entry.region_page_count * page_size_);
+  return true;
+}
+
 bool BaseHeap::QueryProtect(uint32_t address, uint32_t* out_protect) {
   uint32_t page_number = (address - heap_base_) / page_size_;
   if (page_number > page_table_.size()) {
diff --git a/src/xenia/memory.h b/src/xenia/memory.h
index 7ca057a1d..4309ded05 100644
--- a/src/xenia/memory.h
+++ b/src/xenia/memory.h
@@ -56,6 +56,8 @@ struct HeapAllocationInfo {
   uint32_t allocation_base;
   // The memory protection option when the region was initially allocated.
   uint32_t allocation_protect;
+  // The size specified when the region was initially allocated, in bytes.
+  uint32_t allocation_size;
   // The size of the region beginning at the base address in which all pages
   // have identical attributes, in bytes.
   uint32_t region_size;
@@ -63,8 +65,6 @@ struct HeapAllocationInfo {
   uint32_t state;
   // The access protection of the pages in the region.
   uint32_t protect;
-  // The type of pages in the region (private).
-  uint32_t type;
 };
 
 // Describes a single page in the page table.
@@ -144,6 +144,9 @@ class BaseHeap {
   // Queries the size of the region containing the given address.
   bool QuerySize(uint32_t address, uint32_t* out_size);
 
+  // Queries the base and size of a region containing the given address.
+  bool QueryBaseAndSize(uint32_t* in_out_address, uint32_t* out_size);
+
   // Queries the current protection mode of the region containing the given
   // address.
   bool QueryProtect(uint32_t address, uint32_t* out_protect);
@@ -332,6 +335,9 @@ class Memory {
   // Gets the heap with the given properties.
   BaseHeap* LookupHeapByType(bool physical, uint32_t page_size);
 
+  // Gets the physical base heap.
+  VirtualHeap* GetPhysicalHeap();
+
   // Dumps a map of all allocated memory to the log.
   void DumpMap();
 
diff --git a/src/xenia/ui/vulkan/circular_buffer.cc b/src/xenia/ui/vulkan/circular_buffer.cc
index 0347413cc..05f3f7917 100644
--- a/src/xenia/ui/vulkan/circular_buffer.cc
+++ b/src/xenia/ui/vulkan/circular_buffer.cc
@@ -42,7 +42,7 @@ CircularBuffer::CircularBuffer(VulkanDevice* device, VkBufferUsageFlags usage,
 
   VkMemoryRequirements reqs;
   vkGetBufferMemoryRequirements(*device_, gpu_buffer_, &reqs);
-  alignment_ = reqs.alignment;
+  alignment_ = xe::round_up(alignment, reqs.alignment);
 }
 CircularBuffer::~CircularBuffer() { Shutdown(); }
 
diff --git a/src/xenia/ui/vulkan/vulkan_util.h b/src/xenia/ui/vulkan/vulkan_util.h
index 62419429e..ba93ff132 100644
--- a/src/xenia/ui/vulkan/vulkan_util.h
+++ b/src/xenia/ui/vulkan/vulkan_util.h
@@ -26,10 +26,14 @@ namespace ui {
 namespace vulkan {
 
 #define VK_SAFE_DESTROY(fn, dev, obj, alloc) \
-  if (obj) {                                 \
-    fn(dev, obj, alloc);                     \
-    obj = nullptr;                           \
-  }
+                                             \
+  do {                                       \
+    if (obj) {                               \
+      fn(dev, obj, alloc);                   \
+      obj = nullptr;                         \
+    }                                        \
+                                             \
+  } while (0)
 
 class Fence {
  public: