From 19121130a3a2e9d081cd53ebb00a2a5d7d606e81 Mon Sep 17 00:00:00 2001
From: Triang3l <triang3l@yandex.ru>
Date: Tue, 6 Oct 2020 21:32:44 +0300
Subject: [PATCH] [GPU] SharedMemory: common sparse memory allocation

---
 src/xenia/base/bit_range.h                 | 106 ++++++++++++++++
 src/xenia/gpu/d3d12/d3d12_shared_memory.cc | 137 +++++++++------------
 src/xenia/gpu/d3d12/d3d12_shared_memory.h  |  15 +--
 src/xenia/gpu/shared_memory.cc             | 101 +++++++++++++--
 src/xenia/gpu/shared_memory.h              |  41 +++---
 5 files changed, 287 insertions(+), 113 deletions(-)
 create mode 100644 src/xenia/base/bit_range.h
diff --git a/src/xenia/base/bit_range.h b/src/xenia/base/bit_range.h
new file mode 100644
index 000000000..462d5e2cd
--- /dev/null
+++ b/src/xenia/base/bit_range.h
@@ -0,0 +1,106 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2019 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_BASE_BIT_RANGE_H_
+#define XENIA_BASE_BIT_RANGE_H_
+
+#include <climits>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <utility>
+
+#include "xenia/base/math.h"
+
+namespace xe {
+namespace bit_range {
+
+// Provided length is in bits since the first. Returns <first, length> of the
+// range in bits, with length == 0 if not found.
+template <typename Block>
+std::pair<size_t, size_t> NextUnsetRange(const Block* bits, size_t first,
+                                         size_t length) {
+  if (!length) {
+    return std::make_pair(size_t(first), size_t(0));
+  }
+  size_t last = first + length - 1;
+  const size_t block_bits = sizeof(Block) * CHAR_BIT;
+  size_t block_first = first / block_bits;
+  size_t block_last = last / block_bits;
+  size_t range_start = SIZE_MAX;
+  for (size_t i = block_first; i <= block_last; ++i) {
+    Block block = bits[i];
+    // Ignore bits in the block outside the specified range by considering them
+    // set.
+    if (i == block_first) {
+      block |= (Block(1) << (first & (block_bits - 1))) - 1;
+    }
+    if (i == block_last && (last & (block_bits - 1)) != block_bits - 1) {
+      block |= ~((Block(1) << ((last & (block_bits - 1)) + 1)) - 1);
+    }
+    while (true) {
+      uint32_t block_bit;
+      if (range_start == SIZE_MAX) {
+        // Check if need to open a new range.
+        if (!xe::bit_scan_forward(~block, &block_bit)) {
+          break;
+        }
+        range_start = i * block_bits + block_bit;
+      } else {
+        // Check if need to close the range.
+        // Ignore the set bits before the beginning of the range.
+        Block block_bits_set_from_start = block;
+        if (i == range_start / block_bits) {
+          block_bits_set_from_start &=
+              ~((Block(1) << (range_start & (block_bits - 1))) - 1);
+        }
+        if (!xe::bit_scan_forward(block_bits_set_from_start, &block_bit)) {
+          break;
+        }
+        return std::make_pair(range_start,
+                              (i * block_bits) + block_bit - range_start);
+      }
+    }
+  }
+  if (range_start != SIZE_MAX) {
+    return std::make_pair(range_start, last + size_t(1) - range_start);
+  }
+  return std::make_pair(first + length, size_t(0));
+}
+
+template <typename Block>
+void SetRange(Block* bits, size_t first, size_t length) {
+  if (!length) {
+    return;
+  }
+  size_t last = first + length - 1;
+  const size_t block_bits = sizeof(Block) * CHAR_BIT;
+  size_t block_first = first / block_bits;
+  size_t block_last = last / block_bits;
+  Block set_first = ~((Block(1) << (first & (block_bits - 1))) - 1);
+  Block set_last = ~Block(0);
+  if ((last & (block_bits - 1)) != (block_bits - 1)) {
+    set_last &= (Block(1) << ((last & (block_bits - 1)) + 1)) - 1;
+  }
+  if (block_first == block_last) {
+    bits[block_first] |= set_first & set_last;
+    return;
+  }
+  bits[block_first] |= set_first;
+  if (block_first + 1 < block_last) {
+    std::memset(bits + block_first + 1, CHAR_MAX,
+                (block_last - (block_first + 1)) * sizeof(Block));
+  }
+  bits[block_last] |= set_last;
+}
+
+}  // namespace bit_range
+}  // namespace xe
+
+#endif  // XENIA_BASE_BIT_RANGE_H_
diff --git a/src/xenia/gpu/d3d12/d3d12_shared_memory.cc b/src/xenia/gpu/d3d12/d3d12_shared_memory.cc
index 2c74c4da8..992f9aed5 100644
--- a/src/xenia/gpu/d3d12/d3d12_shared_memory.cc
+++ b/src/xenia/gpu/d3d12/d3d12_shared_memory.cc
@@ -17,7 +17,6 @@
 #include "xenia/base/cvar.h"
 #include "xenia/base/logging.h"
 #include "xenia/base/math.h"
-#include "xenia/base/profiling.h"
 #include "xenia/gpu/d3d12/d3d12_command_processor.h"
 #include "xenia/ui/d3d12/d3d12_util.h"
 
@@ -43,26 +42,35 @@ D3D12SharedMemory::~D3D12SharedMemory() { Shutdown(true); }
 bool D3D12SharedMemory::Initialize() {
   InitializeCommon();
 
-  auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
-  auto device = provider.GetDevice();
+  const ui::d3d12::D3D12Provider& provider =
+      command_processor_.GetD3D12Context().GetD3D12Provider();
+  ID3D12Device* device = provider.GetDevice();
 
   D3D12_RESOURCE_DESC buffer_desc;
   ui::d3d12::util::FillBufferResourceDesc(
       buffer_desc, kBufferSize, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS);
   buffer_state_ = D3D12_RESOURCE_STATE_COPY_DEST;
-  if (AreTiledResourcesUsed()) {
+  if (cvars::d3d12_tiled_shared_memory &&
+      provider.GetTiledResourcesTier() !=
+          D3D12_TILED_RESOURCES_TIER_NOT_SUPPORTED &&
+      !provider.GetGraphicsAnalysis()) {
     if (FAILED(device->CreateReservedResource(
             &buffer_desc, buffer_state_, nullptr, IID_PPV_ARGS(&buffer_)))) {
-      XELOGE("Shared memory: Failed to create the 512 MB tiled buffer");
+      XELOGE("Shared memory: Failed to create the {} MB tiled buffer",
+             kBufferSize >> 20);
       Shutdown();
       return false;
     }
+    static_assert(D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES == (1 << 16));
+    InitializeSparseHostGpuMemory(
+        std::max(kHostGpuMemoryOptimalSparseAllocationLog2, uint32_t(16)));
   } else {
     XELOGGPU(
         "Direct3D 12 tiled resources are not used for shared memory "
         "emulation - video memory usage may increase significantly "
-        "because a full 512 MB buffer will be created!");
-    if (provider.GetGraphicsAnalysis() != nullptr) {
+        "because a full {} MB buffer will be created!",
+        kBufferSize >> 20);
+    if (provider.GetGraphicsAnalysis()) {
       // As of October 8th, 2018, PIX doesn't support tiled buffers.
       // FIXME(Triang3l): Re-enable tiled resources with PIX once fixed.
       XELOGGPU(
@@ -73,7 +81,8 @@ bool D3D12SharedMemory::Initialize() {
             &ui::d3d12::util::kHeapPropertiesDefault,
             provider.GetHeapFlagCreateNotZeroed(), &buffer_desc, buffer_state_,
             nullptr, IID_PPV_ARGS(&buffer_)))) {
-      XELOGE("Shared memory: Failed to create the 512 MB buffer");
+      XELOGE("Shared memory: Failed to create the {} MB buffer",
+             kBufferSize >> 20);
       Shutdown();
       return false;
     }
@@ -161,13 +170,10 @@ void D3D12SharedMemory::Shutdown(bool from_destructor) {
   // First free the buffer to detach it from the heaps.
   ui::d3d12::util::ReleaseAndNull(buffer_);
 
-  if (AreTiledResourcesUsed()) {
-    for (uint32_t i = 0; i < xe::countof(heaps_); ++i) {
-      ui::d3d12::util::ReleaseAndNull(heaps_[i]);
-    }
-    heap_count_ = 0;
-    COUNT_profile_set("gpu/shared_memory/used_mb", 0);
+  for (ID3D12Heap* heap : buffer_tiled_heaps_) {
+    heap->Release();
   }
+  buffer_tiled_heaps_.clear();
 
   // If calling from the destructor, the SharedMemory destructor will call
   // ShutdownCommon.
@@ -180,26 +186,12 @@ void D3D12SharedMemory::ClearCache() {
   SharedMemory::ClearCache();
 
   upload_buffer_pool_->ClearCache();
-
-  // TODO(Triang3l): Unmap and destroy heaps.
 }
 
 void D3D12SharedMemory::CompletedSubmissionUpdated() {
   upload_buffer_pool_->Reclaim(command_processor_.GetCompletedSubmission());
 }
 
-bool D3D12SharedMemory::AreTiledResourcesUsed() const {
-  if (!cvars::d3d12_tiled_shared_memory) {
-    return false;
-  }
-  auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
-  // As of October 8th, 2018, PIX doesn't support tiled buffers.
-  // FIXME(Triang3l): Re-enable tiled resources with PIX once fixed.
-  return provider.GetTiledResourcesTier() !=
-             D3D12_TILED_RESOURCES_TIER_NOT_SUPPORTED &&
-         provider.GetGraphicsAnalysis() == nullptr;
-}
-
 void D3D12SharedMemory::CommitUAVWritesAndTransitionBuffer(
     D3D12_RESOURCE_STATES new_state) {
   if (buffer_state_ == new_state) {
@@ -321,11 +313,6 @@ bool D3D12SharedMemory::InitializeTraceSubmitDownloads() {
   command_processor_.SubmitBarriers();
   uint32_t download_buffer_offset = 0;
   for (auto& download_range : trace_download_ranges()) {
-    if (!EnsureHostGpuMemoryAllocated(download_range.first,
-                                      download_range.second)) {
-      download_range.second = 0;
-      continue;
-    }
     command_list.D3DCopyBufferRegion(
         trace_download_buffer_, download_buffer_offset, buffer_,
         download_range.first, download_range.second);
@@ -362,52 +349,50 @@ void D3D12SharedMemory::ResetTraceDownload() {
   ReleaseTraceDownloadRanges();
 }
 
-bool D3D12SharedMemory::EnsureHostGpuMemoryAllocated(uint32_t start,
-                                                     uint32_t length) {
-  if (!length || !AreTiledResourcesUsed()) {
+bool D3D12SharedMemory::AllocateSparseHostGpuMemoryRange(
+    uint32_t offset_allocations, uint32_t length_allocations) {
+  if (!length_allocations) {
     return true;
   }
-  uint32_t heap_first = start >> kHeapSizeLog2;
-  uint32_t heap_last = (start + length - 1) >> kHeapSizeLog2;
-  assert_true(heap_first < xe::countof(heaps_));
-  assert_true(heap_last < xe::countof(heaps_));
-  for (uint32_t i = heap_first; i <= heap_last; ++i) {
-    if (heaps_[i] != nullptr) {
-      continue;
-    }
-    auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
-    auto device = provider.GetDevice();
-    auto direct_queue = provider.GetDirectQueue();
-    D3D12_HEAP_DESC heap_desc = {};
-    heap_desc.SizeInBytes = kHeapSize;
-    heap_desc.Properties.Type = D3D12_HEAP_TYPE_DEFAULT;
-    heap_desc.Flags = D3D12_HEAP_FLAG_ALLOW_ONLY_BUFFERS |
-                      provider.GetHeapFlagCreateNotZeroed();
-    if (FAILED(device->CreateHeap(&heap_desc, IID_PPV_ARGS(&heaps_[i])))) {
-      XELOGE("Shared memory: Failed to create a tile heap");
-      return false;
-    }
-    ++heap_count_;
-    COUNT_profile_set("gpu/shared_memory/used_mb",
-                      heap_count_ << kHeapSizeLog2 >> 20);
-    D3D12_TILED_RESOURCE_COORDINATE region_start_coordinates;
-    region_start_coordinates.X =
-        (i << kHeapSizeLog2) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
-    region_start_coordinates.Y = 0;
-    region_start_coordinates.Z = 0;
-    region_start_coordinates.Subresource = 0;
-    D3D12_TILE_REGION_SIZE region_size;
-    region_size.NumTiles = kHeapSize / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
-    region_size.UseBox = FALSE;
-    D3D12_TILE_RANGE_FLAGS range_flags = D3D12_TILE_RANGE_FLAG_NONE;
-    UINT heap_range_start_offset = 0;
-    UINT range_tile_count = kHeapSize / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
-    direct_queue->UpdateTileMappings(
-        buffer_, 1, &region_start_coordinates, &region_size, heaps_[i], 1,
-        &range_flags, &heap_range_start_offset, &range_tile_count,
-        D3D12_TILE_MAPPING_FLAG_NONE);
-    command_processor_.NotifyQueueOperationsDoneDirectly();
+
+  uint32_t offset_bytes = offset_allocations
+                          << host_gpu_memory_sparse_granularity_log2();
+  uint32_t length_bytes = length_allocations
+                          << host_gpu_memory_sparse_granularity_log2();
+
+  const ui::d3d12::D3D12Provider& provider =
+      command_processor_.GetD3D12Context().GetD3D12Provider();
+  ID3D12Device* device = provider.GetDevice();
+  ID3D12CommandQueue* direct_queue = provider.GetDirectQueue();
+
+  D3D12_HEAP_DESC heap_desc = {};
+  heap_desc.SizeInBytes = length_bytes;
+  heap_desc.Properties.Type = D3D12_HEAP_TYPE_DEFAULT;
+  heap_desc.Flags = D3D12_HEAP_FLAG_ALLOW_ONLY_BUFFERS |
+                    provider.GetHeapFlagCreateNotZeroed();
+  ID3D12Heap* heap;
+  if (FAILED(device->CreateHeap(&heap_desc, IID_PPV_ARGS(&heap)))) {
+    XELOGE("Shared memory: Failed to create a tile heap");
+    return false;
   }
+  buffer_tiled_heaps_.push_back(heap);
+
+  D3D12_TILED_RESOURCE_COORDINATE region_start_coordinates;
+  region_start_coordinates.X =
+      offset_bytes / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
+  region_start_coordinates.Y = 0;
+  region_start_coordinates.Z = 0;
+  region_start_coordinates.Subresource = 0;
+  D3D12_TILE_REGION_SIZE region_size;
+  region_size.NumTiles = length_bytes / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
+  region_size.UseBox = FALSE;
+  D3D12_TILE_RANGE_FLAGS range_flags = D3D12_TILE_RANGE_FLAG_NONE;
+  UINT heap_range_start_offset = 0;
+  direct_queue->UpdateTileMappings(
+      buffer_, 1, &region_start_coordinates, &region_size, heap, 1,
+      &range_flags, &heap_range_start_offset, &region_size.NumTiles,
+      D3D12_TILE_MAPPING_FLAG_NONE);
+  command_processor_.NotifyQueueOperationsDoneDirectly();
   return true;
 }
 
diff --git a/src/xenia/gpu/d3d12/d3d12_shared_memory.h b/src/xenia/gpu/d3d12/d3d12_shared_memory.h
index c66e5578d..6620cecaa 100644
--- a/src/xenia/gpu/d3d12/d3d12_shared_memory.h
+++ b/src/xenia/gpu/d3d12/d3d12_shared_memory.h
@@ -87,33 +87,24 @@ class D3D12SharedMemory : public SharedMemory {
   void InitializeTraceCompleteDownloads();
 
  protected:
-  bool EnsureHostGpuMemoryAllocated(uint32_t start, uint32_t length) override;
+  bool AllocateSparseHostGpuMemoryRange(uint32_t offset_allocations,
+                                        uint32_t length_allocations) override;
 
   bool UploadRanges(const std::vector<std::pair<uint32_t, uint32_t>>&
                         upload_page_ranges) override;
 
  private:
-  bool AreTiledResourcesUsed() const;
-
   D3D12CommandProcessor& command_processor_;
   TraceWriter& trace_writer_;
 
   // The 512 MB tiled buffer.
   ID3D12Resource* buffer_ = nullptr;
   D3D12_GPU_VIRTUAL_ADDRESS buffer_gpu_address_ = 0;
+  std::vector<ID3D12Heap*> buffer_tiled_heaps_;
   D3D12_RESOURCE_STATES buffer_state_ = D3D12_RESOURCE_STATE_COPY_DEST;
   bool buffer_uav_writes_commit_needed_ = false;
   void CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATES new_state);
 
-  static_assert(D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES == (1 << 16));
-  static constexpr uint32_t kHeapSizeLog2 =
-      std::max(kOptimalAllocationLog2, uint32_t(16));
-  static constexpr uint32_t kHeapSize = 1 << kHeapSizeLog2;
-  // Resident portions of the tiled buffer.
-  ID3D12Heap* heaps_[kBufferSize >> kHeapSizeLog2] = {};
-  // Number of the heaps currently resident, for profiling.
-  uint32_t heap_count_ = 0;
-
   // Non-shader-visible buffer descriptor heap for faster binding (via copying
   // rather than creation).
   enum class BufferDescriptorIndex : uint32_t {
diff --git a/src/xenia/gpu/shared_memory.cc b/src/xenia/gpu/shared_memory.cc
index 4951eacea..ca3dcf4f0 100644
--- a/src/xenia/gpu/shared_memory.cc
+++ b/src/xenia/gpu/shared_memory.cc
@@ -13,6 +13,7 @@
 #include <utility>
 
 #include "xenia/base/assert.h"
+#include "xenia/base/bit_range.h"
 #include "xenia/base/math.h"
 #include "xenia/base/memory.h"
 #include "xenia/base/profiling.h"
@@ -36,6 +37,15 @@ void SharedMemory::InitializeCommon() {
           MemoryInvalidationCallbackThunk, this);
 }
 
+void SharedMemory::InitializeSparseHostGpuMemory(uint32_t granularity_log2) {
+  assert_true(granularity_log2 <= kBufferSizeLog2);
+  assert_true(host_gpu_memory_sparse_granularity_log2_ == UINT32_MAX);
+  host_gpu_memory_sparse_granularity_log2_ = granularity_log2;
+  host_gpu_memory_sparse_allocated_.resize(
+      size_t(1) << (std::max(kBufferSizeLog2 - granularity_log2, uint32_t(6)) -
+                    6));
+}
+
 void SharedMemory::ShutdownCommon() {
   ReleaseTraceDownloadRanges();
 
@@ -61,6 +71,19 @@ void SharedMemory::ShutdownCommon() {
         memory_invalidation_callback_handle_);
     memory_invalidation_callback_handle_ = nullptr;
   }
+
+  if (host_gpu_memory_sparse_used_bytes_) {
+    host_gpu_memory_sparse_used_bytes_ = 0;
+    COUNT_profile_set("gpu/shared_memory/host_gpu_memory_sparse_used_mb", 0);
+  }
+  if (host_gpu_memory_sparse_allocations_) {
+    host_gpu_memory_sparse_allocations_ = 0;
+    COUNT_profile_set("gpu/shared_memory/host_gpu_memory_sparse_allocations",
+                      0);
+  }
+  host_gpu_memory_sparse_allocated_.clear();
+  host_gpu_memory_sparse_allocated_.shrink_to_fit();
+  host_gpu_memory_sparse_granularity_log2_ = UINT32_MAX;
 }
 
 void SharedMemory::ClearCache() {
@@ -244,6 +267,14 @@ void SharedMemory::RangeWrittenByGpu(uint32_t start, uint32_t length) {
   MakeRangeValid(start, length, true);
 }
 
+bool SharedMemory::AllocateSparseHostGpuMemoryRange(
+    uint32_t offset_allocations, uint32_t length_allocations) {
+  assert_always(
+      "Sparse host GPU memory allocation has been initialized, but the "
+      "implementation doesn't provide AllocateSparseHostGpuMemoryRange");
+  return false;
+}
+
 void SharedMemory::MakeRangeValid(uint32_t start, uint32_t length,
                                   bool written_by_gpu) {
   if (length == 0 || start >= kBufferSize) {
@@ -316,7 +347,6 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length) {
   if (start > kBufferSize || (kBufferSize - start) < length) {
     return false;
   }
-  uint32_t last = start + length - 1;
 
   SCOPE_profile_cpu_f("gpu");
 
@@ -506,10 +536,14 @@ void SharedMemory::PrepareForTraceDownload() {
       } else {
         uint32_t gpu_written_range_length =
             gpu_written_page - gpu_written_range_start;
-        trace_download_ranges_.push_back(
-            std::make_pair(gpu_written_range_start << page_size_log2_,
-                           gpu_written_range_length << page_size_log2_));
-        trace_download_page_count_ += gpu_written_range_length;
+        if (EnsureHostGpuMemoryAllocated(
+                gpu_written_range_start << page_size_log2_,
+                gpu_written_range_length << page_size_log2_)) {
+          trace_download_ranges_.push_back(
+              std::make_pair(gpu_written_range_start << page_size_log2_,
+                             gpu_written_range_length << page_size_log2_));
+          trace_download_page_count_ += gpu_written_range_length;
+        }
         gpu_written_range_start = UINT32_MAX;
       }
       uint64_t gpu_written_block_mask =
@@ -524,10 +558,14 @@ void SharedMemory::PrepareForTraceDownload() {
   }
   if (gpu_written_range_start != UINT32_MAX) {
     uint32_t gpu_written_range_length = page_count - gpu_written_range_start;
-    trace_download_ranges_.push_back(
-        std::make_pair(gpu_written_range_start << page_size_log2_,
-                       gpu_written_range_length << page_size_log2_));
-    trace_download_page_count_ += gpu_written_range_length;
+    if (EnsureHostGpuMemoryAllocated(
+            gpu_written_range_start << page_size_log2_,
+            gpu_written_range_length << page_size_log2_)) {
+      trace_download_ranges_.push_back(
+          std::make_pair(gpu_written_range_start << page_size_log2_,
+                         gpu_written_range_length << page_size_log2_));
+      trace_download_page_count_ += gpu_written_range_length;
+    }
   }
 }
 
@@ -537,5 +575,50 @@ void SharedMemory::ReleaseTraceDownloadRanges() {
   trace_download_page_count_ = 0;
 }
 
+bool SharedMemory::EnsureHostGpuMemoryAllocated(uint32_t start,
+                                                uint32_t length) {
+  if (host_gpu_memory_sparse_granularity_log2_ == UINT32_MAX) {
+    return true;
+  }
+  if (!length) {
+    return true;
+  }
+  if (start > kBufferSize || (kBufferSize - start) < length) {
+    return false;
+  }
+  uint32_t page_first = start >> page_size_log2_;
+  uint32_t page_last = (start + length - 1) >> page_size_log2_;
+  uint32_t allocation_first =
+      page_first << page_size_log2_ >> host_gpu_memory_sparse_granularity_log2_;
+  uint32_t allocation_last =
+      page_last << page_size_log2_ >> host_gpu_memory_sparse_granularity_log2_;
+  while (true) {
+    std::pair<size_t, size_t> allocation_range = xe::bit_range::NextUnsetRange(
+        host_gpu_memory_sparse_allocated_.data(), allocation_first,
+        allocation_last - allocation_first + 1);
+    if (!allocation_range.second) {
+      break;
+    }
+    if (!AllocateSparseHostGpuMemoryRange(uint32_t(allocation_range.first),
+                                          uint32_t(allocation_range.second))) {
+      return false;
+    }
+    xe::bit_range::SetRange(host_gpu_memory_sparse_allocated_.data(),
+                            allocation_range.first, allocation_range.second);
+    ++host_gpu_memory_sparse_allocations_;
+    COUNT_profile_set("gpu/shared_memory/host_gpu_memory_sparse_allocations",
+                      host_gpu_memory_sparse_allocations_);
+    host_gpu_memory_sparse_used_bytes_ +=
+        uint32_t(allocation_range.second)
+        << host_gpu_memory_sparse_granularity_log2_;
+    COUNT_profile_set(
+        "gpu/shared_memory/host_gpu_memory_sparse_used_mb",
+        (host_gpu_memory_sparse_used_bytes_ + ((1 << 20) - 1)) >> 20);
+    allocation_first =
+        uint32_t(allocation_range.first + allocation_range.second);
+  }
+  return true;
+}
+
 }  // namespace gpu
 }  // namespace xe
diff --git a/src/xenia/gpu/shared_memory.h b/src/xenia/gpu/shared_memory.h
index 6dae85909..496836a38 100644
--- a/src/xenia/gpu/shared_memory.h
+++ b/src/xenia/gpu/shared_memory.h
@@ -93,6 +93,7 @@ class SharedMemory {
   SharedMemory(Memory& memory);
   // Call in implementation-specific initialization.
   void InitializeCommon();
+  void InitializeSparseHostGpuMemory(uint32_t granularity_log2);
   // Call last in implementation-specific shutdown, also callable from the
   // destructor.
   void ShutdownCommon();
@@ -103,33 +104,35 @@ class SharedMemory {
   // Sparse allocations are 4 MB, so not too many of them are allocated, but
   // also not to waste too much memory for padding (with 16 MB there's too
   // much).
-  static constexpr uint32_t kOptimalAllocationLog2 = 22;
-  static_assert(kOptimalAllocationLog2 <= kBufferSizeLog2);
+  static constexpr uint32_t kHostGpuMemoryOptimalSparseAllocationLog2 = 22;
+  static_assert(kHostGpuMemoryOptimalSparseAllocationLog2 <= kBufferSizeLog2);
 
   Memory& memory() const { return memory_; }
 
   uint32_t page_size_log2() const { return page_size_log2_; }
 
+  uint32_t host_gpu_memory_sparse_granularity_log2() const {
+    return host_gpu_memory_sparse_granularity_log2_;
+  }
+
+  virtual bool AllocateSparseHostGpuMemoryRange(uint32_t offset_allocations,
+                                                uint32_t length_allocations);
+
   // Mark the memory range as updated and protect it.
   void MakeRangeValid(uint32_t start, uint32_t length, bool written_by_gpu);
 
-  // Ensures the host GPU memory backing the range is accessible by host GPU
-  // drawing / computations / copying, but doesn't upload anything.
-  virtual bool EnsureHostGpuMemoryAllocated(uint32_t start,
-                                            uint32_t length) = 0;
-
-  // Uploads a range of host pages - only called if EnsureHostGpuMemoryAllocated
-  // succeeded. While uploading, MarkRangeValid must be called for each
-  // successfully uploaded range as early as possible, before the memcpy, to
-  // make sure invalidation that happened during the CPU -> GPU memcpy isn't
-  // missed (upload_page_ranges is in pages because of this - MarkRangeValid has
-  // page granularity).
+  // Uploads a range of host pages - only called if host GPU sparse memory
+  // allocation succeeded if needed. While uploading, MarkRangeValid must be
+  // called for each successfully uploaded range as early as possible, before
+  // the memcpy, to make sure invalidation that happened during the CPU -> GPU
+  // memcpy isn't missed (upload_page_ranges is in pages because of this -
+  // MarkRangeValid has page granularity). upload_page_ranges are sorted in
+  // ascending address order, so front and back can be used to determine the
+  // overall bounds of pages to be uploaded.
   virtual bool UploadRanges(
       const std::vector<std::pair<uint32_t, uint32_t>>& upload_page_ranges) = 0;
 
-  // Mutable so the implementation can skip ranges by setting their "second"
-  // value to 0 if needed.
-  std::vector<std::pair<uint32_t, uint32_t>>& trace_download_ranges() {
+  const std::vector<std::pair<uint32_t, uint32_t>>& trace_download_ranges() {
     return trace_download_ranges_;
   }
   uint32_t trace_download_page_count() const {
@@ -153,6 +156,12 @@ class SharedMemory {
   // touched pages of the buffer of this size will be invalidated).
   uint32_t page_size_log2_;
 
+  bool EnsureHostGpuMemoryAllocated(uint32_t start, uint32_t length);
+  uint32_t host_gpu_memory_sparse_granularity_log2_ = UINT32_MAX;
+  std::vector<uint64_t> host_gpu_memory_sparse_allocated_;
+  uint32_t host_gpu_memory_sparse_allocations_ = 0;
+  uint32_t host_gpu_memory_sparse_used_bytes_ = 0;
+
   void* memory_invalidation_callback_handle_ = nullptr;
   void* memory_data_provider_handle_ = nullptr;