diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
index 1275fdd0f..4b1fb4ac0 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@@ -1161,7 +1161,7 @@ bool D3D12CommandProcessor::SetupContext() {
   }
 
   shared_memory_ =
-      std::make_unique<SharedMemory>(*this, *memory_, trace_writer_);
+      std::make_unique<D3D12SharedMemory>(*this, *memory_, trace_writer_);
   if (!shared_memory_->Initialize()) {
     XELOGE("Failed to initialize shared memory");
     return false;
@@ -2259,7 +2259,7 @@ bool D3D12CommandProcessor::IssueDraw(xenos::PrimitiveType primitive_type,
     // Invalidate textures in memexported memory and watch for changes.
     for (uint32_t i = 0; i < memexport_range_count; ++i) {
       const MemExportRange& memexport_range = memexport_ranges[i];
-      shared_memory_->RangeWrittenByGPU(
+      shared_memory_->RangeWrittenByGpu(
           memexport_range.base_address_dwords << 2,
           memexport_range.size_dwords << 2);
     }
diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h
index 58015cdb4..e2677029b 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.h
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h
@@ -20,11 +20,11 @@
 #include "xenia/base/assert.h"
 #include "xenia/gpu/command_processor.h"
 #include "xenia/gpu/d3d12/d3d12_graphics_system.h"
+#include "xenia/gpu/d3d12/d3d12_shared_memory.h"
 #include "xenia/gpu/d3d12/deferred_command_list.h"
 #include "xenia/gpu/d3d12/pipeline_cache.h"
 #include "xenia/gpu/d3d12/primitive_converter.h"
 #include "xenia/gpu/d3d12/render_target_cache.h"
-#include "xenia/gpu/d3d12/shared_memory.h"
 #include "xenia/gpu/d3d12/texture_cache.h"
 #include "xenia/gpu/dxbc_shader_translator.h"
 #include "xenia/gpu/xenos.h"
@@ -471,7 +471,7 @@ class D3D12CommandProcessor : public CommandProcessor {
   ID3D12RootSignature* root_signature_bindless_vs_ = nullptr;
   ID3D12RootSignature* root_signature_bindless_ds_ = nullptr;
 
-  std::unique_ptr<SharedMemory> shared_memory_;
+  std::unique_ptr<D3D12SharedMemory> shared_memory_;
 
   std::unique_ptr<PipelineCache> pipeline_cache_;
 
diff --git a/src/xenia/gpu/d3d12/d3d12_shared_memory.cc b/src/xenia/gpu/d3d12/d3d12_shared_memory.cc
new file mode 100644
index 000000000..2c74c4da8
--- /dev/null
+++ b/src/xenia/gpu/d3d12/d3d12_shared_memory.cc
@@ -0,0 +1,459 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2020 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include "xenia/gpu/d3d12/d3d12_shared_memory.h"
+
+#include <cstring>
+#include <utility>
+#include <vector>
+
+#include "xenia/base/assert.h"
+#include "xenia/base/cvar.h"
+#include "xenia/base/logging.h"
+#include "xenia/base/math.h"
+#include "xenia/base/profiling.h"
+#include "xenia/gpu/d3d12/d3d12_command_processor.h"
+#include "xenia/ui/d3d12/d3d12_util.h"
+
+DEFINE_bool(d3d12_tiled_shared_memory, true,
+            "Enable tiled resources for shared memory emulation. Disabling "
+            "them greatly increases video memory usage - a 512 MB buffer is "
+            "created - but allows graphics debuggers that don't support tiled "
+            "resources to work.",
+            "D3D12");
+
+namespace xe {
+namespace gpu {
+namespace d3d12 {
+
+D3D12SharedMemory::D3D12SharedMemory(D3D12CommandProcessor& command_processor,
+                                     Memory& memory, TraceWriter& trace_writer)
+    : SharedMemory(memory),
+      command_processor_(command_processor),
+      trace_writer_(trace_writer) {}
+
+D3D12SharedMemory::~D3D12SharedMemory() { Shutdown(true); }
+
+bool D3D12SharedMemory::Initialize() {
+  InitializeCommon();
+
+  auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
+  auto device = provider.GetDevice();
+
+  D3D12_RESOURCE_DESC buffer_desc;
+  ui::d3d12::util::FillBufferResourceDesc(
+      buffer_desc, kBufferSize, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS);
+  buffer_state_ = D3D12_RESOURCE_STATE_COPY_DEST;
+  if (AreTiledResourcesUsed()) {
+    if (FAILED(device->CreateReservedResource(
+            &buffer_desc, buffer_state_, nullptr, IID_PPV_ARGS(&buffer_)))) {
+      XELOGE("Shared memory: Failed to create the 512 MB tiled buffer");
+      Shutdown();
+      return false;
+    }
+  } else {
+    XELOGGPU(
+        "Direct3D 12 tiled resources are not used for shared memory "
+        "emulation - video memory usage may increase significantly "
+        "because a full 512 MB buffer will be created!");
+    if (provider.GetGraphicsAnalysis() != nullptr) {
+      // As of October 8th, 2018, PIX doesn't support tiled buffers.
+      // FIXME(Triang3l): Re-enable tiled resources with PIX once fixed.
+      XELOGGPU(
+          "This is caused by PIX being attached, which doesn't support tiled "
+          "resources yet.");
+    }
+    if (FAILED(device->CreateCommittedResource(
+            &ui::d3d12::util::kHeapPropertiesDefault,
+            provider.GetHeapFlagCreateNotZeroed(), &buffer_desc, buffer_state_,
+            nullptr, IID_PPV_ARGS(&buffer_)))) {
+      XELOGE("Shared memory: Failed to create the 512 MB buffer");
+      Shutdown();
+      return false;
+    }
+  }
+  buffer_gpu_address_ = buffer_->GetGPUVirtualAddress();
+  buffer_uav_writes_commit_needed_ = false;
+
+  D3D12_DESCRIPTOR_HEAP_DESC buffer_descriptor_heap_desc;
+  buffer_descriptor_heap_desc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
+  buffer_descriptor_heap_desc.NumDescriptors =
+      uint32_t(BufferDescriptorIndex::kCount);
+  buffer_descriptor_heap_desc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_NONE;
+  buffer_descriptor_heap_desc.NodeMask = 0;
+  if (FAILED(device->CreateDescriptorHeap(
+          &buffer_descriptor_heap_desc,
+          IID_PPV_ARGS(&buffer_descriptor_heap_)))) {
+    XELOGE(
+        "Failed to create the descriptor heap for shared memory buffer views");
+    Shutdown();
+    return false;
+  }
+  buffer_descriptor_heap_start_ =
+      buffer_descriptor_heap_->GetCPUDescriptorHandleForHeapStart();
+  ui::d3d12::util::CreateBufferRawSRV(
+      device,
+      provider.OffsetViewDescriptor(buffer_descriptor_heap_start_,
+                                    uint32_t(BufferDescriptorIndex::kRawSRV)),
+      buffer_, kBufferSize);
+  ui::d3d12::util::CreateBufferTypedSRV(
+      device,
+      provider.OffsetViewDescriptor(
+          buffer_descriptor_heap_start_,
+          uint32_t(BufferDescriptorIndex::kR32UintSRV)),
+      buffer_, DXGI_FORMAT_R32_UINT, kBufferSize >> 2);
+  ui::d3d12::util::CreateBufferTypedSRV(
+      device,
+      provider.OffsetViewDescriptor(
+          buffer_descriptor_heap_start_,
+          uint32_t(BufferDescriptorIndex::kR32G32UintSRV)),
+      buffer_, DXGI_FORMAT_R32G32_UINT, kBufferSize >> 3);
+  ui::d3d12::util::CreateBufferTypedSRV(
+      device,
+      provider.OffsetViewDescriptor(
+          buffer_descriptor_heap_start_,
+          uint32_t(BufferDescriptorIndex::kR32G32B32A32UintSRV)),
+      buffer_, DXGI_FORMAT_R32G32B32A32_UINT, kBufferSize >> 4);
+  ui::d3d12::util::CreateBufferRawUAV(
+      device,
+      provider.OffsetViewDescriptor(buffer_descriptor_heap_start_,
+                                    uint32_t(BufferDescriptorIndex::kRawUAV)),
+      buffer_, kBufferSize);
+  ui::d3d12::util::CreateBufferTypedUAV(
+      device,
+      provider.OffsetViewDescriptor(
+          buffer_descriptor_heap_start_,
+          uint32_t(BufferDescriptorIndex::kR32UintUAV)),
+      buffer_, DXGI_FORMAT_R32_UINT, kBufferSize >> 2);
+  ui::d3d12::util::CreateBufferTypedUAV(
+      device,
+      provider.OffsetViewDescriptor(
+          buffer_descriptor_heap_start_,
+          uint32_t(BufferDescriptorIndex::kR32G32UintUAV)),
+      buffer_, DXGI_FORMAT_R32G32_UINT, kBufferSize >> 3);
+  ui::d3d12::util::CreateBufferTypedUAV(
+      device,
+      provider.OffsetViewDescriptor(
+          buffer_descriptor_heap_start_,
+          uint32_t(BufferDescriptorIndex::kR32G32B32A32UintUAV)),
+      buffer_, DXGI_FORMAT_R32G32B32A32_UINT, kBufferSize >> 4);
+
+  upload_buffer_pool_ = std::make_unique<ui::d3d12::D3D12UploadBufferPool>(
+      provider, xe::align(ui::d3d12::D3D12UploadBufferPool::kDefaultPageSize,
+                          size_t(1) << page_size_log2()));
+
+  return true;
+}
+
+void D3D12SharedMemory::Shutdown(bool from_destructor) {
+  ResetTraceDownload();
+
+  upload_buffer_pool_.reset();
+
+  ui::d3d12::util::ReleaseAndNull(buffer_descriptor_heap_);
+
+  // First free the buffer to detach it from the heaps.
+  ui::d3d12::util::ReleaseAndNull(buffer_);
+
+  if (AreTiledResourcesUsed()) {
+    for (uint32_t i = 0; i < xe::countof(heaps_); ++i) {
+      ui::d3d12::util::ReleaseAndNull(heaps_[i]);
+    }
+    heap_count_ = 0;
+    COUNT_profile_set("gpu/shared_memory/used_mb", 0);
+  }
+
+  // If calling from the destructor, the SharedMemory destructor will call
+  // ShutdownCommon.
+  if (!from_destructor) {
+    ShutdownCommon();
+  }
+}
+
+void D3D12SharedMemory::ClearCache() {
+  SharedMemory::ClearCache();
+
+  upload_buffer_pool_->ClearCache();
+
+  // TODO(Triang3l): Unmap and destroy heaps.
+}
+
+void D3D12SharedMemory::CompletedSubmissionUpdated() {
+  upload_buffer_pool_->Reclaim(command_processor_.GetCompletedSubmission());
+}
+
+bool D3D12SharedMemory::AreTiledResourcesUsed() const {
+  if (!cvars::d3d12_tiled_shared_memory) {
+    return false;
+  }
+  auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
+  // As of October 8th, 2018, PIX doesn't support tiled buffers.
+  // FIXME(Triang3l): Re-enable tiled resources with PIX once fixed.
+  return provider.GetTiledResourcesTier() !=
+             D3D12_TILED_RESOURCES_TIER_NOT_SUPPORTED &&
+         provider.GetGraphicsAnalysis() == nullptr;
+}
+
+void D3D12SharedMemory::CommitUAVWritesAndTransitionBuffer(
+    D3D12_RESOURCE_STATES new_state) {
+  if (buffer_state_ == new_state) {
+    if (new_state == D3D12_RESOURCE_STATE_UNORDERED_ACCESS &&
+        buffer_uav_writes_commit_needed_) {
+      command_processor_.PushUAVBarrier(buffer_);
+      buffer_uav_writes_commit_needed_ = false;
+    }
+    return;
+  }
+  command_processor_.PushTransitionBarrier(buffer_, buffer_state_, new_state);
+  buffer_state_ = new_state;
+  // "UAV -> anything" transition commits the writes implicitly.
+  buffer_uav_writes_commit_needed_ = false;
+}
+
+void D3D12SharedMemory::WriteRawSRVDescriptor(
+    D3D12_CPU_DESCRIPTOR_HANDLE handle) {
+  auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
+  auto device = provider.GetDevice();
+  device->CopyDescriptorsSimple(
+      1, handle,
+      provider.OffsetViewDescriptor(buffer_descriptor_heap_start_,
+                                    uint32_t(BufferDescriptorIndex::kRawSRV)),
+      D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
+}
+
+void D3D12SharedMemory::WriteRawUAVDescriptor(
+    D3D12_CPU_DESCRIPTOR_HANDLE handle) {
+  auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
+  auto device = provider.GetDevice();
+  device->CopyDescriptorsSimple(
+      1, handle,
+      provider.OffsetViewDescriptor(buffer_descriptor_heap_start_,
+                                    uint32_t(BufferDescriptorIndex::kRawUAV)),
+      D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
+}
+
+void D3D12SharedMemory::WriteUintPow2SRVDescriptor(
+    D3D12_CPU_DESCRIPTOR_HANDLE handle, uint32_t element_size_bytes_pow2) {
+  BufferDescriptorIndex descriptor_index;
+  switch (element_size_bytes_pow2) {
+    case 2:
+      descriptor_index = BufferDescriptorIndex::kR32UintSRV;
+      break;
+    case 3:
+      descriptor_index = BufferDescriptorIndex::kR32G32UintSRV;
+      break;
+    case 4:
+      descriptor_index = BufferDescriptorIndex::kR32G32B32A32UintSRV;
+      break;
+    default:
+      assert_unhandled_case(element_size_bytes_pow2);
+      return;
+  }
+  auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
+  auto device = provider.GetDevice();
+  device->CopyDescriptorsSimple(
+      1, handle,
+      provider.OffsetViewDescriptor(buffer_descriptor_heap_start_,
+                                    uint32_t(descriptor_index)),
+      D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
+}
+
+void D3D12SharedMemory::WriteUintPow2UAVDescriptor(
+    D3D12_CPU_DESCRIPTOR_HANDLE handle, uint32_t element_size_bytes_pow2) {
+  BufferDescriptorIndex descriptor_index;
+  switch (element_size_bytes_pow2) {
+    case 2:
+      descriptor_index = BufferDescriptorIndex::kR32UintUAV;
+      break;
+    case 3:
+      descriptor_index = BufferDescriptorIndex::kR32G32UintUAV;
+      break;
+    case 4:
+      descriptor_index = BufferDescriptorIndex::kR32G32B32A32UintUAV;
+      break;
+    default:
+      assert_unhandled_case(element_size_bytes_pow2);
+      return;
+  }
+  auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
+  auto device = provider.GetDevice();
+  device->CopyDescriptorsSimple(
+      1, handle,
+      provider.OffsetViewDescriptor(buffer_descriptor_heap_start_,
+                                    uint32_t(descriptor_index)),
+      D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
+}
+
+bool D3D12SharedMemory::InitializeTraceSubmitDownloads() {
+  ResetTraceDownload();
+  PrepareForTraceDownload();
+  uint32_t download_page_count = trace_download_page_count();
+  // Request downloading of GPU-written memory.
+  if (!download_page_count) {
+    return false;
+  }
+  D3D12_RESOURCE_DESC download_buffer_desc;
+  ui::d3d12::util::FillBufferResourceDesc(
+      download_buffer_desc, download_page_count << page_size_log2(),
+      D3D12_RESOURCE_FLAG_NONE);
+  auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
+  auto device = provider.GetDevice();
+  if (FAILED(device->CreateCommittedResource(
+          &ui::d3d12::util::kHeapPropertiesReadback,
+          provider.GetHeapFlagCreateNotZeroed(), &download_buffer_desc,
+          D3D12_RESOURCE_STATE_COPY_DEST, nullptr,
+          IID_PPV_ARGS(&trace_download_buffer_)))) {
+    XELOGE(
+        "Shared memory: Failed to create a {} KB GPU-written memory download "
+        "buffer for frame tracing",
+        download_page_count << page_size_log2() >> 10);
+    ResetTraceDownload();
+    return false;
+  }
+  auto& command_list = command_processor_.GetDeferredCommandList();
+  UseAsCopySource();
+  command_processor_.SubmitBarriers();
+  uint32_t download_buffer_offset = 0;
+  for (auto& download_range : trace_download_ranges()) {
+    if (!EnsureHostGpuMemoryAllocated(download_range.first,
+                                      download_range.second)) {
+      download_range.second = 0;
+      continue;
+    }
+    command_list.D3DCopyBufferRegion(
+        trace_download_buffer_, download_buffer_offset, buffer_,
+        download_range.first, download_range.second);
+    download_buffer_offset += download_range.second;
+  }
+  return true;
+}
+
+void D3D12SharedMemory::InitializeTraceCompleteDownloads() {
+  if (!trace_download_buffer_) {
+    return;
+  }
+  void* download_mapping;
+  if (SUCCEEDED(trace_download_buffer_->Map(0, nullptr, &download_mapping))) {
+    uint32_t download_buffer_offset = 0;
+    for (auto download_range : trace_download_ranges()) {
+      trace_writer_.WriteMemoryRead(
+          download_range.first, download_range.second,
+          reinterpret_cast<const uint8_t*>(download_mapping) +
+              download_buffer_offset);
+    }
+    D3D12_RANGE download_write_range = {};
+    trace_download_buffer_->Unmap(0, &download_write_range);
+  } else {
+    XELOGE(
+        "Failed to map the GPU-written memory download buffer for frame "
+        "tracing");
+  }
+  ResetTraceDownload();
+}
+
+void D3D12SharedMemory::ResetTraceDownload() {
+  ui::d3d12::util::ReleaseAndNull(trace_download_buffer_);
+  ReleaseTraceDownloadRanges();
+}
+
+bool D3D12SharedMemory::EnsureHostGpuMemoryAllocated(uint32_t start,
+                                                     uint32_t length) {
+  if (!length || !AreTiledResourcesUsed()) {
+    return true;
+  }
+  uint32_t heap_first = start >> kHeapSizeLog2;
+  uint32_t heap_last = (start + length - 1) >> kHeapSizeLog2;
+  assert_true(heap_first < xe::countof(heaps_));
+  assert_true(heap_last < xe::countof(heaps_));
+  for (uint32_t i = heap_first; i <= heap_last; ++i) {
+    if (heaps_[i] != nullptr) {
+      continue;
+    }
+    auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
+    auto device = provider.GetDevice();
+    auto direct_queue = provider.GetDirectQueue();
+    D3D12_HEAP_DESC heap_desc = {};
+    heap_desc.SizeInBytes = kHeapSize;
+    heap_desc.Properties.Type = D3D12_HEAP_TYPE_DEFAULT;
+    heap_desc.Flags = D3D12_HEAP_FLAG_ALLOW_ONLY_BUFFERS |
+                      provider.GetHeapFlagCreateNotZeroed();
+    if (FAILED(device->CreateHeap(&heap_desc, IID_PPV_ARGS(&heaps_[i])))) {
+      XELOGE("Shared memory: Failed to create a tile heap");
+      return false;
+    }
+    ++heap_count_;
+    COUNT_profile_set("gpu/shared_memory/used_mb",
+                      heap_count_ << kHeapSizeLog2 >> 20);
+    D3D12_TILED_RESOURCE_COORDINATE region_start_coordinates;
+    region_start_coordinates.X =
+        (i << kHeapSizeLog2) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
+    region_start_coordinates.Y = 0;
+    region_start_coordinates.Z = 0;
+    region_start_coordinates.Subresource = 0;
+    D3D12_TILE_REGION_SIZE region_size;
+    region_size.NumTiles = kHeapSize / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
+    region_size.UseBox = FALSE;
+    D3D12_TILE_RANGE_FLAGS range_flags = D3D12_TILE_RANGE_FLAG_NONE;
+    UINT heap_range_start_offset = 0;
+    UINT range_tile_count = kHeapSize / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
+    direct_queue->UpdateTileMappings(
+        buffer_, 1, &region_start_coordinates, &region_size, heaps_[i], 1,
+        &range_flags, &heap_range_start_offset, &range_tile_count,
+        D3D12_TILE_MAPPING_FLAG_NONE);
+    command_processor_.NotifyQueueOperationsDoneDirectly();
+  }
+  return true;
+}
+
+bool D3D12SharedMemory::UploadRanges(
+    const std::vector<std::pair<uint32_t, uint32_t>>& upload_page_ranges) {
+  if (upload_page_ranges.empty()) {
+    return true;
+  }
+  CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_COPY_DEST);
+  command_processor_.SubmitBarriers();
+  auto& command_list = command_processor_.GetDeferredCommandList();
+  for (auto upload_range : upload_page_ranges) {
+    uint32_t upload_range_start = upload_range.first;
+    uint32_t upload_range_length = upload_range.second;
+    trace_writer_.WriteMemoryRead(upload_range_start << page_size_log2(),
+                                  upload_range_length << page_size_log2());
+    while (upload_range_length != 0) {
+      ID3D12Resource* upload_buffer;
+      size_t upload_buffer_offset, upload_buffer_size;
+      uint8_t* upload_buffer_mapping = upload_buffer_pool_->RequestPartial(
+          command_processor_.GetCurrentSubmission(),
+          upload_range_length << page_size_log2(),
+          size_t(1) << page_size_log2(), &upload_buffer, &upload_buffer_offset,
+          &upload_buffer_size, nullptr);
+      if (upload_buffer_mapping == nullptr) {
+        XELOGE("Shared memory: Failed to get an upload buffer");
+        return false;
+      }
+      MakeRangeValid(upload_range_start << page_size_log2(),
+                     uint32_t(upload_buffer_size), false);
+      std::memcpy(
+          upload_buffer_mapping,
+          memory().TranslatePhysical(upload_range_start << page_size_log2()),
+          upload_buffer_size);
+      command_list.D3DCopyBufferRegion(
+          buffer_, upload_range_start << page_size_log2(), upload_buffer,
+          UINT64(upload_buffer_offset), UINT64(upload_buffer_size));
+      uint32_t upload_buffer_pages =
+          uint32_t(upload_buffer_size >> page_size_log2());
+      upload_range_start += upload_buffer_pages;
+      upload_range_length -= upload_buffer_pages;
+    }
+  }
+  return true;
+}
+
+}  // namespace d3d12
+}  // namespace gpu
+}  // namespace xe
diff --git a/src/xenia/gpu/d3d12/d3d12_shared_memory.h b/src/xenia/gpu/d3d12/d3d12_shared_memory.h
new file mode 100644
index 000000000..c66e5578d
--- /dev/null
+++ b/src/xenia/gpu/d3d12/d3d12_shared_memory.h
@@ -0,0 +1,145 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2020 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#ifndef XENIA_GPU_D3D12_D3D12_SHARED_MEMORY_H_
+#define XENIA_GPU_D3D12_D3D12_SHARED_MEMORY_H_
+
+#include <algorithm>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "xenia/gpu/shared_memory.h"
+#include "xenia/gpu/trace_writer.h"
+#include "xenia/memory.h"
+#include "xenia/ui/d3d12/d3d12_api.h"
+#include "xenia/ui/d3d12/d3d12_upload_buffer_pool.h"
+
+namespace xe {
+namespace gpu {
+namespace d3d12 {
+
+class D3D12CommandProcessor;
+
+class D3D12SharedMemory : public SharedMemory {
+ public:
+  D3D12SharedMemory(D3D12CommandProcessor& command_processor, Memory& memory,
+                    TraceWriter& trace_writer);
+  ~D3D12SharedMemory() override;
+
+  bool Initialize();
+  void Shutdown(bool from_destructor = false);
+  void ClearCache() override;
+
+  ID3D12Resource* GetBuffer() const { return buffer_; }
+  D3D12_GPU_VIRTUAL_ADDRESS GetGPUAddress() const {
+    return buffer_gpu_address_;
+  }
+
+  void CompletedSubmissionUpdated();
+
+  // RequestRange may transition the buffer to copy destination - call it before
+  // UseForReading or UseForWriting.
+
+  // Makes the buffer usable for vertices, indices and texture untiling.
+  inline void UseForReading() {
+    // Vertex fetch is also allowed in pixel shaders.
+    CommitUAVWritesAndTransitionBuffer(
+        D3D12_RESOURCE_STATE_INDEX_BUFFER |
+        D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE |
+        D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE);
+  }
+  // Makes the buffer usable for texture tiling after a resolve.
+  inline void UseForWriting() {
+    CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
+  }
+  // Makes the buffer usable as a source for copy commands.
+  inline void UseAsCopySource() {
+    CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_COPY_SOURCE);
+  }
+  // Must be called when doing draws/dispatches modifying data within the shared
+  // memory buffer as a UAV, to make sure that when UseForWriting is called the
+  // next time, a UAV barrier will be done, and subsequent overlapping UAV
+  // writes and reads are ordered.
+  inline void MarkUAVWritesCommitNeeded() {
+    if (buffer_state_ == D3D12_RESOURCE_STATE_UNORDERED_ACCESS) {
+      buffer_uav_writes_commit_needed_ = true;
+    }
+  }
+
+  void WriteRawSRVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle);
+  void WriteRawUAVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle);
+  // Due to the Nvidia 128 megatexel limitation, the smallest supported formats
+  // are 32-bit.
+  void WriteUintPow2SRVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle,
+                                  uint32_t element_size_bytes_pow2);
+  void WriteUintPow2UAVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle,
+                                  uint32_t element_size_bytes_pow2);
+
+  // Returns true if any downloads were submitted to the command processor.
+  bool InitializeTraceSubmitDownloads();
+  void InitializeTraceCompleteDownloads();
+
+ protected:
+  bool EnsureHostGpuMemoryAllocated(uint32_t start, uint32_t length) override;
+
+  bool UploadRanges(const std::vector<std::pair<uint32_t, uint32_t>>&
+                        upload_page_ranges) override;
+
+ private:
+  bool AreTiledResourcesUsed() const;
+
+  D3D12CommandProcessor& command_processor_;
+  TraceWriter& trace_writer_;
+
+  // The 512 MB tiled buffer.
+  ID3D12Resource* buffer_ = nullptr;
+  D3D12_GPU_VIRTUAL_ADDRESS buffer_gpu_address_ = 0;
+  D3D12_RESOURCE_STATES buffer_state_ = D3D12_RESOURCE_STATE_COPY_DEST;
+  bool buffer_uav_writes_commit_needed_ = false;
+  void CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATES new_state);
+
+  static_assert(D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES == (1 << 16));
+  static constexpr uint32_t kHeapSizeLog2 =
+      std::max(kOptimalAllocationLog2, uint32_t(16));
+  static constexpr uint32_t kHeapSize = 1 << kHeapSizeLog2;
+  // Resident portions of the tiled buffer.
+  ID3D12Heap* heaps_[kBufferSize >> kHeapSizeLog2] = {};
+  // Number of the heaps currently resident, for profiling.
+  uint32_t heap_count_ = 0;
+
+  // Non-shader-visible buffer descriptor heap for faster binding (via copying
+  // rather than creation).
+  enum class BufferDescriptorIndex : uint32_t {
+    kRawSRV,
+    kR32UintSRV,
+    kR32G32UintSRV,
+    kR32G32B32A32UintSRV,
+    kRawUAV,
+    kR32UintUAV,
+    kR32G32UintUAV,
+    kR32G32B32A32UintUAV,
+
+    kCount,
+  };
+  ID3D12DescriptorHeap* buffer_descriptor_heap_ = nullptr;
+  D3D12_CPU_DESCRIPTOR_HANDLE buffer_descriptor_heap_start_;
+
+  std::unique_ptr<ui::d3d12::D3D12UploadBufferPool> upload_buffer_pool_;
+
+  // Created temporarily, only for downloading.
+  ID3D12Resource* trace_download_buffer_ = nullptr;
+  void ResetTraceDownload();
+};
+
+}  // namespace d3d12
+}  // namespace gpu
+}  // namespace xe
+
+#endif  // XENIA_GPU_D3D12_D3D12_SHARED_MEMORY_H_
diff --git a/src/xenia/gpu/d3d12/render_target_cache.cc b/src/xenia/gpu/d3d12/render_target_cache.cc
index a43ad90d3..939cffd65 100644
--- a/src/xenia/gpu/d3d12/render_target_cache.cc
+++ b/src/xenia/gpu/d3d12/render_target_cache.cc
@@ -1085,7 +1085,7 @@ bool RenderTargetCache::UpdateRenderTargets(const D3D12Shader* pixel_shader) {
 }
 
 bool RenderTargetCache::Resolve(const Memory& memory,
-                                SharedMemory& shared_memory,
+                                D3D12SharedMemory& shared_memory,
                                 TextureCache& texture_cache,
                                 uint32_t& written_address_out,
                                 uint32_t& written_length_out) {
diff --git a/src/xenia/gpu/d3d12/render_target_cache.h b/src/xenia/gpu/d3d12/render_target_cache.h
index cf575dcdf..0def0d25c 100644
--- a/src/xenia/gpu/d3d12/render_target_cache.h
+++ b/src/xenia/gpu/d3d12/render_target_cache.h
@@ -15,7 +15,7 @@
 
 #include "xenia/base/cvar.h"
 #include "xenia/gpu/d3d12/d3d12_shader.h"
-#include "xenia/gpu/d3d12/shared_memory.h"
+#include "xenia/gpu/d3d12/d3d12_shared_memory.h"
 #include "xenia/gpu/d3d12/texture_cache.h"
 #include "xenia/gpu/draw_util.h"
 #include "xenia/gpu/register_file.h"
@@ -277,11 +277,11 @@ class RenderTargetCache {
   // register values, and also clears the EDRAM buffer if needed. Must be in a
   // frame for calling.
 
-  bool Resolve(const Memory& memory, SharedMemory& shared_memory,
+  bool Resolve(const Memory& memory, D3D12SharedMemory& shared_memory,
                TextureCache& texture_cache, uint32_t& written_address_out,
                uint32_t& written_length_out);
 
-  bool Resolve(SharedMemory* shared_memory, TextureCache* texture_cache,
+  bool Resolve(D3D12SharedMemory* shared_memory, TextureCache* texture_cache,
                Memory* memory, uint32_t& written_address_out,
                uint32_t& written_length_out);
   // Flushes the render targets to EDRAM and unbinds them, for instance, when
diff --git a/src/xenia/gpu/d3d12/shared_memory.cc b/src/xenia/gpu/d3d12/shared_memory.cc
deleted file mode 100644
index c24bb970d..000000000
--- a/src/xenia/gpu/d3d12/shared_memory.cc
+++ /dev/null
@@ -1,959 +0,0 @@
-/**
- ******************************************************************************
- * Xenia : Xbox 360 Emulator Research Project                                 *
- ******************************************************************************
- * Copyright 2018 Ben Vanik. All rights reserved.                             *
- * Released under the BSD license - see LICENSE in the root for more details. *
- ******************************************************************************
- */
-
-#include "xenia/gpu/d3d12/shared_memory.h"
-
-#include <algorithm>
-#include <cstring>
-#include <utility>
-#include <vector>
-
-#include "xenia/base/assert.h"
-#include "xenia/base/cvar.h"
-#include "xenia/base/logging.h"
-#include "xenia/base/math.h"
-#include "xenia/base/memory.h"
-#include "xenia/base/profiling.h"
-#include "xenia/gpu/d3d12/d3d12_command_processor.h"
-#include "xenia/ui/d3d12/d3d12_util.h"
-
-DEFINE_bool(d3d12_tiled_shared_memory, true,
-            "Enable tiled resources for shared memory emulation. Disabling "
-            "them greatly increases video memory usage - a 512 MB buffer is "
-            "created - but allows graphics debuggers that don't support tiled "
-            "resources to work.",
-            "D3D12");
-
-namespace xe {
-namespace gpu {
-namespace d3d12 {
-
-SharedMemory::SharedMemory(D3D12CommandProcessor& command_processor,
-                           Memory& memory, TraceWriter& trace_writer)
-    : command_processor_(command_processor),
-      memory_(memory),
-      trace_writer_(trace_writer) {
-  page_size_log2_ = xe::log2_ceil(uint32_t(xe::memory::page_size()));
-  page_count_ = kBufferSize >> page_size_log2_;
-}
-
-SharedMemory::~SharedMemory() { Shutdown(); }
-
-bool SharedMemory::Initialize() {
-  auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
-  auto device = provider.GetDevice();
-
-  D3D12_RESOURCE_DESC buffer_desc;
-  ui::d3d12::util::FillBufferResourceDesc(
-      buffer_desc, kBufferSize, D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS);
-  buffer_state_ = D3D12_RESOURCE_STATE_COPY_DEST;
-  if (AreTiledResourcesUsed()) {
-    if (FAILED(device->CreateReservedResource(
-            &buffer_desc, buffer_state_, nullptr, IID_PPV_ARGS(&buffer_)))) {
-      XELOGE("Shared memory: Failed to create the 512 MB tiled buffer");
-      Shutdown();
-      return false;
-    }
-  } else {
-    XELOGGPU(
-        "Direct3D 12 tiled resources are not used for shared memory "
-        "emulation - video memory usage may increase significantly "
-        "because a full 512 MB buffer will be created!");
-    if (provider.GetGraphicsAnalysis() != nullptr) {
-      // As of October 8th, 2018, PIX doesn't support tiled buffers.
-      // FIXME(Triang3l): Re-enable tiled resources with PIX once fixed.
-      XELOGGPU(
-          "This is caused by PIX being attached, which doesn't support tiled "
-          "resources yet.");
-    }
-    if (FAILED(device->CreateCommittedResource(
-            &ui::d3d12::util::kHeapPropertiesDefault,
-            provider.GetHeapFlagCreateNotZeroed(), &buffer_desc, buffer_state_,
-            nullptr, IID_PPV_ARGS(&buffer_)))) {
-      XELOGE("Shared memory: Failed to create the 512 MB buffer");
-      Shutdown();
-      return false;
-    }
-  }
-  buffer_gpu_address_ = buffer_->GetGPUVirtualAddress();
-  buffer_uav_writes_commit_needed_ = false;
-
-  std::memset(heaps_, 0, sizeof(heaps_));
-  heap_count_ = 0;
-
-  D3D12_DESCRIPTOR_HEAP_DESC buffer_descriptor_heap_desc;
-  buffer_descriptor_heap_desc.Type = D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV;
-  buffer_descriptor_heap_desc.NumDescriptors =
-      uint32_t(BufferDescriptorIndex::kCount);
-  buffer_descriptor_heap_desc.Flags = D3D12_DESCRIPTOR_HEAP_FLAG_NONE;
-  buffer_descriptor_heap_desc.NodeMask = 0;
-  if (FAILED(device->CreateDescriptorHeap(
-          &buffer_descriptor_heap_desc,
-          IID_PPV_ARGS(&buffer_descriptor_heap_)))) {
-    XELOGE(
-        "Failed to create the descriptor heap for shared memory buffer views");
-    Shutdown();
-    return false;
-  }
-  buffer_descriptor_heap_start_ =
-      buffer_descriptor_heap_->GetCPUDescriptorHandleForHeapStart();
-  ui::d3d12::util::CreateBufferRawSRV(
-      device,
-      provider.OffsetViewDescriptor(buffer_descriptor_heap_start_,
-                                    uint32_t(BufferDescriptorIndex::kRawSRV)),
-      buffer_, kBufferSize);
-  ui::d3d12::util::CreateBufferTypedSRV(
-      device,
-      provider.OffsetViewDescriptor(
-          buffer_descriptor_heap_start_,
-          uint32_t(BufferDescriptorIndex::kR32UintSRV)),
-      buffer_, DXGI_FORMAT_R32_UINT, kBufferSize >> 2);
-  ui::d3d12::util::CreateBufferTypedSRV(
-      device,
-      provider.OffsetViewDescriptor(
-          buffer_descriptor_heap_start_,
-          uint32_t(BufferDescriptorIndex::kR32G32UintSRV)),
-      buffer_, DXGI_FORMAT_R32G32_UINT, kBufferSize >> 3);
-  ui::d3d12::util::CreateBufferTypedSRV(
-      device,
-      provider.OffsetViewDescriptor(
-          buffer_descriptor_heap_start_,
-          uint32_t(BufferDescriptorIndex::kR32G32B32A32UintSRV)),
-      buffer_, DXGI_FORMAT_R32G32B32A32_UINT, kBufferSize >> 4);
-  ui::d3d12::util::CreateBufferRawUAV(
-      device,
-      provider.OffsetViewDescriptor(buffer_descriptor_heap_start_,
-                                    uint32_t(BufferDescriptorIndex::kRawUAV)),
-      buffer_, kBufferSize);
-  ui::d3d12::util::CreateBufferTypedUAV(
-      device,
-      provider.OffsetViewDescriptor(
-          buffer_descriptor_heap_start_,
-          uint32_t(BufferDescriptorIndex::kR32UintUAV)),
-      buffer_, DXGI_FORMAT_R32_UINT, kBufferSize >> 2);
-  ui::d3d12::util::CreateBufferTypedUAV(
-      device,
-      provider.OffsetViewDescriptor(
-          buffer_descriptor_heap_start_,
-          uint32_t(BufferDescriptorIndex::kR32G32UintUAV)),
-      buffer_, DXGI_FORMAT_R32G32_UINT, kBufferSize >> 3);
-  ui::d3d12::util::CreateBufferTypedUAV(
-      device,
-      provider.OffsetViewDescriptor(
-          buffer_descriptor_heap_start_,
-          uint32_t(BufferDescriptorIndex::kR32G32B32A32UintUAV)),
-      buffer_, DXGI_FORMAT_R32G32B32A32_UINT, kBufferSize >> 4);
-
-  system_page_flags_.clear();
-  system_page_flags_.resize((page_count_ + 63) / 64);
-
-  upload_buffer_pool_ = std::make_unique<ui::d3d12::D3D12UploadBufferPool>(
-      provider, xe::align(ui::d3d12::D3D12UploadBufferPool::kDefaultPageSize,
-                          size_t(1) << page_size_log2_));
-
-  memory_invalidation_callback_handle_ =
-      memory_.RegisterPhysicalMemoryInvalidationCallback(
-          MemoryInvalidationCallbackThunk, this);
-
-  ResetTraceGPUWrittenBuffer();
-
-  return true;
-}
-
-void SharedMemory::Shutdown() {
-  ResetTraceGPUWrittenBuffer();
-
-  FireWatches(0, (kBufferSize - 1) >> page_size_log2_, false);
-  assert_true(global_watches_.empty());
-  // No watches now, so no references to the pools accessible by guest threads -
-  // safe not to enter the global critical region.
-  watch_node_first_free_ = nullptr;
-  watch_node_current_pool_allocated_ = 0;
-  for (WatchNode* pool : watch_node_pools_) {
-    delete[] pool;
-  }
-  watch_node_pools_.clear();
-  watch_range_first_free_ = nullptr;
-  watch_range_current_pool_allocated_ = 0;
-  for (WatchRange* pool : watch_range_pools_) {
-    delete[] pool;
-  }
-  watch_range_pools_.clear();
-
-  if (memory_invalidation_callback_handle_ != nullptr) {
-    memory_.UnregisterPhysicalMemoryInvalidationCallback(
-        memory_invalidation_callback_handle_);
-    memory_invalidation_callback_handle_ = nullptr;
-  }
-
-  upload_buffer_pool_.reset();
-
-  ui::d3d12::util::ReleaseAndNull(buffer_descriptor_heap_);
-
-  // First free the buffer to detach it from the heaps.
-  ui::d3d12::util::ReleaseAndNull(buffer_);
-
-  if (AreTiledResourcesUsed()) {
-    for (uint32_t i = 0; i < xe::countof(heaps_); ++i) {
-      ui::d3d12::util::ReleaseAndNull(heaps_[i]);
-    }
-    heap_count_ = 0;
-    COUNT_profile_set("gpu/shared_memory/used_mb", 0);
-  }
-}
-
-void SharedMemory::ClearCache() {
-  upload_buffer_pool_->ClearCache();
-
-  // Keeping GPU-written data, so "invalidated by GPU".
-  FireWatches(0, (kBufferSize - 1) >> page_size_log2_, true);
-  // No watches now, so no references to the pools accessible by guest threads -
-  // safe not to enter the global critical region.
-  watch_node_first_free_ = nullptr;
-  watch_node_current_pool_allocated_ = 0;
-  for (WatchNode* pool : watch_node_pools_) {
-    delete[] pool;
-  }
-  watch_node_pools_.clear();
-  watch_range_first_free_ = nullptr;
-  watch_range_current_pool_allocated_ = 0;
-  for (WatchRange* pool : watch_range_pools_) {
-    delete[] pool;
-  }
-  watch_range_pools_.clear();
-
-  {
-    auto global_lock = global_critical_region_.Acquire();
-    for (SystemPageFlagsBlock& block : system_page_flags_) {
-      block.valid = block.valid_and_gpu_written;
-    }
-  }
-
-  // TODO(Triang3l): Unmap and destroy heaps.
-}
-
-void SharedMemory::CompletedSubmissionUpdated() {
-  upload_buffer_pool_->Reclaim(command_processor_.GetCompletedSubmission());
-}
-
-SharedMemory::GlobalWatchHandle SharedMemory::RegisterGlobalWatch(
-    GlobalWatchCallback callback, void* callback_context) {
-  GlobalWatch* watch = new GlobalWatch;
-  watch->callback = callback;
-  watch->callback_context = callback_context;
-
-  auto global_lock = global_critical_region_.Acquire();
-  global_watches_.push_back(watch);
-
-  return reinterpret_cast<GlobalWatchHandle>(watch);
-}
-
-void SharedMemory::UnregisterGlobalWatch(GlobalWatchHandle handle) {
-  auto watch = reinterpret_cast<GlobalWatch*>(handle);
-
-  {
-    auto global_lock = global_critical_region_.Acquire();
-    auto it = std::find(global_watches_.begin(), global_watches_.end(), watch);
-    assert_false(it == global_watches_.end());
-    if (it != global_watches_.end()) {
-      global_watches_.erase(it);
-    }
-  }
-
-  delete watch;
-}
-
-SharedMemory::WatchHandle SharedMemory::WatchMemoryRange(
-    uint32_t start, uint32_t length, WatchCallback callback,
-    void* callback_context, void* callback_data, uint64_t callback_argument) {
-  if (length == 0 || start >= kBufferSize) {
-    return nullptr;
-  }
-  length = std::min(length, kBufferSize - start);
-  uint32_t watch_page_first = start >> page_size_log2_;
-  uint32_t watch_page_last = (start + length - 1) >> page_size_log2_;
-  uint32_t bucket_first =
-      watch_page_first << page_size_log2_ >> kWatchBucketSizeLog2;
-  uint32_t bucket_last =
-      watch_page_last << page_size_log2_ >> kWatchBucketSizeLog2;
-
-  auto global_lock = global_critical_region_.Acquire();
-
-  // Allocate the range.
-  WatchRange* range = watch_range_first_free_;
-  if (range != nullptr) {
-    watch_range_first_free_ = range->next_free;
-  } else {
-    if (watch_range_pools_.empty() ||
-        watch_range_current_pool_allocated_ >= kWatchRangePoolSize) {
-      watch_range_pools_.push_back(new WatchRange[kWatchRangePoolSize]);
-      watch_range_current_pool_allocated_ = 0;
-    }
-    range = &(watch_range_pools_.back()[watch_range_current_pool_allocated_++]);
-  }
-  range->callback = callback;
-  range->callback_context = callback_context;
-  range->callback_data = callback_data;
-  range->callback_argument = callback_argument;
-  range->page_first = watch_page_first;
-  range->page_last = watch_page_last;
-
-  // Allocate and link the nodes.
-  WatchNode* node_previous = nullptr;
-  for (uint32_t i = bucket_first; i <= bucket_last; ++i) {
-    WatchNode* node = watch_node_first_free_;
-    if (node != nullptr) {
-      watch_node_first_free_ = node->next_free;
-    } else {
-      if (watch_node_pools_.empty() ||
-          watch_node_current_pool_allocated_ >= kWatchNodePoolSize) {
-        watch_node_pools_.push_back(new WatchNode[kWatchNodePoolSize]);
-        watch_node_current_pool_allocated_ = 0;
-      }
-      node = &(watch_node_pools_.back()[watch_node_current_pool_allocated_++]);
-    }
-    node->range = range;
-    node->range_node_next = nullptr;
-    if (node_previous != nullptr) {
-      node_previous->range_node_next = node;
-    } else {
-      range->node_first = node;
-    }
-    node_previous = node;
-    node->bucket_node_previous = nullptr;
-    node->bucket_node_next = watch_buckets_[i];
-    if (watch_buckets_[i] != nullptr) {
-      watch_buckets_[i]->bucket_node_previous = node;
-    }
-    watch_buckets_[i] = node;
-  }
-
-  return reinterpret_cast<WatchHandle>(range);
-}
-
-void SharedMemory::UnwatchMemoryRange(WatchHandle handle) {
-  if (handle == nullptr) {
-    // Could be a zero length range.
-    return;
-  }
-  auto global_lock = global_critical_region_.Acquire();
-  UnlinkWatchRange(reinterpret_cast<WatchRange*>(handle));
-}
-
-bool SharedMemory::EnsureTilesResident(uint32_t start, uint32_t length) {
-  if (length == 0) {
-    // Some texture is empty, for example - safe to draw in this case.
-    return true;
-  }
-  if (start > kBufferSize || (kBufferSize - start) < length) {
-    return false;
-  }
-
-  if (!AreTiledResourcesUsed()) {
-    return true;
-  }
-
-  uint32_t heap_first = start >> kHeapSizeLog2;
-  uint32_t heap_last = (start + length - 1) >> kHeapSizeLog2;
-  for (uint32_t i = heap_first; i <= heap_last; ++i) {
-    if (heaps_[i] != nullptr) {
-      continue;
-    }
-    auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
-    auto device = provider.GetDevice();
-    auto direct_queue = provider.GetDirectQueue();
-    D3D12_HEAP_DESC heap_desc = {};
-    heap_desc.SizeInBytes = kHeapSize;
-    heap_desc.Properties.Type = D3D12_HEAP_TYPE_DEFAULT;
-    heap_desc.Flags = D3D12_HEAP_FLAG_ALLOW_ONLY_BUFFERS |
-                      provider.GetHeapFlagCreateNotZeroed();
-    if (FAILED(device->CreateHeap(&heap_desc, IID_PPV_ARGS(&heaps_[i])))) {
-      XELOGE("Shared memory: Failed to create a tile heap");
-      return false;
-    }
-    ++heap_count_;
-    COUNT_profile_set("gpu/shared_memory/used_mb",
-                      heap_count_ << kHeapSizeLog2 >> 20);
-    D3D12_TILED_RESOURCE_COORDINATE region_start_coordinates;
-    region_start_coordinates.X =
-        (i << kHeapSizeLog2) / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
-    region_start_coordinates.Y = 0;
-    region_start_coordinates.Z = 0;
-    region_start_coordinates.Subresource = 0;
-    D3D12_TILE_REGION_SIZE region_size;
-    region_size.NumTiles = kHeapSize / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
-    region_size.UseBox = FALSE;
-    D3D12_TILE_RANGE_FLAGS range_flags = D3D12_TILE_RANGE_FLAG_NONE;
-    UINT heap_range_start_offset = 0;
-    UINT range_tile_count = kHeapSize / D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES;
-    direct_queue->UpdateTileMappings(
-        buffer_, 1, &region_start_coordinates, &region_size, heaps_[i], 1,
-        &range_flags, &heap_range_start_offset, &range_tile_count,
-        D3D12_TILE_MAPPING_FLAG_NONE);
-    command_processor_.NotifyQueueOperationsDoneDirectly();
-  }
-  return true;
-}
-
-bool SharedMemory::RequestRange(uint32_t start, uint32_t length) {
-  if (length == 0) {
-    // Some texture is empty, for example - safe to draw in this case.
-    return true;
-  }
-  if (start > kBufferSize || (kBufferSize - start) < length) {
-    return false;
-  }
-  uint32_t last = start + length - 1;
-
-  auto& command_list = command_processor_.GetDeferredCommandList();
-
-#if FINE_GRAINED_DRAW_SCOPES
-  SCOPE_profile_cpu_f("gpu");
-#endif  // FINE_GRAINED_DRAW_SCOPES
-
-  // Ensure all tile heaps are present.
-  if (!EnsureTilesResident(start, length)) {
-    return false;
-  }
-
-  // Upload and protect used ranges.
-  GetRangesToUpload(start >> page_size_log2_, last >> page_size_log2_);
-  if (upload_ranges_.size() == 0) {
-    return true;
-  }
-  CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_COPY_DEST);
-  command_processor_.SubmitBarriers();
-  for (auto upload_range : upload_ranges_) {
-    uint32_t upload_range_start = upload_range.first;
-    uint32_t upload_range_length = upload_range.second;
-    trace_writer_.WriteMemoryRead(upload_range_start << page_size_log2_,
-                                  upload_range_length << page_size_log2_);
-    while (upload_range_length != 0) {
-      ID3D12Resource* upload_buffer;
-      size_t upload_buffer_offset, upload_buffer_size;
-      uint8_t* upload_buffer_mapping = upload_buffer_pool_->RequestPartial(
-          command_processor_.GetCurrentSubmission(),
-          upload_range_length << page_size_log2_, size_t(1) << page_size_log2_,
-          &upload_buffer, &upload_buffer_offset, &upload_buffer_size, nullptr);
-      if (upload_buffer_mapping == nullptr) {
-        XELOGE("Shared memory: Failed to get an upload buffer");
-        return false;
-      }
-      MakeRangeValid(upload_range_start << page_size_log2_,
-                     uint32_t(upload_buffer_size), false);
-      std::memcpy(
-          upload_buffer_mapping,
-          memory_.TranslatePhysical(upload_range_start << page_size_log2_),
-          upload_buffer_size);
-      command_list.D3DCopyBufferRegion(
-          buffer_, upload_range_start << page_size_log2_, upload_buffer,
-          UINT64(upload_buffer_offset), UINT64(upload_buffer_size));
-      uint32_t upload_buffer_pages =
-          uint32_t(upload_buffer_size >> page_size_log2_);
-      upload_range_start += upload_buffer_pages;
-      upload_range_length -= upload_buffer_pages;
-    }
-  }
-
-  return true;
-}
-
-void SharedMemory::FireWatches(uint32_t page_first, uint32_t page_last,
-                               bool invalidated_by_gpu) {
-  uint32_t address_first = page_first << page_size_log2_;
-  uint32_t address_last =
-      (page_last << page_size_log2_) + ((1 << page_size_log2_) - 1);
-  uint32_t bucket_first = address_first >> kWatchBucketSizeLog2;
-  uint32_t bucket_last = address_last >> kWatchBucketSizeLog2;
-
-  auto global_lock = global_critical_region_.Acquire();
-
-  // Fire global watches.
-  for (const auto global_watch : global_watches_) {
-    global_watch->callback(global_watch->callback_context, address_first,
-                           address_last, invalidated_by_gpu);
-  }
-
-  // Fire per-range watches.
-  for (uint32_t i = bucket_first; i <= bucket_last; ++i) {
-    WatchNode* node = watch_buckets_[i];
-    while (node != nullptr) {
-      WatchRange* range = node->range;
-      // Store the next node now since when the callback is triggered, the links
-      // will be broken.
-      node = node->bucket_node_next;
-      if (page_first <= range->page_last && page_last >= range->page_first) {
-        range->callback(range->callback_context, range->callback_data,
-                        range->callback_argument, invalidated_by_gpu);
-        UnlinkWatchRange(range);
-      }
-    }
-  }
-}
-
-void SharedMemory::RangeWrittenByGPU(uint32_t start, uint32_t length) {
-  if (length == 0 || start >= kBufferSize) {
-    return;
-  }
-  length = std::min(length, kBufferSize - start);
-  uint32_t end = start + length - 1;
-  uint32_t page_first = start >> page_size_log2_;
-  uint32_t page_last = end >> page_size_log2_;
-
-  // Trigger modification callbacks so, for instance, resolved data is loaded to
-  // the texture.
-  FireWatches(page_first, page_last, true);
-
-  // Mark the range as valid (so pages are not reuploaded until modified by the
-  // CPU) and watch it so the CPU can reuse it and this will be caught.
-  MakeRangeValid(start, length, true);
-}
-
-bool SharedMemory::AreTiledResourcesUsed() const {
-  if (!cvars::d3d12_tiled_shared_memory) {
-    return false;
-  }
-  auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
-  // As of October 8th, 2018, PIX doesn't support tiled buffers.
-  // FIXME(Triang3l): Re-enable tiled resources with PIX once fixed.
-  return provider.GetTiledResourcesTier() !=
-             D3D12_TILED_RESOURCES_TIER_NOT_SUPPORTED &&
-         provider.GetGraphicsAnalysis() == nullptr;
-}
-
-void SharedMemory::MakeRangeValid(uint32_t start, uint32_t length,
-                                  bool written_by_gpu) {
-  if (length == 0 || start >= kBufferSize) {
-    return;
-  }
-  length = std::min(length, kBufferSize - start);
-  uint32_t last = start + length - 1;
-  uint32_t valid_page_first = start >> page_size_log2_;
-  uint32_t valid_page_last = last >> page_size_log2_;
-  uint32_t valid_block_first = valid_page_first >> 6;
-  uint32_t valid_block_last = valid_page_last >> 6;
-
-  {
-    auto global_lock = global_critical_region_.Acquire();
-
-    for (uint32_t i = valid_block_first; i <= valid_block_last; ++i) {
-      uint64_t valid_bits = UINT64_MAX;
-      if (i == valid_block_first) {
-        valid_bits &= ~((1ull << (valid_page_first & 63)) - 1);
-      }
-      if (i == valid_block_last && (valid_page_last & 63) != 63) {
-        valid_bits &= (1ull << ((valid_page_last & 63) + 1)) - 1;
-      }
-      SystemPageFlagsBlock& block = system_page_flags_[i];
-      block.valid |= valid_bits;
-      if (written_by_gpu) {
-        block.valid_and_gpu_written |= valid_bits;
-      } else {
-        block.valid_and_gpu_written &= ~valid_bits;
-      }
-    }
-  }
-
-  if (memory_invalidation_callback_handle_) {
-    memory_.EnablePhysicalMemoryAccessCallbacks(
-        valid_page_first << page_size_log2_,
-        (valid_page_last - valid_page_first + 1) << page_size_log2_, true,
-        false);
-  }
-}
-
-void SharedMemory::UnlinkWatchRange(WatchRange* range) {
-  uint32_t bucket =
-      range->page_first << page_size_log2_ >> kWatchBucketSizeLog2;
-  WatchNode* node = range->node_first;
-  while (node != nullptr) {
-    WatchNode* node_next = node->range_node_next;
-    if (node->bucket_node_previous != nullptr) {
-      node->bucket_node_previous->bucket_node_next = node->bucket_node_next;
-    } else {
-      watch_buckets_[bucket] = node->bucket_node_next;
-    }
-    if (node->bucket_node_next != nullptr) {
-      node->bucket_node_next->bucket_node_previous = node->bucket_node_previous;
-    }
-    node->next_free = watch_node_first_free_;
-    watch_node_first_free_ = node;
-    node = node_next;
-    ++bucket;
-  }
-  range->next_free = watch_range_first_free_;
-  watch_range_first_free_ = range;
-}
-
-void SharedMemory::GetRangesToUpload(uint32_t request_page_first,
-                                     uint32_t request_page_last) {
-  upload_ranges_.clear();
-  request_page_last = std::min(request_page_last, page_count_ - 1u);
-  if (request_page_first > request_page_last) {
-    return;
-  }
-  uint32_t request_block_first = request_page_first >> 6;
-  uint32_t request_block_last = request_page_last >> 6;
-
-  auto global_lock = global_critical_region_.Acquire();
-
-  uint32_t range_start = UINT32_MAX;
-  for (uint32_t i = request_block_first; i <= request_block_last; ++i) {
-    uint64_t block_valid = system_page_flags_[i].valid;
-    // Consider pages in the block outside the requested range valid.
-    if (i == request_block_first) {
-      block_valid |= (1ull << (request_page_first & 63)) - 1;
-    }
-    if (i == request_block_last && (request_page_last & 63) != 63) {
-      block_valid |= ~((1ull << ((request_page_last & 63) + 1)) - 1);
-    }
-
-    while (true) {
-      uint32_t block_page;
-      if (range_start == UINT32_MAX) {
-        // Check if need to open a new range.
-        if (!xe::bit_scan_forward(~block_valid, &block_page)) {
-          break;
-        }
-        range_start = (i << 6) + block_page;
-      } else {
-        // Check if need to close the range.
-        // Ignore the valid pages before the beginning of the range.
-        uint64_t block_valid_from_start = block_valid;
-        if (i == (range_start >> 6)) {
-          block_valid_from_start &= ~((1ull << (range_start & 63)) - 1);
-        }
-        if (!xe::bit_scan_forward(block_valid_from_start, &block_page)) {
-          break;
-        }
-        upload_ranges_.push_back(
-            std::make_pair(range_start, (i << 6) + block_page - range_start));
-        // In the next interation within this block, consider this range valid
-        // since it has been queued for upload.
-        block_valid |= (1ull << block_page) - 1;
-        range_start = UINT32_MAX;
-      }
-    }
-  }
-  if (range_start != UINT32_MAX) {
-    upload_ranges_.push_back(
-        std::make_pair(range_start, request_page_last + 1 - range_start));
-  }
-}
-
-std::pair<uint32_t, uint32_t> SharedMemory::MemoryInvalidationCallbackThunk(
-    void* context_ptr, uint32_t physical_address_start, uint32_t length,
-    bool exact_range) {
-  return reinterpret_cast<SharedMemory*>(context_ptr)
-      ->MemoryInvalidationCallback(physical_address_start, length, exact_range);
-}
-
-std::pair<uint32_t, uint32_t> SharedMemory::MemoryInvalidationCallback(
-    uint32_t physical_address_start, uint32_t length, bool exact_range) {
-  if (length == 0 || physical_address_start >= kBufferSize) {
-    return std::make_pair(uint32_t(0), UINT32_MAX);
-  }
-  length = std::min(length, kBufferSize - physical_address_start);
-  uint32_t physical_address_last = physical_address_start + (length - 1);
-
-  uint32_t page_first = physical_address_start >> page_size_log2_;
-  uint32_t page_last = physical_address_last >> page_size_log2_;
-  assert_true(page_first < page_count_ && page_last < page_count_);
-  uint32_t block_first = page_first >> 6;
-  uint32_t block_last = page_last >> 6;
-
-  auto global_lock = global_critical_region_.Acquire();
-
-  if (!exact_range) {
-    // Check if a somewhat wider range (up to 256 KB with 4 KB pages) can be
-    // invalidated - if no GPU-written data nearby that was not intended to be
-    // invalidated since it's not in sync with CPU memory and can't be
-    // reuploaded. It's a lot cheaper to upload some excess data than to catch
-    // access violations - with 4 KB callbacks, the original Doom runs at 4 FPS
-    // on Intel Core i7-3770, with 64 KB the CPU game code takes 3 ms to run per
-    // frame, but with 256 KB it's 0.7 ms.
-    if (page_first & 63) {
-      uint64_t gpu_written_start =
-          system_page_flags_[block_first].valid_and_gpu_written;
-      gpu_written_start &= (1ull << (page_first & 63)) - 1;
-      page_first =
-          (page_first & ~uint32_t(63)) + (64 - xe::lzcnt(gpu_written_start));
-    }
-    if ((page_last & 63) != 63) {
-      uint64_t gpu_written_end =
-          system_page_flags_[block_last].valid_and_gpu_written;
-      gpu_written_end &= ~((1ull << ((page_last & 63) + 1)) - 1);
-      page_last = (page_last & ~uint32_t(63)) +
-                  (std::max(xe::tzcnt(gpu_written_end), uint8_t(1)) - 1);
-    }
-  }
-
-  for (uint32_t i = block_first; i <= block_last; ++i) {
-    uint64_t invalidate_bits = UINT64_MAX;
-    if (i == block_first) {
-      invalidate_bits &= ~((1ull << (page_first & 63)) - 1);
-    }
-    if (i == block_last && (page_last & 63) != 63) {
-      invalidate_bits &= (1ull << ((page_last & 63) + 1)) - 1;
-    }
-    SystemPageFlagsBlock& block = system_page_flags_[i];
-    block.valid &= ~invalidate_bits;
-    block.valid_and_gpu_written &= ~invalidate_bits;
-  }
-
-  FireWatches(page_first, page_last, false);
-
-  return std::make_pair(page_first << page_size_log2_,
-                        (page_last - page_first + 1) << page_size_log2_);
-}
-
-void SharedMemory::CommitUAVWritesAndTransitionBuffer(
-    D3D12_RESOURCE_STATES new_state) {
-  if (buffer_state_ == new_state) {
-    if (new_state == D3D12_RESOURCE_STATE_UNORDERED_ACCESS &&
-        buffer_uav_writes_commit_needed_) {
-      command_processor_.PushUAVBarrier(buffer_);
-      buffer_uav_writes_commit_needed_ = false;
-    }
-    return;
-  }
-  command_processor_.PushTransitionBarrier(buffer_, buffer_state_, new_state);
-  buffer_state_ = new_state;
-  // "UAV -> anything" transition commits the writes implicitly.
-  buffer_uav_writes_commit_needed_ = false;
-}
-
-void SharedMemory::WriteRawSRVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle) {
-  auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
-  auto device = provider.GetDevice();
-  device->CopyDescriptorsSimple(
-      1, handle,
-      provider.OffsetViewDescriptor(buffer_descriptor_heap_start_,
-                                    uint32_t(BufferDescriptorIndex::kRawSRV)),
-      D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
-}
-
-void SharedMemory::WriteRawUAVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle) {
-  auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
-  auto device = provider.GetDevice();
-  device->CopyDescriptorsSimple(
-      1, handle,
-      provider.OffsetViewDescriptor(buffer_descriptor_heap_start_,
-                                    uint32_t(BufferDescriptorIndex::kRawUAV)),
-      D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
-}
-
-void SharedMemory::WriteUintPow2SRVDescriptor(
-    D3D12_CPU_DESCRIPTOR_HANDLE handle, uint32_t element_size_bytes_pow2) {
-  BufferDescriptorIndex descriptor_index;
-  switch (element_size_bytes_pow2) {
-    case 2:
-      descriptor_index = BufferDescriptorIndex::kR32UintSRV;
-      break;
-    case 3:
-      descriptor_index = BufferDescriptorIndex::kR32G32UintSRV;
-      break;
-    case 4:
-      descriptor_index = BufferDescriptorIndex::kR32G32B32A32UintSRV;
-      break;
-    default:
-      assert_unhandled_case(element_size_bytes_pow2);
-      return;
-  }
-  auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
-  auto device = provider.GetDevice();
-  device->CopyDescriptorsSimple(
-      1, handle,
-      provider.OffsetViewDescriptor(buffer_descriptor_heap_start_,
-                                    uint32_t(descriptor_index)),
-      D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
-}
-
-void SharedMemory::WriteUintPow2UAVDescriptor(
-    D3D12_CPU_DESCRIPTOR_HANDLE handle, uint32_t element_size_bytes_pow2) {
-  BufferDescriptorIndex descriptor_index;
-  switch (element_size_bytes_pow2) {
-    case 2:
-      descriptor_index = BufferDescriptorIndex::kR32UintUAV;
-      break;
-    case 3:
-      descriptor_index = BufferDescriptorIndex::kR32G32UintUAV;
-      break;
-    case 4:
-      descriptor_index = BufferDescriptorIndex::kR32G32B32A32UintUAV;
-      break;
-    default:
-      assert_unhandled_case(element_size_bytes_pow2);
-      return;
-  }
-  auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
-  auto device = provider.GetDevice();
-  device->CopyDescriptorsSimple(
-      1, handle,
-      provider.OffsetViewDescriptor(buffer_descriptor_heap_start_,
-                                    uint32_t(descriptor_index)),
-      D3D12_DESCRIPTOR_HEAP_TYPE_CBV_SRV_UAV);
-}
-
-bool SharedMemory::InitializeTraceSubmitDownloads() {
-  // Invalidate the entire memory CPU->GPU memory copy so all the history
-  // doesn't have to be written into every frame trace, and collect the list of
-  // ranges with data modified on the GPU.
-  ResetTraceGPUWrittenBuffer();
-  uint32_t gpu_written_page_count = 0;
-
-  {
-    auto global_lock = global_critical_region_.Acquire();
-    uint32_t fire_watches_range_start = UINT32_MAX;
-    uint32_t gpu_written_range_start = UINT32_MAX;
-    for (uint32_t i = 0; i < system_page_flags_.size(); ++i) {
-      SystemPageFlagsBlock& page_flags_block = system_page_flags_[i];
-      uint64_t previously_valid_block = page_flags_block.valid;
-      uint64_t gpu_written_block = page_flags_block.valid_and_gpu_written;
-      page_flags_block.valid = gpu_written_block;
-
-      // Fire watches on the invalidated pages.
-      uint64_t fire_watches_block = previously_valid_block & ~gpu_written_block;
-      uint64_t fire_watches_break_block = ~fire_watches_block;
-      while (true) {
-        uint32_t fire_watches_block_page;
-        if (!xe::bit_scan_forward(fire_watches_range_start == UINT32_MAX
-                                      ? fire_watches_block
-                                      : fire_watches_break_block,
-                                  &fire_watches_block_page)) {
-          break;
-        }
-        uint32_t fire_watches_page = (i << 6) + fire_watches_block_page;
-        if (fire_watches_range_start == UINT32_MAX) {
-          fire_watches_range_start = fire_watches_page;
-        } else {
-          FireWatches(fire_watches_range_start, fire_watches_page - 1, false);
-          fire_watches_range_start = UINT32_MAX;
-        }
-        uint64_t fire_watches_block_mask =
-            ~((1ull << fire_watches_block_page) - 1);
-        fire_watches_block &= fire_watches_block_mask;
-        fire_watches_break_block &= fire_watches_block_mask;
-      }
-
-      // Add to the GPU-written ranges.
-      uint64_t gpu_written_break_block = ~gpu_written_block;
-      while (true) {
-        uint32_t gpu_written_block_page;
-        if (!xe::bit_scan_forward(gpu_written_range_start == UINT32_MAX
-                                      ? gpu_written_block
-                                      : gpu_written_break_block,
-                                  &gpu_written_block_page)) {
-          break;
-        }
-        uint32_t gpu_written_page = (i << 6) + gpu_written_block_page;
-        if (gpu_written_range_start == UINT32_MAX) {
-          gpu_written_range_start = gpu_written_page;
-        } else {
-          uint32_t gpu_written_range_length =
-              gpu_written_page - gpu_written_range_start;
-          trace_gpu_written_ranges_.push_back(
-              std::make_pair(gpu_written_range_start << page_size_log2_,
-                             gpu_written_range_length << page_size_log2_));
-          gpu_written_page_count += gpu_written_range_length;
-          gpu_written_range_start = UINT32_MAX;
-        }
-        uint64_t gpu_written_block_mask =
-            ~((1ull << gpu_written_block_page) - 1);
-        gpu_written_block &= gpu_written_block_mask;
-        gpu_written_break_block &= gpu_written_block_mask;
-      }
-    }
-    if (fire_watches_range_start != UINT32_MAX) {
-      FireWatches(fire_watches_range_start, page_count_ - 1, false);
-    }
-    if (gpu_written_range_start != UINT32_MAX) {
-      uint32_t gpu_written_range_length = page_count_ - gpu_written_range_start;
-      trace_gpu_written_ranges_.push_back(
-          std::make_pair(gpu_written_range_start << page_size_log2_,
-                         gpu_written_range_length << page_size_log2_));
-      gpu_written_page_count += gpu_written_range_length;
-    }
-  }
-
-  // Request downloading of GPU-written memory.
-  if (!gpu_written_page_count) {
-    return false;
-  }
-  D3D12_RESOURCE_DESC gpu_written_buffer_desc;
-  ui::d3d12::util::FillBufferResourceDesc(
-      gpu_written_buffer_desc, gpu_written_page_count << page_size_log2_,
-      D3D12_RESOURCE_FLAG_NONE);
-  auto& provider = command_processor_.GetD3D12Context().GetD3D12Provider();
-  auto device = provider.GetDevice();
-  if (FAILED(device->CreateCommittedResource(
-          &ui::d3d12::util::kHeapPropertiesReadback,
-          provider.GetHeapFlagCreateNotZeroed(), &gpu_written_buffer_desc,
-          D3D12_RESOURCE_STATE_COPY_DEST, nullptr,
-          IID_PPV_ARGS(&trace_gpu_written_buffer_)))) {
-    XELOGE(
-        "Shared memory: Failed to create a {} KB GPU-written memory download "
-        "buffer for frame tracing",
-        gpu_written_page_count << page_size_log2_ >> 10);
-    ResetTraceGPUWrittenBuffer();
-    return false;
-  }
-  auto& command_list = command_processor_.GetDeferredCommandList();
-  UseAsCopySource();
-  command_processor_.SubmitBarriers();
-  uint32_t gpu_written_buffer_offset = 0;
-  for (auto& gpu_written_submit_range : trace_gpu_written_ranges_) {
-    // For cases like resolution scale, when the data may not be actually
-    // written, just marked as valid.
-    if (!EnsureTilesResident(gpu_written_submit_range.first,
-                             gpu_written_submit_range.second)) {
-      gpu_written_submit_range.second = 0;
-      continue;
-    }
-    command_list.D3DCopyBufferRegion(
-        trace_gpu_written_buffer_, gpu_written_buffer_offset, buffer_,
-        gpu_written_submit_range.first, gpu_written_submit_range.second);
-    gpu_written_buffer_offset += gpu_written_submit_range.second;
-  }
-  return true;
-}
-
-void SharedMemory::InitializeTraceCompleteDownloads() {
-  if (!trace_gpu_written_buffer_) {
-    return;
-  }
-  void* download_mapping;
-  if (SUCCEEDED(
-          trace_gpu_written_buffer_->Map(0, nullptr, &download_mapping))) {
-    uint32_t gpu_written_buffer_offset = 0;
-    for (auto gpu_written_submit_range : trace_gpu_written_ranges_) {
-      trace_writer_.WriteMemoryRead(
-          gpu_written_submit_range.first, gpu_written_submit_range.second,
-          reinterpret_cast<const uint8_t*>(download_mapping) +
-              gpu_written_buffer_offset);
-    }
-    D3D12_RANGE download_write_range = {};
-    trace_gpu_written_buffer_->Unmap(0, &download_write_range);
-  } else {
-    XELOGE(
-        "Failed to map the GPU-written memory download buffer for frame "
-        "tracing");
-  }
-  ResetTraceGPUWrittenBuffer();
-}
-
-void SharedMemory::ResetTraceGPUWrittenBuffer() {
-  trace_gpu_written_ranges_.clear();
-  trace_gpu_written_ranges_.shrink_to_fit();
-  ui::d3d12::util::ReleaseAndNull(trace_gpu_written_buffer_);
-}
-
-}  // namespace d3d12
-}  // namespace gpu
-}  // namespace xe
diff --git a/src/xenia/gpu/d3d12/texture_cache.cc b/src/xenia/gpu/d3d12/texture_cache.cc
index 7228b9b3a..de0568ccf 100644
--- a/src/xenia/gpu/d3d12/texture_cache.cc
+++ b/src/xenia/gpu/d3d12/texture_cache.cc
@@ -832,7 +832,7 @@ const TextureCache::LoadModeInfo TextureCache::load_mode_info_[] = {
 TextureCache::TextureCache(D3D12CommandProcessor& command_processor,
                            const RegisterFile& register_file,
                            bool bindless_resources_used,
-                           SharedMemory& shared_memory)
+                           D3D12SharedMemory& shared_memory)
     : command_processor_(command_processor),
       register_file_(register_file),
       bindless_resources_used_(bindless_resources_used),
@@ -1604,7 +1604,7 @@ void TextureCache::MarkRangeAsResolved(uint32_t start_unscaled,
 
   // Invalidate textures. Toggling individual textures between scaled and
   // unscaled also relies on invalidation through shared memory.
-  shared_memory_.RangeWrittenByGPU(start_unscaled, length_unscaled);
+  shared_memory_.RangeWrittenByGpu(start_unscaled, length_unscaled);
 }
 
 bool TextureCache::EnsureScaledResolveBufferResident(uint32_t start_unscaled,
diff --git a/src/xenia/gpu/d3d12/texture_cache.h b/src/xenia/gpu/d3d12/texture_cache.h
index 1047cabd0..1345d8faf 100644
--- a/src/xenia/gpu/d3d12/texture_cache.h
+++ b/src/xenia/gpu/d3d12/texture_cache.h
@@ -17,7 +17,7 @@
 
 #include "xenia/base/mutex.h"
 #include "xenia/gpu/d3d12/d3d12_shader.h"
-#include "xenia/gpu/d3d12/shared_memory.h"
+#include "xenia/gpu/d3d12/d3d12_shared_memory.h"
 #include "xenia/gpu/register_file.h"
 #include "xenia/gpu/texture_info.h"
 #include "xenia/gpu/xenos.h"
@@ -169,7 +169,7 @@ class TextureCache {
 
   TextureCache(D3D12CommandProcessor& command_processor,
                const RegisterFile& register_file, bool bindless_resources_used,
-               SharedMemory& shared_memory);
+               D3D12SharedMemory& shared_memory);
   ~TextureCache();
 
   bool Initialize(bool edram_rov_used);
@@ -546,7 +546,7 @@ class TextureCache {
   D3D12CommandProcessor& command_processor_;
   const RegisterFile& register_file_;
   bool bindless_resources_used_;
-  SharedMemory& shared_memory_;
+  D3D12SharedMemory& shared_memory_;
 
   static const LoadModeInfo load_mode_info_[];
   ID3D12RootSignature* load_root_signature_ = nullptr;
diff --git a/src/xenia/gpu/shared_memory.cc b/src/xenia/gpu/shared_memory.cc
new file mode 100644
index 000000000..4951eacea
--- /dev/null
+++ b/src/xenia/gpu/shared_memory.cc
@@ -0,0 +1,541 @@
+/**
+ ******************************************************************************
+ * Xenia : Xbox 360 Emulator Research Project                                 *
+ ******************************************************************************
+ * Copyright 2020 Ben Vanik. All rights reserved.                             *
+ * Released under the BSD license - see LICENSE in the root for more details. *
+ ******************************************************************************
+ */
+
+#include "xenia/gpu/shared_memory.h"
+
+#include <algorithm>
+#include <utility>
+
+#include "xenia/base/assert.h"
+#include "xenia/base/math.h"
+#include "xenia/base/memory.h"
+#include "xenia/base/profiling.h"
+#include "xenia/memory.h"
+
+namespace xe {
+namespace gpu {
+
+SharedMemory::SharedMemory(Memory& memory) : memory_(memory) {
+  page_size_log2_ = xe::log2_ceil(uint32_t(xe::memory::page_size()));
+}
+
+SharedMemory::~SharedMemory() { ShutdownCommon(); }
+
+void SharedMemory::InitializeCommon() {
+  system_page_flags_.clear();
+  system_page_flags_.resize(((kBufferSize >> page_size_log2_) + 63) / 64);
+
+  memory_invalidation_callback_handle_ =
+      memory_.RegisterPhysicalMemoryInvalidationCallback(
+          MemoryInvalidationCallbackThunk, this);
+}
+
+void SharedMemory::ShutdownCommon() {
+  ReleaseTraceDownloadRanges();
+
+  FireWatches(0, (kBufferSize - 1) >> page_size_log2_, false);
+  assert_true(global_watches_.empty());
+  // No watches now, so no references to the pools accessible by guest threads -
+  // safe not to enter the global critical region.
+  watch_node_first_free_ = nullptr;
+  watch_node_current_pool_allocated_ = 0;
+  for (WatchNode* pool : watch_node_pools_) {
+    delete[] pool;
+  }
+  watch_node_pools_.clear();
+  watch_range_first_free_ = nullptr;
+  watch_range_current_pool_allocated_ = 0;
+  for (WatchRange* pool : watch_range_pools_) {
+    delete[] pool;
+  }
+  watch_range_pools_.clear();
+
+  if (memory_invalidation_callback_handle_ != nullptr) {
+    memory_.UnregisterPhysicalMemoryInvalidationCallback(
+        memory_invalidation_callback_handle_);
+    memory_invalidation_callback_handle_ = nullptr;
+  }
+}
+
+void SharedMemory::ClearCache() {
+  // Keeping GPU-written data, so "invalidated by GPU".
+  FireWatches(0, (kBufferSize - 1) >> page_size_log2_, true);
+  // No watches now, so no references to the pools accessible by guest threads -
+  // safe not to enter the global critical region.
+  watch_node_first_free_ = nullptr;
+  watch_node_current_pool_allocated_ = 0;
+  for (WatchNode* pool : watch_node_pools_) {
+    delete[] pool;
+  }
+  watch_node_pools_.clear();
+  watch_range_first_free_ = nullptr;
+  watch_range_current_pool_allocated_ = 0;
+  for (WatchRange* pool : watch_range_pools_) {
+    delete[] pool;
+  }
+  watch_range_pools_.clear();
+
+  {
+    auto global_lock = global_critical_region_.Acquire();
+    for (SystemPageFlagsBlock& block : system_page_flags_) {
+      block.valid = block.valid_and_gpu_written;
+    }
+  }
+}
+
+SharedMemory::GlobalWatchHandle SharedMemory::RegisterGlobalWatch(
+    GlobalWatchCallback callback, void* callback_context) {
+  GlobalWatch* watch = new GlobalWatch;
+  watch->callback = callback;
+  watch->callback_context = callback_context;
+
+  auto global_lock = global_critical_region_.Acquire();
+  global_watches_.push_back(watch);
+
+  return reinterpret_cast<GlobalWatchHandle>(watch);
+}
+
+void SharedMemory::UnregisterGlobalWatch(GlobalWatchHandle handle) {
+  auto watch = reinterpret_cast<GlobalWatch*>(handle);
+
+  {
+    auto global_lock = global_critical_region_.Acquire();
+    auto it = std::find(global_watches_.begin(), global_watches_.end(), watch);
+    assert_false(it == global_watches_.end());
+    if (it != global_watches_.end()) {
+      global_watches_.erase(it);
+    }
+  }
+
+  delete watch;
+}
+
+SharedMemory::WatchHandle SharedMemory::WatchMemoryRange(
+    uint32_t start, uint32_t length, WatchCallback callback,
+    void* callback_context, void* callback_data, uint64_t callback_argument) {
+  if (length == 0 || start >= kBufferSize) {
+    return nullptr;
+  }
+  length = std::min(length, kBufferSize - start);
+  uint32_t watch_page_first = start >> page_size_log2_;
+  uint32_t watch_page_last = (start + length - 1) >> page_size_log2_;
+  uint32_t bucket_first =
+      watch_page_first << page_size_log2_ >> kWatchBucketSizeLog2;
+  uint32_t bucket_last =
+      watch_page_last << page_size_log2_ >> kWatchBucketSizeLog2;
+
+  auto global_lock = global_critical_region_.Acquire();
+
+  // Allocate the range.
+  WatchRange* range = watch_range_first_free_;
+  if (range != nullptr) {
+    watch_range_first_free_ = range->next_free;
+  } else {
+    if (watch_range_pools_.empty() ||
+        watch_range_current_pool_allocated_ >= kWatchRangePoolSize) {
+      watch_range_pools_.push_back(new WatchRange[kWatchRangePoolSize]);
+      watch_range_current_pool_allocated_ = 0;
+    }
+    range = &(watch_range_pools_.back()[watch_range_current_pool_allocated_++]);
+  }
+  range->callback = callback;
+  range->callback_context = callback_context;
+  range->callback_data = callback_data;
+  range->callback_argument = callback_argument;
+  range->page_first = watch_page_first;
+  range->page_last = watch_page_last;
+
+  // Allocate and link the nodes.
+  WatchNode* node_previous = nullptr;
+  for (uint32_t i = bucket_first; i <= bucket_last; ++i) {
+    WatchNode* node = watch_node_first_free_;
+    if (node != nullptr) {
+      watch_node_first_free_ = node->next_free;
+    } else {
+      if (watch_node_pools_.empty() ||
+          watch_node_current_pool_allocated_ >= kWatchNodePoolSize) {
+        watch_node_pools_.push_back(new WatchNode[kWatchNodePoolSize]);
+        watch_node_current_pool_allocated_ = 0;
+      }
+      node = &(watch_node_pools_.back()[watch_node_current_pool_allocated_++]);
+    }
+    node->range = range;
+    node->range_node_next = nullptr;
+    if (node_previous != nullptr) {
+      node_previous->range_node_next = node;
+    } else {
+      range->node_first = node;
+    }
+    node_previous = node;
+    node->bucket_node_previous = nullptr;
+    node->bucket_node_next = watch_buckets_[i];
+    if (watch_buckets_[i] != nullptr) {
+      watch_buckets_[i]->bucket_node_previous = node;
+    }
+    watch_buckets_[i] = node;
+  }
+
+  return reinterpret_cast<WatchHandle>(range);
+}
+
+void SharedMemory::UnwatchMemoryRange(WatchHandle handle) {
+  if (handle == nullptr) {
+    // Could be a zero length range.
+    return;
+  }
+  auto global_lock = global_critical_region_.Acquire();
+  UnlinkWatchRange(reinterpret_cast<WatchRange*>(handle));
+}
+
+void SharedMemory::FireWatches(uint32_t page_first, uint32_t page_last,
+                               bool invalidated_by_gpu) {
+  uint32_t address_first = page_first << page_size_log2_;
+  uint32_t address_last =
+      (page_last << page_size_log2_) + ((1 << page_size_log2_) - 1);
+  uint32_t bucket_first = address_first >> kWatchBucketSizeLog2;
+  uint32_t bucket_last = address_last >> kWatchBucketSizeLog2;
+
+  auto global_lock = global_critical_region_.Acquire();
+
+  // Fire global watches.
+  for (const auto global_watch : global_watches_) {
+    global_watch->callback(global_watch->callback_context, address_first,
+                           address_last, invalidated_by_gpu);
+  }
+
+  // Fire per-range watches.
+  for (uint32_t i = bucket_first; i <= bucket_last; ++i) {
+    WatchNode* node = watch_buckets_[i];
+    while (node != nullptr) {
+      WatchRange* range = node->range;
+      // Store the next node now since when the callback is triggered, the links
+      // will be broken.
+      node = node->bucket_node_next;
+      if (page_first <= range->page_last && page_last >= range->page_first) {
+        range->callback(range->callback_context, range->callback_data,
+                        range->callback_argument, invalidated_by_gpu);
+        UnlinkWatchRange(range);
+      }
+    }
+  }
+}
+
+void SharedMemory::RangeWrittenByGpu(uint32_t start, uint32_t length) {
+  if (length == 0 || start >= kBufferSize) {
+    return;
+  }
+  length = std::min(length, kBufferSize - start);
+  uint32_t end = start + length - 1;
+  uint32_t page_first = start >> page_size_log2_;
+  uint32_t page_last = end >> page_size_log2_;
+
+  // Trigger modification callbacks so, for instance, resolved data is loaded to
+  // the texture.
+  FireWatches(page_first, page_last, true);
+
+  // Mark the range as valid (so pages are not reuploaded until modified by the
+  // CPU) and watch it so the CPU can reuse it and this will be caught.
+  MakeRangeValid(start, length, true);
+}
+
+void SharedMemory::MakeRangeValid(uint32_t start, uint32_t length,
+                                  bool written_by_gpu) {
+  if (length == 0 || start >= kBufferSize) {
+    return;
+  }
+  length = std::min(length, kBufferSize - start);
+  uint32_t last = start + length - 1;
+  uint32_t valid_page_first = start >> page_size_log2_;
+  uint32_t valid_page_last = last >> page_size_log2_;
+  uint32_t valid_block_first = valid_page_first >> 6;
+  uint32_t valid_block_last = valid_page_last >> 6;
+
+  {
+    auto global_lock = global_critical_region_.Acquire();
+
+    for (uint32_t i = valid_block_first; i <= valid_block_last; ++i) {
+      uint64_t valid_bits = UINT64_MAX;
+      if (i == valid_block_first) {
+        valid_bits &= ~((uint64_t(1) << (valid_page_first & 63)) - 1);
+      }
+      if (i == valid_block_last && (valid_page_last & 63) != 63) {
+        valid_bits &= (uint64_t(1) << ((valid_page_last & 63) + 1)) - 1;
+      }
+      SystemPageFlagsBlock& block = system_page_flags_[i];
+      block.valid |= valid_bits;
+      if (written_by_gpu) {
+        block.valid_and_gpu_written |= valid_bits;
+      } else {
+        block.valid_and_gpu_written &= ~valid_bits;
+      }
+    }
+  }
+
+  if (memory_invalidation_callback_handle_) {
+    memory().EnablePhysicalMemoryAccessCallbacks(
+        valid_page_first << page_size_log2_,
+        (valid_page_last - valid_page_first + 1) << page_size_log2_, true,
+        false);
+  }
+}
+
+void SharedMemory::UnlinkWatchRange(WatchRange* range) {
+  uint32_t bucket =
+      range->page_first << page_size_log2_ >> kWatchBucketSizeLog2;
+  WatchNode* node = range->node_first;
+  while (node != nullptr) {
+    WatchNode* node_next = node->range_node_next;
+    if (node->bucket_node_previous != nullptr) {
+      node->bucket_node_previous->bucket_node_next = node->bucket_node_next;
+    } else {
+      watch_buckets_[bucket] = node->bucket_node_next;
+    }
+    if (node->bucket_node_next != nullptr) {
+      node->bucket_node_next->bucket_node_previous = node->bucket_node_previous;
+    }
+    node->next_free = watch_node_first_free_;
+    watch_node_first_free_ = node;
+    node = node_next;
+    ++bucket;
+  }
+  range->next_free = watch_range_first_free_;
+  watch_range_first_free_ = range;
+}
+
+bool SharedMemory::RequestRange(uint32_t start, uint32_t length) {
+  if (!length) {
+    // Some texture or buffer is empty, for example - safe to draw in this case.
+    return true;
+  }
+  if (start > kBufferSize || (kBufferSize - start) < length) {
+    return false;
+  }
+  uint32_t last = start + length - 1;
+
+  SCOPE_profile_cpu_f("gpu");
+
+  if (!EnsureHostGpuMemoryAllocated(start, length)) {
+    return false;
+  }
+
+  uint32_t page_first = start >> page_size_log2_;
+  uint32_t page_last = (start + length - 1) >> page_size_log2_;
+
+  upload_ranges_.clear();
+  uint32_t block_first = page_first >> 6;
+  uint32_t block_last = page_last >> 6;
+  uint32_t range_start = UINT32_MAX;
+  {
+    auto global_lock = global_critical_region_.Acquire();
+    for (uint32_t i = block_first; i <= block_last; ++i) {
+      uint64_t block_valid = system_page_flags_[i].valid;
+      // Consider pages in the block outside the requested range valid.
+      if (i == block_first) {
+        block_valid |= (uint64_t(1) << (page_first & 63)) - 1;
+      }
+      if (i == block_last && (page_last & 63) != 63) {
+        block_valid |= ~((uint64_t(1) << ((page_last & 63) + 1)) - 1);
+      }
+
+      while (true) {
+        uint32_t block_page;
+        if (range_start == UINT32_MAX) {
+          // Check if need to open a new range.
+          if (!xe::bit_scan_forward(~block_valid, &block_page)) {
+            break;
+          }
+          range_start = (i << 6) + block_page;
+        } else {
+          // Check if need to close the range.
+          // Ignore the valid pages before the beginning of the range.
+          uint64_t block_valid_from_start = block_valid;
+          if (i == (range_start >> 6)) {
+            block_valid_from_start &=
+                ~((uint64_t(1) << (range_start & 63)) - 1);
+          }
+          if (!xe::bit_scan_forward(block_valid_from_start, &block_page)) {
+            break;
+          }
+          upload_ranges_.push_back(
+              std::make_pair(range_start, (i << 6) + block_page - range_start));
+          // In the next iteration within this block, consider this range valid
+          // since it has been queued for upload.
+          block_valid |= (uint64_t(1) << block_page) - 1;
+          range_start = UINT32_MAX;
+        }
+      }
+    }
+  }
+  if (range_start != UINT32_MAX) {
+    upload_ranges_.push_back(
+        std::make_pair(range_start, page_last + 1 - range_start));
+  }
+  if (upload_ranges_.empty()) {
+    return true;
+  }
+
+  return UploadRanges(upload_ranges_);
+}
+
+std::pair<uint32_t, uint32_t> SharedMemory::MemoryInvalidationCallbackThunk(
+    void* context_ptr, uint32_t physical_address_start, uint32_t length,
+    bool exact_range) {
+  return reinterpret_cast<SharedMemory*>(context_ptr)
+      ->MemoryInvalidationCallback(physical_address_start, length, exact_range);
+}
+
+std::pair<uint32_t, uint32_t> SharedMemory::MemoryInvalidationCallback(
+    uint32_t physical_address_start, uint32_t length, bool exact_range) {
+  if (length == 0 || physical_address_start >= kBufferSize) {
+    return std::make_pair(uint32_t(0), UINT32_MAX);
+  }
+  length = std::min(length, kBufferSize - physical_address_start);
+  uint32_t physical_address_last = physical_address_start + (length - 1);
+
+  uint32_t page_first = physical_address_start >> page_size_log2_;
+  uint32_t page_last = physical_address_last >> page_size_log2_;
+  uint32_t block_first = page_first >> 6;
+  uint32_t block_last = page_last >> 6;
+
+  auto global_lock = global_critical_region_.Acquire();
+
+  if (!exact_range) {
+    // Check if a somewhat wider range (up to 256 KB with 4 KB pages) can be
+    // invalidated - if no GPU-written data nearby that was not intended to be
+    // invalidated since it's not in sync with CPU memory and can't be
+    // reuploaded. It's a lot cheaper to upload some excess data than to catch
+    // access violations - with 4 KB callbacks, the original Doom runs at 4 FPS
+    // on Intel Core i7-3770, with 64 KB the CPU game code takes 3 ms to run per
+    // frame, but with 256 KB it's 0.7 ms.
+    if (page_first & 63) {
+      uint64_t gpu_written_start =
+          system_page_flags_[block_first].valid_and_gpu_written;
+      gpu_written_start &= (uint64_t(1) << (page_first & 63)) - 1;
+      page_first =
+          (page_first & ~uint32_t(63)) + (64 - xe::lzcnt(gpu_written_start));
+    }
+    if ((page_last & 63) != 63) {
+      uint64_t gpu_written_end =
+          system_page_flags_[block_last].valid_and_gpu_written;
+      gpu_written_end &= ~((uint64_t(1) << ((page_last & 63) + 1)) - 1);
+      page_last = (page_last & ~uint32_t(63)) +
+                  (std::max(xe::tzcnt(gpu_written_end), uint8_t(1)) - 1);
+    }
+  }
+
+  for (uint32_t i = block_first; i <= block_last; ++i) {
+    uint64_t invalidate_bits = UINT64_MAX;
+    if (i == block_first) {
+      invalidate_bits &= ~((uint64_t(1) << (page_first & 63)) - 1);
+    }
+    if (i == block_last && (page_last & 63) != 63) {
+      invalidate_bits &= (uint64_t(1) << ((page_last & 63) + 1)) - 1;
+    }
+    SystemPageFlagsBlock& block = system_page_flags_[i];
+    block.valid &= ~invalidate_bits;
+    block.valid_and_gpu_written &= ~invalidate_bits;
+  }
+
+  FireWatches(page_first, page_last, false);
+
+  return std::make_pair(page_first << page_size_log2_,
+                        (page_last - page_first + 1) << page_size_log2_);
+}
+
+void SharedMemory::PrepareForTraceDownload() {
+  ReleaseTraceDownloadRanges();
+  assert_true(trace_download_ranges_.empty());
+  assert_zero(trace_download_page_count_);
+
+  // Invalidate the entire memory CPU->GPU memory copy so all the history
+  // doesn't have to be written into every frame trace, and collect the list of
+  // ranges with data modified on the GPU.
+
+  uint32_t fire_watches_range_start = UINT32_MAX;
+  uint32_t gpu_written_range_start = UINT32_MAX;
+  auto global_lock = global_critical_region_.Acquire();
+  for (uint32_t i = 0; i < system_page_flags_.size(); ++i) {
+    SystemPageFlagsBlock& page_flags_block = system_page_flags_[i];
+    uint64_t previously_valid_block = page_flags_block.valid;
+    uint64_t gpu_written_block = page_flags_block.valid_and_gpu_written;
+    page_flags_block.valid = gpu_written_block;
+
+    // Fire watches on the invalidated pages.
+    uint64_t fire_watches_block = previously_valid_block & ~gpu_written_block;
+    uint64_t fire_watches_break_block = ~fire_watches_block;
+    while (true) {
+      uint32_t fire_watches_block_page;
+      if (!xe::bit_scan_forward(fire_watches_range_start == UINT32_MAX
+                                    ? fire_watches_block
+                                    : fire_watches_break_block,
+                                &fire_watches_block_page)) {
+        break;
+      }
+      uint32_t fire_watches_page = (i << 6) + fire_watches_block_page;
+      if (fire_watches_range_start == UINT32_MAX) {
+        fire_watches_range_start = fire_watches_page;
+      } else {
+        FireWatches(fire_watches_range_start, fire_watches_page - 1, false);
+        fire_watches_range_start = UINT32_MAX;
+      }
+      uint64_t fire_watches_block_mask =
+          ~((uint64_t(1) << fire_watches_block_page) - 1);
+      fire_watches_block &= fire_watches_block_mask;
+      fire_watches_break_block &= fire_watches_block_mask;
+    }
+
+    // Add to the GPU-written ranges.
+    uint64_t gpu_written_break_block = ~gpu_written_block;
+    while (true) {
+      uint32_t gpu_written_block_page;
+      if (!xe::bit_scan_forward(gpu_written_range_start == UINT32_MAX
+                                    ? gpu_written_block
+                                    : gpu_written_break_block,
+                                &gpu_written_block_page)) {
+        break;
+      }
+      uint32_t gpu_written_page = (i << 6) + gpu_written_block_page;
+      if (gpu_written_range_start == UINT32_MAX) {
+        gpu_written_range_start = gpu_written_page;
+      } else {
+        uint32_t gpu_written_range_length =
+            gpu_written_page - gpu_written_range_start;
+        trace_download_ranges_.push_back(
+            std::make_pair(gpu_written_range_start << page_size_log2_,
+                           gpu_written_range_length << page_size_log2_));
+        trace_download_page_count_ += gpu_written_range_length;
+        gpu_written_range_start = UINT32_MAX;
+      }
+      uint64_t gpu_written_block_mask =
+          ~((uint64_t(1) << gpu_written_block_page) - 1);
+      gpu_written_block &= gpu_written_block_mask;
+      gpu_written_break_block &= gpu_written_block_mask;
+    }
+  }
+  uint32_t page_count = kBufferSize >> page_size_log2_;
+  if (fire_watches_range_start != UINT32_MAX) {
+    FireWatches(fire_watches_range_start, page_count - 1, false);
+  }
+  if (gpu_written_range_start != UINT32_MAX) {
+    uint32_t gpu_written_range_length = page_count - gpu_written_range_start;
+    trace_download_ranges_.push_back(
+        std::make_pair(gpu_written_range_start << page_size_log2_,
+                       gpu_written_range_length << page_size_log2_));
+    trace_download_page_count_ += gpu_written_range_length;
+  }
+}
+
+void SharedMemory::ReleaseTraceDownloadRanges() {
+  trace_download_ranges_.clear();
+  trace_download_ranges_.shrink_to_fit();
+  trace_download_page_count_ = 0;
+}
+
+}  // namespace gpu
+}  // namespace xe
diff --git a/src/xenia/gpu/d3d12/shared_memory.h b/src/xenia/gpu/shared_memory.h
similarity index 61%
rename from src/xenia/gpu/d3d12/shared_memory.h
rename to src/xenia/gpu/shared_memory.h
index 86a55b2b7..6dae85909 100644
--- a/src/xenia/gpu/d3d12/shared_memory.h
+++ b/src/xenia/gpu/shared_memory.h
@@ -2,49 +2,32 @@
  ******************************************************************************
  * Xenia : Xbox 360 Emulator Research Project                                 *
  ******************************************************************************
- * Copyright 2018 Ben Vanik. All rights reserved.                             *
+ * Copyright 2020 Ben Vanik. All rights reserved.                             *
  * Released under the BSD license - see LICENSE in the root for more details. *
  ******************************************************************************
  */
 
-#ifndef XENIA_GPU_D3D12_SHARED_MEMORY_H_
-#define XENIA_GPU_D3D12_SHARED_MEMORY_H_
+#ifndef XENIA_GPU_SHARED_MEMORY_H_
+#define XENIA_GPU_SHARED_MEMORY_H_
 
-#include <memory>
+#include <cstdint>
 #include <utility>
 #include <vector>
 
 #include "xenia/base/mutex.h"
-#include "xenia/gpu/trace_writer.h"
 #include "xenia/memory.h"
-#include "xenia/ui/d3d12/d3d12_api.h"
-#include "xenia/ui/d3d12/d3d12_upload_buffer_pool.h"
 
 namespace xe {
 namespace gpu {
-namespace d3d12 {
-
-class D3D12CommandProcessor;
 
 // Manages memory for unconverted textures, resolve targets, vertex and index
 // buffers that can be accessed from shaders with Xenon physical addresses, with
 // system page size granularity.
 class SharedMemory {
  public:
-  SharedMemory(D3D12CommandProcessor& command_processor, Memory& memory,
-               TraceWriter& trace_writer);
-  ~SharedMemory();
-
-  bool Initialize();
-  void Shutdown();
-  void ClearCache();
-
-  ID3D12Resource* GetBuffer() const { return buffer_; }
-  D3D12_GPU_VIRTUAL_ADDRESS GetGPUAddress() const {
-    return buffer_gpu_address_;
-  }
-
-  void CompletedSubmissionUpdated();
+  virtual ~SharedMemory();
+  // Call in the implementation-specific ClearCache.
+  virtual void ClearCache();
 
   typedef void (*GlobalWatchCallback)(void* context, uint32_t address_first,
                                       uint32_t address_last,
@@ -86,10 +69,8 @@ class SharedMemory {
   void UnwatchMemoryRange(WatchHandle handle);
 
   // Checks if the range has been updated, uploads new data if needed and
-  // ensures the buffer tiles backing the range are resident. May transition the
-  // tiled buffer to copy destination - call this before UseForReading or
-  // UseForWriting. Returns true if the range has been fully updated and is
-  // usable.
+  // ensures the host GPU memory backing the range are resident. Returns true if
+  // the range has been fully updated and is usable.
   bool RequestRange(uint32_t start, uint32_t length);
 
   // Marks the range and, if not exact_range, potentially its surroundings
@@ -106,124 +87,83 @@ class SharedMemory {
   // be called, to make sure, if the GPU writes don't overwrite *everything* in
   // the pages they touch, the CPU data is properly loaded to the unmodified
   // regions in those pages.
-  void RangeWrittenByGPU(uint32_t start, uint32_t length);
+  void RangeWrittenByGpu(uint32_t start, uint32_t length);
 
-  // Makes the buffer usable for vertices, indices and texture untiling.
-  inline void UseForReading() {
-    // Vertex fetch is also allowed in pixel shaders.
-    CommitUAVWritesAndTransitionBuffer(
-        D3D12_RESOURCE_STATE_INDEX_BUFFER |
-        D3D12_RESOURCE_STATE_NON_PIXEL_SHADER_RESOURCE |
-        D3D12_RESOURCE_STATE_PIXEL_SHADER_RESOURCE);
-  }
-  // Makes the buffer usable for texture tiling after a resolve.
-  inline void UseForWriting() {
-    CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_UNORDERED_ACCESS);
-  }
-  // Makes the buffer usable as a source for copy commands.
-  inline void UseAsCopySource() {
-    CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATE_COPY_SOURCE);
-  }
-  // Must be called when doing draws/dispatches modifying data within the shared
-  // memory buffer as a UAV, to make sure that when UseForWriting is called the
-  // next time, a UAV barrier will be done, and subsequent overlapping UAV
-  // writes and reads are ordered.
-  inline void MarkUAVWritesCommitNeeded() {
-    if (buffer_state_ == D3D12_RESOURCE_STATE_UNORDERED_ACCESS) {
-      buffer_uav_writes_commit_needed_ = true;
-    }
-  }
+ protected:
+  SharedMemory(Memory& memory);
+  // Call in implementation-specific initialization.
+  void InitializeCommon();
+  // Call last in implementation-specific shutdown, also callable from the
+  // destructor.
+  void ShutdownCommon();
 
-  void WriteRawSRVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle);
-  void WriteRawUAVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle);
-  // Due to the Nvidia 128 megatexel limitation, the smallest supported formats
-  // are 32-bit.
-  void WriteUintPow2SRVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle,
-                                  uint32_t element_size_bytes_pow2);
-  void WriteUintPow2UAVDescriptor(D3D12_CPU_DESCRIPTOR_HANDLE handle,
-                                  uint32_t element_size_bytes_pow2);
+  static constexpr uint32_t kBufferSizeLog2 = 29;
+  static constexpr uint32_t kBufferSize = 1 << kBufferSizeLog2;
 
-  // Returns true if any downloads were submitted to the command processor.
-  bool InitializeTraceSubmitDownloads();
-  void InitializeTraceCompleteDownloads();
+  // Sparse allocations are 4 MB, so not too many of them are allocated, but
+  // also not to waste too much memory for padding (with 16 MB there's too
+  // much).
+  static constexpr uint32_t kOptimalAllocationLog2 = 22;
+  static_assert(kOptimalAllocationLog2 <= kBufferSizeLog2);
 
- private:
-  bool AreTiledResourcesUsed() const;
+  Memory& memory() const { return memory_; }
+
+  uint32_t page_size_log2() const { return page_size_log2_; }
 
   // Mark the memory range as updated and protect it.
   void MakeRangeValid(uint32_t start, uint32_t length, bool written_by_gpu);
 
-  D3D12CommandProcessor& command_processor_;
+  // Ensures the host GPU memory backing the range is accessible by host GPU
+  // drawing / computations / copying, but doesn't upload anything.
+  virtual bool EnsureHostGpuMemoryAllocated(uint32_t start,
+                                            uint32_t length) = 0;
+
+  // Uploads a range of host pages - only called if EnsureHostGpuMemoryAllocated
+  // succeeded. While uploading, MarkRangeValid must be called for each
+  // successfully uploaded range as early as possible, before the memcpy, to
+  // make sure invalidation that happened during the CPU -> GPU memcpy isn't
+  // missed (upload_page_ranges is in pages because of this - MarkRangeValid has
+  // page granularity).
+  virtual bool UploadRanges(
+      const std::vector<std::pair<uint32_t, uint32_t>>& upload_page_ranges) = 0;
+
+  // Mutable so the implementation can skip ranges by setting their "second"
+  // value to 0 if needed.
+  std::vector<std::pair<uint32_t, uint32_t>>& trace_download_ranges() {
+    return trace_download_ranges_;
+  }
+  uint32_t trace_download_page_count() const {
+    return trace_download_page_count_;
+  }
+  // Fills trace_download_ranges() and trace_download_page_count() with
+  // GPU-written ranges that need to be downloaded, and also invalidates
+  // non-GPU-written ranges so only the needed data - not the all the collected
+  // data - will be written in the trace. trace_download_page_count() will be 0
+  // if nothing to download.
+  void PrepareForTraceDownload();
+  // Release memory used for trace download ranges, to be called after
+  // downloading or in cases when download is dropped.
+  void ReleaseTraceDownloadRanges();
+
+ private:
   Memory& memory_;
-  TraceWriter& trace_writer_;
-
-  // The 512 MB tiled buffer.
-  static constexpr uint32_t kBufferSizeLog2 = 29;
-  static constexpr uint32_t kBufferSize = 1 << kBufferSizeLog2;
-  ID3D12Resource* buffer_ = nullptr;
-  D3D12_GPU_VIRTUAL_ADDRESS buffer_gpu_address_ = 0;
-  D3D12_RESOURCE_STATES buffer_state_ = D3D12_RESOURCE_STATE_COPY_DEST;
-  bool buffer_uav_writes_commit_needed_ = false;
-  void CommitUAVWritesAndTransitionBuffer(D3D12_RESOURCE_STATES new_state);
-
-  // Heaps are 4 MB, so not too many of them are allocated, but also not to
-  // waste too much memory for padding (with 16 MB there's too much).
-  static constexpr uint32_t kHeapSizeLog2 = 22;
-  static constexpr uint32_t kHeapSize = 1 << kHeapSizeLog2;
-  static_assert((kHeapSize % D3D12_TILED_RESOURCE_TILE_SIZE_IN_BYTES) == 0,
-                "Heap size must be a multiple of Direct3D tile size");
-  // Resident portions of the tiled buffer.
-  ID3D12Heap* heaps_[kBufferSize >> kHeapSizeLog2] = {};
-  // Number of the heaps currently resident, for profiling.
-  uint32_t heap_count_ = 0;
 
   // Log2 of invalidation granularity (the system page size, but the dependency
   // on it is not hard - the access callback takes a range as an argument, and
   // touched pages of the buffer of this size will be invalidated).
   uint32_t page_size_log2_;
-  // Total buffer page count.
-  uint32_t page_count_;
-
-  // Ensures the buffer tiles backing the range are resident, but doesn't upload
-  // anything.
-  bool EnsureTilesResident(uint32_t start, uint32_t length);
-
-  // Non-shader-visible buffer descriptor heap for faster binding (via copying
-  // rather than creation).
-  enum class BufferDescriptorIndex : uint32_t {
-    kRawSRV,
-    kR32UintSRV,
-    kR32G32UintSRV,
-    kR32G32B32A32UintSRV,
-    kRawUAV,
-    kR32UintUAV,
-    kR32G32UintUAV,
-    kR32G32B32A32UintUAV,
-
-    kCount,
-  };
-  ID3D12DescriptorHeap* buffer_descriptor_heap_ = nullptr;
-  D3D12_CPU_DESCRIPTOR_HANDLE buffer_descriptor_heap_start_;
-
-  // First page and length in pages.
-  typedef std::pair<uint32_t, uint32_t> UploadRange;
-  // Ranges that need to be uploaded, generated by GetRangesToUpload (a
-  // persistently allocated vector).
-  std::vector<UploadRange> upload_ranges_;
-  void GetRangesToUpload(uint32_t request_page_first,
-                         uint32_t request_page_last);
-  std::unique_ptr<ui::d3d12::D3D12UploadBufferPool> upload_buffer_pool_;
-
-  // GPU-written memory downloading for traces.
-  // Start page, length in pages.
-  std::vector<std::pair<uint32_t, uint32_t>> trace_gpu_written_ranges_;
-  // Created temporarily, only for downloading.
-  ID3D12Resource* trace_gpu_written_buffer_ = nullptr;
-  void ResetTraceGPUWrittenBuffer();
 
   void* memory_invalidation_callback_handle_ = nullptr;
   void* memory_data_provider_handle_ = nullptr;
 
+  // Ranges that need to be uploaded, generated by GetRangesToUpload (a
+  // persistently allocated vector).
+  std::vector<std::pair<uint32_t, uint32_t>> upload_ranges_;
+
+  // GPU-written memory downloading for traces. <Start address, length>.
+  std::vector<std::pair<uint32_t, uint32_t>> trace_download_ranges_;
+  uint32_t trace_download_page_count_ = 0;
+
   // Mutex between the guest memory subsystem and the command processor, to be
   // locked when checking or updating validity of pages/ranges and when firing
   // watches.
@@ -309,8 +249,7 @@ class SharedMemory {
   void UnlinkWatchRange(WatchRange* range);
 };
 
-}  // namespace d3d12
 }  // namespace gpu
 }  // namespace xe
 
-#endif  // XENIA_GPU_D3D12_SHARED_MEMORY_H_
+#endif  // XENIA_GPU_SHARED_MEMORY_H_