diff --git a/src/xenia/cpu/mmio_handler.cc b/src/xenia/cpu/mmio_handler.cc
index 9eba0cac8..4891ab1bc 100644
--- a/src/xenia/cpu/mmio_handler.cc
+++ b/src/xenia/cpu/mmio_handler.cc
@@ -11,6 +11,7 @@
 
 #include <algorithm>
 #include <cstring>
+#include <utility>
 
 #include "xenia/base/assert.h"
 #include "xenia/base/byte_order.h"
@@ -281,6 +282,14 @@ bool MMIOHandler::ExceptionCallback(Exception* ex) {
   if (ex->code() != Exception::Code::kAccessViolation) {
     return false;
   }
+  Exception::AccessViolationOperation operation =
+      ex->access_violation_operation();
+  if (operation != Exception::AccessViolationOperation::kRead &&
+      operation != Exception::AccessViolationOperation::kWrite) {
+    // Data Execution Prevention or something else uninteresting.
+    return false;
+  }
+  bool is_write = operation == Exception::AccessViolationOperation::kWrite;
   if (ex->fault_address() < uint64_t(virtual_membase_) ||
       ex->fault_address() > uint64_t(memory_end_)) {
     // Quick kill anything outside our mapping.
@@ -304,32 +313,23 @@ bool MMIOHandler::ExceptionCallback(Exception* ex) {
   }
   if (!range) {
     // Recheck if the pages are still protected (race condition - another thread
-    // clears the writewatch we just hit).
+    // clears the watch we just hit).
     // Do this under the lock so we don't introduce another race condition.
     auto lock = global_critical_region_.Acquire();
     memory::PageAccess cur_access;
     size_t page_length = memory::page_size();
     memory::QueryProtect(fault_host_address, page_length, cur_access);
-    if (cur_access != memory::PageAccess::kReadOnly &&
-        cur_access != memory::PageAccess::kNoAccess) {
-      // Another thread has cleared this write watch. Abort.
+    if (cur_access != memory::PageAccess::kNoAccess &&
+        (!is_write || cur_access != memory::PageAccess::kReadOnly)) {
+      // Another thread has cleared this watch. Abort.
       return true;
     }
-
     // The address is not found within any range, so either a write watch or an
     // actual access violation.
     if (access_violation_callback_) {
-      switch (ex->access_violation_operation()) {
-        case Exception::AccessViolationOperation::kRead:
-          return access_violation_callback_(access_violation_callback_context_,
-                                            fault_host_address, false);
-        case Exception::AccessViolationOperation::kWrite:
-          return access_violation_callback_(access_violation_callback_context_,
-                                            fault_host_address, true);
-        default:
-          // Data Execution Prevention or something else uninteresting.
-          break;
-      }
+      return access_violation_callback_(std::move(lock),
+                                        access_violation_callback_context_,
+                                        fault_host_address, is_write);
     }
     return false;
   }
diff --git a/src/xenia/cpu/mmio_handler.h b/src/xenia/cpu/mmio_handler.h
index 1bff31216..fdf202e1c 100644
--- a/src/xenia/cpu/mmio_handler.h
+++ b/src/xenia/cpu/mmio_handler.h
@@ -11,6 +11,7 @@
 #define XENIA_CPU_MMIO_HANDLER_H_
 
 #include <memory>
+#include <mutex>
 #include <vector>
 
 #include "xenia/base/mutex.h"
@@ -44,12 +45,13 @@ class MMIOHandler {
 
   typedef uint32_t (*HostToGuestVirtual)(const void* context,
                                          const void* host_address);
-  typedef bool (*AccessViolationCallback)(void* context, void* host_address,
-                                          bool is_write);
+  typedef bool (*AccessViolationCallback)(
+      std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+      void* context, void* host_address, bool is_write);
 
-  // access_violation_callback is called in global_critical_region, so if
-  // multiple threads trigger an access violation in the same page, the callback
-  // will be called only once.
+  // access_violation_callback is called with global_critical_region locked once
+  // on the thread, so if multiple threads trigger an access violation in the
+  // same page, the callback will be called only once.
   static std::unique_ptr<MMIOHandler> Install(
       uint8_t* virtual_membase, uint8_t* physical_membase, uint8_t* membase_end,
       HostToGuestVirtual host_to_guest_virtual,
diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
index 0be3e5b5d..432a6ea11 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@@ -87,8 +87,8 @@ void D3D12CommandProcessor::RequestFrameTrace(const std::wstring& root_path) {
 
 void D3D12CommandProcessor::TracePlaybackWroteMemory(uint32_t base_ptr,
                                                      uint32_t length) {
-  shared_memory_->MemoryWriteCallback(base_ptr, length, true);
-  primitive_converter_->MemoryWriteCallback(base_ptr, length, true);
+  shared_memory_->MemoryInvalidationCallback(base_ptr, length, true);
+  primitive_converter_->MemoryInvalidationCallback(base_ptr, length, true);
 }
 
 void D3D12CommandProcessor::RestoreEDRAMSnapshot(const void* snapshot) {
@@ -866,6 +866,7 @@ bool D3D12CommandProcessor::SetupContext() {
   if (FAILED(gamma_ramp_upload_->Map(
           0, nullptr, reinterpret_cast<void**>(&gamma_ramp_upload_mapping_)))) {
     XELOGE("Failed to map the gamma ramp upload buffer");
+    gamma_ramp_upload_mapping_ = nullptr;
     return false;
   }
 
@@ -1827,42 +1828,24 @@ bool D3D12CommandProcessor::IssueCopy() {
   return true;
 }
 
-void D3D12CommandProcessor::BeginSubmission(bool is_guest_command) {
-#if FINE_GRAINED_DRAW_SCOPES
-  SCOPE_profile_cpu_f("gpu");
-#endif  // FINE_GRAINED_DRAW_SCOPES
-
-  bool is_opening_frame = is_guest_command && !frame_open_;
-  if (submission_open_ && !is_opening_frame) {
-    return;
+void D3D12CommandProcessor::CheckSubmissionFence(uint64_t await_submission) {
+  assert_true(await_submission <= submission_current_);
+  if (await_submission == submission_current_) {
+    assert_true(submission_open_);
+    EndSubmission(false);
   }
 
-  // Check the fence - needed for all kinds of submissions (to reclaim transient
-  // resources early) and specifically for frames (not to queue too many).
+  uint64_t submission_completed_before = submission_completed_;
   submission_completed_ = submission_fence_->GetCompletedValue();
-  if (is_opening_frame) {
-    // Await the availability of the current frame.
-    uint64_t frame_current_last_submission =
-        closed_frame_submissions_[frame_current_ % kQueueFrames];
-    if (frame_current_last_submission > submission_completed_) {
-      submission_fence_->SetEventOnCompletion(
-          frame_current_last_submission, submission_fence_completion_event_);
-      WaitForSingleObject(submission_fence_completion_event_, INFINITE);
-      submission_completed_ = submission_fence_->GetCompletedValue();
-    }
-    // Update the completed frame index, also obtaining the actual completed
-    // frame number (since the CPU may be actually less than 3 frames behind)
-    // before reclaiming resources tracked with the frame number.
-    frame_completed_ =
-        std::max(frame_current_, uint64_t(kQueueFrames)) - kQueueFrames;
-    for (uint64_t frame = frame_completed_ + 1; frame < frame_current_;
-         ++frame) {
-      if (closed_frame_submissions_[frame % kQueueFrames] >
-          submission_completed_) {
-        break;
-      }
-      frame_completed_ = frame;
-    }
+  if (submission_completed_ < await_submission) {
+    submission_fence_->SetEventOnCompletion(await_submission,
+                                            submission_fence_completion_event_);
+    WaitForSingleObject(submission_fence_completion_event_, INFINITE);
+    submission_completed_ = submission_fence_->GetCompletedValue();
+  }
+  if (submission_completed_ <= submission_completed_before) {
+    // Not updated - no need to reclaim or download things.
+    return;
   }
 
   // Reclaim command allocators.
@@ -1898,6 +1881,46 @@ void D3D12CommandProcessor::BeginSubmission(bool is_guest_command) {
   }
   buffers_for_deletion_.erase(buffers_for_deletion_.begin(), erase_buffers_end);
 
+  shared_memory_->CompletedSubmissionUpdated();
+
+  render_target_cache_->CompletedSubmissionUpdated();
+
+  primitive_converter_->CompletedSubmissionUpdated();
+}
+
+void D3D12CommandProcessor::BeginSubmission(bool is_guest_command) {
+#if FINE_GRAINED_DRAW_SCOPES
+  SCOPE_profile_cpu_f("gpu");
+#endif  // FINE_GRAINED_DRAW_SCOPES
+
+  bool is_opening_frame = is_guest_command && !frame_open_;
+  if (submission_open_ && !is_opening_frame) {
+    return;
+  }
+
+  // Check the fence - needed for all kinds of submissions (to reclaim transient
+  // resources early) and specifically for frames (not to queue too many), and
+  // await the availability of the current frame.
+  CheckSubmissionFence(
+      is_opening_frame
+          ? closed_frame_submissions_[frame_current_ % kQueueFrames]
+          : 0);
+  if (is_opening_frame) {
+    // Update the completed frame index, also obtaining the actual completed
+    // frame number (since the CPU may be actually less than 3 frames behind)
+    // before reclaiming resources tracked with the frame number.
+    frame_completed_ =
+        std::max(frame_current_, uint64_t(kQueueFrames)) - kQueueFrames;
+    for (uint64_t frame = frame_completed_ + 1; frame < frame_current_;
+         ++frame) {
+      if (closed_frame_submissions_[frame % kQueueFrames] >
+          submission_completed_) {
+        break;
+      }
+      frame_completed_ = frame;
+    }
+  }
+
   if (!submission_open_) {
     submission_open_ = true;
 
@@ -1920,8 +1943,6 @@ void D3D12CommandProcessor::BeginSubmission(bool is_guest_command) {
     current_sampler_heap_ = nullptr;
     primitive_topology_ = D3D_PRIMITIVE_TOPOLOGY_UNDEFINED;
 
-    shared_memory_->BeginSubmission();
-
     render_target_cache_->BeginSubmission();
 
     primitive_converter_->BeginSubmission();
diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.h b/src/xenia/gpu/d3d12/d3d12_command_processor.h
index 4a1050d61..3c42e67bb 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.h
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.h
@@ -229,6 +229,9 @@ class D3D12CommandProcessor : public CommandProcessor {
   // frame. EndSubmission(true) will close the frame no matter whether the
   // submission has already been closed.
 
+  // Rechecks submission number and reclaims per-submission resources. Pass 0 as
+  // the submission to await to simply check status.
+  void CheckSubmissionFence(uint64_t await_submission);
   // If is_guest_command is true, a new full frame - with full cleanup of
   // resources and, if needed, starting capturing - is opened if pending (as
   // opposed to simply resuming after mid-frame synchronization).
diff --git a/src/xenia/gpu/d3d12/primitive_converter.cc b/src/xenia/gpu/d3d12/primitive_converter.cc
index a3dfec579..72bcfa545 100644
--- a/src/xenia/gpu/d3d12/primitive_converter.cc
+++ b/src/xenia/gpu/d3d12/primitive_converter.cc
@@ -124,16 +124,18 @@ bool PrimitiveConverter::Initialize() {
   static_ib_gpu_address_ = static_ib_->GetGPUVirtualAddress();
 
   memory_regions_invalidated_.store(0ull, std::memory_order_relaxed);
-  physical_write_watch_handle_ =
-      memory_->RegisterPhysicalWriteWatch(MemoryWriteCallbackThunk, this);
+  memory_invalidation_callback_handle_ =
+      memory_->RegisterPhysicalMemoryInvalidationCallback(
+          MemoryInvalidationCallbackThunk, this);
 
   return true;
 }
 
 void PrimitiveConverter::Shutdown() {
-  if (physical_write_watch_handle_ != nullptr) {
-    memory_->UnregisterPhysicalWriteWatch(physical_write_watch_handle_);
-    physical_write_watch_handle_ = nullptr;
+  if (memory_invalidation_callback_handle_ != nullptr) {
+    memory_->UnregisterPhysicalMemoryInvalidationCallback(
+        memory_invalidation_callback_handle_);
+    memory_invalidation_callback_handle_ = nullptr;
   }
   ui::d3d12::util::ReleaseAndNull(static_ib_);
   ui::d3d12::util::ReleaseAndNull(static_ib_upload_);
@@ -142,24 +144,25 @@ void PrimitiveConverter::Shutdown() {
 
 void PrimitiveConverter::ClearCache() { buffer_pool_->ClearCache(); }
 
+void PrimitiveConverter::CompletedSubmissionUpdated() {
+  if (static_ib_upload_ && command_processor_->GetCompletedSubmission() >=
+                               static_ib_upload_submission_) {
+    // Completely uploaded - release the upload buffer.
+    static_ib_upload_->Release();
+    static_ib_upload_ = nullptr;
+  }
+}
+
 void PrimitiveConverter::BeginSubmission() {
   // Got a command list now - upload and transition the static index buffer if
   // needed.
-  if (static_ib_upload_) {
-    if (static_ib_upload_submission_ == UINT64_MAX) {
-      // Not uploaded yet - upload.
-      command_processor_->GetDeferredCommandList()->D3DCopyResource(
-          static_ib_, static_ib_upload_);
-      command_processor_->PushTransitionBarrier(
-          static_ib_, D3D12_RESOURCE_STATE_COPY_DEST,
-          D3D12_RESOURCE_STATE_INDEX_BUFFER);
-      static_ib_upload_submission_ = command_processor_->GetCurrentSubmission();
-    } else if (command_processor_->GetCompletedSubmission() >=
-               static_ib_upload_submission_) {
-      // Completely uploaded - release the upload buffer.
-      static_ib_upload_->Release();
-      static_ib_upload_ = nullptr;
-    }
+  if (static_ib_upload_ && static_ib_upload_submission_ == UINT64_MAX) {
+    command_processor_->GetDeferredCommandList()->D3DCopyResource(
+        static_ib_, static_ib_upload_);
+    command_processor_->PushTransitionBarrier(
+        static_ib_, D3D12_RESOURCE_STATE_COPY_DEST,
+        D3D12_RESOURCE_STATE_INDEX_BUFFER);
+    static_ib_upload_submission_ = command_processor_->GetCurrentSubmission();
   }
 }
 
@@ -706,7 +709,7 @@ void* PrimitiveConverter::AllocateIndices(
   return mapping + simd_offset;
 }
 
-std::pair<uint32_t, uint32_t> PrimitiveConverter::MemoryWriteCallback(
+std::pair<uint32_t, uint32_t> PrimitiveConverter::MemoryInvalidationCallback(
     uint32_t physical_address_start, uint32_t length, bool exact_range) {
   // 1 bit = (512 / 64) MB = 8 MB. Invalidate a region of this size.
   uint32_t bit_index_first = physical_address_start >> 23;
@@ -719,11 +722,12 @@ std::pair<uint32_t, uint32_t> PrimitiveConverter::MemoryWriteCallback(
   return std::make_pair<uint32_t, uint32_t>(0, UINT32_MAX);
 }
 
-std::pair<uint32_t, uint32_t> PrimitiveConverter::MemoryWriteCallbackThunk(
+std::pair<uint32_t, uint32_t>
+PrimitiveConverter::MemoryInvalidationCallbackThunk(
     void* context_ptr, uint32_t physical_address_start, uint32_t length,
     bool exact_range) {
   return reinterpret_cast<PrimitiveConverter*>(context_ptr)
-      ->MemoryWriteCallback(physical_address_start, length, exact_range);
+      ->MemoryInvalidationCallback(physical_address_start, length, exact_range);
 }
 
 D3D12_GPU_VIRTUAL_ADDRESS PrimitiveConverter::GetStaticIndexBuffer(
diff --git a/src/xenia/gpu/d3d12/primitive_converter.h b/src/xenia/gpu/d3d12/primitive_converter.h
index f45c36146..57a3067b2 100644
--- a/src/xenia/gpu/d3d12/primitive_converter.h
+++ b/src/xenia/gpu/d3d12/primitive_converter.h
@@ -46,6 +46,7 @@ class PrimitiveConverter {
   void Shutdown();
   void ClearCache();
 
+  void CompletedSubmissionUpdated();
   void BeginSubmission();
   void BeginFrame();
 
@@ -83,7 +84,7 @@ class PrimitiveConverter {
       uint32_t& index_count_out) const;
 
   // Callback for invalidating buffers mid-frame.
-  std::pair<uint32_t, uint32_t> MemoryWriteCallback(
+  std::pair<uint32_t, uint32_t> MemoryInvalidationCallback(
       uint32_t physical_address_start, uint32_t length, bool exact_range);
 
   void InitializeTrace();
@@ -96,7 +97,7 @@ class PrimitiveConverter {
                         uint32_t simd_offset,
                         D3D12_GPU_VIRTUAL_ADDRESS& gpu_address_out);
 
-  static std::pair<uint32_t, uint32_t> MemoryWriteCallbackThunk(
+  static std::pair<uint32_t, uint32_t> MemoryInvalidationCallbackThunk(
       void* context_ptr, uint32_t physical_address_start, uint32_t length,
       bool exact_range);
 
@@ -176,7 +177,7 @@ class PrimitiveConverter {
   // the cache.
   uint64_t memory_regions_used_;
   std::atomic<uint64_t> memory_regions_invalidated_ = 0;
-  void* physical_write_watch_handle_ = nullptr;
+  void* memory_invalidation_callback_handle_ = nullptr;
   uint32_t system_page_size_;
 };
 
diff --git a/src/xenia/gpu/d3d12/render_target_cache.cc b/src/xenia/gpu/d3d12/render_target_cache.cc
index df83d5df9..22c5cde59 100644
--- a/src/xenia/gpu/d3d12/render_target_cache.cc
+++ b/src/xenia/gpu/d3d12/render_target_cache.cc
@@ -455,12 +455,14 @@ void RenderTargetCache::ClearCache() {
   edram_snapshot_restore_pool_.reset();
 }
 
-void RenderTargetCache::BeginSubmission() {
+void RenderTargetCache::CompletedSubmissionUpdated() {
   if (edram_snapshot_restore_pool_) {
     edram_snapshot_restore_pool_->Reclaim(
         command_processor_->GetCompletedSubmission());
   }
+}
 
+void RenderTargetCache::BeginSubmission() {
   // With the ROV, a submission does not always end in a resolve (for example,
   // when memexport readback happens) or something else that would surely submit
   // the UAV barrier, so we need to preserve the `current_` variables.
@@ -1417,8 +1419,8 @@ bool RenderTargetCache::ResolveCopy(SharedMemory* shared_memory,
         return false;
       }
     } else {
-      if (!shared_memory->MakeTilesResident(dest_modified_start,
-                                            dest_modified_length)) {
+      if (!shared_memory->EnsureTilesResident(dest_modified_start,
+                                              dest_modified_length)) {
         return false;
       }
     }
diff --git a/src/xenia/gpu/d3d12/render_target_cache.h b/src/xenia/gpu/d3d12/render_target_cache.h
index e227a7b60..db1826ac5 100644
--- a/src/xenia/gpu/d3d12/render_target_cache.h
+++ b/src/xenia/gpu/d3d12/render_target_cache.h
@@ -257,6 +257,7 @@ class RenderTargetCache {
   void Shutdown();
   void ClearCache();
 
+  void CompletedSubmissionUpdated();
   void BeginSubmission();
   void EndFrame();
   // Called in the beginning of a draw call - may bind pipelines.
diff --git a/src/xenia/gpu/d3d12/shared_memory.cc b/src/xenia/gpu/d3d12/shared_memory.cc
index 1b80cb271..a88537672 100644
--- a/src/xenia/gpu/d3d12/shared_memory.cc
+++ b/src/xenia/gpu/d3d12/shared_memory.cc
@@ -11,6 +11,7 @@
 
 #include <algorithm>
 #include <cstring>
+#include <utility>
 #include <vector>
 
 #include "xenia/base/assert.h"
@@ -49,11 +50,6 @@ SharedMemory::SharedMemory(D3D12CommandProcessor* command_processor,
       trace_writer_(trace_writer) {
   page_size_log2_ = xe::log2_ceil(uint32_t(xe::memory::page_size()));
   page_count_ = kBufferSize >> page_size_log2_;
-  uint32_t page_bitmap_length = page_count_ >> 6;
-  assert_true(page_bitmap_length != 0);
-
-  // Two interleaved bit arrays.
-  valid_and_gpu_written_pages_.resize(page_bitmap_length << 1);
 }
 
 SharedMemory::~SharedMemory() { Shutdown(); }
@@ -125,14 +121,16 @@ bool SharedMemory::Initialize() {
                                      uint32_t(BufferDescriptorIndex::kRawUAV)),
       buffer_, kBufferSize);
 
-  std::memset(valid_and_gpu_written_pages_.data(), 0,
-              valid_and_gpu_written_pages_.size() * sizeof(uint64_t));
+  system_page_flags_.clear();
+  system_page_flags_.resize((page_count_ + 63) / 64);
 
-  upload_buffer_pool_ =
-      std::make_unique<ui::d3d12::UploadBufferPool>(device, 4 * 1024 * 1024);
+  upload_buffer_pool_ = std::make_unique<ui::d3d12::UploadBufferPool>(
+      device,
+      xe::align(uint32_t(4 * 1024 * 1024), uint32_t(1) << page_size_log2_));
 
-  physical_write_watch_handle_ =
-      memory_->RegisterPhysicalWriteWatch(MemoryWriteCallbackThunk, this);
+  memory_invalidation_callback_handle_ =
+      memory_->RegisterPhysicalMemoryInvalidationCallback(
+          MemoryInvalidationCallbackThunk, this);
 
   ResetTraceGPUWrittenBuffer();
 
@@ -144,9 +142,10 @@ void SharedMemory::Shutdown() {
 
   // TODO(Triang3l): Do something in case any watches are still registered.
 
-  if (physical_write_watch_handle_ != nullptr) {
-    memory_->UnregisterPhysicalWriteWatch(physical_write_watch_handle_);
-    physical_write_watch_handle_ = nullptr;
+  if (memory_invalidation_callback_handle_ != nullptr) {
+    memory_->UnregisterPhysicalMemoryInvalidationCallback(
+        memory_invalidation_callback_handle_);
+    memory_invalidation_callback_handle_ = nullptr;
   }
 
   upload_buffer_pool_.reset();
@@ -165,7 +164,7 @@ void SharedMemory::Shutdown() {
   }
 }
 
-void SharedMemory::BeginSubmission() {
+void SharedMemory::CompletedSubmissionUpdated() {
   upload_buffer_pool_->Reclaim(command_processor_->GetCompletedSubmission());
 }
 
@@ -273,7 +272,7 @@ void SharedMemory::UnwatchMemoryRange(WatchHandle handle) {
   UnlinkWatchRange(reinterpret_cast<WatchRange*>(handle));
 }
 
-bool SharedMemory::MakeTilesResident(uint32_t start, uint32_t length) {
+bool SharedMemory::EnsureTilesResident(uint32_t start, uint32_t length) {
   if (length == 0) {
     // Some texture is empty, for example - safe to draw in this case.
     return true;
@@ -347,7 +346,7 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length) {
 #endif  // FINE_GRAINED_DRAW_SCOPES
 
   // Ensure all tile heaps are present.
-  if (!MakeTilesResident(start, length)) {
+  if (!EnsureTilesResident(start, length)) {
     return false;
   }
 
@@ -375,7 +374,8 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length) {
         return false;
       }
       uint32_t upload_buffer_pages = upload_buffer_size >> page_size_log2_;
-      MakeRangeValid(upload_range_start, upload_buffer_pages, false);
+      MakeRangeValid(upload_range_start << page_size_log2_,
+                     upload_buffer_pages << page_size_log2_, false);
       std::memcpy(
           upload_buffer_mapping,
           memory_->TranslatePhysical(upload_range_start << page_size_log2_),
@@ -439,7 +439,7 @@ void SharedMemory::RangeWrittenByGPU(uint32_t start, uint32_t length) {
 
   // Mark the range as valid (so pages are not reuploaded until modified by the
   // CPU) and watch it so the CPU can reuse it and this will be caught.
-  MakeRangeValid(page_first, page_last - page_first + 1, true);
+  MakeRangeValid(start, length, true);
 }
 
 bool SharedMemory::AreTiledResourcesUsed() const {
@@ -453,14 +453,15 @@ bool SharedMemory::AreTiledResourcesUsed() const {
          provider->GetGraphicsAnalysis() == nullptr;
 }
 
-void SharedMemory::MakeRangeValid(uint32_t valid_page_first,
-                                  uint32_t valid_page_count,
+void SharedMemory::MakeRangeValid(uint32_t start, uint32_t length,
                                   bool written_by_gpu) {
-  if (valid_page_first >= page_count_ || valid_page_count == 0) {
+  if (length == 0 || start >= kBufferSize) {
     return;
   }
-  valid_page_count = std::min(valid_page_count, page_count_ - valid_page_first);
-  uint32_t valid_page_last = valid_page_first + valid_page_count - 1;
+  length = std::min(length, kBufferSize - start);
+  uint32_t last = start + length - 1;
+  uint32_t valid_page_first = start >> page_size_log2_;
+  uint32_t valid_page_last = last >> page_size_log2_;
   uint32_t valid_block_first = valid_page_first >> 6;
   uint32_t valid_block_last = valid_page_last >> 6;
 
@@ -475,18 +476,21 @@ void SharedMemory::MakeRangeValid(uint32_t valid_page_first,
       if (i == valid_block_last && (valid_page_last & 63) != 63) {
         valid_bits &= (1ull << ((valid_page_last & 63) + 1)) - 1;
       }
-      valid_and_gpu_written_pages_[i << 1] |= valid_bits;
+      SystemPageFlagsBlock& block = system_page_flags_[i];
+      block.valid |= valid_bits;
       if (written_by_gpu) {
-        valid_and_gpu_written_pages_[(i << 1) + 1] |= valid_bits;
+        block.valid_and_gpu_written |= valid_bits;
       } else {
-        valid_and_gpu_written_pages_[(i << 1) + 1] &= ~valid_bits;
+        block.valid_and_gpu_written &= ~valid_bits;
       }
     }
   }
 
-  if (physical_write_watch_handle_) {
-    memory_->WatchPhysicalMemoryWrite(valid_page_first << page_size_log2_,
-                                      valid_page_count << page_size_log2_);
+  if (memory_invalidation_callback_handle_) {
+    memory_->EnablePhysicalMemoryAccessCallbacks(
+        valid_page_first << page_size_log2_,
+        (valid_page_last - valid_page_first + 1) << page_size_log2_, true,
+        false);
   }
 }
 
@@ -527,7 +531,7 @@ void SharedMemory::GetRangesToUpload(uint32_t request_page_first,
 
   uint32_t range_start = UINT32_MAX;
   for (uint32_t i = request_block_first; i <= request_block_last; ++i) {
-    uint64_t block_valid = valid_and_gpu_written_pages_[i << 1];
+    uint64_t block_valid = system_page_flags_[i].valid;
     // Consider pages in the block outside the requested range valid.
     if (i == request_block_first) {
       block_valid |= (1ull << (request_page_first & 63)) - 1;
@@ -569,17 +573,23 @@ void SharedMemory::GetRangesToUpload(uint32_t request_page_first,
   }
 }
 
-std::pair<uint32_t, uint32_t> SharedMemory::MemoryWriteCallbackThunk(
+std::pair<uint32_t, uint32_t> SharedMemory::MemoryInvalidationCallbackThunk(
     void* context_ptr, uint32_t physical_address_start, uint32_t length,
     bool exact_range) {
   return reinterpret_cast<SharedMemory*>(context_ptr)
-      ->MemoryWriteCallback(physical_address_start, length, exact_range);
+      ->MemoryInvalidationCallback(physical_address_start, length, exact_range);
 }
 
-std::pair<uint32_t, uint32_t> SharedMemory::MemoryWriteCallback(
+std::pair<uint32_t, uint32_t> SharedMemory::MemoryInvalidationCallback(
     uint32_t physical_address_start, uint32_t length, bool exact_range) {
+  if (length == 0 || physical_address_start >= kBufferSize) {
+    return std::make_pair(uint32_t(0), UINT32_MAX);
+  }
+  length = std::min(length, kBufferSize - physical_address_start);
+  uint32_t physical_address_last = physical_address_start + (length - 1);
+
   uint32_t page_first = physical_address_start >> page_size_log2_;
-  uint32_t page_last = (physical_address_start + length - 1) >> page_size_log2_;
+  uint32_t page_last = physical_address_last >> page_size_log2_;
   assert_true(page_first < page_count_ && page_last < page_count_);
   uint32_t block_first = page_first >> 6;
   uint32_t block_last = page_last >> 6;
@@ -596,14 +606,14 @@ std::pair<uint32_t, uint32_t> SharedMemory::MemoryWriteCallback(
     // frame, but with 256 KB it's 0.7 ms.
     if (page_first & 63) {
       uint64_t gpu_written_start =
-          valid_and_gpu_written_pages_[(block_first << 1) + 1];
+          system_page_flags_[block_first].valid_and_gpu_written;
       gpu_written_start &= (1ull << (page_first & 63)) - 1;
       page_first =
           (page_first & ~uint32_t(63)) + (64 - xe::lzcnt(gpu_written_start));
     }
     if ((page_last & 63) != 63) {
       uint64_t gpu_written_end =
-          valid_and_gpu_written_pages_[(block_last << 1) + 1];
+          system_page_flags_[block_last].valid_and_gpu_written;
       gpu_written_end &= ~((1ull << ((page_last & 63) + 1)) - 1);
       page_last = (page_last & ~uint32_t(63)) +
                   (std::max(xe::tzcnt(gpu_written_end), uint8_t(1)) - 1);
@@ -618,8 +628,9 @@ std::pair<uint32_t, uint32_t> SharedMemory::MemoryWriteCallback(
     if (i == block_last && (page_last & 63) != 63) {
       invalidate_bits &= (1ull << ((page_last & 63) + 1)) - 1;
     }
-    valid_and_gpu_written_pages_[i << 1] &= ~invalidate_bits;
-    valid_and_gpu_written_pages_[(i << 1) + 1] &= ~invalidate_bits;
+    SystemPageFlagsBlock& block = system_page_flags_[i];
+    block.valid &= ~invalidate_bits;
+    block.valid_and_gpu_written &= ~invalidate_bits;
   }
 
   FireWatches(page_first, page_last, false);
@@ -664,10 +675,11 @@ bool SharedMemory::InitializeTraceSubmitDownloads() {
     auto global_lock = global_critical_region_.Acquire();
     uint32_t fire_watches_range_start = UINT32_MAX;
     uint32_t gpu_written_range_start = UINT32_MAX;
-    for (uint32_t i = 0; i * 2 < valid_and_gpu_written_pages_.size(); ++i) {
-      uint64_t previously_valid_block = valid_and_gpu_written_pages_[i * 2];
-      uint64_t gpu_written_block = valid_and_gpu_written_pages_[i * 2 + 1];
-      valid_and_gpu_written_pages_[i * 2] = gpu_written_block;
+    for (uint32_t i = 0; i < system_page_flags_.size(); ++i) {
+      SystemPageFlagsBlock& page_flags_block = system_page_flags_[i];
+      uint64_t previously_valid_block = page_flags_block.valid;
+      uint64_t gpu_written_block = page_flags_block.valid_and_gpu_written;
+      page_flags_block.valid = gpu_written_block;
 
       // Fire watches on the invalidated pages.
       uint64_t fire_watches_block = previously_valid_block & ~gpu_written_block;
@@ -748,8 +760,8 @@ bool SharedMemory::InitializeTraceSubmitDownloads() {
           &gpu_written_buffer_desc, D3D12_RESOURCE_STATE_COPY_DEST, nullptr,
           IID_PPV_ARGS(&trace_gpu_written_buffer_)))) {
     XELOGE(
-        "Failed to create a %u KB GPU-written memory download buffer for frame "
-        "tracing",
+        "Shared memory: Failed to create a %u KB GPU-written memory download "
+        "buffer for frame tracing",
         gpu_written_page_count << page_size_log2_ >> 10);
     ResetTraceGPUWrittenBuffer();
     return false;
@@ -761,8 +773,8 @@ bool SharedMemory::InitializeTraceSubmitDownloads() {
   for (auto& gpu_written_submit_range : trace_gpu_written_ranges_) {
     // For cases like resolution scale, when the data may not be actually
     // written, just marked as valid.
-    if (!MakeTilesResident(gpu_written_submit_range.first,
-                           gpu_written_submit_range.second)) {
+    if (!EnsureTilesResident(gpu_written_submit_range.first,
+                             gpu_written_submit_range.second)) {
       gpu_written_submit_range.second = 0;
       continue;
     }
diff --git a/src/xenia/gpu/d3d12/shared_memory.h b/src/xenia/gpu/d3d12/shared_memory.h
index 9b24b01b1..af99fa15b 100644
--- a/src/xenia/gpu/d3d12/shared_memory.h
+++ b/src/xenia/gpu/d3d12/shared_memory.h
@@ -11,7 +11,6 @@
 #define XENIA_GPU_D3D12_SHARED_MEMORY_H_
 
 #include <memory>
-#include <mutex>
 #include <utility>
 #include <vector>
 
@@ -44,7 +43,7 @@ class SharedMemory {
     return buffer_gpu_address_;
   }
 
-  void BeginSubmission();
+  void CompletedSubmissionUpdated();
 
   typedef void (*GlobalWatchCallback)(void* context, uint32_t address_first,
                                       uint32_t address_last,
@@ -57,7 +56,7 @@ class SharedMemory {
   // example, if the game changes protection level of a memory range containing
   // the watched range.
   //
-  // The callback is called with the mutex locked.
+  // The callback is called within the global critical region.
   GlobalWatchHandle RegisterGlobalWatch(GlobalWatchCallback callback,
                                         void* callback_context);
   void UnregisterGlobalWatch(GlobalWatchHandle handle);
@@ -84,15 +83,10 @@ class SharedMemory {
                                void* callback_data, uint64_t callback_argument);
   // Unregisters previously registered watched memory range.
   void UnwatchMemoryRange(WatchHandle handle);
-  // Locks the mutex that gets locked when watch callbacks are invoked - must be
-  // done when checking variables that may be changed by a watch callback.
-  inline std::unique_lock<std::recursive_mutex> LockWatchMutex() {
-    return global_critical_region_.Acquire();
-  }
 
   // Ensures the buffer tiles backing the range are resident, but doesn't upload
   // anything.
-  bool MakeTilesResident(uint32_t start, uint32_t length);
+  bool EnsureTilesResident(uint32_t start, uint32_t length);
 
   // Checks if the range has been updated, uploads new data if needed and
   // ensures the buffer tiles backing the range are resident. May transition the
@@ -105,7 +99,7 @@ class SharedMemory {
   // (to up to the first GPU-written page, as an access violation exception
   // count optimization) as modified by the CPU, also invalidating GPU-written
   // pages directly in the range.
-  std::pair<uint32_t, uint32_t> MemoryWriteCallback(
+  std::pair<uint32_t, uint32_t> MemoryInvalidationCallback(
       uint32_t physical_address_start, uint32_t length, bool exact_range);
 
   // Marks the range as containing GPU-generated data (such as resolves),
@@ -141,8 +135,7 @@ class SharedMemory {
   bool AreTiledResourcesUsed() const;
 
   // Mark the memory range as updated and protect it.
-  void MakeRangeValid(uint32_t valid_page_first, uint32_t valid_page_count,
-                      bool written_by_gpu);
+  void MakeRangeValid(uint32_t start, uint32_t length, bool written_by_gpu);
 
   D3D12CommandProcessor* command_processor_;
   Memory* memory_;
@@ -154,6 +147,7 @@ class SharedMemory {
   ID3D12Resource* buffer_ = nullptr;
   D3D12_GPU_VIRTUAL_ADDRESS buffer_gpu_address_ = 0;
   D3D12_RESOURCE_STATES buffer_state_ = D3D12_RESOURCE_STATE_COPY_DEST;
+  void TransitionBuffer(D3D12_RESOURCE_STATES new_state);
 
   // Heaps are 4 MB, so not too many of them are allocated, but also not to
   // waste too much memory for padding (with 16 MB there's too much).
@@ -166,9 +160,11 @@ class SharedMemory {
   // Number of the heaps currently resident, for profiling.
   uint32_t heap_count_ = 0;
 
-  // Log2 of system page size.
+  // Log2 of invalidation granularity (the system page size, but the dependency
+  // on it is not hard - the access callback takes a range as an argument, and
+  // touched pages of the buffer of this size will be invalidated).
   uint32_t page_size_log2_;
-  // Total physical page count.
+  // Total buffer page count.
   uint32_t page_count_;
 
   // Non-shader-visible buffer descriptor heap for faster binding (via copying
@@ -182,24 +178,46 @@ class SharedMemory {
   ID3D12DescriptorHeap* buffer_descriptor_heap_ = nullptr;
   D3D12_CPU_DESCRIPTOR_HANDLE buffer_descriptor_heap_start_;
 
-  // Handle of the physical memory write callback.
-  void* physical_write_watch_handle_ = nullptr;
+  // First page and length in pages.
+  typedef std::pair<uint32_t, uint32_t> UploadRange;
+  // Ranges that need to be uploaded, generated by GetRangesToUpload (a
+  // persistently allocated vector).
+  std::vector<UploadRange> upload_ranges_;
+  void GetRangesToUpload(uint32_t request_page_first,
+                         uint32_t request_page_last);
+  std::unique_ptr<ui::d3d12::UploadBufferPool> upload_buffer_pool_ = nullptr;
 
-  // Mutex between the exception handler and the command processor, to be locked
-  // when checking or updating validity of pages/ranges.
+  // GPU-written memory downloading for traces.
+  // Start page, length in pages.
+  std::vector<std::pair<uint32_t, uint32_t>> trace_gpu_written_ranges_;
+  // Created temporarily, only for downloading.
+  ID3D12Resource* trace_gpu_written_buffer_ = nullptr;
+  void ResetTraceGPUWrittenBuffer();
+
+  void* memory_invalidation_callback_handle_ = nullptr;
+  void* memory_data_provider_handle_ = nullptr;
+
+  // Mutex between the guest memory subsystem and the command processor, to be
+  // locked when checking or updating validity of pages/ranges and when firing
+  // watches.
   xe::global_critical_region global_critical_region_;
 
   // ***************************************************************************
-  // Things below should be protected by global_critical_region.
+  // Things below should be fully protected by global_critical_region.
   // ***************************************************************************
 
-  // Bit vector containing:
-  // - Even block indices - whether physical memory system pages are up to date.
-  // - Odd block indices - whether phyical memory system pages contain data
-  //   written by the GPU not synchronized with the CPU (subset of valid pages).
-  std::vector<uint64_t> valid_and_gpu_written_pages_;
+  struct SystemPageFlagsBlock {
+    // Whether each page is up to date in the GPU buffer.
+    uint64_t valid;
+    // Subset of valid pages - whether each page in the GPU buffer contains data
+    // that was written on the GPU, thus should not be invalidated spuriously.
+    uint64_t valid_and_gpu_written;
+  };
+  // Flags for each 64 system pages, interleaved as blocks, so bit scan can be
+  // used to quickly extract ranges.
+  std::vector<SystemPageFlagsBlock> system_page_flags_;
 
-  static std::pair<uint32_t, uint32_t> MemoryWriteCallbackThunk(
+  static std::pair<uint32_t, uint32_t> MemoryInvalidationCallbackThunk(
       void* context_ptr, uint32_t physical_address_start, uint32_t length,
       bool exact_range);
 
@@ -259,30 +277,9 @@ class SharedMemory {
   // watches.
   void FireWatches(uint32_t page_first, uint32_t page_last,
                    bool invalidated_by_gpu);
-  // Unlinks and frees the range and its nodes. Call this with the mutex locked.
+  // Unlinks and frees the range and its nodes. Call this in the global critical
+  // region.
   void UnlinkWatchRange(WatchRange* range);
-
-  // ***************************************************************************
-  // Things above should be protected by global_critical_region.
-  // ***************************************************************************
-
-  // First page and length in pages.
-  typedef std::pair<uint32_t, uint32_t> UploadRange;
-  // Ranges that need to be uploaded, generated by GetRangesToUpload (a
-  // persistently allocated vector).
-  std::vector<UploadRange> upload_ranges_;
-  void GetRangesToUpload(uint32_t request_page_first,
-                         uint32_t request_page_last);
-  std::unique_ptr<ui::d3d12::UploadBufferPool> upload_buffer_pool_ = nullptr;
-
-  void TransitionBuffer(D3D12_RESOURCE_STATES new_state);
-
-  // GPU-written memory downloading for traces.
-  // Start page, length in pages.
-  std::vector<std::pair<uint32_t, uint32_t>> trace_gpu_written_ranges_;
-  // Created temporarily, only for downloading.
-  ID3D12Resource* trace_gpu_written_buffer_ = nullptr;
-  void ResetTraceGPUWrittenBuffer();
 };
 
 }  // namespace d3d12
diff --git a/src/xenia/gpu/d3d12/texture_cache.cc b/src/xenia/gpu/d3d12/texture_cache.cc
index d4496195a..1d1570dee 100644
--- a/src/xenia/gpu/d3d12/texture_cache.cc
+++ b/src/xenia/gpu/d3d12/texture_cache.cc
@@ -1702,7 +1702,7 @@ void TextureCache::MarkRangeAsResolved(uint32_t start_unscaled,
     uint32_t page_last = (start_unscaled + length_unscaled - 1) >> 12;
     uint32_t block_first = page_first >> 5;
     uint32_t block_last = page_last >> 5;
-    auto watch_lock = shared_memory_->LockWatchMutex();
+    auto global_lock = global_critical_region_.Acquire();
     for (uint32_t i = block_first; i <= block_last; ++i) {
       uint32_t add_bits = UINT32_MAX;
       if (i == block_first) {
@@ -1812,8 +1812,8 @@ bool TextureCache::TileResolvedTexture(
       return false;
     }
   } else {
-    if (!shared_memory_->MakeTilesResident(texture_modified_start,
-                                           texture_modified_length)) {
+    if (!shared_memory_->EnsureTilesResident(texture_modified_start,
+                                             texture_modified_length)) {
       return false;
     }
   }
@@ -2404,7 +2404,7 @@ bool TextureCache::LoadTextureData(Texture* texture) {
   // See what we need to upload.
   bool base_in_sync, mips_in_sync;
   {
-    auto watch_lock = shared_memory_->LockWatchMutex();
+    auto global_lock = global_critical_region_.Acquire();
     base_in_sync = texture->base_in_sync;
     mips_in_sync = texture->mips_in_sync;
   }
@@ -2672,7 +2672,7 @@ bool TextureCache::LoadTextureData(Texture* texture) {
   // regular texture or a vertex buffer, and thus the scaled resolve version is
   // not up to date anymore.
   {
-    auto watch_lock = shared_memory_->LockWatchMutex();
+    auto global_lock = global_critical_region_.Acquire();
     texture->base_in_sync = true;
     texture->mips_in_sync = true;
     if (!base_in_sync) {
@@ -2761,7 +2761,7 @@ bool TextureCache::IsRangeScaledResolved(uint32_t start_unscaled,
   uint32_t block_last = page_last >> 5;
   uint32_t l2_block_first = block_first >> 6;
   uint32_t l2_block_last = block_last >> 6;
-  auto watch_lock = shared_memory_->LockWatchMutex();
+  auto global_lock = global_critical_region_.Acquire();
   for (uint32_t i = l2_block_first; i <= l2_block_last; ++i) {
     uint64_t l2_block = scaled_resolve_pages_l2_[i];
     if (i == l2_block_first) {
diff --git a/src/xenia/gpu/d3d12/texture_cache.h b/src/xenia/gpu/d3d12/texture_cache.h
index 2541d2af3..d93ee962f 100644
--- a/src/xenia/gpu/d3d12/texture_cache.h
+++ b/src/xenia/gpu/d3d12/texture_cache.h
@@ -11,9 +11,9 @@
 #define XENIA_GPU_D3D12_TEXTURE_CACHE_H_
 
 #include <atomic>
-#include <mutex>
 #include <unordered_map>
 
+#include "xenia/base/mutex.h"
 #include "xenia/gpu/d3d12/d3d12_shader.h"
 #include "xenia/gpu/d3d12/shared_memory.h"
 #include "xenia/gpu/register_file.h"
@@ -369,15 +369,14 @@ class TextureCache {
     static constexpr uint32_t kCachedSRVDescriptorSwizzleMissing = UINT32_MAX;
     uint32_t cached_srv_descriptor_swizzle;
 
-    // Watch handles for the memory ranges (protected by the shared memory watch
-    // mutex).
+    // These are to be accessed within the global critical region to synchronize
+    // with shared memory.
+    // Watch handles for the memory ranges.
     SharedMemory::WatchHandle base_watch_handle;
     SharedMemory::WatchHandle mip_watch_handle;
-    // Whether the recent base level data has been loaded from the memory
-    // (protected by the shared memory watch mutex).
+    // Whether the recent base level data has been loaded from the memory.
     bool base_in_sync;
-    // Whether the recent mip data has been loaded from the memory (protected by
-    // the shared memory watch mutex).
+    // Whether the recent mip data has been loaded from the memory.
     bool mips_in_sync;
   };
 
@@ -620,16 +619,16 @@ class TextureCache {
                                     kScaledResolveHeapSizeLog2] = {};
   // Number of currently resident portions of the tiled buffer, for profiling.
   uint32_t scaled_resolve_heap_count_ = 0;
+  // Global watch for scaled resolve data invalidation.
+  SharedMemory::GlobalWatchHandle scaled_resolve_global_watch_handle_ = nullptr;
+
+  xe::global_critical_region global_critical_region_;
   // Bit vector storing whether each 4 KB physical memory page contains scaled
   // resolve data. uint32_t rather than uint64_t because parts of it are sent to
   // shaders.
-  // PROTECTED BY THE SHARED MEMORY WATCH MUTEX!
   uint32_t* scaled_resolve_pages_ = nullptr;
   // Second level of the bit vector for faster rejection of non-scaled textures.
-  // PROTECTED BY THE SHARED MEMORY WATCH MUTEX!
   uint64_t scaled_resolve_pages_l2_[(512 << 20) >> (12 + 5 + 6)];
-  // Global watch for scaled resolve data invalidation.
-  SharedMemory::GlobalWatchHandle scaled_resolve_global_watch_handle_ = nullptr;
 };
 
 }  // namespace d3d12
diff --git a/src/xenia/gpu/vulkan/texture_cache.cc b/src/xenia/gpu/vulkan/texture_cache.cc
index e0fb9a662..92d8f9fc7 100644
--- a/src/xenia/gpu/vulkan/texture_cache.cc
+++ b/src/xenia/gpu/vulkan/texture_cache.cc
@@ -150,16 +150,18 @@ VkResult TextureCache::Initialize() {
 
   device_queue_ = device_->AcquireQueue(device_->queue_family_index());
 
-  physical_write_watch_handle_ =
-      memory_->RegisterPhysicalWriteWatch(MemoryWriteCallbackThunk, this);
+  memory_invalidation_callback_handle_ =
+      memory_->RegisterPhysicalMemoryInvalidationCallback(
+          MemoryInvalidationCallbackThunk, this);
 
   return VK_SUCCESS;
 }
 
 void TextureCache::Shutdown() {
-  if (physical_write_watch_handle_ != nullptr) {
-    memory_->UnregisterPhysicalWriteWatch(physical_write_watch_handle_);
-    physical_write_watch_handle_ = nullptr;
+  if (memory_invalidation_callback_handle_ != nullptr) {
+    memory_->UnregisterPhysicalMemoryInvalidationCallback(
+        memory_invalidation_callback_handle_);
+    memory_invalidation_callback_handle_ = nullptr;
   }
 
   if (device_queue_) {
@@ -411,7 +413,7 @@ void TextureCache::WatchTexture(Texture* texture) {
     texture->is_watched = true;
   }
 
-  memory_->WatchPhysicalMemoryWrite(address, size);
+  memory_->EnablePhysicalMemoryAccessCallbacks(address, size, true, false);
 }
 
 void TextureCache::TextureTouched(Texture* texture) {
@@ -428,7 +430,7 @@ void TextureCache::TextureTouched(Texture* texture) {
   texture->pending_invalidation = true;
 }
 
-std::pair<uint32_t, uint32_t> TextureCache::MemoryWriteCallback(
+std::pair<uint32_t, uint32_t> TextureCache::MemoryInvalidationCallback(
     uint32_t physical_address_start, uint32_t length, bool exact_range) {
   global_critical_region_.Acquire();
   if (watched_textures_.empty()) {
@@ -468,11 +470,11 @@ std::pair<uint32_t, uint32_t> TextureCache::MemoryWriteCallback(
   return std::make_pair(previous_end, next_start - previous_end);
 }
 
-std::pair<uint32_t, uint32_t> TextureCache::MemoryWriteCallbackThunk(
+std::pair<uint32_t, uint32_t> TextureCache::MemoryInvalidationCallbackThunk(
     void* context_ptr, uint32_t physical_address_start, uint32_t length,
     bool exact_range) {
   return reinterpret_cast<TextureCache*>(context_ptr)
-      ->MemoryWriteCallback(physical_address_start, length, exact_range);
+      ->MemoryInvalidationCallback(physical_address_start, length, exact_range);
 }
 
 TextureCache::Texture* TextureCache::DemandResolveTexture(
diff --git a/src/xenia/gpu/vulkan/texture_cache.h b/src/xenia/gpu/vulkan/texture_cache.h
index 015868209..370d0b925 100644
--- a/src/xenia/gpu/vulkan/texture_cache.h
+++ b/src/xenia/gpu/vulkan/texture_cache.h
@@ -147,9 +147,9 @@ class TextureCache {
 
   void WatchTexture(Texture* texture);
   void TextureTouched(Texture* texture);
-  std::pair<uint32_t, uint32_t> MemoryWriteCallback(
+  std::pair<uint32_t, uint32_t> MemoryInvalidationCallback(
       uint32_t physical_address_start, uint32_t length, bool exact_range);
-  static std::pair<uint32_t, uint32_t> MemoryWriteCallbackThunk(
+  static std::pair<uint32_t, uint32_t> MemoryInvalidationCallbackThunk(
       void* context_ptr, uint32_t physical_address_start, uint32_t length,
       bool exact_range);
 
@@ -220,7 +220,7 @@ class TextureCache {
   std::unordered_map<uint64_t, Sampler*> samplers_;
   std::list<Texture*> pending_delete_textures_;
 
-  void* physical_write_watch_handle_ = nullptr;
+  void* memory_invalidation_callback_handle_ = nullptr;
 
   xe::global_critical_region global_critical_region_;
   std::list<WatchedTexture> watched_textures_;
diff --git a/src/xenia/kernel/xboxkrnl/xboxkrnl_io.cc b/src/xenia/kernel/xboxkrnl/xboxkrnl_io.cc
index 4352c7be9..5aba4004f 100644
--- a/src/xenia/kernel/xboxkrnl/xboxkrnl_io.cc
+++ b/src/xenia/kernel/xboxkrnl/xboxkrnl_io.cc
@@ -9,6 +9,7 @@
 
 #include "xenia/base/logging.h"
 #include "xenia/base/memory.h"
+#include "xenia/base/mutex.h"
 #include "xenia/cpu/processor.h"
 #include "xenia/kernel/kernel_state.h"
 #include "xenia/kernel/util/shim_utils.h"
@@ -172,8 +173,9 @@ dword_result_t NtReadFile(dword_t file_handle, dword_t event_handle,
       // some games NtReadFile() directly into texture memory
       auto heap = kernel_memory()->LookupHeap(buffer.guest_address());
       if (heap && heap->IsGuestPhysicalHeap()) {
-        kernel_memory()->TriggerWatches(buffer.guest_address(), buffer_length,
-                                        true, true);
+        kernel_memory()->TriggerPhysicalMemoryCallbacks(
+            xe::global_critical_region::AcquireDirect(), buffer.guest_address(),
+            buffer_length, true, true);
       }
 
       // Synchronous.
diff --git a/src/xenia/memory.cc b/src/xenia/memory.cc
index bd6d58f44..f7dec5366 100644
--- a/src/xenia/memory.cc
+++ b/src/xenia/memory.cc
@@ -11,7 +11,9 @@
 
 #include <algorithm>
 #include <cstring>
+#include <utility>
 
+#include "xenia/base/assert.h"
 #include "xenia/base/byte_stream.h"
 #include "xenia/base/clock.h"
 #include "xenia/base/cvar.h"
@@ -96,8 +98,8 @@ Memory::~Memory() {
   // requests.
   mmio_handler_.reset();
 
-  for (auto physical_write_watch : physical_write_watches_) {
-    delete physical_write_watch;
+  for (auto invalidation_callback : physical_memory_invalidation_callbacks_) {
+    delete invalidation_callback;
   }
 
   heaps_.v00000000.Dispose();
@@ -433,13 +435,12 @@ cpu::MMIORange* Memory::LookupVirtualMappedRange(uint32_t virtual_address) {
   return mmio_handler_->LookupRange(virtual_address);
 }
 
-bool Memory::AccessViolationCallback(void* host_address, bool is_write) {
-  if (!is_write) {
-    // TODO(Triang3l): Handle GPU readback.
-    return false;
-  }
-  // Access via physical_membase_ is special, when need to bypass everything,
-  // so only watching virtual memory regions.
+bool Memory::AccessViolationCallback(
+    std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+    void* host_address, bool is_write) {
+  // Access via physical_membase_ is special, when need to bypass everything
+  // (for instance, for a data provider to actually write the data) so only
+  // triggering callbacks on virtual memory regions.
   if (reinterpret_cast<size_t>(host_address) <
           reinterpret_cast<size_t>(virtual_membase_) ||
       reinterpret_cast<size_t>(host_address) >=
@@ -448,65 +449,79 @@ bool Memory::AccessViolationCallback(void* host_address, bool is_write) {
   }
   uint32_t virtual_address = HostToGuestVirtual(host_address);
   BaseHeap* heap = LookupHeap(virtual_address);
-  if (heap->IsGuestPhysicalHeap()) {
-    // Will be rounded to physical page boundaries internally, so just pass 1 as
-    // the length - guranteed not to cross page boundaries also.
-    return static_cast<PhysicalHeap*>(heap)->TriggerWatches(virtual_address, 1,
-                                                            is_write, false);
+  if (!heap->IsGuestPhysicalHeap()) {
+    return false;
   }
 
-  return false;
+  // Access violation callbacks from the guest are triggered when the global
+  // critical region mutex is locked once.
+  //
+  // Will be rounded to physical page boundaries internally, so just pass 1 as
+  // the length - guranteed not to cross page boundaries also.
+  auto physical_heap = static_cast<PhysicalHeap*>(heap);
+  return physical_heap->TriggerCallbacks(std::move(global_lock_locked_once),
+                                         virtual_address, 1, is_write, false);
 }
 
-bool Memory::AccessViolationCallbackThunk(void* context, void* host_address,
-                                          bool is_write) {
+bool Memory::AccessViolationCallbackThunk(
+    std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+    void* context, void* host_address, bool is_write) {
   return reinterpret_cast<Memory*>(context)->AccessViolationCallback(
-      host_address, is_write);
+      std::move(global_lock_locked_once), host_address, is_write);
 }
 
-bool Memory::TriggerWatches(uint32_t virtual_address, uint32_t length,
-                            bool is_write, bool unwatch_exact_range,
-                            bool unprotect) {
+bool Memory::TriggerPhysicalMemoryCallbacks(
+    std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+    uint32_t virtual_address, uint32_t length, bool is_write,
+    bool unwatch_exact_range, bool unprotect) {
   BaseHeap* heap = LookupHeap(virtual_address);
   if (heap->IsGuestPhysicalHeap()) {
-    return static_cast<PhysicalHeap*>(heap)->TriggerWatches(
-        virtual_address, length, is_write, unwatch_exact_range, unprotect);
+    auto physical_heap = static_cast<PhysicalHeap*>(heap);
+    return physical_heap->TriggerCallbacks(std::move(global_lock_locked_once),
+                                           virtual_address, length, is_write,
+                                           unwatch_exact_range, unprotect);
   }
   return false;
 }
 
-void* Memory::RegisterPhysicalWriteWatch(PhysicalWriteWatchCallback callback,
-                                         void* callback_context) {
-  PhysicalWriteWatchEntry* entry = new PhysicalWriteWatchEntry;
-  entry->callback = callback;
-  entry->callback_context = callback_context;
-
+void* Memory::RegisterPhysicalMemoryInvalidationCallback(
+    PhysicalMemoryInvalidationCallback callback, void* callback_context) {
+  auto entry = new std::pair<PhysicalMemoryInvalidationCallback, void*>(
+      callback, callback_context);
   auto lock = global_critical_region_.Acquire();
-  physical_write_watches_.push_back(entry);
-
+  physical_memory_invalidation_callbacks_.push_back(entry);
   return entry;
 }
 
-void Memory::UnregisterPhysicalWriteWatch(void* watch_handle) {
-  auto entry = reinterpret_cast<PhysicalWriteWatchEntry*>(watch_handle);
+void Memory::UnregisterPhysicalMemoryInvalidationCallback(
+    void* callback_handle) {
+  auto entry =
+      reinterpret_cast<std::pair<PhysicalMemoryInvalidationCallback, void*>*>(
+          callback_handle);
   {
     auto lock = global_critical_region_.Acquire();
-    auto it = std::find(physical_write_watches_.begin(),
-                        physical_write_watches_.end(), entry);
-    assert_false(it == physical_write_watches_.end());
-    if (it != physical_write_watches_.end()) {
-      physical_write_watches_.erase(it);
+    auto it = std::find(physical_memory_invalidation_callbacks_.begin(),
+                        physical_memory_invalidation_callbacks_.end(), entry);
+    assert_true(it != physical_memory_invalidation_callbacks_.end());
+    if (it != physical_memory_invalidation_callbacks_.end()) {
+      physical_memory_invalidation_callbacks_.erase(it);
     }
   }
   delete entry;
 }
 
-void Memory::WatchPhysicalMemoryWrite(uint32_t physical_address,
-                                      uint32_t length) {
-  // Watch independently in all three mappings.
-  heaps_.vA0000000.WatchPhysicalWrite(physical_address, length);
-  heaps_.vC0000000.WatchPhysicalWrite(physical_address, length);
-  heaps_.vE0000000.WatchPhysicalWrite(physical_address, length);
+void Memory::EnablePhysicalMemoryAccessCallbacks(
+    uint32_t physical_address, uint32_t length,
+    bool enable_invalidation_notifications, bool enable_data_providers) {
+  heaps_.vA0000000.EnableAccessCallbacks(physical_address, length,
+                                         enable_invalidation_notifications,
+                                         enable_data_providers);
+  heaps_.vC0000000.EnableAccessCallbacks(physical_address, length,
+                                         enable_invalidation_notifications,
+                                         enable_data_providers);
+  heaps_.vE0000000.EnableAccessCallbacks(physical_address, length,
+                                         enable_invalidation_notifications,
+                                         enable_data_providers);
 }
 
 uint32_t Memory::SystemHeapAlloc(uint32_t size, uint32_t alignment,
@@ -798,7 +813,8 @@ bool BaseHeap::Restore(ByteStream* stream) {
 void BaseHeap::Reset() {
   // TODO(DrChat): protect pages.
   std::memset(page_table_.data(), 0, sizeof(PageEntry) * page_table_.size());
-  // TODO(Triang3l): Unwatch pages.
+  // TODO(Triang3l): Remove access callbacks from pages if this is a physical
+  // memory heap.
 }
 
 bool BaseHeap::Alloc(uint32_t size, uint32_t alignment,
@@ -1313,9 +1329,7 @@ void PhysicalHeap::Initialize(Memory* memory, uint8_t* membase,
   system_page_count_ =
       (heap_size_ /* already - 1 */ + host_address_offset + system_page_size_) /
       system_page_size_;
-  system_pages_watched_write_.resize((system_page_count_ + 63) / 64);
-  std::memset(system_pages_watched_write_.data(), 0,
-              system_pages_watched_write_.size() * sizeof(uint64_t));
+  system_page_flags_.resize((system_page_count_ + 63) / 64);
 }
 
 bool PhysicalHeap::Alloc(uint32_t size, uint32_t alignment,
@@ -1357,7 +1371,7 @@ bool PhysicalHeap::Alloc(uint32_t size, uint32_t alignment,
   }
 
   if (protect & kMemoryProtectWrite) {
-    TriggerWatches(address, size, true, true, false);
+    TriggerCallbacks(std::move(global_lock), address, size, true, true, false);
   }
 
   *out_address = address;
@@ -1398,7 +1412,7 @@ bool PhysicalHeap::AllocFixed(uint32_t base_address, uint32_t size,
   }
 
   if (protect & kMemoryProtectWrite) {
-    TriggerWatches(address, size, true, true, false);
+    TriggerCallbacks(std::move(global_lock), address, size, true, true, false);
   }
 
   return true;
@@ -1443,7 +1457,7 @@ bool PhysicalHeap::AllocRange(uint32_t low_address, uint32_t high_address,
   }
 
   if (protect & kMemoryProtectWrite) {
-    TriggerWatches(address, size, true, true, false);
+    TriggerCallbacks(std::move(global_lock), address, size, true, true, false);
   }
 
   *out_address = address;
@@ -1477,7 +1491,7 @@ bool PhysicalHeap::Protect(uint32_t address, uint32_t size, uint32_t protect,
   // Only invalidate if making writable again, for simplicity - not when simply
   // marking some range as immutable, for instance.
   if (protect & kMemoryProtectWrite) {
-    TriggerWatches(address, size, true, true, false);
+    TriggerCallbacks(std::move(global_lock), address, size, true, true, false);
   }
 
   if (!parent_heap_->Protect(GetPhysicalAddress(address), size, protect,
@@ -1489,8 +1503,15 @@ bool PhysicalHeap::Protect(uint32_t address, uint32_t size, uint32_t protect,
   return BaseHeap::Protect(address, size, protect);
 }
 
-void PhysicalHeap::WatchPhysicalWrite(uint32_t physical_address,
-                                      uint32_t length) {
+void PhysicalHeap::EnableAccessCallbacks(uint32_t physical_address,
+                                         uint32_t length,
+                                         bool enable_invalidation_notifications,
+                                         bool enable_data_providers) {
+  // TODO(Triang3l): Implement data providers.
+  assert_false(enable_data_providers);
+  if (!enable_invalidation_notifications && !enable_data_providers) {
+    return;
+  }
   uint32_t physical_address_offset = GetPhysicalAddress(heap_base_);
   if (physical_address < physical_address_offset) {
     if (physical_address_offset - physical_address >= length) {
@@ -1516,28 +1537,61 @@ void PhysicalHeap::WatchPhysicalWrite(uint32_t physical_address,
   system_page_last = std::min(system_page_last, system_page_count_ - 1);
   assert_true(system_page_first <= system_page_last);
 
-  auto global_lock = global_critical_region_.Acquire();
-
-  // Protect the pages and mark them as watched. Don't mark non-writable pages
-  // as watched, so true access violations can still occur there.
+  // Update callback flags for system pages and make their protection stricter
+  // if needed.
+  xe::memory::PageAccess protect_access =
+      enable_data_providers ? xe::memory::PageAccess::kNoAccess
+                            : xe::memory::PageAccess::kReadOnly;
   uint8_t* protect_base = membase_ + heap_base_;
   uint32_t protect_system_page_first = UINT32_MAX;
+  auto global_lock = global_critical_region_.Acquire();
   for (uint32_t i = system_page_first; i <= system_page_last; ++i) {
-    uint64_t page_bit = uint64_t(1) << (i & 63);
-    // Check if need to allow writing to this page.
-    bool add_page_to_watch =
-        (system_pages_watched_write_[i >> 6] & page_bit) == 0;
-    if (add_page_to_watch) {
-      uint32_t page_number =
-          xe::sat_sub(i * system_page_size_, host_address_offset()) /
-          page_size_;
-      if (ToPageAccess(page_table_[page_number].current_protect) !=
-          xe::memory::PageAccess::kReadWrite) {
-        add_page_to_watch = false;
+    // Check if need to enable callbacks for the page and raise its protection.
+    //
+    // If enabling invalidation notifications:
+    // - Page writable and not watched for changes yet - protect and enable
+    //   invalidation notifications.
+    // - Page seen as writable by the guest, but only needs data providers -
+    //   just set the bits to enable invalidation notifications (already has
+    //   even stricter protection than needed).
+    // - Page not writable as requested by the game - don't do anything (need
+    //   real access violations here).
+    // If enabling data providers:
+    // - Page accessible (either read/write or read-only) and didn't need data
+    //   providers initially - protect and enable data providers.
+    // - Otherwise - do nothing.
+    //
+    // It's safe not to await data provider completion here before protecting as
+    // this never makes protection lighter, so it can't interfere with page
+    // faults that await data providers.
+    //
+    // Enabling data providers doesn't need to be deferred - providers will be
+    // polled for the last time without releasing the lock.
+    SystemPageFlagsBlock& page_flags_block = system_page_flags_[i >> 6];
+    uint64_t page_flags_bit = uint64_t(1) << (i & 63);
+    uint32_t guest_page_number =
+        xe::sat_sub(i * system_page_size_, host_address_offset()) / page_size_;
+    xe::memory::PageAccess current_page_access =
+        ToPageAccess(page_table_[guest_page_number].current_protect);
+    bool protect_system_page = false;
+    // Don't do anything with inaccessible pages - don't protect, don't enable
+    // callbacks - because real access violations are needed there. And don't
+    // enable invalidation notifications for read-only pages for the same
+    // reason.
+    if (current_page_access != xe::memory::PageAccess::kNoAccess) {
+      // TODO(Triang3l): Enable data providers.
+      if (enable_invalidation_notifications) {
+        if (current_page_access != xe::memory::PageAccess::kReadOnly &&
+            (page_flags_block.notify_on_invalidation & page_flags_bit) == 0) {
+          // TODO(Triang3l): Check if data providers are already enabled.
+          // If data providers are already enabled for the page, it has even
+          // stricter protection.
+          protect_system_page = true;
+          page_flags_block.notify_on_invalidation |= page_flags_bit;
+        }
       }
     }
-    if (add_page_to_watch) {
-      system_pages_watched_write_[i >> 6] |= page_bit;
+    if (protect_system_page) {
       if (protect_system_page_first == UINT32_MAX) {
         protect_system_page_first = i;
       }
@@ -1546,7 +1600,7 @@ void PhysicalHeap::WatchPhysicalWrite(uint32_t physical_address,
         xe::memory::Protect(
             protect_base + protect_system_page_first * system_page_size_,
             (i - protect_system_page_first) * system_page_size_,
-            xe::memory::PageAccess::kReadOnly);
+            protect_access);
         protect_system_page_first = UINT32_MAX;
       }
     }
@@ -1555,13 +1609,14 @@ void PhysicalHeap::WatchPhysicalWrite(uint32_t physical_address,
     xe::memory::Protect(
         protect_base + protect_system_page_first * system_page_size_,
         (system_page_last + 1 - protect_system_page_first) * system_page_size_,
-        xe::memory::PageAccess::kReadOnly);
+        protect_access);
   }
 }
 
-bool PhysicalHeap::TriggerWatches(uint32_t virtual_address, uint32_t length,
-                                  bool is_write, bool unwatch_exact_range,
-                                  bool unprotect) {
+bool PhysicalHeap::TriggerCallbacks(
+    std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+    uint32_t virtual_address, uint32_t length, bool is_write,
+    bool unwatch_exact_range, bool unprotect) {
   // TODO(Triang3l): Support read watches.
   assert_true(is_write);
   if (!is_write) {
@@ -1594,12 +1649,10 @@ bool PhysicalHeap::TriggerWatches(uint32_t virtual_address, uint32_t length,
   uint32_t block_index_first = system_page_first >> 6;
   uint32_t block_index_last = system_page_last >> 6;
 
-  auto global_lock = global_critical_region_.Acquire();
-
   // Check if watching any page, whether need to call the callback at all.
   bool any_watched = false;
   for (uint32_t i = block_index_first; i <= block_index_last; ++i) {
-    uint64_t block = system_pages_watched_write_[i];
+    uint64_t block = system_page_flags_[i].notify_on_invalidation;
     if (i == block_index_first) {
       block &= ~((uint64_t(1) << (system_page_first & 63)) - 1);
     }
@@ -1633,11 +1686,12 @@ bool PhysicalHeap::TriggerWatches(uint32_t virtual_address, uint32_t length,
       heap_size_ + 1 - (physical_address_start - physical_address_offset));
   uint32_t unwatch_first = 0;
   uint32_t unwatch_last = UINT32_MAX;
-  for (auto physical_write_watch : memory_->physical_write_watches_) {
+  for (auto invalidation_callback :
+       memory_->physical_memory_invalidation_callbacks_) {
     std::pair<uint32_t, uint32_t> callback_unwatch_range =
-        physical_write_watch->callback(physical_write_watch->callback_context,
-                                       physical_address_start, physical_length,
-                                       unwatch_exact_range);
+        invalidation_callback->first(invalidation_callback->second,
+                                     physical_address_start, physical_length,
+                                     unwatch_exact_range);
     if (!unwatch_exact_range) {
       unwatch_first = std::max(unwatch_first, callback_unwatch_range.first);
       unwatch_last = std::min(
@@ -1682,13 +1736,13 @@ bool PhysicalHeap::TriggerWatches(uint32_t virtual_address, uint32_t length,
     uint32_t unprotect_system_page_first = UINT32_MAX;
     for (uint32_t i = system_page_first; i <= system_page_last; ++i) {
       // Check if need to allow writing to this page.
-      bool unprotect_page = (system_pages_watched_write_[i >> 6] &
+      bool unprotect_page = (system_page_flags_[i >> 6].notify_on_invalidation &
                              (uint64_t(1) << (i & 63))) != 0;
       if (unprotect_page) {
-        uint32_t page_number =
+        uint32_t guest_page_number =
             xe::sat_sub(i * system_page_size_, host_address_offset()) /
             page_size_;
-        if (ToPageAccess(page_table_[page_number].current_protect) !=
+        if (ToPageAccess(page_table_[guest_page_number].current_protect) !=
             xe::memory::PageAccess::kReadWrite) {
           unprotect_page = false;
         }
@@ -1725,7 +1779,7 @@ bool PhysicalHeap::TriggerWatches(uint32_t virtual_address, uint32_t length,
     if (i == block_index_last && (system_page_last & 63) != 63) {
       mask |= ~((uint64_t(1) << ((system_page_last & 63) + 1)) - 1);
     }
-    system_pages_watched_write_[i] &= mask;
+    system_page_flags_[i].notify_on_invalidation &= mask;
   }
 
   return true;
diff --git a/src/xenia/memory.h b/src/xenia/memory.h
index 8250b1787..9d01af167 100644
--- a/src/xenia/memory.h
+++ b/src/xenia/memory.h
@@ -12,6 +12,7 @@
 
 #include <cstdint>
 #include <memory>
+#include <mutex>
 #include <string>
 #include <utility>
 #include <vector>
@@ -238,10 +239,14 @@ class PhysicalHeap : public BaseHeap {
   bool Protect(uint32_t address, uint32_t size, uint32_t protect,
                uint32_t* old_protect = nullptr) override;
 
-  void WatchPhysicalWrite(uint32_t physical_address, uint32_t length);
+  void EnableAccessCallbacks(uint32_t physical_address, uint32_t length,
+                             bool enable_invalidation_notifications,
+                             bool enable_data_providers);
   // Returns true if any page in the range was watched.
-  bool TriggerWatches(uint32_t virtual_address, uint32_t length, bool is_write,
-                      bool unwatch_exact_range, bool unprotect = true);
+  bool TriggerCallbacks(
+      std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+      uint32_t virtual_address, uint32_t length, bool is_write,
+      bool unwatch_exact_range, bool unprotect = true);
 
   bool IsGuestPhysicalHeap() const override { return true; }
   uint32_t GetPhysicalAddress(uint32_t address) const;
@@ -251,8 +256,15 @@ class PhysicalHeap : public BaseHeap {
 
   uint32_t system_page_size_;
   uint32_t system_page_count_;
-  // Protected by global_critical_region.
-  std::vector<uint64_t> system_pages_watched_write_;
+
+  struct SystemPageFlagsBlock {
+    // Whether writing to each page should result trigger invalidation
+    // callbacks.
+    uint64_t notify_on_invalidation;
+  };
+  // Protected by global_critical_region. Flags for each 64 system pages,
+  // interleaved as blocks, so bit scan can be used to quickly extract ranges.
+  std::vector<SystemPageFlagsBlock> system_page_flags_;
 };
 
 // Models the entire guest memory system on the console.
@@ -347,64 +359,80 @@ class Memory {
   // Gets the defined MMIO range for the given virtual address, if any.
   cpu::MMIORange* LookupVirtualMappedRange(uint32_t virtual_address);
 
+  // Physical memory access callbacks, two types of them.
+  //
+  // This is simple per-system-page protection without reference counting or
+  // stored ranges. Whenever a watched page is accessed, all callbacks for it
+  // are triggered. Also the only way to remove callbacks is to trigger them
+  // somehow. Since there are no references from pages to individual callbacks,
+  // there's no way to disable only a specific callback for a page. Also
+  // callbacks may be triggered spuriously, and handlers should properly ignore
+  // pages they don't care about.
+  //
+  // Once callbacks are triggered for a page, the page is not watched anymore
+  // until requested again later. It is, however, unwatched only in one guest
+  // view of physical memory (because different views may have different
+  // protection for the same memory) - but it's rare when the same memory is
+  // used with different guest page sizes, and it's okay to fire a callback more
+  // than once.
+  //
+  // Only accessing the guest virtual memory views of physical memory triggers
+  // callbacks - data providers, for instance, must write to the host physical
+  // heap directly, otherwise their threads may infinitely await themselves.
+  //
+  // - Invalidation notifications:
+  //
+  // Protecting from writing. One-shot callbacks for invalidation of various
+  // kinds of physical memory caches (such as the GPU copy of the memory).
+  //
+  // May be triggered for a single page (in case of a write access violation or
+  // when need to synchronize data given by data providers) or for multiple
+  // pages (like when memory is allocated).
+  //
+  // Since granularity of callbacks is one single page, an invalidation
+  // notification handler must invalidate the all the data stored in the touched
+  // pages.
+  //
+  // Because large ranges (like whole framebuffers) may be written to and
+  // exceptions are expensive, it's better to unprotect multiple pages as a
+  // result of a write access violation, so the shortest common range returned
+  // by all the invalidation callbacks (clamped to a sane range and also not to
+  // touch pages with provider callbacks) is unprotected.
+  //
+  // - Data providers:
+  //
+  // TODO(Triang3l): Implement data providers - more complicated because they
+  // will need to be able to release the global lock.
+
   // Returns start and length of the smallest physical memory region surrounding
   // the watched region that can be safely unwatched, if it doesn't matter,
   // return (0, UINT32_MAX).
-  typedef std::pair<uint32_t, uint32_t> (*PhysicalWriteWatchCallback)(
+  typedef std::pair<uint32_t, uint32_t> (*PhysicalMemoryInvalidationCallback)(
       void* context_ptr, uint32_t physical_address_start, uint32_t length,
       bool exact_range);
+  // Returns a handle for unregistering or for skipping one notification handler
+  // while triggering data providers.
+  void* RegisterPhysicalMemoryInvalidationCallback(
+      PhysicalMemoryInvalidationCallback callback, void* callback_context);
+  // Unregisters a physical memory invalidation callback previously added with
+  // RegisterPhysicalMemoryInvalidationCallback.
+  void UnregisterPhysicalMemoryInvalidationCallback(void* callback_handle);
 
-  // Physical memory write watching, allowing subsystems to invalidate cached
-  // data that depends on memory contents.
-  //
-  // Placing a watch simply marks the pages (of the system page size) as
-  // watched, individual watched ranges (or which specific subscribers are
-  // watching specific pages) are not stored. Because of this, callbacks may be
-  // triggered multiple times for a single range, and for any watched page every
-  // registered callbacks is triggered. This is a very simple one-shot method
-  // for use primarily for cache invalidation - there may be spurious firing,
-  // for example, if the game only makes the pages writable without actually
-  // writing anything (done for simplicity).
-  //
-  // A range of pages can be watched at any time, but pages are only unwatched
-  // when watches are triggered (since multiple subscribers can depend on the
-  // same memory, and one subscriber shouldn't interfere with another).
-  //
-  // Callbacks can be triggered for one page (if the guest just stores words) or
-  // for multiple pages (for file reading, making pages writable).
-  //
-  // Only guest physical memory mappings are watched - the host-only mapping is
-  // not protected so it can be used to bypass the write protection (for file
-  // reads, for example - in this case, watches are triggered manually).
-  //
-  // Note that when a watch is triggered, the watched page is unprotected only
-  // in the heap where the address is located. Since different virtual memory
-  // mappings of physical memory can have different protection levels for the
-  // same pages, and watches must not be placed on read-only or totally
-  // inaccessible pages, there are significant difficulties with synchronizing
-  // all the three ranges, but it's generally not needed.
-  void* RegisterPhysicalWriteWatch(PhysicalWriteWatchCallback callback,
-                                   void* callback_context);
-
-  // Unregisters a physical memory write watch previously added with
-  // RegisterPhysicalWriteWatch.
-  void UnregisterPhysicalWriteWatch(void* watch_handle);
-
-  // Enables watching of the specified memory range, snapped to system page
-  // boundaries. When something is written to a watched range (or when the
-  // protection of it changes in a a way that it becomes writable), the
-  // registered watch callbacks are triggered for the page (or pages, for file
-  // reads and protection changes) where something has been written to. This
-  // protects physical memory only under virtual_membase_, so writing to
-  // physical_membase_ can be done to bypass the protection placed by the
-  // watches.
-  void WatchPhysicalMemoryWrite(uint32_t physical_address, uint32_t length);
+  // Enables physical memory access callbacks for the specified memory range,
+  // snapped to system page boundaries.
+  void EnablePhysicalMemoryAccessCallbacks(
+      uint32_t physical_address, uint32_t length,
+      bool enable_invalidation_notifications, bool enable_data_providers);
 
   // Forces triggering of watch callbacks for a virtual address range if pages
   // are watched there and unwatching them. Returns whether any page was
-  // watched.
-  bool TriggerWatches(uint32_t virtual_address, uint32_t length, bool is_write,
-                      bool unwatch_exact_range, bool unprotect = true);
+  // watched. Must be called with global critical region locking depth of 1.
+  // TODO(Triang3l): Implement data providers - this is why locking depth of 1
+  // will be required in the future.
+  bool TriggerPhysicalMemoryCallbacks(
+      std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+      uint32_t virtual_address, uint32_t length, bool is_write,
+      bool unwatch_exact_range, bool unprotect = true);
 
   // Allocates virtual memory from the 'system' heap.
   // System memory is kept separate from game memory but is still accessible
@@ -443,9 +471,12 @@ class Memory {
   static uint32_t HostToGuestVirtualThunk(const void* context,
                                           const void* host_address);
 
-  bool AccessViolationCallback(void* host_address, bool is_write);
-  static bool AccessViolationCallbackThunk(void* context, void* host_address,
-                                           bool is_write);
+  bool AccessViolationCallback(
+      std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+      void* host_address, bool is_write);
+  static bool AccessViolationCallbackThunk(
+      std::unique_lock<std::recursive_mutex> global_lock_locked_once,
+      void* context, void* host_address, bool is_write);
 
   std::wstring file_name_;
   uint32_t system_page_size_ = 0;
@@ -487,12 +518,9 @@ class Memory {
   friend class BaseHeap;
 
   friend class PhysicalHeap;
-  struct PhysicalWriteWatchEntry {
-    PhysicalWriteWatchCallback callback;
-    void* callback_context;
-  };
   xe::global_critical_region global_critical_region_;
-  std::vector<PhysicalWriteWatchEntry*> physical_write_watches_;
+  std::vector<std::pair<PhysicalMemoryInvalidationCallback, void*>*>
+      physical_memory_invalidation_callbacks_;
 };
 
 }  // namespace xe