diff --git a/src/xenia/gpu/d3d12/d3d12_command_processor.cc b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
index 3d31e50de..61a5b88bf 100644
--- a/src/xenia/gpu/d3d12/d3d12_command_processor.cc
+++ b/src/xenia/gpu/d3d12/d3d12_command_processor.cc
@@ -2063,7 +2063,7 @@ D3D12CommandProcessor::WriteRegisterRangeFromMem_WithKnownBound(
 
 #define DO_A_RANGE(start_range, end_range, cb)                       \
   if constexpr (start_range >= register_lower_bound ||               \
-                end_range > register_lower_bound) {                 \
+                end_range > register_lower_bound) {                  \
     if (current_index < (end_range)) {                               \
       uint32_t ntowrite = get_end_before_qty(end_range);             \
       cb((start_range), (end_range), current_index, base, ntowrite); \
diff --git a/src/xenia/gpu/shared_memory.cc b/src/xenia/gpu/shared_memory.cc
index b891b5f38..a94aa053b 100644
--- a/src/xenia/gpu/shared_memory.cc
+++ b/src/xenia/gpu/shared_memory.cc
@@ -26,9 +26,28 @@ SharedMemory::SharedMemory(Memory& memory) : memory_(memory) {
 SharedMemory::~SharedMemory() { ShutdownCommon(); }
 
 void SharedMemory::InitializeCommon() {
-  system_page_flags_.clear();
-  system_page_flags_.resize(((kBufferSize >> page_size_log2_) + 63) / 64);
+  size_t num_system_page_flags_entries =
+      ((kBufferSize >> page_size_log2_) + 63) / 64;
+  num_system_page_flags_ = static_cast<uint32_t>(num_system_page_flags_entries);
 
+  // in total on windows the page flags take up 2048 entries per fields, with 3
+  // fields and 8 bytes per entry thats 49152 bytes. having page alignment for
+  // them is probably beneficial, we do waste 16384 bytes with this alloc though
+
+  uint64_t* system_page_flags_base = (uint64_t*)memory::AllocFixed(
+      nullptr, num_system_page_flags_ * 3 * sizeof(uint64_t),
+      memory::AllocationType::kReserveCommit, memory::PageAccess::kReadWrite);
+
+  system_page_flags_valid_ = system_page_flags_base,
+  system_page_flags_valid_and_gpu_resolved_ =
+      system_page_flags_base + (num_system_page_flags_),
+  system_page_flags_valid_and_gpu_written_ =
+      system_page_flags_base + (num_system_page_flags_ * 2);
+  memset(system_page_flags_valid_, 0, 8 * num_system_page_flags_entries);
+  memset(system_page_flags_valid_and_gpu_resolved_, 0,
+         8 * num_system_page_flags_entries);
+  memset(system_page_flags_valid_and_gpu_written_, 0,
+         8 * num_system_page_flags_entries);
   memory_invalidation_callback_handle_ =
       memory_.RegisterPhysicalMemoryInvalidationCallback(
           MemoryInvalidationCallbackThunk, this);
@@ -81,6 +100,12 @@ void SharedMemory::ShutdownCommon() {
   host_gpu_memory_sparse_allocated_.clear();
   host_gpu_memory_sparse_allocated_.shrink_to_fit();
   host_gpu_memory_sparse_granularity_log2_ = UINT32_MAX;
+  memory::DeallocFixed(system_page_flags_valid_, 0,
+                       memory::DeallocationType::kRelease);
+  system_page_flags_valid_ = nullptr;
+  system_page_flags_valid_and_gpu_resolved_ = nullptr;
+  system_page_flags_valid_and_gpu_written_ = nullptr;
+  num_system_page_flags_ = 0;
 }
 
 void SharedMemory::ClearCache() {
@@ -105,8 +130,9 @@ void SharedMemory::ClearCache() {
 
 void SharedMemory::SetSystemPageBlocksValidWithGpuDataWritten() {
   auto global_lock = global_critical_region_.Acquire();
-  for (SystemPageFlagsBlock& block : system_page_flags_) {
-    block.valid = block.valid_and_gpu_written;
+
+  for (unsigned i = 0; i < num_system_page_flags_; ++i) {
+    system_page_flags_valid_[i] = system_page_flags_valid_and_gpu_written_[i];
   }
 }
 
@@ -150,8 +176,8 @@ SharedMemory::WatchHandle SharedMemory::WatchMemoryRange(
       watch_page_first << page_size_log2_ >> kWatchBucketSizeLog2;
   uint32_t bucket_last =
       watch_page_last << page_size_log2_ >> kWatchBucketSizeLog2;
-  //chrispy: Not required the global lock is always held by the caller
- // auto global_lock = global_critical_region_.Acquire();
+  // chrispy: Not required the global lock is always held by the caller
+  // auto global_lock = global_critical_region_.Acquire();
 
   // Allocate the range.
   WatchRange* range = watch_range_first_free_;
@@ -308,17 +334,17 @@ void SharedMemory::MakeRangeValid(uint32_t start, uint32_t length,
       if (i == valid_block_last && (valid_page_last & 63) != 63) {
         valid_bits &= (uint64_t(1) << ((valid_page_last & 63) + 1)) - 1;
       }
-      SystemPageFlagsBlock& block = system_page_flags_[i];
-      block.valid |= valid_bits;
+      // SystemPageFlagsBlock& block = system_page_flags_[i];
+      system_page_flags_valid_[i] |= valid_bits;
       if (written_by_gpu) {
-        block.valid_and_gpu_written |= valid_bits;
+        system_page_flags_valid_and_gpu_written_[i] |= valid_bits;
       } else {
-        block.valid_and_gpu_written &= ~valid_bits;
+        system_page_flags_valid_and_gpu_written_[i] &= ~valid_bits;
       }
       if (written_by_gpu_resolve) {
-        block.valid_and_gpu_resolved |= valid_bits;
+        system_page_flags_valid_and_gpu_resolved_[i] |= valid_bits;
       } else {
-        block.valid_and_gpu_resolved &= ~valid_bits;
+        system_page_flags_valid_and_gpu_resolved_[i] &= ~valid_bits;
       }
     }
   }
@@ -384,64 +410,15 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length,
 
   bool any_data_resolved = false;
   uint32_t block_first = page_first >> 6;
-  swcache::PrefetchL1(&system_page_flags_[block_first]);
+  // swcache::PrefetchL1(&system_page_flags_[block_first]);
   uint32_t block_last = page_last >> 6;
   uint32_t range_start = UINT32_MAX;
 
   {
     auto global_lock = global_critical_region_.Acquire();
-    for (uint32_t i = block_first; i <= block_last; ++i) {
-      const SystemPageFlagsBlock& block = system_page_flags_[i];
-      uint64_t block_valid = block.valid;
-      uint64_t block_resolved = block.valid_and_gpu_resolved;
-      // Consider pages in the block outside the requested range valid.
-      if (i == block_first) {
-        uint64_t block_before = (uint64_t(1) << (page_first & 63)) - 1;
-        block_valid |= block_before;
-        block_resolved &= ~block_before;
-      }
-      if (i == block_last && (page_last & 63) != 63) {
-        uint64_t block_inside = (uint64_t(1) << ((page_last & 63) + 1)) - 1;
-        block_valid |= ~block_inside;
-        block_resolved &= block_inside;
-      }
-      if (block_resolved) {
-        any_data_resolved = true;
-      }
-
-      while (true) {
-        uint32_t block_page;
-        if (range_start == UINT32_MAX) {
-          // Check if need to open a new range.
-          if (!xe::bit_scan_forward(~block_valid, &block_page)) {
-            break;
-          }
-          range_start = (i << 6) + block_page;
-        } else {
-          // Check if need to close the range.
-          // Ignore the valid pages before the beginning of the range.
-          uint64_t block_valid_from_start = block_valid;
-          if (i == (range_start >> 6)) {
-            block_valid_from_start &=
-                ~((uint64_t(1) << (range_start & 63)) - 1);
-          }
-          if (!xe::bit_scan_forward(block_valid_from_start, &block_page)) {
-            break;
-          }
-          if (current_upload_range + 1 >= MAX_UPLOAD_RANGES) {
-            xe::FatalError(
-                "Hit max upload ranges in shared_memory.cc, tell a dev to "
-                "raise the limit!");
-          }
-          uploads[current_upload_range++] =
-              std::make_pair(range_start, (i << 6) + block_page - range_start);
-          // In the next iteration within this block, consider this range valid
-          // since it has been queued for upload.
-          block_valid |= (uint64_t(1) << block_page) - 1;
-          range_start = UINT32_MAX;
-        }
-      }
-    }
+    TryFindUploadRange(block_first, block_last, page_first, page_last,
+                       any_data_resolved, range_start, current_upload_range,
+                       uploads);
   }
   if (range_start != UINT32_MAX) {
     uploads[current_upload_range++] =
@@ -457,6 +434,110 @@ bool SharedMemory::RequestRange(uint32_t start, uint32_t length,
   return UploadRanges(uploads, current_upload_range);
 }
 
+template <typename T>
+XE_FORCEINLINE XE_NOALIAS static T mod_shift_left(T value, uint32_t by) {
+#if XE_ARCH_AMD64 == 1
+  // arch has modular shifts
+  return value << by;
+#else
+  return value << (by % (sizeof(T) * CHAR_BIT));
+#endif
+}
+void SharedMemory::TryFindUploadRange(const uint32_t& block_first,
+                                      const uint32_t& block_last,
+                                      const uint32_t& page_first,
+                                      const uint32_t& page_last,
+                                      bool& any_data_resolved,
+                                      uint32_t& range_start,
+                                      unsigned int& current_upload_range,
+                                      std::pair<uint32_t, uint32_t>* uploads) {
+  for (uint32_t i = block_first; i <= block_last; ++i) {
+    // const SystemPageFlagsBlock& block = system_page_flags_[i];
+    uint64_t block_valid = system_page_flags_valid_[i];
+    uint64_t block_resolved = 0;
+
+    if (any_data_resolved) {
+      block_resolved = 0;
+    } else {
+      block_resolved = system_page_flags_valid_and_gpu_resolved_[i];
+    }
+    if (i == block_first) {
+      uint64_t block_before = mod_shift_left(uint64_t(1), page_first) - 1;
+      block_valid |= block_before;
+      block_resolved &= ~block_before;
+    }
+    if (i == block_last && (page_last & 63) != 63) {
+      uint64_t block_inside = mod_shift_left(uint64_t(1), page_last + 1) - 1;
+      block_valid |= ~block_inside;
+      block_resolved &= block_inside;
+    }
+    // Consider pages in the block outside the requested range valid.
+    if (!block_resolved) {
+    } else {
+      any_data_resolved = true;
+    }
+    TryGetNextUploadRange(range_start, block_valid, i, current_upload_range,
+                          uploads);
+  }
+}
+
+static bool UploadRange_DoBestScanForward(uint64_t v, uint32_t* out) {
+#if XE_ARCH_AMD64 == 1
+  if (!v) {
+    return false;
+  }
+  if (amd64::GetFeatureFlags() & amd64::kX64EmitBMI1) {
+    *out = static_cast<uint32_t>(_tzcnt_u64(v));
+  } else {
+    unsigned char bsfres = _BitScanForward64((unsigned long*)out, v);
+
+    XE_MSVC_ASSUME(bsfres == 1);
+  }
+  return true;
+#else
+  return xe::bit_scan_forward(v, out);
+#endif
+}
+
+void SharedMemory::TryGetNextUploadRange(
+    uint32_t& range_start, uint64_t& block_valid, const uint32_t& i,
+    unsigned int& current_upload_range,
+    std::pair<uint32_t, uint32_t>* uploads) {
+  while (true) {
+    uint32_t block_page = 0;
+    if (range_start == UINT32_MAX) {
+      // Check if need to open a new range.
+      if (!UploadRange_DoBestScanForward(~block_valid, &block_page)) {
+        break;
+      }
+      range_start = (i << 6) + block_page;
+    } else {
+      // Check if need to close the range.
+      // Ignore the valid pages before the beginning of the range.
+      uint64_t block_valid_from_start = block_valid;
+      if (i == (range_start >> 6)) {
+        block_valid_from_start &=
+            ~(mod_shift_left(uint64_t(1), range_start) - 1);
+      }
+      if (!UploadRange_DoBestScanForward(block_valid_from_start, &block_page)) {
+        break;
+      }
+      if (current_upload_range + 1 < MAX_UPLOAD_RANGES) {
+        uploads[current_upload_range++] =
+            std::make_pair(range_start, (i << 6) + block_page - range_start);
+        // In the next iteration within this block, consider this range valid
+        // since it has been queued for upload.
+        block_valid |= (uint64_t(1) << block_page) - 1;
+        range_start = UINT32_MAX;
+      } else {
+        xe::FatalError(
+            "Hit max upload ranges in shared_memory.cc, tell a dev to "
+            "raise the limit!");
+      }
+    }
+  }
+}
+
 std::pair<uint32_t, uint32_t> SharedMemory::MemoryInvalidationCallbackThunk(
     void* context_ptr, uint32_t physical_address_start, uint32_t length,
     bool exact_range) {
@@ -490,14 +571,14 @@ std::pair<uint32_t, uint32_t> SharedMemory::MemoryInvalidationCallback(
     // 0.7 ms.
     if (page_first & 63) {
       uint64_t gpu_written_start =
-          system_page_flags_[block_first].valid_and_gpu_written;
+          system_page_flags_valid_and_gpu_written_[block_first];
       gpu_written_start &= (uint64_t(1) << (page_first & 63)) - 1;
       page_first =
           (page_first & ~uint32_t(63)) + (64 - xe::lzcnt(gpu_written_start));
     }
     if ((page_last & 63) != 63) {
       uint64_t gpu_written_end =
-          system_page_flags_[block_last].valid_and_gpu_written;
+          system_page_flags_valid_and_gpu_written_[block_last];
       gpu_written_end &= ~((uint64_t(1) << ((page_last & 63) + 1)) - 1);
       page_last = (page_last & ~uint32_t(63)) +
                   (std::max(xe::tzcnt(gpu_written_end), uint8_t(1)) - 1);
@@ -512,10 +593,9 @@ std::pair<uint32_t, uint32_t> SharedMemory::MemoryInvalidationCallback(
     if (i == block_last && (page_last & 63) != 63) {
       invalidate_bits &= (uint64_t(1) << ((page_last & 63) + 1)) - 1;
     }
-    SystemPageFlagsBlock& block = system_page_flags_[i];
-    block.valid &= ~invalidate_bits;
-    block.valid_and_gpu_written &= ~invalidate_bits;
-    block.valid_and_gpu_resolved &= ~invalidate_bits;
+    system_page_flags_valid_[i] &= ~invalidate_bits;
+    system_page_flags_valid_and_gpu_resolved_[i] &= ~invalidate_bits;
+    system_page_flags_valid_and_gpu_written_[i] &= ~invalidate_bits;
   }
 
   FireWatches(page_first, page_last, false);
@@ -536,11 +616,11 @@ void SharedMemory::PrepareForTraceDownload() {
   uint32_t fire_watches_range_start = UINT32_MAX;
   uint32_t gpu_written_range_start = UINT32_MAX;
   auto global_lock = global_critical_region_.Acquire();
-  for (uint32_t i = 0; i < system_page_flags_.size(); ++i) {
-    SystemPageFlagsBlock& page_flags_block = system_page_flags_[i];
-    uint64_t previously_valid_block = page_flags_block.valid;
-    uint64_t gpu_written_block = page_flags_block.valid_and_gpu_written;
-    page_flags_block.valid = gpu_written_block;
+  for (uint32_t i = 0; i < num_system_page_flags_; ++i) {
+    // SystemPageFlagsBlock& page_flags_block = system_page_flags_[i];
+    uint64_t previously_valid_block = system_page_flags_valid_[i];
+    uint64_t gpu_written_block = system_page_flags_valid_and_gpu_written_[i];
+    system_page_flags_valid_[i] = gpu_written_block;
 
     // Fire watches on the invalidated pages.
     uint64_t fire_watches_block = previously_valid_block & ~gpu_written_block;
@@ -627,45 +707,48 @@ void SharedMemory::ReleaseTraceDownloadRanges() {
 
 bool SharedMemory::EnsureHostGpuMemoryAllocated(uint32_t start,
                                                 uint32_t length) {
-  if (host_gpu_memory_sparse_granularity_log2_ == UINT32_MAX) {
-    return true;
-  }
-  if (!length) {
-    return true;
-  }
-  if (start > kBufferSize || (kBufferSize - start) < length) {
-    return false;
-  }
-  uint32_t page_first = start >> page_size_log2_;
-  uint32_t page_last = (start + length - 1) >> page_size_log2_;
-  uint32_t allocation_first =
-      page_first << page_size_log2_ >> host_gpu_memory_sparse_granularity_log2_;
-  uint32_t allocation_last =
-      page_last << page_size_log2_ >> host_gpu_memory_sparse_granularity_log2_;
-  while (true) {
-    std::pair<size_t, size_t> allocation_range = xe::bit_range::NextUnsetRange(
-        host_gpu_memory_sparse_allocated_.data(), allocation_first,
-        allocation_last - allocation_first + 1);
-    if (!allocation_range.second) {
-      break;
-    }
-    if (!AllocateSparseHostGpuMemoryRange(uint32_t(allocation_range.first),
-                                          uint32_t(allocation_range.second))) {
+  if (host_gpu_memory_sparse_granularity_log2_ != UINT32_MAX && length) {
+    if (start <= kBufferSize && (kBufferSize - start) >= length) {
+      uint32_t page_first = start >> page_size_log2_;
+      uint32_t page_last = (start + length - 1) >> page_size_log2_;
+      uint32_t allocation_first = page_first << page_size_log2_ >>
+                                  host_gpu_memory_sparse_granularity_log2_;
+      uint32_t allocation_last = page_last << page_size_log2_ >>
+                                 host_gpu_memory_sparse_granularity_log2_;
+      while (true) {
+        std::pair<size_t, size_t> allocation_range =
+            xe::bit_range::NextUnsetRange(
+                host_gpu_memory_sparse_allocated_.data(), allocation_first,
+                allocation_last - allocation_first + 1);
+        if (!allocation_range.second) {
+          break;
+        }
+        if (!AllocateSparseHostGpuMemoryRange(
+                uint32_t(allocation_range.first),
+                uint32_t(allocation_range.second))) {
+          return false;
+        }
+        xe::bit_range::SetRange(host_gpu_memory_sparse_allocated_.data(),
+                                allocation_range.first,
+                                allocation_range.second);
+        ++host_gpu_memory_sparse_allocations_;
+        COUNT_profile_set(
+            "gpu/shared_memory/host_gpu_memory_sparse_allocations",
+            host_gpu_memory_sparse_allocations_);
+        host_gpu_memory_sparse_used_bytes_ +=
+            uint32_t(allocation_range.second)
+            << host_gpu_memory_sparse_granularity_log2_;
+        COUNT_profile_set(
+            "gpu/shared_memory/host_gpu_memory_sparse_used_mb",
+            (host_gpu_memory_sparse_used_bytes_ + ((1 << 20) - 1)) >> 20);
+        allocation_first =
+            uint32_t(allocation_range.first + allocation_range.second);
+      }
+    } else {
       return false;
     }
-    xe::bit_range::SetRange(host_gpu_memory_sparse_allocated_.data(),
-                            allocation_range.first, allocation_range.second);
-    ++host_gpu_memory_sparse_allocations_;
-    COUNT_profile_set("gpu/shared_memory/host_gpu_memory_sparse_allocations",
-                      host_gpu_memory_sparse_allocations_);
-    host_gpu_memory_sparse_used_bytes_ +=
-        uint32_t(allocation_range.second)
-        << host_gpu_memory_sparse_granularity_log2_;
-    COUNT_profile_set(
-        "gpu/shared_memory/host_gpu_memory_sparse_used_mb",
-        (host_gpu_memory_sparse_used_bytes_ + ((1 << 20) - 1)) >> 20);
-    allocation_first =
-        uint32_t(allocation_range.first + allocation_range.second);
+  } else {
+    return true;
   }
   return true;
 }
diff --git a/src/xenia/gpu/shared_memory.h b/src/xenia/gpu/shared_memory.h
index e721fe8af..7100d4df1 100644
--- a/src/xenia/gpu/shared_memory.h
+++ b/src/xenia/gpu/shared_memory.h
@@ -74,6 +74,18 @@ class SharedMemory {
   bool RequestRange(uint32_t start, uint32_t length,
                     bool* any_data_resolved_out = nullptr);
 
+  void TryFindUploadRange(const uint32_t& block_first,
+                          const uint32_t& block_last,
+                          const uint32_t& page_first, const uint32_t& page_last,
+                          bool& any_data_resolved, uint32_t& range_start,
+                          unsigned int& current_upload_range,
+                          std::pair<uint32_t, uint32_t>* uploads);
+
+  void TryGetNextUploadRange(uint32_t& range_start, uint64_t& block_valid,
+                             const uint32_t& i,
+                             unsigned int& current_upload_range,
+                             std::pair<uint32_t, uint32_t>* uploads);
+
   // Marks the range and, if not exact_range, potentially its surroundings
   // (to up to the first GPU-written page, as an access violation exception
   // count optimization) as modified by the CPU, also invalidating GPU-written
@@ -196,10 +208,16 @@ class SharedMemory {
     // contains data written specifically by resolving from EDRAM.
     uint64_t valid_and_gpu_resolved;
   };
+
+  //chrispy: todo, systempageflagsblock should be 3 different arrays
   // Flags for each 64 system pages, interleaved as blocks, so bit scan can be
   // used to quickly extract ranges.
-  std::vector<SystemPageFlagsBlock> system_page_flags_;
+ // std::vector<SystemPageFlagsBlock> system_page_flags_;
 
+  uint64_t *system_page_flags_valid_ = nullptr,
+           *system_page_flags_valid_and_gpu_written_ = nullptr,
+           *system_page_flags_valid_and_gpu_resolved_ = nullptr;
+  unsigned num_system_page_flags_ = 0;
   static std::pair<uint32_t, uint32_t> MemoryInvalidationCallbackThunk(
       void* context_ptr, uint32_t physical_address_start, uint32_t length,
       bool exact_range);
diff --git a/src/xenia/gpu/texture_cache.h b/src/xenia/gpu/texture_cache.h
index 717273275..075b80111 100644
--- a/src/xenia/gpu/texture_cache.h
+++ b/src/xenia/gpu/texture_cache.h
@@ -108,7 +108,7 @@ class TextureCache {
     // generate a mask of all bits from before the first index, and xor it with
     // all bits before the last index this produces a mask covering only the
     // bits between first and last
-    uint32_t res = ((1U << first_index) - 1) ^ ((1U << (last_index + 1)) - 1);
+    uint32_t res = ((1U << first_index) - 1) ^ static_cast<uint32_t>((1ULL << (last_index + 1)) - 1ULL);
     // todo: check that this is right
 
     texture_bindings_in_sync_ &= ~res;